diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/steps/docs.py b/.buildkite/dagster-buildkite/dagster_buildkite/steps/docs.py index 8bbf7ed182143..43aed1dcdbd64 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/steps/docs.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/steps/docs.py @@ -6,39 +6,12 @@ def docs_steps() -> List[dict]: return [ - # No docs tests for now - # StepBuilder("docs validate-libraries") - # .run("pip install -e python_modules/automation", "dagster-docs validate-libraries") - # .on_integration_image(SupportedPython.V3_7) - # .build(), - # StepBuilder("docs next build tests") - # .run( - # "pip install -e python_modules/automation", - # "pip install -r docs-requirements.txt -qqq", - # "cd docs", - # "make NODE_ENV=production VERSION=master full_docs_build", - # ) - # .on_integration_image(SupportedPython.V3_7) - # .build(), - # StepBuilder("docs next tests") - # .run( - # "pip install -e python_modules/automation", - # "pip install -r docs-requirements.txt -qqq", - # "cd docs", - # "make buildnext", - # "cd next", - # "yarn test", - # ) - # .on_integration_image(SupportedPython.V3_7) - # .build(), - # StepBuilder(":coverage: docs") + # TODO: Yuhan to fix + # StepBuilder("docs sphinx build") # .run( - # "make install_dev_python_modules", # "pip install -e python_modules/automation", # "pip install -r docs-requirements.txt -qqq", - # "cd docs", - # "make updateindex", - # "pytest -vv test_doc_build.py", + # "pushd docs; make build", # "git diff --exit-code", # ) # .on_integration_image(SupportedPython.V3_7) diff --git a/docs/Makefile b/docs/Makefile index c81ad051269af..9fe2425f6e47c 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,2 +1,2 @@ build: - pushd sphinx; popd; python pack_json.py \ No newline at end of file + cd sphinx; make clean; make json; cd ..; python pack_json.py \ No newline at end of file diff --git a/docs/next/content/api/modules.json b/docs/next/content/api/modules.json new file mode 100644 index 0000000000000..42abd0914a3e4 --- /dev/null +++ b/docs/next/content/api/modules.json @@ -0,0 +1,1480 @@ +{ + "": { + "abc": { + "alabaster_version": "0.7.12", + "body": "
\n# Copyright 2007 Google, Inc. All Rights Reserved.\n# Licensed to PSF under a Contributor Agreement.\n\n"""Abstract Base Classes (ABCs) according to PEP 3119."""\n\n\ndef abstractmethod(funcobj):\n """A decorator indicating abstract methods.\n\n Requires that the metaclass is ABCMeta or derived from it. A\n class that has a metaclass derived from ABCMeta cannot be\n instantiated unless all of its abstract methods are overridden.\n The abstract methods can be called using any of the normal\n 'super' call mechanisms. abstractmethod() may be used to declare\n abstract methods for properties and descriptors.\n\n Usage:\n\n class C(metaclass=ABCMeta):\n @abstractmethod\n def my_abstract_method(self, ...):\n ...\n """\n funcobj.__isabstractmethod__ = True\n return funcobj\n\n\nclass abstractclassmethod(classmethod):\n """A decorator indicating abstract classmethods.\n\n Deprecated, use 'classmethod' with 'abstractmethod' instead.\n """\n\n __isabstractmethod__ = True\n\n def __init__(self, callable):\n callable.__isabstractmethod__ = True\n super().__init__(callable)\n\n\nclass abstractstaticmethod(staticmethod):\n """A decorator indicating abstract staticmethods.\n\n Deprecated, use 'staticmethod' with 'abstractmethod' instead.\n """\n\n __isabstractmethod__ = True\n\n def __init__(self, callable):\n callable.__isabstractmethod__ = True\n super().__init__(callable)\n\n\nclass abstractproperty(property):\n """A decorator indicating abstract properties.\n\n Deprecated, use 'property' with 'abstractmethod' instead.\n """\n\n __isabstractmethod__ = True\n\n\ntry:\n from _abc import (get_cache_token, _abc_init, _abc_register,\n _abc_instancecheck, _abc_subclasscheck, _get_dump,\n _reset_registry, _reset_caches)\nexcept ImportError:\n from _py_abc import ABCMeta, get_cache_token\n ABCMeta.__module__ = 'abc'\nelse:\n class ABCMeta(type):\n """Metaclass for defining Abstract Base Classes (ABCs).\n\n Use this metaclass to create an ABC. An ABC can be subclassed\n directly, and then acts as a mix-in class. You can also register\n unrelated concrete classes (even built-in classes) and unrelated\n ABCs as 'virtual subclasses' -- these and their descendants will\n be considered subclasses of the registering ABC by the built-in\n issubclass() function, but the registering ABC won't show up in\n their MRO (Method Resolution Order) nor will method\n implementations defined by the registering ABC be callable (not\n even via super()).\n """\n def __new__(mcls, name, bases, namespace, **kwargs):\n cls = super().__new__(mcls, name, bases, namespace, **kwargs)\n _abc_init(cls)\n return cls\n\n def register(cls, subclass):\n """Register a virtual subclass of an ABC.\n\n Returns the subclass, to allow usage as a class decorator.\n """\n return _abc_register(cls, subclass)\n\n def __instancecheck__(cls, instance):\n """Override for isinstance(instance, cls)."""\n return _abc_instancecheck(cls, instance)\n\n def __subclasscheck__(cls, subclass):\n """Override for issubclass(subclass, cls)."""\n return _abc_subclasscheck(cls, subclass)\n\n def _dump_registry(cls, file=None):\n """Debug helper to print the ABC registry."""\n print(f"Class: {cls.__module__}.{cls.__qualname__}", file=file)\n print(f"Inv. counter: {get_cache_token()}", file=file)\n (_abc_registry, _abc_cache, _abc_negative_cache,\n _abc_negative_cache_version) = _get_dump(cls)\n print(f"_abc_registry: {_abc_registry!r}", file=file)\n print(f"_abc_cache: {_abc_cache!r}", file=file)\n print(f"_abc_negative_cache: {_abc_negative_cache!r}", file=file)\n print(f"_abc_negative_cache_version: {_abc_negative_cache_version!r}",\n file=file)\n\n def _abc_registry_clear(cls):\n """Clear the registry (for debugging or testing)."""\n _reset_registry(cls)\n\n def _abc_caches_clear(cls):\n """Clear the caches (for debugging or testing)."""\n _reset_caches(cls)\n\n\nclass ABC(metaclass=ABCMeta):\n """Helper class that provides a standard way to create an ABC using\n inheritance.\n """\n __slots__ = ()\n
\n"""\nThe typing module: Support for gradual typing as defined by PEP 484.\n\nAt large scale, the structure of the module is following:\n* Imports and exports, all public names should be explicitly added to __all__.\n* Internal helper functions: these should never be used in code outside this module.\n* _SpecialForm and its instances (special forms): Any, NoReturn, ClassVar, Union, Optional\n* Two classes whose instances can be type arguments in addition to types: ForwardRef and TypeVar\n* The core of internal generics API: _GenericAlias and _VariadicGenericAlias, the latter is\n currently only used by Tuple and Callable. All subscripted types like X[int], Union[int, str],\n etc., are instances of either of these classes.\n* The public counterpart of the generics API consists of two classes: Generic and Protocol.\n* Public helper functions: get_type_hints, overload, cast, no_type_check,\n no_type_check_decorator.\n* Generic aliases for collections.abc ABCs and few additional protocols.\n* Special types: NewType, NamedTuple, TypedDict (may be added soon).\n* Wrapper submodules for re and io related types.\n"""\n\nfrom abc import abstractmethod, ABCMeta\nimport collections\nimport collections.abc\nimport contextlib\nimport functools\nimport operator\nimport re as stdlib_re # Avoid confusion with the re we export.\nimport sys\nimport types\nfrom types import WrapperDescriptorType, MethodWrapperType, MethodDescriptorType\n\n# Please keep __all__ alphabetized within each category.\n__all__ = [\n # Super-special typing primitives.\n 'Any',\n 'Callable',\n 'ClassVar',\n 'Final',\n 'ForwardRef',\n 'Generic',\n 'Literal',\n 'Optional',\n 'Protocol',\n 'Tuple',\n 'Type',\n 'TypeVar',\n 'Union',\n\n # ABCs (from collections.abc).\n 'AbstractSet', # collections.abc.Set.\n 'ByteString',\n 'Container',\n 'ContextManager',\n 'Hashable',\n 'ItemsView',\n 'Iterable',\n 'Iterator',\n 'KeysView',\n 'Mapping',\n 'MappingView',\n 'MutableMapping',\n 'MutableSequence',\n 'MutableSet',\n 'Sequence',\n 'Sized',\n 'ValuesView',\n 'Awaitable',\n 'AsyncIterator',\n 'AsyncIterable',\n 'Coroutine',\n 'Collection',\n 'AsyncGenerator',\n 'AsyncContextManager',\n\n # Structural checks, a.k.a. protocols.\n 'Reversible',\n 'SupportsAbs',\n 'SupportsBytes',\n 'SupportsComplex',\n 'SupportsFloat',\n 'SupportsIndex',\n 'SupportsInt',\n 'SupportsRound',\n\n # Concrete collection types.\n 'ChainMap',\n 'Counter',\n 'Deque',\n 'Dict',\n 'DefaultDict',\n 'List',\n 'OrderedDict',\n 'Set',\n 'FrozenSet',\n 'NamedTuple', # Not really a type.\n 'TypedDict', # Not really a type.\n 'Generator',\n\n # One-off things.\n 'AnyStr',\n 'cast',\n 'final',\n 'get_args',\n 'get_origin',\n 'get_type_hints',\n 'NewType',\n 'no_type_check',\n 'no_type_check_decorator',\n 'NoReturn',\n 'overload',\n 'runtime_checkable',\n 'Text',\n 'TYPE_CHECKING',\n]\n\n# The pseudo-submodules 're' and 'io' are part of the public\n# namespace, but excluded from __all__ because they might stomp on\n# legitimate imports of those modules.\n\n\ndef _type_check(arg, msg, is_argument=True):\n """Check that the argument is a type, and return it (internal helper).\n\n As a special case, accept None and return type(None) instead. Also wrap strings\n into ForwardRef instances. Consider several corner cases, for example plain\n special forms like Union are not valid, while Union[int, str] is OK, etc.\n The msg argument is a human-readable error message, e.g::\n\n "Union[arg, ...]: arg should be a type."\n\n We append the repr() of the actual value (truncated to 100 chars).\n """\n invalid_generic_forms = (Generic, Protocol)\n if is_argument:\n invalid_generic_forms = invalid_generic_forms + (ClassVar, Final)\n\n if arg is None:\n return type(None)\n if isinstance(arg, str):\n return ForwardRef(arg)\n if (isinstance(arg, _GenericAlias) and\n arg.__origin__ in invalid_generic_forms):\n raise TypeError(f"{arg} is not valid as type argument")\n if (isinstance(arg, _SpecialForm) and arg not in (Any, NoReturn) or\n arg in (Generic, Protocol)):\n raise TypeError(f"Plain {arg} is not valid as type argument")\n if isinstance(arg, (type, TypeVar, ForwardRef)):\n return arg\n if not callable(arg):\n raise TypeError(f"{msg} Got {arg!r:.100}.")\n return arg\n\n\ndef _type_repr(obj):\n """Return the repr() of an object, special-casing types (internal helper).\n\n If obj is a type, we return a shorter version than the default\n type.__repr__, based on the module and qualified name, which is\n typically enough to uniquely identify a type. For everything\n else, we fall back on repr(obj).\n """\n if isinstance(obj, type):\n if obj.__module__ == 'builtins':\n return obj.__qualname__\n return f'{obj.__module__}.{obj.__qualname__}'\n if obj is ...:\n return('...')\n if isinstance(obj, types.FunctionType):\n return obj.__name__\n return repr(obj)\n\n\ndef _collect_type_vars(types):\n """Collect all type variable contained in types in order of\n first appearance (lexicographic order). For example::\n\n _collect_type_vars((T, List[S, T])) == (T, S)\n """\n tvars = []\n for t in types:\n if isinstance(t, TypeVar) and t not in tvars:\n tvars.append(t)\n if isinstance(t, _GenericAlias) and not t._special:\n tvars.extend([t for t in t.__parameters__ if t not in tvars])\n return tuple(tvars)\n\n\ndef _subs_tvars(tp, tvars, subs):\n """Substitute type variables 'tvars' with substitutions 'subs'.\n These two must have the same length.\n """\n if not isinstance(tp, _GenericAlias):\n return tp\n new_args = list(tp.__args__)\n for a, arg in enumerate(tp.__args__):\n if isinstance(arg, TypeVar):\n for i, tvar in enumerate(tvars):\n if arg == tvar:\n new_args[a] = subs[i]\n else:\n new_args[a] = _subs_tvars(arg, tvars, subs)\n if tp.__origin__ is Union:\n return Union[tuple(new_args)]\n return tp.copy_with(tuple(new_args))\n\n\ndef _check_generic(cls, parameters):\n """Check correct count for parameters of a generic cls (internal helper).\n This gives a nice error message in case of count mismatch.\n """\n if not cls.__parameters__:\n raise TypeError(f"{cls} is not a generic class")\n alen = len(parameters)\n elen = len(cls.__parameters__)\n if alen != elen:\n raise TypeError(f"Too {'many' if alen > elen else 'few'} parameters for {cls};"\n f" actual {alen}, expected {elen}")\n\n\ndef _remove_dups_flatten(parameters):\n """An internal helper for Union creation and substitution: flatten Unions\n among parameters, then remove duplicates.\n """\n # Flatten out Union[Union[...], ...].\n params = []\n for p in parameters:\n if isinstance(p, _GenericAlias) and p.__origin__ is Union:\n params.extend(p.__args__)\n elif isinstance(p, tuple) and len(p) > 0 and p[0] is Union:\n params.extend(p[1:])\n else:\n params.append(p)\n # Weed out strict duplicates, preserving the first of each occurrence.\n all_params = set(params)\n if len(all_params) < len(params):\n new_params = []\n for t in params:\n if t in all_params:\n new_params.append(t)\n all_params.remove(t)\n params = new_params\n assert not all_params, all_params\n return tuple(params)\n\n\n_cleanups = []\n\n\ndef _tp_cache(func):\n """Internal wrapper caching __getitem__ of generic types with a fallback to\n original function for non-hashable arguments.\n """\n cached = functools.lru_cache()(func)\n _cleanups.append(cached.cache_clear)\n\n @functools.wraps(func)\n def inner(*args, **kwds):\n try:\n return cached(*args, **kwds)\n except TypeError:\n pass # All real errors (not unhashable args) are raised below.\n return func(*args, **kwds)\n return inner\n\n\ndef _eval_type(t, globalns, localns):\n """Evaluate all forward reverences in the given type t.\n For use of globalns and localns see the docstring for get_type_hints().\n """\n if isinstance(t, ForwardRef):\n return t._evaluate(globalns, localns)\n if isinstance(t, _GenericAlias):\n ev_args = tuple(_eval_type(a, globalns, localns) for a in t.__args__)\n if ev_args == t.__args__:\n return t\n res = t.copy_with(ev_args)\n res._special = t._special\n return res\n return t\n\n\nclass _Final:\n """Mixin to prohibit subclassing"""\n\n __slots__ = ('__weakref__',)\n\n def __init_subclass__(self, /, *args, **kwds):\n if '_root' not in kwds:\n raise TypeError("Cannot subclass special typing classes")\n\nclass _Immutable:\n """Mixin to indicate that object should not be copied."""\n\n def __copy__(self):\n return self\n\n def __deepcopy__(self, memo):\n return self\n\n\nclass _SpecialForm(_Final, _Immutable, _root=True):\n """Internal indicator of special typing constructs.\n See _doc instance attribute for specific docs.\n """\n\n __slots__ = ('_name', '_doc')\n\n def __new__(cls, *args, **kwds):\n """Constructor.\n\n This only exists to give a better error message in case\n someone tries to subclass a special typing object (not a good idea).\n """\n if (len(args) == 3 and\n isinstance(args[0], str) and\n isinstance(args[1], tuple)):\n # Close enough.\n raise TypeError(f"Cannot subclass {cls!r}")\n return super().__new__(cls)\n\n def __init__(self, name, doc):\n self._name = name\n self._doc = doc\n\n def __eq__(self, other):\n if not isinstance(other, _SpecialForm):\n return NotImplemented\n return self._name == other._name\n\n def __hash__(self):\n return hash((self._name,))\n\n def __repr__(self):\n return 'typing.' + self._name\n\n def __reduce__(self):\n return self._name\n\n def __call__(self, *args, **kwds):\n raise TypeError(f"Cannot instantiate {self!r}")\n\n def __instancecheck__(self, obj):\n raise TypeError(f"{self} cannot be used with isinstance()")\n\n def __subclasscheck__(self, cls):\n raise TypeError(f"{self} cannot be used with issubclass()")\n\n @_tp_cache\n def __getitem__(self, parameters):\n if self._name in ('ClassVar', 'Final'):\n item = _type_check(parameters, f'{self._name} accepts only single type.')\n return _GenericAlias(self, (item,))\n if self._name == 'Union':\n if parameters == ():\n raise TypeError("Cannot take a Union of no types.")\n if not isinstance(parameters, tuple):\n parameters = (parameters,)\n msg = "Union[arg, ...]: each arg must be a type."\n parameters = tuple(_type_check(p, msg) for p in parameters)\n parameters = _remove_dups_flatten(parameters)\n if len(parameters) == 1:\n return parameters[0]\n return _GenericAlias(self, parameters)\n if self._name == 'Optional':\n arg = _type_check(parameters, "Optional[t] requires a single type.")\n return Union[arg, type(None)]\n if self._name == 'Literal':\n # There is no '_type_check' call because arguments to Literal[...] are\n # values, not types.\n return _GenericAlias(self, parameters)\n raise TypeError(f"{self} is not subscriptable")\n\n\nAny = _SpecialForm('Any', doc=\n """Special type indicating an unconstrained type.\n\n - Any is compatible with every type.\n - Any assumed to have all methods.\n - All values assumed to be instances of Any.\n\n Note that all the above statements are true from the point of view of\n static type checkers. At runtime, Any should not be used with instance\n or class checks.\n """)\n\nNoReturn = _SpecialForm('NoReturn', doc=\n """Special type indicating functions that never return.\n Example::\n\n from typing import NoReturn\n\n def stop() -> NoReturn:\n raise Exception('no way')\n\n This type is invalid in other positions, e.g., ``List[NoReturn]``\n will fail in static type checkers.\n """)\n\nClassVar = _SpecialForm('ClassVar', doc=\n """Special type construct to mark class variables.\n\n An annotation wrapped in ClassVar indicates that a given\n attribute is intended to be used as a class variable and\n should not be set on instances of that class. Usage::\n\n class Starship:\n stats: ClassVar[Dict[str, int]] = {} # class variable\n damage: int = 10 # instance variable\n\n ClassVar accepts only types and cannot be further subscribed.\n\n Note that ClassVar is not a class itself, and should not\n be used with isinstance() or issubclass().\n """)\n\nFinal = _SpecialForm('Final', doc=\n """Special typing construct to indicate final names to type checkers.\n\n A final name cannot be re-assigned or overridden in a subclass.\n For example:\n\n MAX_SIZE: Final = 9000\n MAX_SIZE += 1 # Error reported by type checker\n\n class Connection:\n TIMEOUT: Final[int] = 10\n\n class FastConnector(Connection):\n TIMEOUT = 1 # Error reported by type checker\n\n There is no runtime checking of these properties.\n """)\n\nUnion = _SpecialForm('Union', doc=\n """Union type; Union[X, Y] means either X or Y.\n\n To define a union, use e.g. Union[int, str]. Details:\n - The arguments must be types and there must be at least one.\n - None as an argument is a special case and is replaced by\n type(None).\n - Unions of unions are flattened, e.g.::\n\n Union[Union[int, str], float] == Union[int, str, float]\n\n - Unions of a single argument vanish, e.g.::\n\n Union[int] == int # The constructor actually returns int\n\n - Redundant arguments are skipped, e.g.::\n\n Union[int, str, int] == Union[int, str]\n\n - When comparing unions, the argument order is ignored, e.g.::\n\n Union[int, str] == Union[str, int]\n\n - You cannot subclass or instantiate a union.\n - You can use Optional[X] as a shorthand for Union[X, None].\n """)\n\nOptional = _SpecialForm('Optional', doc=\n """Optional type.\n\n Optional[X] is equivalent to Union[X, None].\n """)\n\nLiteral = _SpecialForm('Literal', doc=\n """Special typing form to define literal types (a.k.a. value types).\n\n This form can be used to indicate to type checkers that the corresponding\n variable or function parameter has a value equivalent to the provided\n literal (or one of several literals):\n\n def validate_simple(data: Any) -> Literal[True]: # always returns True\n ...\n\n MODE = Literal['r', 'rb', 'w', 'wb']\n def open_helper(file: str, mode: MODE) -> str:\n ...\n\n open_helper('/some/path', 'r') # Passes type check\n open_helper('/other/path', 'typo') # Error in type checker\n\n Literal[...] cannot be subclassed. At runtime, an arbitrary value\n is allowed as type argument to Literal[...], but type checkers may\n impose restrictions.\n """)\n\n\nclass ForwardRef(_Final, _root=True):\n """Internal wrapper to hold a forward reference."""\n\n __slots__ = ('__forward_arg__', '__forward_code__',\n '__forward_evaluated__', '__forward_value__',\n '__forward_is_argument__')\n\n def __init__(self, arg, is_argument=True):\n if not isinstance(arg, str):\n raise TypeError(f"Forward reference must be a string -- got {arg!r}")\n try:\n code = compile(arg, '<string>', 'eval')\n except SyntaxError:\n raise SyntaxError(f"Forward reference must be an expression -- got {arg!r}")\n self.__forward_arg__ = arg\n self.__forward_code__ = code\n self.__forward_evaluated__ = False\n self.__forward_value__ = None\n self.__forward_is_argument__ = is_argument\n\n def _evaluate(self, globalns, localns):\n if not self.__forward_evaluated__ or localns is not globalns:\n if globalns is None and localns is None:\n globalns = localns = {}\n elif globalns is None:\n globalns = localns\n elif localns is None:\n localns = globalns\n self.__forward_value__ = _type_check(\n eval(self.__forward_code__, globalns, localns),\n "Forward references must evaluate to types.",\n is_argument=self.__forward_is_argument__)\n self.__forward_evaluated__ = True\n return self.__forward_value__\n\n def __eq__(self, other):\n if not isinstance(other, ForwardRef):\n return NotImplemented\n if self.__forward_evaluated__ and other.__forward_evaluated__:\n return (self.__forward_arg__ == other.__forward_arg__ and\n self.__forward_value__ == other.__forward_value__)\n return self.__forward_arg__ == other.__forward_arg__\n\n def __hash__(self):\n return hash(self.__forward_arg__)\n\n def __repr__(self):\n return f'ForwardRef({self.__forward_arg__!r})'\n\n\nclass TypeVar(_Final, _Immutable, _root=True):\n """Type variable.\n\n Usage::\n\n T = TypeVar('T') # Can be anything\n A = TypeVar('A', str, bytes) # Must be str or bytes\n\n Type variables exist primarily for the benefit of static type\n checkers. They serve as the parameters for generic types as well\n as for generic function definitions. See class Generic for more\n information on generic types. Generic functions work as follows:\n\n def repeat(x: T, n: int) -> List[T]:\n '''Return a list containing n references to x.'''\n return [x]*n\n\n def longest(x: A, y: A) -> A:\n '''Return the longest of two strings.'''\n return x if len(x) >= len(y) else y\n\n The latter example's signature is essentially the overloading\n of (str, str) -> str and (bytes, bytes) -> bytes. Also note\n that if the arguments are instances of some subclass of str,\n the return type is still plain str.\n\n At runtime, isinstance(x, T) and issubclass(C, T) will raise TypeError.\n\n Type variables defined with covariant=True or contravariant=True\n can be used to declare covariant or contravariant generic types.\n See PEP 484 for more details. By default generic types are invariant\n in all type variables.\n\n Type variables can be introspected. e.g.:\n\n T.__name__ == 'T'\n T.__constraints__ == ()\n T.__covariant__ == False\n T.__contravariant__ = False\n A.__constraints__ == (str, bytes)\n\n Note that only type variables defined in global scope can be pickled.\n """\n\n __slots__ = ('__name__', '__bound__', '__constraints__',\n '__covariant__', '__contravariant__')\n\n def __init__(self, name, *constraints, bound=None,\n covariant=False, contravariant=False):\n self.__name__ = name\n if covariant and contravariant:\n raise ValueError("Bivariant types are not supported.")\n self.__covariant__ = bool(covariant)\n self.__contravariant__ = bool(contravariant)\n if constraints and bound is not None:\n raise TypeError("Constraints cannot be combined with bound=...")\n if constraints and len(constraints) == 1:\n raise TypeError("A single constraint is not allowed")\n msg = "TypeVar(name, constraint, ...): constraints must be types."\n self.__constraints__ = tuple(_type_check(t, msg) for t in constraints)\n if bound:\n self.__bound__ = _type_check(bound, "Bound must be a type.")\n else:\n self.__bound__ = None\n def_mod = sys._getframe(1).f_globals['__name__'] # for pickling\n if def_mod != 'typing':\n self.__module__ = def_mod\n\n def __repr__(self):\n if self.__covariant__:\n prefix = '+'\n elif self.__contravariant__:\n prefix = '-'\n else:\n prefix = '~'\n return prefix + self.__name__\n\n def __reduce__(self):\n return self.__name__\n\n\n# Special typing constructs Union, Optional, Generic, Callable and Tuple\n# use three special attributes for internal bookkeeping of generic types:\n# * __parameters__ is a tuple of unique free type parameters of a generic\n# type, for example, Dict[T, T].__parameters__ == (T,);\n# * __origin__ keeps a reference to a type that was subscripted,\n# e.g., Union[T, int].__origin__ == Union, or the non-generic version of\n# the type.\n# * __args__ is a tuple of all arguments used in subscripting,\n# e.g., Dict[T, int].__args__ == (T, int).\n\n\n# Mapping from non-generic type names that have a generic alias in typing\n# but with a different name.\n_normalize_alias = {'list': 'List',\n 'tuple': 'Tuple',\n 'dict': 'Dict',\n 'set': 'Set',\n 'frozenset': 'FrozenSet',\n 'deque': 'Deque',\n 'defaultdict': 'DefaultDict',\n 'type': 'Type',\n 'Set': 'AbstractSet'}\n\ndef _is_dunder(attr):\n return attr.startswith('__') and attr.endswith('__')\n\n\nclass _GenericAlias(_Final, _root=True):\n """The central part of internal API.\n\n This represents a generic version of type 'origin' with type arguments 'params'.\n There are two kind of these aliases: user defined and special. The special ones\n are wrappers around builtin collections and ABCs in collections.abc. These must\n have 'name' always set. If 'inst' is False, then the alias can't be instantiated,\n this is used by e.g. typing.List and typing.Dict.\n """\n def __init__(self, origin, params, *, inst=True, special=False, name=None):\n self._inst = inst\n self._special = special\n if special and name is None:\n orig_name = origin.__name__\n name = _normalize_alias.get(orig_name, orig_name)\n self._name = name\n if not isinstance(params, tuple):\n params = (params,)\n self.__origin__ = origin\n self.__args__ = tuple(... if a is _TypingEllipsis else\n () if a is _TypingEmpty else\n a for a in params)\n self.__parameters__ = _collect_type_vars(params)\n self.__slots__ = None # This is not documented.\n if not name:\n self.__module__ = origin.__module__\n\n @_tp_cache\n def __getitem__(self, params):\n if self.__origin__ in (Generic, Protocol):\n # Can't subscript Generic[...] or Protocol[...].\n raise TypeError(f"Cannot subscript already-subscripted {self}")\n if not isinstance(params, tuple):\n params = (params,)\n msg = "Parameters to generic types must be types."\n params = tuple(_type_check(p, msg) for p in params)\n _check_generic(self, params)\n return _subs_tvars(self, self.__parameters__, params)\n\n def copy_with(self, params):\n # We don't copy self._special.\n return _GenericAlias(self.__origin__, params, name=self._name, inst=self._inst)\n\n def __repr__(self):\n if (self._name != 'Callable' or\n len(self.__args__) == 2 and self.__args__[0] is Ellipsis):\n if self._name:\n name = 'typing.' + self._name\n else:\n name = _type_repr(self.__origin__)\n if not self._special:\n args = f'[{", ".join([_type_repr(a) for a in self.__args__])}]'\n else:\n args = ''\n return (f'{name}{args}')\n if self._special:\n return 'typing.Callable'\n return (f'typing.Callable'\n f'[[{", ".join([_type_repr(a) for a in self.__args__[:-1]])}], '\n f'{_type_repr(self.__args__[-1])}]')\n\n def __eq__(self, other):\n if not isinstance(other, _GenericAlias):\n return NotImplemented\n if self.__origin__ != other.__origin__:\n return False\n if self.__origin__ is Union and other.__origin__ is Union:\n return frozenset(self.__args__) == frozenset(other.__args__)\n return self.__args__ == other.__args__\n\n def __hash__(self):\n if self.__origin__ is Union:\n return hash((Union, frozenset(self.__args__)))\n return hash((self.__origin__, self.__args__))\n\n def __call__(self, *args, **kwargs):\n if not self._inst:\n raise TypeError(f"Type {self._name} cannot be instantiated; "\n f"use {self._name.lower()}() instead")\n result = self.__origin__(*args, **kwargs)\n try:\n result.__orig_class__ = self\n except AttributeError:\n pass\n return result\n\n def __mro_entries__(self, bases):\n if self._name: # generic version of an ABC or built-in class\n res = []\n if self.__origin__ not in bases:\n res.append(self.__origin__)\n i = bases.index(self)\n if not any(isinstance(b, _GenericAlias) or issubclass(b, Generic)\n for b in bases[i+1:]):\n res.append(Generic)\n return tuple(res)\n if self.__origin__ is Generic:\n if Protocol in bases:\n return ()\n i = bases.index(self)\n for b in bases[i+1:]:\n if isinstance(b, _GenericAlias) and b is not self:\n return ()\n return (self.__origin__,)\n\n def __getattr__(self, attr):\n # We are careful for copy and pickle.\n # Also for simplicity we just don't relay all dunder names\n if '__origin__' in self.__dict__ and not _is_dunder(attr):\n return getattr(self.__origin__, attr)\n raise AttributeError(attr)\n\n def __setattr__(self, attr, val):\n if _is_dunder(attr) or attr in ('_name', '_inst', '_special'):\n super().__setattr__(attr, val)\n else:\n setattr(self.__origin__, attr, val)\n\n def __instancecheck__(self, obj):\n return self.__subclasscheck__(type(obj))\n\n def __subclasscheck__(self, cls):\n if self._special:\n if not isinstance(cls, _GenericAlias):\n return issubclass(cls, self.__origin__)\n if cls._special:\n return issubclass(cls.__origin__, self.__origin__)\n raise TypeError("Subscripted generics cannot be used with"\n " class and instance checks")\n\n def __reduce__(self):\n if self._special:\n return self._name\n\n if self._name:\n origin = globals()[self._name]\n else:\n origin = self.__origin__\n if (origin is Callable and\n not (len(self.__args__) == 2 and self.__args__[0] is Ellipsis)):\n args = list(self.__args__[:-1]), self.__args__[-1]\n else:\n args = tuple(self.__args__)\n if len(args) == 1 and not isinstance(args[0], tuple):\n args, = args\n return operator.getitem, (origin, args)\n\n\nclass _VariadicGenericAlias(_GenericAlias, _root=True):\n """Same as _GenericAlias above but for variadic aliases. Currently,\n this is used only by special internal aliases: Tuple and Callable.\n """\n def __getitem__(self, params):\n if self._name != 'Callable' or not self._special:\n return self.__getitem_inner__(params)\n if not isinstance(params, tuple) or len(params) != 2:\n raise TypeError("Callable must be used as "\n "Callable[[arg, ...], result].")\n args, result = params\n if args is Ellipsis:\n params = (Ellipsis, result)\n else:\n if not isinstance(args, list):\n raise TypeError(f"Callable[args, result]: args must be a list."\n f" Got {args}")\n params = (tuple(args), result)\n return self.__getitem_inner__(params)\n\n @_tp_cache\n def __getitem_inner__(self, params):\n if self.__origin__ is tuple and self._special:\n if params == ():\n return self.copy_with((_TypingEmpty,))\n if not isinstance(params, tuple):\n params = (params,)\n if len(params) == 2 and params[1] is ...:\n msg = "Tuple[t, ...]: t must be a type."\n p = _type_check(params[0], msg)\n return self.copy_with((p, _TypingEllipsis))\n msg = "Tuple[t0, t1, ...]: each t must be a type."\n params = tuple(_type_check(p, msg) for p in params)\n return self.copy_with(params)\n if self.__origin__ is collections.abc.Callable and self._special:\n args, result = params\n msg = "Callable[args, result]: result must be a type."\n result = _type_check(result, msg)\n if args is Ellipsis:\n return self.copy_with((_TypingEllipsis, result))\n msg = "Callable[[arg, ...], result]: each arg must be a type."\n args = tuple(_type_check(arg, msg) for arg in args)\n params = args + (result,)\n return self.copy_with(params)\n return super().__getitem__(params)\n\n\nclass Generic:\n """Abstract base class for generic types.\n\n A generic type is typically declared by inheriting from\n this class parameterized with one or more type variables.\n For example, a generic mapping type might be defined as::\n\n class Mapping(Generic[KT, VT]):\n def __getitem__(self, key: KT) -> VT:\n ...\n # Etc.\n\n This class can then be used as follows::\n\n def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT:\n try:\n return mapping[key]\n except KeyError:\n return default\n """\n __slots__ = ()\n _is_protocol = False\n\n def __new__(cls, *args, **kwds):\n if cls in (Generic, Protocol):\n raise TypeError(f"Type {cls.__name__} cannot be instantiated; "\n "it can be used only as a base class")\n if super().__new__ is object.__new__ and cls.__init__ is not object.__init__:\n obj = super().__new__(cls)\n else:\n obj = super().__new__(cls, *args, **kwds)\n return obj\n\n @_tp_cache\n def __class_getitem__(cls, params):\n if not isinstance(params, tuple):\n params = (params,)\n if not params and cls is not Tuple:\n raise TypeError(\n f"Parameter list to {cls.__qualname__}[...] cannot be empty")\n msg = "Parameters to generic types must be types."\n params = tuple(_type_check(p, msg) for p in params)\n if cls in (Generic, Protocol):\n # Generic and Protocol can only be subscripted with unique type variables.\n if not all(isinstance(p, TypeVar) for p in params):\n raise TypeError(\n f"Parameters to {cls.__name__}[...] must all be type variables")\n if len(set(params)) != len(params):\n raise TypeError(\n f"Parameters to {cls.__name__}[...] must all be unique")\n else:\n # Subscripting a regular Generic subclass.\n _check_generic(cls, params)\n return _GenericAlias(cls, params)\n\n def __init_subclass__(cls, *args, **kwargs):\n super().__init_subclass__(*args, **kwargs)\n tvars = []\n if '__orig_bases__' in cls.__dict__:\n error = Generic in cls.__orig_bases__\n else:\n error = Generic in cls.__bases__ and cls.__name__ != 'Protocol'\n if error:\n raise TypeError("Cannot inherit from plain Generic")\n if '__orig_bases__' in cls.__dict__:\n tvars = _collect_type_vars(cls.__orig_bases__)\n # Look for Generic[T1, ..., Tn].\n # If found, tvars must be a subset of it.\n # If not found, tvars is it.\n # Also check for and reject plain Generic,\n # and reject multiple Generic[...].\n gvars = None\n for base in cls.__orig_bases__:\n if (isinstance(base, _GenericAlias) and\n base.__origin__ is Generic):\n if gvars is not None:\n raise TypeError(\n "Cannot inherit from Generic[...] multiple types.")\n gvars = base.__parameters__\n if gvars is not None:\n tvarset = set(tvars)\n gvarset = set(gvars)\n if not tvarset <= gvarset:\n s_vars = ', '.join(str(t) for t in tvars if t not in gvarset)\n s_args = ', '.join(str(g) for g in gvars)\n raise TypeError(f"Some type variables ({s_vars}) are"\n f" not listed in Generic[{s_args}]")\n tvars = gvars\n cls.__parameters__ = tuple(tvars)\n\n\nclass _TypingEmpty:\n """Internal placeholder for () or []. Used by TupleMeta and CallableMeta\n to allow empty list/tuple in specific places, without allowing them\n to sneak in where prohibited.\n """\n\n\nclass _TypingEllipsis:\n """Internal placeholder for ... (ellipsis)."""\n\n\n_TYPING_INTERNALS = ['__parameters__', '__orig_bases__', '__orig_class__',\n '_is_protocol', '_is_runtime_protocol']\n\n_SPECIAL_NAMES = ['__abstractmethods__', '__annotations__', '__dict__', '__doc__',\n '__init__', '__module__', '__new__', '__slots__',\n '__subclasshook__', '__weakref__']\n\n# These special attributes will be not collected as protocol members.\nEXCLUDED_ATTRIBUTES = _TYPING_INTERNALS + _SPECIAL_NAMES + ['_MutableMapping__marker']\n\n\ndef _get_protocol_attrs(cls):\n """Collect protocol members from a protocol class objects.\n\n This includes names actually defined in the class dictionary, as well\n as names that appear in annotations. Special names (above) are skipped.\n """\n attrs = set()\n for base in cls.__mro__[:-1]: # without object\n if base.__name__ in ('Protocol', 'Generic'):\n continue\n annotations = getattr(base, '__annotations__', {})\n for attr in list(base.__dict__.keys()) + list(annotations.keys()):\n if not attr.startswith('_abc_') and attr not in EXCLUDED_ATTRIBUTES:\n attrs.add(attr)\n return attrs\n\n\ndef _is_callable_members_only(cls):\n # PEP 544 prohibits using issubclass() with protocols that have non-method members.\n return all(callable(getattr(cls, attr, None)) for attr in _get_protocol_attrs(cls))\n\n\ndef _no_init(self, *args, **kwargs):\n if type(self)._is_protocol:\n raise TypeError('Protocols cannot be instantiated')\n\n\ndef _allow_reckless_class_cheks():\n """Allow instnance and class checks for special stdlib modules.\n\n The abc and functools modules indiscriminately call isinstance() and\n issubclass() on the whole MRO of a user class, which may contain protocols.\n """\n try:\n return sys._getframe(3).f_globals['__name__'] in ['abc', 'functools']\n except (AttributeError, ValueError): # For platforms without _getframe().\n return True\n\n\n_PROTO_WHITELIST = {\n 'collections.abc': [\n 'Callable', 'Awaitable', 'Iterable', 'Iterator', 'AsyncIterable',\n 'Hashable', 'Sized', 'Container', 'Collection', 'Reversible',\n ],\n 'contextlib': ['AbstractContextManager', 'AbstractAsyncContextManager'],\n}\n\n\nclass _ProtocolMeta(ABCMeta):\n # This metaclass is really unfortunate and exists only because of\n # the lack of __instancehook__.\n def __instancecheck__(cls, instance):\n # We need this method for situations where attributes are\n # assigned in __init__.\n if ((not getattr(cls, '_is_protocol', False) or\n _is_callable_members_only(cls)) and\n issubclass(instance.__class__, cls)):\n return True\n if cls._is_protocol:\n if all(hasattr(instance, attr) and\n # All *methods* can be blocked by setting them to None.\n (not callable(getattr(cls, attr, None)) or\n getattr(instance, attr) is not None)\n for attr in _get_protocol_attrs(cls)):\n return True\n return super().__instancecheck__(instance)\n\n\nclass Protocol(Generic, metaclass=_ProtocolMeta):\n """Base class for protocol classes.\n\n Protocol classes are defined as::\n\n class Proto(Protocol):\n def meth(self) -> int:\n ...\n\n Such classes are primarily used with static type checkers that recognize\n structural subtyping (static duck-typing), for example::\n\n class C:\n def meth(self) -> int:\n return 0\n\n def func(x: Proto) -> int:\n return x.meth()\n\n func(C()) # Passes static type check\n\n See PEP 544 for details. Protocol classes decorated with\n @typing.runtime_checkable act as simple-minded runtime protocols that check\n only the presence of given attributes, ignoring their type signatures.\n Protocol classes can be generic, they are defined as::\n\n class GenProto(Protocol[T]):\n def meth(self) -> T:\n ...\n """\n __slots__ = ()\n _is_protocol = True\n _is_runtime_protocol = False\n\n def __init_subclass__(cls, *args, **kwargs):\n super().__init_subclass__(*args, **kwargs)\n\n # Determine if this is a protocol or a concrete subclass.\n if not cls.__dict__.get('_is_protocol', False):\n cls._is_protocol = any(b is Protocol for b in cls.__bases__)\n\n # Set (or override) the protocol subclass hook.\n def _proto_hook(other):\n if not cls.__dict__.get('_is_protocol', False):\n return NotImplemented\n\n # First, perform various sanity checks.\n if not getattr(cls, '_is_runtime_protocol', False):\n if _allow_reckless_class_cheks():\n return NotImplemented\n raise TypeError("Instance and class checks can only be used with"\n " @runtime_checkable protocols")\n if not _is_callable_members_only(cls):\n if _allow_reckless_class_cheks():\n return NotImplemented\n raise TypeError("Protocols with non-method members"\n " don't support issubclass()")\n if not isinstance(other, type):\n # Same error message as for issubclass(1, int).\n raise TypeError('issubclass() arg 1 must be a class')\n\n # Second, perform the actual structural compatibility check.\n for attr in _get_protocol_attrs(cls):\n for base in other.__mro__:\n # Check if the members appears in the class dictionary...\n if attr in base.__dict__:\n if base.__dict__[attr] is None:\n return NotImplemented\n break\n\n # ...or in annotations, if it is a sub-protocol.\n annotations = getattr(base, '__annotations__', {})\n if (isinstance(annotations, collections.abc.Mapping) and\n attr in annotations and\n issubclass(other, Generic) and other._is_protocol):\n break\n else:\n return NotImplemented\n return True\n\n if '__subclasshook__' not in cls.__dict__:\n cls.__subclasshook__ = _proto_hook\n\n # We have nothing more to do for non-protocols...\n if not cls._is_protocol:\n return\n\n # ... otherwise check consistency of bases, and prohibit instantiation.\n for base in cls.__bases__:\n if not (base in (object, Generic) or\n base.__module__ in _PROTO_WHITELIST and\n base.__name__ in _PROTO_WHITELIST[base.__module__] or\n issubclass(base, Generic) and base._is_protocol):\n raise TypeError('Protocols can only inherit from other'\n ' protocols, got %r' % base)\n cls.__init__ = _no_init\n\n\ndef runtime_checkable(cls):\n """Mark a protocol class as a runtime protocol.\n\n Such protocol can be used with isinstance() and issubclass().\n Raise TypeError if applied to a non-protocol class.\n This allows a simple-minded structural check very similar to\n one trick ponies in collections.abc such as Iterable.\n For example::\n\n @runtime_checkable\n class Closable(Protocol):\n def close(self): ...\n\n assert isinstance(open('/some/file'), Closable)\n\n Warning: this will check only the presence of the required methods,\n not their type signatures!\n """\n if not issubclass(cls, Generic) or not cls._is_protocol:\n raise TypeError('@runtime_checkable can be only applied to protocol classes,'\n ' got %r' % cls)\n cls._is_runtime_protocol = True\n return cls\n\n\ndef cast(typ, val):\n """Cast a value to a type.\n\n This returns the value unchanged. To the type checker this\n signals that the return value has the designated type, but at\n runtime we intentionally don't check anything (we want this\n to be as fast as possible).\n """\n return val\n\n\ndef _get_defaults(func):\n """Internal helper to extract the default arguments, by name."""\n try:\n code = func.__code__\n except AttributeError:\n # Some built-in functions don't have __code__, __defaults__, etc.\n return {}\n pos_count = code.co_argcount\n arg_names = code.co_varnames\n arg_names = arg_names[:pos_count]\n defaults = func.__defaults__ or ()\n kwdefaults = func.__kwdefaults__\n res = dict(kwdefaults) if kwdefaults else {}\n pos_offset = pos_count - len(defaults)\n for name, value in zip(arg_names[pos_offset:], defaults):\n assert name not in res\n res[name] = value\n return res\n\n\n_allowed_types = (types.FunctionType, types.BuiltinFunctionType,\n types.MethodType, types.ModuleType,\n WrapperDescriptorType, MethodWrapperType, MethodDescriptorType)\n\n\ndef get_type_hints(obj, globalns=None, localns=None):\n """Return type hints for an object.\n\n This is often the same as obj.__annotations__, but it handles\n forward references encoded as string literals, and if necessary\n adds Optional[t] if a default value equal to None is set.\n\n The argument may be a module, class, method, or function. The annotations\n are returned as a dictionary. For classes, annotations include also\n inherited members.\n\n TypeError is raised if the argument is not of a type that can contain\n annotations, and an empty dictionary is returned if no annotations are\n present.\n\n BEWARE -- the behavior of globalns and localns is counterintuitive\n (unless you are familiar with how eval() and exec() work). The\n search order is locals first, then globals.\n\n - If no dict arguments are passed, an attempt is made to use the\n globals from obj (or the respective module's globals for classes),\n and these are also used as the locals. If the object does not appear\n to have globals, an empty dictionary is used.\n\n - If one dict argument is passed, it is used for both globals and\n locals.\n\n - If two dict arguments are passed, they specify globals and\n locals, respectively.\n """\n\n if getattr(obj, '__no_type_check__', None):\n return {}\n # Classes require a special treatment.\n if isinstance(obj, type):\n hints = {}\n for base in reversed(obj.__mro__):\n if globalns is None:\n base_globals = sys.modules[base.__module__].__dict__\n else:\n base_globals = globalns\n ann = base.__dict__.get('__annotations__', {})\n for name, value in ann.items():\n if value is None:\n value = type(None)\n if isinstance(value, str):\n value = ForwardRef(value, is_argument=False)\n value = _eval_type(value, base_globals, localns)\n hints[name] = value\n return hints\n\n if globalns is None:\n if isinstance(obj, types.ModuleType):\n globalns = obj.__dict__\n else:\n nsobj = obj\n # Find globalns for the unwrapped object.\n while hasattr(nsobj, '__wrapped__'):\n nsobj = nsobj.__wrapped__\n globalns = getattr(nsobj, '__globals__', {})\n if localns is None:\n localns = globalns\n elif localns is None:\n localns = globalns\n hints = getattr(obj, '__annotations__', None)\n if hints is None:\n # Return empty annotations for something that _could_ have them.\n if isinstance(obj, _allowed_types):\n return {}\n else:\n raise TypeError('{!r} is not a module, class, method, '\n 'or function.'.format(obj))\n defaults = _get_defaults(obj)\n hints = dict(hints)\n for name, value in hints.items():\n if value is None:\n value = type(None)\n if isinstance(value, str):\n value = ForwardRef(value)\n value = _eval_type(value, globalns, localns)\n if name in defaults and defaults[name] is None:\n value = Optional[value]\n hints[name] = value\n return hints\n\n\ndef get_origin(tp):\n """Get the unsubscripted version of a type.\n\n This supports generic types, Callable, Tuple, Union, Literal, Final and ClassVar.\n Return None for unsupported types. Examples::\n\n get_origin(Literal[42]) is Literal\n get_origin(int) is None\n get_origin(ClassVar[int]) is ClassVar\n get_origin(Generic) is Generic\n get_origin(Generic[T]) is Generic\n get_origin(Union[T, int]) is Union\n get_origin(List[Tuple[T, T]][int]) == list\n """\n if isinstance(tp, _GenericAlias):\n return tp.__origin__\n if tp is Generic:\n return Generic\n return None\n\n\ndef get_args(tp):\n """Get type arguments with all substitutions performed.\n\n For unions, basic simplifications used by Union constructor are performed.\n Examples::\n get_args(Dict[str, int]) == (str, int)\n get_args(int) == ()\n get_args(Union[int, Union[T, int], str][int]) == (int, str)\n get_args(Union[int, Tuple[T, int]][str]) == (int, Tuple[str, int])\n get_args(Callable[[], T][int]) == ([], int)\n """\n if isinstance(tp, _GenericAlias):\n res = tp.__args__\n if get_origin(tp) is collections.abc.Callable and res[0] is not Ellipsis:\n res = (list(res[:-1]), res[-1])\n return res\n return ()\n\n\ndef no_type_check(arg):\n """Decorator to indicate that annotations are not type hints.\n\n The argument must be a class or function; if it is a class, it\n applies recursively to all methods and classes defined in that class\n (but not to methods defined in its superclasses or subclasses).\n\n This mutates the function(s) or class(es) in place.\n """\n if isinstance(arg, type):\n arg_attrs = arg.__dict__.copy()\n for attr, val in arg.__dict__.items():\n if val in arg.__bases__ + (arg,):\n arg_attrs.pop(attr)\n for obj in arg_attrs.values():\n if isinstance(obj, types.FunctionType):\n obj.__no_type_check__ = True\n if isinstance(obj, type):\n no_type_check(obj)\n try:\n arg.__no_type_check__ = True\n except TypeError: # built-in classes\n pass\n return arg\n\n\ndef no_type_check_decorator(decorator):\n """Decorator to give another decorator the @no_type_check effect.\n\n This wraps the decorator with something that wraps the decorated\n function in @no_type_check.\n """\n\n @functools.wraps(decorator)\n def wrapped_decorator(*args, **kwds):\n func = decorator(*args, **kwds)\n func = no_type_check(func)\n return func\n\n return wrapped_decorator\n\n\ndef _overload_dummy(*args, **kwds):\n """Helper for @overload to raise when called."""\n raise NotImplementedError(\n "You should not call an overloaded function. "\n "A series of @overload-decorated functions "\n "outside a stub module should always be followed "\n "by an implementation that is not @overload-ed.")\n\n\ndef overload(func):\n """Decorator for overloaded functions/methods.\n\n In a stub file, place two or more stub definitions for the same\n function in a row, each decorated with @overload. For example:\n\n @overload\n def utf8(value: None) -> None: ...\n @overload\n def utf8(value: bytes) -> bytes: ...\n @overload\n def utf8(value: str) -> bytes: ...\n\n In a non-stub file (i.e. a regular .py file), do the same but\n follow it with an implementation. The implementation should *not*\n be decorated with @overload. For example:\n\n @overload\n def utf8(value: None) -> None: ...\n @overload\n def utf8(value: bytes) -> bytes: ...\n @overload\n def utf8(value: str) -> bytes: ...\n def utf8(value):\n # implementation goes here\n """\n return _overload_dummy\n\n\ndef final(f):\n """A decorator to indicate final methods and final classes.\n\n Use this decorator to indicate to type checkers that the decorated\n method cannot be overridden, and decorated class cannot be subclassed.\n For example:\n\n class Base:\n @final\n def done(self) -> None:\n ...\n class Sub(Base):\n def done(self) -> None: # Error reported by type checker\n ...\n\n @final\n class Leaf:\n ...\n class Other(Leaf): # Error reported by type checker\n ...\n\n There is no runtime checking of these properties.\n """\n return f\n\n\n# Some unconstrained type variables. These are used by the container types.\n# (These are not for export.)\nT = TypeVar('T') # Any type.\nKT = TypeVar('KT') # Key type.\nVT = TypeVar('VT') # Value type.\nT_co = TypeVar('T_co', covariant=True) # Any type covariant containers.\nV_co = TypeVar('V_co', covariant=True) # Any type covariant containers.\nVT_co = TypeVar('VT_co', covariant=True) # Value type covariant containers.\nT_contra = TypeVar('T_contra', contravariant=True) # Ditto contravariant.\n# Internal type variable used for Type[].\nCT_co = TypeVar('CT_co', covariant=True, bound=type)\n\n# A useful type variable with constraints. This represents string types.\n# (This one *is* for export!)\nAnyStr = TypeVar('AnyStr', bytes, str)\n\n\n# Various ABCs mimicking those in collections.abc.\ndef _alias(origin, params, inst=True):\n return _GenericAlias(origin, params, special=True, inst=inst)\n\nHashable = _alias(collections.abc.Hashable, ()) # Not generic.\nAwaitable = _alias(collections.abc.Awaitable, T_co)\nCoroutine = _alias(collections.abc.Coroutine, (T_co, T_contra, V_co))\nAsyncIterable = _alias(collections.abc.AsyncIterable, T_co)\nAsyncIterator = _alias(collections.abc.AsyncIterator, T_co)\nIterable = _alias(collections.abc.Iterable, T_co)\nIterator = _alias(collections.abc.Iterator, T_co)\nReversible = _alias(collections.abc.Reversible, T_co)\nSized = _alias(collections.abc.Sized, ()) # Not generic.\nContainer = _alias(collections.abc.Container, T_co)\nCollection = _alias(collections.abc.Collection, T_co)\nCallable = _VariadicGenericAlias(collections.abc.Callable, (), special=True)\nCallable.__doc__ = \\\n """Callable type; Callable[[int], str] is a function of (int) -> str.\n\n The subscription syntax must always be used with exactly two\n values: the argument list and the return type. The argument list\n must be a list of types or ellipsis; the return type must be a single type.\n\n There is no syntax to indicate optional or keyword arguments,\n such function types are rarely used as callback types.\n """\nAbstractSet = _alias(collections.abc.Set, T_co)\nMutableSet = _alias(collections.abc.MutableSet, T)\n# NOTE: Mapping is only covariant in the value type.\nMapping = _alias(collections.abc.Mapping, (KT, VT_co))\nMutableMapping = _alias(collections.abc.MutableMapping, (KT, VT))\nSequence = _alias(collections.abc.Sequence, T_co)\nMutableSequence = _alias(collections.abc.MutableSequence, T)\nByteString = _alias(collections.abc.ByteString, ()) # Not generic\nTuple = _VariadicGenericAlias(tuple, (), inst=False, special=True)\nTuple.__doc__ = \\\n """Tuple type; Tuple[X, Y] is the cross-product type of X and Y.\n\n Example: Tuple[T1, T2] is a tuple of two elements corresponding\n to type variables T1 and T2. Tuple[int, float, str] is a tuple\n of an int, a float and a string.\n\n To specify a variable-length tuple of homogeneous type, use Tuple[T, ...].\n """\nList = _alias(list, T, inst=False)\nDeque = _alias(collections.deque, T)\nSet = _alias(set, T, inst=False)\nFrozenSet = _alias(frozenset, T_co, inst=False)\nMappingView = _alias(collections.abc.MappingView, T_co)\nKeysView = _alias(collections.abc.KeysView, KT)\nItemsView = _alias(collections.abc.ItemsView, (KT, VT_co))\nValuesView = _alias(collections.abc.ValuesView, VT_co)\nContextManager = _alias(contextlib.AbstractContextManager, T_co)\nAsyncContextManager = _alias(contextlib.AbstractAsyncContextManager, T_co)\nDict = _alias(dict, (KT, VT), inst=False)\nDefaultDict = _alias(collections.defaultdict, (KT, VT))\nOrderedDict = _alias(collections.OrderedDict, (KT, VT))\nCounter = _alias(collections.Counter, T)\nChainMap = _alias(collections.ChainMap, (KT, VT))\nGenerator = _alias(collections.abc.Generator, (T_co, T_contra, V_co))\nAsyncGenerator = _alias(collections.abc.AsyncGenerator, (T_co, T_contra))\nType = _alias(type, CT_co, inst=False)\nType.__doc__ = \\\n """A special construct usable to annotate class objects.\n\n For example, suppose we have the following classes::\n\n class User: ... # Abstract base for User classes\n class BasicUser(User): ...\n class ProUser(User): ...\n class TeamUser(User): ...\n\n And a function that takes a class argument that's a subclass of\n User and returns an instance of the corresponding class::\n\n U = TypeVar('U', bound=User)\n def new_user(user_class: Type[U]) -> U:\n user = user_class()\n # (Here we could write the user object to a database)\n return user\n\n joe = new_user(BasicUser)\n\n At this point the type checker knows that joe has type BasicUser.\n """\n\n\n@runtime_checkable\nclass SupportsInt(Protocol):\n """An ABC with one abstract method __int__."""\n __slots__ = ()\n\n @abstractmethod\n def __int__(self) -> int:\n pass\n\n\n@runtime_checkable\nclass SupportsFloat(Protocol):\n """An ABC with one abstract method __float__."""\n __slots__ = ()\n\n @abstractmethod\n def __float__(self) -> float:\n pass\n\n\n@runtime_checkable\nclass SupportsComplex(Protocol):\n """An ABC with one abstract method __complex__."""\n __slots__ = ()\n\n @abstractmethod\n def __complex__(self) -> complex:\n pass\n\n\n@runtime_checkable\nclass SupportsBytes(Protocol):\n """An ABC with one abstract method __bytes__."""\n __slots__ = ()\n\n @abstractmethod\n def __bytes__(self) -> bytes:\n pass\n\n\n@runtime_checkable\nclass SupportsIndex(Protocol):\n """An ABC with one abstract method __index__."""\n __slots__ = ()\n\n @abstractmethod\n def __index__(self) -> int:\n pass\n\n\n@runtime_checkable\nclass SupportsAbs(Protocol[T_co]):\n """An ABC with one abstract method __abs__ that is covariant in its return type."""\n __slots__ = ()\n\n @abstractmethod\n def __abs__(self) -> T_co:\n pass\n\n\n@runtime_checkable\nclass SupportsRound(Protocol[T_co]):\n """An ABC with one abstract method __round__ that is covariant in its return type."""\n __slots__ = ()\n\n @abstractmethod\n def __round__(self, ndigits: int = 0) -> T_co:\n pass\n\n\ndef _make_nmtuple(name, types):\n msg = "NamedTuple('Name', [(f0, t0), (f1, t1), ...]); each t must be a type"\n types = [(n, _type_check(t, msg)) for n, t in types]\n nm_tpl = collections.namedtuple(name, [n for n, t in types])\n # Prior to PEP 526, only _field_types attribute was assigned.\n # Now __annotations__ are used and _field_types is deprecated (remove in 3.9)\n nm_tpl.__annotations__ = nm_tpl._field_types = dict(types)\n try:\n nm_tpl.__module__ = sys._getframe(2).f_globals.get('__name__', '__main__')\n except (AttributeError, ValueError):\n pass\n return nm_tpl\n\n\n# attributes prohibited to set in NamedTuple class syntax\n_prohibited = ('__new__', '__init__', '__slots__', '__getnewargs__',\n '_fields', '_field_defaults', '_field_types',\n '_make', '_replace', '_asdict', '_source')\n\n_special = ('__module__', '__name__', '__annotations__')\n\n\nclass NamedTupleMeta(type):\n\n def __new__(cls, typename, bases, ns):\n if ns.get('_root', False):\n return super().__new__(cls, typename, bases, ns)\n types = ns.get('__annotations__', {})\n nm_tpl = _make_nmtuple(typename, types.items())\n defaults = []\n defaults_dict = {}\n for field_name in types:\n if field_name in ns:\n default_value = ns[field_name]\n defaults.append(default_value)\n defaults_dict[field_name] = default_value\n elif defaults:\n raise TypeError("Non-default namedtuple field {field_name} cannot "\n "follow default field(s) {default_names}"\n .format(field_name=field_name,\n default_names=', '.join(defaults_dict.keys())))\n nm_tpl.__new__.__annotations__ = dict(types)\n nm_tpl.__new__.__defaults__ = tuple(defaults)\n nm_tpl._field_defaults = defaults_dict\n # update from user namespace without overriding special namedtuple attributes\n for key in ns:\n if key in _prohibited:\n raise AttributeError("Cannot overwrite NamedTuple attribute " + key)\n elif key not in _special and key not in nm_tpl._fields:\n setattr(nm_tpl, key, ns[key])\n return nm_tpl\n\n\nclass NamedTuple(metaclass=NamedTupleMeta):\n """Typed version of namedtuple.\n\n Usage in Python versions >= 3.6::\n\n class Employee(NamedTuple):\n name: str\n id: int\n\n This is equivalent to::\n\n Employee = collections.namedtuple('Employee', ['name', 'id'])\n\n The resulting class has an extra __annotations__ attribute, giving a\n dict that maps field names to types. (The field names are also in\n the _fields attribute, which is part of the namedtuple API.)\n Alternative equivalent keyword syntax is also accepted::\n\n Employee = NamedTuple('Employee', name=str, id=int)\n\n In Python versions <= 3.5 use::\n\n Employee = NamedTuple('Employee', [('name', str), ('id', int)])\n """\n _root = True\n\n def __new__(*args, **kwargs):\n if not args:\n raise TypeError('NamedTuple.__new__(): not enough arguments')\n cls, *args = args # allow the "cls" keyword be passed\n if args:\n typename, *args = args # allow the "typename" keyword be passed\n elif 'typename' in kwargs:\n typename = kwargs.pop('typename')\n import warnings\n warnings.warn("Passing 'typename' as keyword argument is deprecated",\n DeprecationWarning, stacklevel=2)\n else:\n raise TypeError("NamedTuple.__new__() missing 1 required positional "\n "argument: 'typename'")\n if args:\n try:\n fields, = args # allow the "fields" keyword be passed\n except ValueError:\n raise TypeError(f'NamedTuple.__new__() takes from 2 to 3 '\n f'positional arguments but {len(args) + 2} '\n f'were given') from None\n elif 'fields' in kwargs and len(kwargs) == 1:\n fields = kwargs.pop('fields')\n import warnings\n warnings.warn("Passing 'fields' as keyword argument is deprecated",\n DeprecationWarning, stacklevel=2)\n else:\n fields = None\n\n if fields is None:\n fields = kwargs.items()\n elif kwargs:\n raise TypeError("Either list of fields or keywords"\n " can be provided to NamedTuple, not both")\n return _make_nmtuple(typename, fields)\n __new__.__text_signature__ = '($cls, typename, fields=None, /, **kwargs)'\n\n\ndef _dict_new(cls, /, *args, **kwargs):\n return dict(*args, **kwargs)\n\n\ndef _typeddict_new(cls, typename, fields=None, /, *, total=True, **kwargs):\n if fields is None:\n fields = kwargs\n elif kwargs:\n raise TypeError("TypedDict takes either a dict or keyword arguments,"\n " but not both")\n\n ns = {'__annotations__': dict(fields), '__total__': total}\n try:\n # Setting correct module is necessary to make typed dict classes pickleable.\n ns['__module__'] = sys._getframe(1).f_globals.get('__name__', '__main__')\n except (AttributeError, ValueError):\n pass\n\n return _TypedDictMeta(typename, (), ns)\n\n\ndef _check_fails(cls, other):\n # Typed dicts are only for static structural subtyping.\n raise TypeError('TypedDict does not support instance and class checks')\n\n\nclass _TypedDictMeta(type):\n def __new__(cls, name, bases, ns, total=True):\n """Create new typed dict class object.\n\n This method is called directly when TypedDict is subclassed,\n or via _typeddict_new when TypedDict is instantiated. This way\n TypedDict supports all three syntax forms described in its docstring.\n Subclasses and instances of TypedDict return actual dictionaries\n via _dict_new.\n """\n ns['__new__'] = _typeddict_new if name == 'TypedDict' else _dict_new\n tp_dict = super(_TypedDictMeta, cls).__new__(cls, name, (dict,), ns)\n\n anns = ns.get('__annotations__', {})\n msg = "TypedDict('Name', {f0: t0, f1: t1, ...}); each t must be a type"\n anns = {n: _type_check(tp, msg) for n, tp in anns.items()}\n for base in bases:\n anns.update(base.__dict__.get('__annotations__', {}))\n tp_dict.__annotations__ = anns\n if not hasattr(tp_dict, '__total__'):\n tp_dict.__total__ = total\n return tp_dict\n\n __instancecheck__ = __subclasscheck__ = _check_fails\n\n\nclass TypedDict(dict, metaclass=_TypedDictMeta):\n """A simple typed namespace. At runtime it is equivalent to a plain dict.\n\n TypedDict creates a dictionary type that expects all of its\n instances to have a certain set of keys, where each key is\n associated with a value of a consistent type. This expectation\n is not checked at runtime but is only enforced by type checkers.\n Usage::\n\n class Point2D(TypedDict):\n x: int\n y: int\n label: str\n\n a: Point2D = {'x': 1, 'y': 2, 'label': 'good'} # OK\n b: Point2D = {'z': 3, 'label': 'bad'} # Fails type check\n\n assert Point2D(x=1, y=2, label='first') == dict(x=1, y=2, label='first')\n\n The type info can be accessed via Point2D.__annotations__. TypedDict\n supports two additional equivalent forms::\n\n Point2D = TypedDict('Point2D', x=int, y=int, label=str)\n Point2D = TypedDict('Point2D', {'x': int, 'y': int, 'label': str})\n\n The class syntax is only supported in Python 3.6+, while two other\n syntax forms work for Python 2.7 and 3.2+\n """\n\n\ndef NewType(name, tp):\n """NewType creates simple unique types with almost zero\n runtime overhead. NewType(name, tp) is considered a subtype of tp\n by static type checkers. At runtime, NewType(name, tp) returns\n a dummy function that simply returns its argument. Usage::\n\n UserId = NewType('UserId', int)\n\n def name_by_id(user_id: UserId) -> str:\n ...\n\n UserId('user') # Fails type check\n\n name_by_id(42) # Fails type check\n name_by_id(UserId(42)) # OK\n\n num = UserId(5) + 1 # type: int\n """\n\n def new_type(x):\n return x\n\n new_type.__name__ = name\n new_type.__supertype__ = tp\n return new_type\n\n\n# Python-version-specific alias (Python 2: unicode; Python 3: str)\nText = str\n\n\n# Constant that's True when type checking, but False here.\nTYPE_CHECKING = False\n\n\nclass IO(Generic[AnyStr]):\n """Generic base class for TextIO and BinaryIO.\n\n This is an abstract, generic version of the return of open().\n\n NOTE: This does not distinguish between the different possible\n classes (text vs. binary, read vs. write vs. read/write,\n append-only, unbuffered). The TextIO and BinaryIO subclasses\n below capture the distinctions between text vs. binary, which is\n pervasive in the interface; however we currently do not offer a\n way to track the other distinctions in the type system.\n """\n\n __slots__ = ()\n\n @property\n @abstractmethod\n def mode(self) -> str:\n pass\n\n @property\n @abstractmethod\n def name(self) -> str:\n pass\n\n @abstractmethod\n def close(self) -> None:\n pass\n\n @abstractmethod\n def closed(self) -> bool:\n pass\n\n @abstractmethod\n def fileno(self) -> int:\n pass\n\n @abstractmethod\n def flush(self) -> None:\n pass\n\n @abstractmethod\n def isatty(self) -> bool:\n pass\n\n @abstractmethod\n def read(self, n: int = -1) -> AnyStr:\n pass\n\n @abstractmethod\n def readable(self) -> bool:\n pass\n\n @abstractmethod\n def readline(self, limit: int = -1) -> AnyStr:\n pass\n\n @abstractmethod\n def readlines(self, hint: int = -1) -> List[AnyStr]:\n pass\n\n @abstractmethod\n def seek(self, offset: int, whence: int = 0) -> int:\n pass\n\n @abstractmethod\n def seekable(self) -> bool:\n pass\n\n @abstractmethod\n def tell(self) -> int:\n pass\n\n @abstractmethod\n def truncate(self, size: int = None) -> int:\n pass\n\n @abstractmethod\n def writable(self) -> bool:\n pass\n\n @abstractmethod\n def write(self, s: AnyStr) -> int:\n pass\n\n @abstractmethod\n def writelines(self, lines: List[AnyStr]) -> None:\n pass\n\n @abstractmethod\n def __enter__(self) -> 'IO[AnyStr]':\n pass\n\n @abstractmethod\n def __exit__(self, type, value, traceback) -> None:\n pass\n\n\nclass BinaryIO(IO[bytes]):\n """Typed version of the return of open() in binary mode."""\n\n __slots__ = ()\n\n @abstractmethod\n def write(self, s: Union[bytes, bytearray]) -> int:\n pass\n\n @abstractmethod\n def __enter__(self) -> 'BinaryIO':\n pass\n\n\nclass TextIO(IO[str]):\n """Typed version of the return of open() in text mode."""\n\n __slots__ = ()\n\n @property\n @abstractmethod\n def buffer(self) -> BinaryIO:\n pass\n\n @property\n @abstractmethod\n def encoding(self) -> str:\n pass\n\n @property\n @abstractmethod\n def errors(self) -> Optional[str]:\n pass\n\n @property\n @abstractmethod\n def line_buffering(self) -> bool:\n pass\n\n @property\n @abstractmethod\n def newlines(self) -> Any:\n pass\n\n @abstractmethod\n def __enter__(self) -> 'TextIO':\n pass\n\n\nclass io:\n """Wrapper namespace for IO generic classes."""\n\n __all__ = ['IO', 'TextIO', 'BinaryIO']\n IO = IO\n TextIO = TextIO\n BinaryIO = BinaryIO\n\n\nio.__name__ = __name__ + '.io'\nsys.modules[io.__name__] = io\n\nPattern = _alias(stdlib_re.Pattern, AnyStr)\nMatch = _alias(stdlib_re.Match, AnyStr)\n\nclass re:\n """Wrapper namespace for re type aliases."""\n\n __all__ = ['Pattern', 'Match']\n Pattern = Pattern\n Match = Match\n\n\nre.__name__ = __name__ + '.re'\nsys.modules[re.__name__] = re\n
\n[docs]class ConfigSchema:\n """This is a placeholder type. Any time that it appears in documentation, it means that any of\n the following types are acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@solid(config_schema=int)``\n * ``@solid(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@solid(config_schema=list)``\n * ``@solid(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )\n
\nfrom enum import Enum as PythonEnum\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.serdes import whitelist_for_serdes\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n ANY = "ANY"\n SCALAR = "SCALAR"\n ENUM = "ENUM"\n\n SELECTOR = "SELECTOR"\n STRICT_SHAPE = "STRICT_SHAPE"\n PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n SCALAR_UNION = "SCALAR_UNION"\n\n @staticmethod\n def has_fields(kind):\n check.inst_param(kind, "kind", ConfigTypeKind)\n return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n # Closed generic types\n ARRAY = "ARRAY"\n NONEABLE = "NONEABLE"\n\n @staticmethod\n def is_closed_generic(kind):\n check.inst_param(kind, "kind", ConfigTypeKind)\n return (\n kind == ConfigTypeKind.ARRAY\n or kind == ConfigTypeKind.NONEABLE\n or kind == ConfigTypeKind.SCALAR_UNION\n )\n\n @staticmethod\n def is_shape(kind):\n check.inst_param(kind, "kind", ConfigTypeKind)\n return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n @staticmethod\n def is_selector(kind):\n check.inst_param(kind, "kind", ConfigTypeKind)\n return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n """\n The class backing DagsterTypes as they are used processing configuration data.\n """\n\n def __init__(\n self, key, kind, given_name=None, description=None, type_params=None,\n ):\n\n self.key = check.str_param(key, "key")\n self.kind = check.inst_param(kind, "kind", ConfigTypeKind)\n self.given_name = check.opt_str_param(given_name, "given_name")\n self._description = check.opt_str_param(description, "description")\n self.type_params = (\n check.list_param(type_params, "type_params", of_type=ConfigType)\n if type_params\n else None\n )\n\n @property\n def description(self):\n return self._description\n\n @staticmethod\n def from_builtin_enum(builtin_enum):\n check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n return _CONFIG_MAP[builtin_enum]\n\n def post_process(self, value):\n """\n Implement this in order to take a value provided by the user\n and perform computation on it. This can be done to coerce data types,\n fetch things from the environment (e.g. environment variables), or\n to do custom validation. If the value is not valid, throw a\n PostProcessingError. Otherwise return the coerced value.\n """\n return value\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n INT = "INT"\n STRING = "STRING"\n FLOAT = "FLOAT"\n BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n def __init__(self, key, given_name, scalar_kind, **kwargs):\n self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n super(ConfigScalar, self).__init__(\n key, given_name=given_name, kind=ConfigTypeKind.SCALAR, **kwargs\n )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n def __init__(self, scalar_kind, description=None):\n super(BuiltinConfigScalar, self).__init__(\n key=type(self).__name__,\n given_name=type(self).__name__,\n scalar_kind=scalar_kind,\n description=description,\n )\n\n\nclass Int(BuiltinConfigScalar):\n def __init__(self):\n super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n def __init__(self):\n super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n def __init__(self):\n super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n def __init__(self):\n super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n def post_process(self, value):\n return float(value)\n\n\nclass Any(ConfigType):\n def __init__(self):\n super(Any, self).__init__(\n key="Any", given_name="Any", kind=ConfigTypeKind.ANY,\n )\n\n\n[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type):\n from .field import resolve_to_config_type\n\n self.inner_type = resolve_to_config_type(inner_type)\n super(Noneable, self).__init__(\n key="Noneable.{inner_type}".format(inner_type=self.inner_type.key),\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )\n\n\n[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type):\n from .field import resolve_to_config_type\n\n self.inner_type = resolve_to_config_type(inner_type)\n super(Array, self).__init__(\n key="Array.{inner_type}".format(inner_type=self.inner_type.key),\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @property\n def description(self):\n return "List of {inner_type}".format(inner_type=self.key)\n\n\n[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(self, config_value, python_value=None, description=None):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")\n\n\n[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @solid(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name, enum_values):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.list_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value):\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(\n (\n "Should never reach this. config_value should be pre-validated. "\n "Got {config_value}"\n ).format(config_value=value)\n )\n\n @classmethod\n def from_python_enum(cls, enum, name=None):\n """\n Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @solid(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n # ...\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])\n\n\n[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n solids:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n solids:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self, scalar_type, non_scalar_schema, _key=None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = resolve_to_config_type(scalar_type)\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", "ScalarUnion.{}-{}".format(self.scalar_type.key, self.non_scalar_type.key)\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )\n\n\nConfigAnyInstance = Any()\nConfigBoolInstance = Bool()\nConfigFloatInstance = Float()\nConfigIntInstance = Int()\nConfigStringInstance = String()\n_CONFIG_MAP = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed("Scalar {} is not supported".format(type_name))\n return _CONFIG_MAP_BY_NAME[type_name]\n
\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster.serdes import serialize_value\nfrom dagster.utils import is_enum_value\nfrom dagster.utils.typing_api import is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, all_optional_type\n\n\ndef _is_config_type_class(obj):\n return isinstance(obj, type) and issubclass(obj, ConfigType)\n\n\ndef helpful_list_error_string():\n return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n Becomes Array with list element as an argument.\n"""\n\n\ndef resolve_to_config_type(dagster_type):\n from .field_utils import convert_fields_to_dict_type\n\n # Short circuit if it's already a Config Type\n if isinstance(dagster_type, ConfigType):\n return dagster_type\n\n if isinstance(dagster_type, dict):\n return convert_fields_to_dict_type(dagster_type)\n\n if isinstance(dagster_type, list):\n if len(dagster_type) != 1:\n raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n inner_type = resolve_to_config_type(dagster_type[0])\n\n if not inner_type:\n raise DagsterInvalidDefinitionError(\n "Invalid member of array specification: {value} in list {the_list}".format(\n value=repr(dagster_type[0]), the_list=dagster_type\n )\n )\n return Array(inner_type)\n\n from dagster.core.types.dagster_type import DagsterType, List, ListType\n from dagster.core.types.python_set import Set, _TypedPythonSet\n from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple\n\n if _is_config_type_class(dagster_type):\n check.param_invariant(\n False,\n "dagster_type",\n "Cannot pass a config type class to resolve_to_config_type. Got {dagster_type}".format(\n dagster_type=dagster_type\n ),\n )\n\n if isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType):\n raise DagsterInvalidDefinitionError(\n "You have passed a DagsterType class {dagster_type} to the config system. "\n "The DagsterType and config schema systems are separate. "\n "Valid config values are:\\n{desc}".format(\n dagster_type=repr(dagster_type), desc=VALID_CONFIG_DESC,\n )\n )\n\n if is_typing_type(dagster_type):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed in {dagster_type} to the config system. Types from "\n "the typing module in python are not allowed in the config system. "\n "You must use types that are imported from dagster or primitive types "\n "such as bool, int, etc."\n ).format(dagster_type=dagster_type)\n )\n\n if dagster_type is List or isinstance(dagster_type, ListType):\n raise DagsterInvalidDefinitionError(\n "Cannot use List in the context of config. " + helpful_list_error_string()\n )\n\n if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet):\n raise DagsterInvalidDefinitionError(\n "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n )\n\n if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple):\n raise DagsterInvalidDefinitionError(\n "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n )\n\n if isinstance(dagster_type, DagsterType):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed an instance of DagsterType {type_name} to the config "\n "system (Repr of type: {dagster_type}). "\n "The DagsterType and config schema systems are separate. "\n "Valid config values are:\\n{desc}"\n ).format(\n type_name=dagster_type.display_name,\n dagster_type=repr(dagster_type),\n desc=VALID_CONFIG_DESC,\n ),\n )\n\n # If we are passed here either:\n # 1) We have been passed a python builtin\n # 2) We have been a dagster wrapping type that needs to be convert its config variant\n # e.g. dagster.List\n # 2) We have been passed an invalid thing. We return False to signify this. It is\n # up to callers to report a reasonable error.\n\n from dagster.primitive_mapping import (\n remap_python_builtin_for_config,\n is_supported_config_python_builtin,\n )\n\n if is_supported_config_python_builtin(dagster_type):\n return remap_python_builtin_for_config(dagster_type)\n\n if dagster_type is None:\n return ConfigAnyInstance\n if BuiltinEnum.contains(dagster_type):\n return ConfigType.from_builtin_enum(dagster_type)\n\n # This means that this is an error and we are return False to a callsite\n # We do the error reporting there because those callsites have more context\n return False\n\n\ndef has_implicit_default(config_type):\n if config_type.kind == ConfigTypeKind.NONEABLE:\n return True\n\n return all_optional_type(config_type)\n\n\n[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n pipeline execution time through the config system. Config fields can be set on solids, on\n loaders and materializers for custom, and on other pluggable components of the system, such as\n resources, loggers, and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n\n .. code-block:: python\n\n @solid(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.solid_config['word'] * context.solid_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n (\n "Attempted to pass {value_repr} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n ).format(value_repr=repr(config))\n )\n return config_type\n\n def __init__(\n self, config, default_value=FIELD_NO_DEFAULT_PROVIDED, is_required=None, description=None,\n ):\n from .validate import validate_config\n from .post_process import resolve_defaults\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self.description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values],\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.", evr.errors, default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.", evr.errors, None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @property\n def is_required(self):\n return self._is_required\n\n @property\n def default_provided(self):\n """Was a default value provided\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @property\n def default_value(self):\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @property\n def default_value_as_json_str(self):\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self):\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default="@"\n if self._default_value == FIELD_NO_DEFAULT_PROVIDED\n else self._default_value,\n is_required=self.is_required,\n )\n\n\ndef check_opt_field_param(obj, param_name):\n return check.opt_inst_param(obj, param_name, Field)\n
\n# encoding: utf-8\nimport hashlib\nfrom typing import Any, Dict\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import ConfigType, ConfigTypeKind\n\n\ndef all_optional_type(config_type):\n check.inst_param(config_type, "config_type", ConfigType)\n\n if ConfigTypeKind.is_shape(config_type.kind):\n for field in config_type.fields.values():\n if field.is_required:\n return False\n return True\n\n if ConfigTypeKind.is_selector(config_type.kind):\n if len(config_type.fields) == 1:\n for field in config_type.fields.values():\n if field.is_required:\n return False\n return True\n\n return False\n\n\nclass __FieldValueSentinel:\n pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n def __init__(self, fields, **kwargs):\n self.fields = expand_fields_dict(fields)\n super(_ConfigHasFields, self).__init__(**kwargs)\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n if key in FIELD_HASH_CACHE:\n return FIELD_HASH_CACHE[key]\n\n defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n\n FIELD_HASH_CACHE[key] = defined_cls_inst\n return defined_cls_inst\n\n\ndef _add_hash(m, string):\n m.update(string.encode("utf-8"))\n\n\ndef _compute_fields_hash(fields, description):\n\n m = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n if description:\n _add_hash(m, ":description: " + description)\n\n for field_name in sorted(list(fields.keys())):\n field = fields[field_name]\n _add_hash(m, ":fieldname:" + field_name)\n if field.default_provided:\n _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n _add_hash(m, ":is_required: " + str(field.is_required))\n _add_hash(m, ":type_key: " + field.config_type.key)\n if field.description:\n _add_hash(m, ":description: " + field.description)\n\n return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description):\n return "Shape." + _compute_fields_hash(fields, description)\n\n\n[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n """\n\n def __new__(\n cls, fields, description=None,\n ):\n return _memoize_inst_in_field_cache(\n cls, Shape, _define_shape_key_hash(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description),\n description=description,\n fields=fields,\n )\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + _compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @solid(config_schema=Field(Permissive({'required': Field(String)})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.solid_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + _compute_fields_hash(fields, description=description)\n\n\n[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @solid(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.solid_config:\n return 'Aloha {whom}!'.format(whom=context.solid_config['haw']['whom'])\n if 'cn' in context.solid_config:\n return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.solid_config['cn']['whom'])\n if 'en' in context.solid_config:\n return 'Hello, {whom}!'.format(whom=context.solid_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls, Selector, _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field):\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or resolve_to_config_type(\n potential_field\n )\n\n\ndef convert_fields_to_dict_type(fields):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(original_root, fields, stack):\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields):\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(original_root, fields, stack):\n check.dict_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root, the_list, stack):\n from .config_type import Array\n\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef convert_potential_field(potential_field):\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root, potential_type, stack):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, dict):\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(original_root, potential_field, stack):\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n
\nimport os\n\nfrom dagster import check\n\nfrom .config_type import ScalarUnion\nfrom .errors import PostProcessingError\nfrom .field_utils import Selector\n\nVALID_STRING_SOURCE_TYPES = (str, dict)\n\n\ndef _ensure_env_variable(var):\n check.str_param(var, "var")\n value = os.getenv(var)\n if value is None:\n raise PostProcessingError(\n (\n 'You have attempted to fetch the environment variable "{var}" '\n "which is not set. In order for this execution to succeed it "\n "must be set in this environment."\n ).format(var=var)\n )\n return value\n\n\nclass StringSourceType(ScalarUnion):\n def __init__(self):\n super(StringSourceType, self).__init__(\n scalar_type=str, non_scalar_schema=Selector({"env": str}), _key="StringSourceType",\n )\n\n def post_process(self, value):\n check.param_invariant(isinstance(value, VALID_STRING_SOURCE_TYPES), "value")\n\n if not isinstance(value, dict):\n return value\n\n key, cfg = list(value.items())[0]\n check.invariant(key == "env", "Only valid key is env")\n return str(_ensure_env_variable(cfg))\n\n\nclass IntSourceType(ScalarUnion):\n def __init__(self):\n super(IntSourceType, self).__init__(\n scalar_type=int, non_scalar_schema=Selector({"env": str}), _key="IntSourceType",\n )\n\n def post_process(self, value):\n check.param_invariant(isinstance(value, (dict, int)), "value", "Should be pre-validated")\n\n if not isinstance(value, dict):\n return value\n\n check.invariant(len(value) == 1, "Selector should have one entry")\n\n key, cfg = list(value.items())[0]\n check.invariant(key == "env", "Only valid key is env")\n value = _ensure_env_variable(cfg)\n try:\n return int(value)\n except ValueError:\n raise PostProcessingError(\n (\n 'Value "{value}" stored in env variable "{var}" cannot be '\n "coerced into an int."\n ).format(value=value, var=cfg)\n )\n\n\nStringSource = StringSourceType()\nIntSource = IntSourceType()\n
\nfrom collections import namedtuple\nfrom typing import Any, Callable, Dict, Optional, Union\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.primitive_mapping import is_supported_config_python_builtin\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\n\ndef is_callable_valid_config_arg(config: Dict[str, Any]) -> bool:\n return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n[docs]class ConfigMapping(namedtuple("_ConfigMapping", "config_fn config_schema")):\n """Defines a config mapping for a composite solid.\n\n By specifying a config mapping function, you can override the configuration for the child\n solids contained within a composite solid.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the composite solid, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the composite solid to the config\n that will be provided to the child solids.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the composite config to a config appropriate for the child solids.\n config_schema (ConfigSchema): The schema of the composite config.\n """\n\n def __new__(\n cls,\n config_fn: Callable[[Union[Any, Dict[str, Any]]], Union[Any, Dict[str, Any]]],\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n )\n
\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom typing import Any, Callable, Dict, Optional, Union\n\nfrom dagster import Field, check\nfrom dagster.config.evaluate_value_result import EvaluateValueResult\nfrom dagster.core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom .definition_config_schema import (\n ConfiguredDefinitionConfigSchema,\n IDefinitionConfigSchema,\n convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n @abstractproperty\n def config_schema(self) -> IDefinitionConfigSchema:\n raise NotImplementedError()\n\n @property\n def has_config_field(self) -> bool:\n return self.config_schema is not None and bool(self.config_schema.as_field())\n\n @property\n def config_field(self) -> Optional[Field]:\n return None if not self.config_schema else self.config_schema.as_field()\n\n @abstractmethod\n def copy_for_configured(\n self,\n name: Optional[str],\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n config_or_config_fn: Union[Any, Callable[[Any], Any]],\n ):\n raise NotImplementedError()\n\n def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n """\n Applies user-provided config mapping functions to the given configuration and validates the\n results against the respective config schema.\n\n Expects incoming config to be validated and have fully-resolved values (StringSource values\n resolved, Enum types hydrated, etc.) via process_config() during EnvironmentConfig\n construction and CompositeSolid config mapping.\n\n Args:\n config (Any): A validated and resolved configuration dictionary matching this object's\n config_schema\n\n Returns (EvaluateValueResult):\n If successful, the value is a validated and resolved configuration dictionary for the\n innermost wrapped object after applying the config mapping transformation function.\n """\n # If schema is on a mapped schema this is the innermost resource (base case),\n # so we aren't responsible for validating against anything farther down.\n # Returns an EVR for type consistency with config_mapping_fn.\n return (\n self.config_schema.resolve_config(config)\n if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n else EvaluateValueResult.for_value(config)\n )\n\n def configured(\n self,\n config_or_config_fn: Any,\n config_schema: Optional[Dict[str, Any]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ):\n """\n Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n that its input must satisfy.\n name (Optional[str]): Name of the new definition. If not specified, inherits the name\n of the definition being configured. Note: some definitions (e.g. ResourceDefinition)\n are unnamed and this will error if a name is passed.\n description (Optional[str]): Name of the new definition. If not specified, inherits the name\n of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(name, description, new_config_schema, config_or_config_fn)\n\n def _name_for_configured_node(\n self,\n old_name: Optional[str],\n new_name: Optional[str],\n original_config_or_config_fn: Optional[Callable],\n ) -> Optional[str]:\n fn_name = (\n original_config_or_config_fn.__name__\n if callable(original_config_or_config_fn)\n else None\n )\n name = new_name or fn_name\n if not name:\n raise DagsterInvalidDefinitionError(\n 'Missing string param "name" while attempting to configure the node '\n '"{node_name}". When configuring a node, you must specify a name for the '\n "resulting node definition as a keyword param or use `configured` in decorator "\n "form. For examples, visit https://docs.dagster.io/overview/configuration#configured.".format(\n node_name=old_name,\n )\n )\n return name\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> Any:\n from dagster.core.definitions.composition import CallableNode\n\n check.param_invariant(\n not isinstance(configurable, CallableNode),\n "configurable",\n (\n "You have invoked `configured` on a CallableNode (an intermediate type), which is "\n "produced by aliasing or tagging a solid definition. To configure a solid, you must "\n "call `configured` on either a SolidDefinition and CompositeSolidDefinition. To fix "\n "this error, make sure to call `configured` on the definition object *before* using "\n "the `tag` or `alias` methods. For usage examples, see "\n "https://docs.dagster.io/overview/configuration#configured"\n ),\n )\n check.inst_param(\n configurable,\n "configurable",\n ConfigurableDefinition,\n (\n "Only the following types can be used with the `configured` method: ResourceDefinition, "\n "ExecutorDefinition, CompositeSolidDefinition, SolidDefinition, LoggerDefinition, "\n "and IntermediateStorageDefinition. For usage examples of "\n "`configured`, see https://docs.dagster.io/overview/configuration#configured"\n ),\n )\n\n\n[docs]def configured(\n configurable: ConfigurableDefinition,\n config_schema: Optional[Dict[str, Any]] = None,\n **kwargs: Any,\n):\n """\n A decorator that makes it easy to create a function-configured version of an object.\n The following definition types can be configured using this function:\n\n * :py:class:`CompositeSolidDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`IntermediateStorageDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`SolidDefinition`\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n dev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(s3_resource):\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(s3_resource, {'bucket_prefix', str}):\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n """\n _check_configurable_param(configurable)\n\n def _configured(config_or_config_fn):\n return configurable.configured(\n config_schema=config_schema, config_or_config_fn=config_or_config_fn, **kwargs\n )\n\n return _configured\n
\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.definition_config_schema import (\n convert_user_facing_definition_config_schema,\n)\n\nfrom ..composition import do_composition\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..solid import CompositeSolidDefinition\n\n\nclass _CompositeSolid:\n def __init__(\n self,\n name: Optional[str] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n description: Optional[str] = None,\n config_schema: Any = None,\n config_fn: Optional[Callable[[dict], dict]] = None,\n ):\n self.name = check.opt_str_param(name, "name")\n self.input_defs = check.opt_nullable_list_param(input_defs, "input_defs", InputDefinition)\n self.output_defs = check.opt_nullable_list_param(output_defs, "output", OutputDefinition)\n self.description = check.opt_str_param(description, "description")\n\n self.config_schema = convert_user_facing_definition_config_schema(config_schema)\n self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n\n def __call__(self, fn: Callable[..., Any]):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n (\n input_mappings,\n output_mappings,\n dependencies,\n solid_defs,\n config_mapping,\n positional_inputs,\n ) = do_composition(\n "@composite_solid",\n self.name,\n fn,\n self.input_defs,\n self.output_defs,\n self.config_schema,\n self.config_fn,\n ignore_output_from_composition_fn=False,\n )\n\n composite_def = CompositeSolidDefinition(\n name=self.name,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n dependencies=dependencies,\n solid_defs=solid_defs,\n description=self.description,\n config_mapping=config_mapping,\n positional_inputs=positional_inputs,\n )\n update_wrapper(composite_def, fn)\n return composite_def\n\n\n[docs]def composite_solid(\n name: Union[Optional[str], Callable[..., Any]] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Dict[str, Any]] = None,\n config_fn: Optional[Callable[[dict], dict]] = None,\n) -> _CompositeSolid:\n """Create a composite solid with the specified parameters from the decorated composition\n function.\n\n Using this decorator allows you to build up the dependency graph of the composite by writing a\n function that invokes solids and passes the output to other solids. This is similar to the use\n of the :py:func:`@pipeline <pipeline>` decorator, with the additional ability to remap inputs,\n outputs, and config across the composite boundary.\n\n Args:\n name (Optional[str]): Name for the new composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n description (Optional[str]): Human-readable description of the new composite solid.\n input_defs (Optional[List[InputDefinition]]): Input definitions for the composite solid.\n If not provided explicitly, these will be inferred from typehints.\n\n Uses of these inputs in the body of the decorated composition function will be used to\n infer the appropriate set of :py:class:`InputMappings <InputMapping>` passed to the\n underlying :py:class:`CompositeSolidDefinition`.\n output_defs (Optional[List[OutputDefinition]]): Output definitions for the composite solid.\n If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`CompositeSolidDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n config_schema (Optional[ConfigSchema]): The schema for the config. Must be combined with the\n `config_fn` argument in order to transform this config into the config for the contained\n solids.\n config_fn (Callable[[dict], dict]): By specifying a config mapping\n function, you can override the configuration for the child solids contained within this\n composite solid.\n\n Config mappings require the configuration field to be specified as ``config_schema``, which\n will be exposed as the configuration field for the composite solid, as well as a\n configuration mapping function, ``config_fn``, which maps the config provided to the\n composite solid to the config that will be provided to the child solids.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n @composite_solid\n def add_two(num: int) -> int:\n adder_1 = add_one.alias('adder_1')\n adder_2 = add_one.alias('adder_2')\n\n return adder_2(adder_1(num))\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(config_fn is None)\n return _CompositeSolid()(name)\n\n return _CompositeSolid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n description=description,\n config_schema=config_schema,\n config_fn=config_fn,\n )\n
\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import split_function_parameters, validate_decorated_fn_positionals\nfrom ..events import HookExecutionResult\nfrom ..hook import HookDefinition\n\n\nclass _Hook:\n def __init__(self, name=None, required_resource_keys=None):\n self.name = check.opt_str_param(name, "name")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n expected_positionals = ["context", "event_list"]\n fn_positionals, _ = split_function_parameters(fn, expected_positionals)\n missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "'{hook_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. Hook functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context' and "\n "a second positional parameter named 'event_list'.".format(\n hook_name=fn.__name__, missing_param=missing_positional\n )\n )\n\n hook_def = HookDefinition(\n name=self.name, hook_fn=fn, required_resource_keys=self.required_resource_keys,\n )\n update_wrapper(hook_def, fn)\n return hook_def\n\n\ndef event_list_hook(name=None, required_resource_keys=None):\n """Create a generic hook with the specified parameters from the decorated function.\n\n This decorator is currently used internally by Dagster machinery to support success_hook and\n failure_hook.\n\n The user-defined hook function requires two parameters:\n - A `context` object is passed as the first parameter. The context is an instance of\n :py:class:`context <HookContext>`, and provides access to system\n information, such as loggers (context.log), resources (context.resources), the solid\n (context.solid) and its execution step (context.step) which triggers this hook.\n - An `event_list` object is passed as the second paramter. It provides the full event list of the\n associated execution step.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @event_list_hook(required_resource_keys={'slack'})\n def slack_on_materializations(context, event_list):\n for event in event_list:\n if event.event_type == DagsterEventType.STEP_MATERIALIZATION:\n message = '{solid} has materialized an asset {key}.'.format(\n solid=context.solid.name,\n key=event.asset_key\n )\n # send a slack message every time a materialization event occurs\n context.resources.slack.send_message(message)\n\n\n """\n # This case is for when decorator is used bare, without arguments.\n # e.g. @event_list_hook versus @event_list_hook()\n if callable(name):\n check.invariant(required_resource_keys is None)\n return _Hook()(name)\n\n return _Hook(name=name, required_resource_keys=required_resource_keys)\n\n\n[docs]def success_hook(name=None, required_resource_keys=None):\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_on_success(context):\n message = 'solid {} succeeded'.format(context.solid.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn):\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n fn_positionals, _ = split_function_parameters(fn, expected_positionals)\n missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@success_hook '{hook_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. Hook functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n hook_name=fn.__name__, missing_param=missing_positional\n )\n )\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys)\n def _success_hook(context, event_list):\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper\n\n\n[docs]def failure_hook(name=None, required_resource_keys=None):\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_on_failure(context):\n message = 'solid {} failed'.format(context.solid.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn):\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n fn_positionals, _ = split_function_parameters(fn, expected_positionals)\n missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@failure_hook '{hook_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. Hook functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n hook_name=fn.__name__, missing_param=missing_positional\n )\n )\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys)\n def _failure_hook(context, event_list):\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper\n
\nfrom functools import update_wrapper, wraps\n\nfrom dagster import check\nfrom dagster.core.types.dagster_type import DagsterTypeKind\n\nfrom ..events import Output\nfrom ..inference import infer_input_definitions_for_lambda_solid, infer_output_definitions\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..solid import SolidDefinition\nfrom .solid import validate_solid_fn\n\n\nclass _LambdaSolid:\n def __init__(self, name=None, input_defs=None, output_def=None, description=None):\n self.name = check.opt_str_param(name, "name")\n self.input_defs = check.opt_nullable_list_param(input_defs, "input_defs", InputDefinition)\n self.output_def = check.opt_inst_param(output_def, "output_def", OutputDefinition)\n self.description = check.opt_str_param(description, "description")\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n input_defs = (\n self.input_defs\n if self.input_defs is not None\n else infer_input_definitions_for_lambda_solid(self.name, fn)\n )\n output_def = (\n self.output_def\n if self.output_def is not None\n else infer_output_definitions("@lambda_solid", self.name, fn)[0]\n )\n\n positional_inputs = validate_solid_fn("@lambda_solid", self.name, fn, input_defs)\n compute_fn = _create_lambda_solid_compute_wrapper(fn, input_defs, output_def)\n\n solid_def = SolidDefinition(\n name=self.name,\n input_defs=input_defs,\n output_defs=[output_def],\n compute_fn=compute_fn,\n description=self.description,\n positional_inputs=positional_inputs,\n )\n update_wrapper(solid_def, fn)\n return solid_def\n\n\n[docs]def lambda_solid(name=None, description=None, input_defs=None, output_def=None):\n """Create a simple solid from the decorated function.\n\n This shortcut allows the creation of simple solids that do not require\n configuration and whose implementations do not require a\n :py:class:`context <SystemComputeExecutionContext>`.\n\n Lambda solids take any number of inputs and produce a single output.\n\n Inputs can be defined using :class:`InputDefinition` and passed to the ``input_defs`` argument\n of this decorator, or inferred from the type signature of the decorated function.\n\n The single output can be defined using :class:`OutputDefinition` and passed as the\n ``output_def`` argument of this decorator, or its type can be inferred from the type signature\n of the decorated function.\n\n The body of the decorated function should return a single value, which will be yielded as the\n solid's output.\n\n Args:\n name (str): Name of solid.\n description (str): Solid description.\n input_defs (List[InputDefinition]): List of input_defs.\n output_def (OutputDefinition): The output of the solid. Defaults to\n :class:`OutputDefinition() <OutputDefinition>`.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def hello_world():\n return 'hello'\n\n @lambda_solid(\n input_defs=[InputDefinition(name='foo', str)],\n output_def=OutputDefinition(str)\n )\n def hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @lambda_solid\n def hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(description is None)\n return _LambdaSolid(output_def=output_def)(name)\n\n return _LambdaSolid(\n name=name, input_defs=input_defs, output_def=output_def, description=description\n )\n\n\ndef _create_lambda_solid_compute_wrapper(fn, input_defs, output_def):\n check.callable_param(fn, "fn")\n check.list_param(input_defs, "input_defs", of_type=InputDefinition)\n check.inst_param(output_def, "output_def", OutputDefinition)\n\n input_names = [\n input_def.name\n for input_def in input_defs\n if not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n ]\n\n @wraps(fn)\n def compute(_context, input_defs):\n kwargs = {}\n for input_name in input_names:\n kwargs[input_name] = input_defs[input_name]\n\n result = fn(**kwargs)\n yield Output(value=result, output_name=output_def.name)\n\n return compute\n
\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..hook import HookDefinition\nfrom ..input import InputDefinition\nfrom ..mode import ModeDefinition\nfrom ..output import OutputDefinition\nfrom ..pipeline import PipelineDefinition\nfrom ..preset import PresetDefinition\n\n\nclass _Pipeline:\n def __init__(\n self,\n name=None,\n mode_defs=None,\n preset_defs=None,\n description=None,\n tags=None,\n hook_defs=None,\n input_defs=None,\n output_defs=None,\n config_schema=None,\n config_fn=None,\n ):\n self.name = check.opt_str_param(name, "name")\n self.mode_definitions = check.opt_list_param(mode_defs, "mode_defs", ModeDefinition)\n self.preset_definitions = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n self.description = check.opt_str_param(description, "description")\n self.tags = check.opt_dict_param(tags, "tags")\n self.hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self.input_defs = check.opt_nullable_list_param(\n input_defs, "input_defs", of_type=InputDefinition\n )\n self.did_pass_outputs = output_defs is not None\n self.output_defs = check.opt_nullable_list_param(\n output_defs, "output_defs", of_type=OutputDefinition\n )\n self.config_schema = config_schema\n self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n from dagster.core.definitions.decorators.composite_solid import do_composition\n\n (\n input_mappings,\n output_mappings,\n dependencies,\n solid_defs,\n config_mapping,\n positional_inputs,\n ) = do_composition(\n "@pipeline",\n self.name,\n fn,\n self.input_defs,\n self.output_defs,\n self.config_schema,\n self.config_fn,\n ignore_output_from_composition_fn=not self.did_pass_outputs,\n )\n\n pipeline_def = PipelineDefinition(\n name=self.name,\n dependencies=dependencies,\n solid_defs=solid_defs,\n mode_defs=self.mode_definitions,\n preset_defs=self.preset_definitions,\n description=self.description,\n tags=self.tags,\n hook_defs=self.hook_defs,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config_mapping=config_mapping,\n positional_inputs=positional_inputs,\n )\n update_wrapper(pipeline_def, fn)\n return pipeline_def\n\n\n[docs]def pipeline(\n name=None,\n description=None,\n mode_defs=None,\n preset_defs=None,\n tags=None,\n hook_defs=None,\n input_defs=None,\n output_defs=None,\n config_schema=None,\n config_fn=None,\n):\n """Create a pipeline with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up the dependency graph of the pipeline by writing a\n function that invokes solids and passes the output to other solids.\n\n Args:\n name (Optional[str]): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary\n available resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[Set[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n\n Example:\n\n .. code-block:: python\n\n @solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\n def emit_two_four(_) -> int:\n yield Output(2, "two")\n yield Output(4, "four")\n\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n\n @lambda_solid\n def mult_two(num: int) -> int:\n return num * 2\n\n\n @pipeline\n def math_pipeline():\n two, four = emit_two_four()\n add_one(two)\n mult_two(four)\n """\n\n if input_defs is not None:\n experimental_arg_warning("input_defs", "pipeline")\n\n if output_defs is not None:\n experimental_arg_warning("output_defs", "pipeline")\n\n if config_schema is not None:\n experimental_arg_warning("config_schema", "pipeline")\n\n if config_fn is not None:\n experimental_arg_warning("config_fn", "pipeline")\n\n if callable(name):\n check.invariant(description is None)\n return _Pipeline()(name)\n\n return _Pipeline(\n name=name,\n mode_defs=mode_defs,\n preset_defs=preset_defs,\n description=description,\n tags=tags,\n hook_defs=hook_defs,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n config_fn=config_fn,\n )\n
\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ..job import JobDefinition\nfrom ..partition import PartitionSetDefinition\nfrom ..pipeline import PipelineDefinition\nfrom ..repository import VALID_REPOSITORY_DATA_DICT_KEYS, RepositoryData, RepositoryDefinition\nfrom ..schedule import ScheduleDefinition\n\n\nclass _Repository:\n def __init__(self, name=None, description=None):\n self.name = check.opt_str_param(name, "name")\n self.description = check.opt_str_param(description, "description")\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n repository_definitions = fn()\n\n if not (\n isinstance(repository_definitions, list)\n or isinstance(repository_definitions, dict)\n or isinstance(repository_definitions, RepositoryData)\n ):\n raise DagsterInvalidDefinitionError(\n "Bad return value of type {type_} from repository construction function: must "\n "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n "details and examples".format(type_=type(repository_definitions)),\n )\n\n if isinstance(repository_definitions, list):\n bad_definitions = []\n for i, definition in enumerate(repository_definitions):\n if not (\n isinstance(definition, PipelineDefinition)\n or isinstance(definition, PartitionSetDefinition)\n or isinstance(definition, ScheduleDefinition)\n or isinstance(definition, JobDefinition)\n ):\n bad_definitions.append((i, type(definition)))\n if bad_definitions:\n raise DagsterInvalidDefinitionError(\n "Bad return value from repository construction function: all elements of list "\n "must be of type PipelineDefinition, PartitionSetDefinition, "\n "ScheduleDefinition, or JobDefinition. Got {bad_definitions_formatted}.".format(\n bad_definitions_formatted=", ".join(\n [\n "value of type {type_} at index {i}".format(type_=type_, i=i)\n for i, type_ in bad_definitions\n ]\n )\n )\n )\n repository_data = RepositoryData.from_list(repository_definitions)\n\n elif isinstance(repository_definitions, dict):\n if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n raise DagsterInvalidDefinitionError(\n "Bad return value from repository construction function: dict must not contain "\n "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found "\n "{bad_keys}".format(\n bad_keys=", ".join(\n [\n "'{key}'".format(key=key)\n for key in repository_definitions.keys()\n if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n ]\n )\n )\n )\n repository_data = RepositoryData.from_dict(repository_definitions)\n elif isinstance(repository_definitions, RepositoryData):\n repository_data = repository_definitions\n\n repository_def = RepositoryDefinition(\n name=self.name, description=self.description, repository_data=repository_data\n )\n\n update_wrapper(repository_def, fn)\n return repository_def\n\n\n[docs]def repository(name=None, description=None):\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[PipelineDefinition, PartitionSetDefinition, ScheduleDefinition]]``. Use this\n form when you have no need to lazy load pipelines or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. An object of type :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n\n Example:\n\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @solid(config_schema={n: Field(Int)})\n def return_n(context):\n return context.solid_config['n']\n\n @pipeline(name='simple_pipeline')\n def simple_pipeline():\n return_n()\n\n simple_partition_set = PartitionSetDefinition(\n name='simple_partition_set',\n pipeline_name='simple_pipeline',\n partition_fn=lambda: range(10),\n run_config_fn_for_partition=(\n lambda partition: {\n 'solids': {'return_n': {'config': {'n': partition}}}\n }\n ),\n )\n\n simple_schedule = simple_partition_set.create_schedule_definition(\n schedule_name='simple_daily_10_pm_schedule',\n cron_schedule='0 22 * * *',\n )\n\n @repository\n def simple_repository():\n return [simple_pipeline, simple_partition_set, simple_schedule]\n\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_pipeline():\n @pipeline(name='expensive_pipeline')\n def expensive_pipeline():\n for i in range(10000):\n return_n.alias('return_n_{i}'.format(i=i))()\n\n return expensive_pipeline\n\n expensive_partition_set = PartitionSetDefinition(\n name='expensive_partition_set',\n pipeline_name='expensive_pipeline',\n partition_fn=lambda: range(10),\n run_config_fn_for_partition=(\n lambda partition: {\n 'solids': {\n 'return_n_{i}'.format(i=i): {'config': {'n': partition}}\n for i in range(10000)\n }\n }\n ),\n )\n\n def make_expensive_schedule():\n expensive_partition_set.create_schedule_definition(\n schedule_name='expensive_schedule',\n cron_schedule='0 22 * * *',\n )\n\n @repository\n def lazy_loaded_repository():\n return {\n 'pipelines': {'expensive_pipeline': make_expensive_pipeline},\n 'partition_sets': {\n 'expensive_partition_set': expensive_partition_set\n },\n 'schedules': {'expensive_schedule: make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily construct pipelines from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_pipeline(self, pipeline_name):\n return self._construct_pipeline_def_from_yaml_file(\n self._yaml_file_for_pipeline_name(pipeline_name)\n )\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n\n """\n if callable(name):\n check.invariant(description is None)\n\n return _Repository()(name)\n\n return _Repository(name=name, description=description)\n
\nimport datetime\nimport warnings\n\nimport pendulum\nfrom dagster import check\nfrom dagster.core.definitions.partition import PartitionSetDefinition\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.utils.partitions import (\n DEFAULT_DATE_FORMAT,\n DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,\n DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,\n DEFAULT_MONTHLY_FORMAT,\n create_offset_partition_selector,\n schedule_partition_range,\n)\n\nfrom ..mode import DEFAULT_MODE_NAME\nfrom ..schedule import ScheduleDefinition\n\n# Error messages are long\n# pylint: disable=C0301\n\n\n[docs]def schedule(\n cron_schedule,\n pipeline_name,\n name=None,\n tags=None,\n tags_fn=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n execution_timezone=None,\n):\n """Create a schedule.\n\n The decorated function will be called as the ``run_config_fn`` of the underlying\n :py:class:`~dagster.ScheduleDefinition` and should take a\n :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment\n dict for the scheduled execution.\n\n Args:\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n ``'45 23 * * 6'`` for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleExecutionContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[[ScheduleExecutionContext], bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n\n def inner(fn):\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n return ScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n run_config_fn=fn,\n tags=tags,\n tags_fn=tags_fn,\n solid_selection=solid_selection,\n mode=mode,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n )\n\n return inner\n\n\n[docs]def monthly_schedule(\n pipeline_name,\n start_date,\n name=None,\n execution_day_of_month=1,\n execution_time=datetime.time(0, 0),\n tags_fn_for_date=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n end_date=None,\n execution_timezone=None,\n):\n """Create a schedule that runs monthly.\n\n The decorated function will be called as the ``run_config_fn`` of the underlying\n :py:class:`~dagster.ScheduleDefinition` and should take a\n :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment\n dict for the scheduled execution.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_month (int): The day of the month on which to run the schedule (must be\n between 0 and 31).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_month, "execution_day")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n\n if (\n start_date.day != 1\n or start_date.hour != 0\n or start_date.minute != 0\n or start_date.second != 0\n ):\n warnings.warn(\n "`start_date` must be at the beginning of the first day of the month for a monthly "\n "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "\n "at a specific time within the month. For example, to run the schedule at 3AM on the "\n "23rd of each month starting in October, your schedule definition would look like:"\n """\n@monthly_schedule(\n start_date=datetime.datetime(2020, 10, 1),\n execution_day_of_month=23,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_month <= 0 or execution_day_of_month > 31:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "\n "between 1 and 31".format(execution_day_of_month)\n )\n\n cron_schedule = "{minute} {hour} {day} * *".format(\n minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_month\n )\n\n fmt = DEFAULT_MONTHLY_FORMAT\n\n execution_time_to_partition_fn = (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(months=1, days=execution_day_of_month - 1)\n )\n\n partition_fn = schedule_partition_range(\n start_date,\n end=end_date,\n cron_schedule=cron_schedule,\n fmt=fmt,\n timezone=execution_timezone,\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n )\n\n def inner(fn):\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value = lambda partition: {}\n if tags_fn_for_date:\n tags_fn_for_partition_value = lambda partition: tags_fn_for_date(partition.value)\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name,\n partition_fn=partition_fn,\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n )\n\n return partition_set.create_schedule_definition(\n schedule_name,\n cron_schedule,\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=execution_time_to_partition_fn\n ),\n execution_timezone=execution_timezone,\n )\n\n return inner\n\n\n[docs]def weekly_schedule(\n pipeline_name,\n start_date,\n name=None,\n execution_day_of_week=0,\n execution_time=datetime.time(0, 0),\n tags_fn_for_date=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n end_date=None,\n execution_timezone=None,\n):\n """Create a schedule that runs weekly.\n\n The decorated function will be called as the ``run_config_fn`` of the underlying\n :py:class:`~dagster.ScheduleDefinition` and should take a\n :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment\n dict for the scheduled execution.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_week (int): The day of the week on which to run the schedule. Must be\n between 0 (Sunday) and 6 (Saturday).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_week, "execution_day_of_week")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a weekly schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@weekly_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_day_of_week=1,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_week < 0 or execution_day_of_week >= 7:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be "\n "between 0 [Sunday] and 6 [Saturday]".format(execution_day_of_week)\n )\n\n cron_schedule = "{minute} {hour} * * {day}".format(\n minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_week\n )\n\n fmt = DEFAULT_DATE_FORMAT\n\n day_difference = (execution_day_of_week - (start_date.weekday() + 1)) % 7\n\n execution_time_to_partition_fn = (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(weeks=1, days=day_difference)\n )\n\n partition_fn = schedule_partition_range(\n start_date,\n end=end_date,\n cron_schedule=cron_schedule,\n fmt=fmt,\n timezone=execution_timezone,\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n )\n\n def inner(fn):\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value = lambda partition: {}\n if tags_fn_for_date:\n tags_fn_for_partition_value = lambda partition: tags_fn_for_date(partition.value)\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name,\n partition_fn=partition_fn,\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n )\n\n return partition_set.create_schedule_definition(\n schedule_name,\n cron_schedule,\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n ),\n execution_timezone=execution_timezone,\n )\n\n return inner\n\n\n[docs]def daily_schedule(\n pipeline_name,\n start_date,\n name=None,\n execution_time=datetime.time(0, 0),\n tags_fn_for_date=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n end_date=None,\n execution_timezone=None,\n):\n """Create a schedule that runs daily.\n\n The decorated function will be called as the ``run_config_fn`` of the underlying\n :py:class:`~dagster.ScheduleDefinition` and should take a\n :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment\n dict for the scheduled execution.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n check.str_param(pipeline_name, "pipeline_name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_str_param(name, "name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(execution_timezone, "execution_timezone")\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a daily schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each day starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@daily_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n cron_schedule = "{minute} {hour} * * *".format(\n minute=execution_time.minute, hour=execution_time.hour\n )\n\n fmt = DEFAULT_DATE_FORMAT\n\n execution_time_to_partition_fn = (\n lambda d: pendulum.instance(d).replace(hour=0, minute=0).subtract(days=1,)\n )\n\n partition_fn = schedule_partition_range(\n start_date,\n end=end_date,\n cron_schedule=cron_schedule,\n fmt=fmt,\n timezone=execution_timezone,\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n )\n\n def inner(fn):\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value = lambda partition: {}\n if tags_fn_for_date:\n tags_fn_for_partition_value = lambda partition: tags_fn_for_date(partition.value)\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name,\n partition_fn=partition_fn,\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n )\n\n return partition_set.create_schedule_definition(\n schedule_name,\n cron_schedule,\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n ),\n execution_timezone=execution_timezone,\n )\n\n return inner\n\n\n[docs]def hourly_schedule(\n pipeline_name,\n start_date,\n name=None,\n execution_time=datetime.time(0, 0),\n tags_fn_for_date=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n end_date=None,\n execution_timezone=None,\n):\n """Create a schedule that runs hourly.\n\n The decorated function will be called as the ``run_config_fn`` of the underlying\n :py:class:`~dagster.ScheduleDefinition` and should take a\n :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment\n dict for the scheduled execution.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create. By default, this will be the name\n of the decorated function.\n execution_time (datetime.time): The time at which to execute the schedule. Only the minutes\n component will be respected -- the hour should be 0, and will be ignored if it is not 0.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.str_param(pipeline_name, "pipeline_name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n\n if start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of the hour for an hourly schedule. "\n "Use `execution_time` to execute the schedule at a specific time within the hour. For "\n "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "\n "on 10/20/2020, your schedule definition would look like:"\n """\n@hourly_schedule(\n start_date=datetime.datetime(2020, 10, 20, 3),\n execution_time=datetime.time(0, 15)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_time.hour != 0:\n warnings.warn(\n "Hourly schedule {schedule_name} created with:\\n"\n "\\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."\n "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "\n "will run on the {minute} mark for the previous hour interval. Replace "\n "datetime.time(hour={hour}, minute={minute}, ...) with "\n "datetime.time(minute={minute}, ...) to fix this warning."\n )\n\n cron_schedule = "{minute} * * * *".format(minute=execution_time.minute)\n\n fmt = (\n DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE\n if execution_timezone\n else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n )\n\n execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract(\n hours=1, minutes=(execution_time.minute - start_date.minute) % 60\n )\n\n partition_fn = schedule_partition_range(\n start_date,\n end=end_date,\n cron_schedule=cron_schedule,\n fmt=fmt,\n timezone=execution_timezone,\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n )\n\n def inner(fn):\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value = lambda partition: {}\n if tags_fn_for_date:\n tags_fn_for_partition_value = lambda partition: tags_fn_for_date(partition.value)\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name,\n partition_fn=partition_fn,\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n )\n\n return partition_set.create_schedule_definition(\n schedule_name,\n cron_schedule,\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=execution_time_to_partition_fn,\n ),\n execution_timezone=execution_timezone,\n )\n\n return inner\n
\nimport inspect\n\nfrom dagster import check\nfrom dagster.core.definitions.sensor import RunRequest, SensorDefinition, SkipReason\nfrom dagster.core.errors import DagsterInvariantViolationError\n\n\n[docs]def sensor(pipeline_name, name=None, solid_selection=None, mode=None):\n """\n Creates a sensor where the decorated function is used as the sensor's evaluation function. The\n decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Yield multiple of `RunRequest` objects.\n 3. Return or yield a `SkipReason` object, providing a descriptive message of why no runs were\n requested.\n 4. Return or yield nothing (skipping without providing a reason)\n\n Takes a :py:class:`~dagster.SensorExecutionContext`.\n\n Args:\n name (str): The name of this sensor\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute for runs for this sensor e.g.\n ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing runs for this sensor.\n (default: 'default')\n """\n check.opt_str_param(name, "name")\n\n def inner(fn):\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(context):\n result = fn(context)\n\n if inspect.isgenerator(result):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n (\n "Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n "{result} of type {type_}. Should only return SkipReason or "\n "RunRequest objects."\n ).format(sensor_name=sensor_name, result=result, type_=type(result))\n )\n\n return SensorDefinition(\n name=sensor_name,\n pipeline_name=pipeline_name,\n evaluation_fn=_wrapped_fn,\n solid_selection=solid_selection,\n mode=mode,\n )\n\n return inner\n
\nimport inspect\nfrom functools import update_wrapper, wraps\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.core.types.dagster_type import DagsterTypeKind\n\nfrom ...decorator_utils import (\n InvalidDecoratedFunctionInfo,\n positional_arg_name_list,\n split_function_parameters,\n validate_decorated_fn_input_args,\n validate_decorated_fn_positionals,\n)\nfrom ..events import AssetMaterialization, ExpectationResult, Materialization, Output\nfrom ..inference import infer_input_definitions_for_solid, infer_output_definitions\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..solid import SolidDefinition\n\n\nclass _Solid:\n def __init__(\n self,\n name=None,\n input_defs=None,\n output_defs=None,\n description=None,\n required_resource_keys=None,\n config_schema=None,\n tags=None,\n version=None,\n ):\n self.name = check.opt_str_param(name, "name")\n self.input_defs = check.opt_nullable_list_param(input_defs, "input_defs", InputDefinition)\n self.output_defs = check.opt_nullable_list_param(\n output_defs, "output_defs", OutputDefinition\n )\n\n self.description = check.opt_str_param(description, "description")\n\n # these will be checked within SolidDefinition\n self.required_resource_keys = required_resource_keys\n self.tags = tags\n self.version = version\n\n # config will be checked within SolidDefinition\n self.config_schema = config_schema\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n input_defs = (\n self.input_defs\n if self.input_defs is not None\n else infer_input_definitions_for_solid(self.name, fn)\n )\n output_defs = (\n self.output_defs\n if self.output_defs is not None\n else infer_output_definitions("@solid", self.name, fn)\n )\n\n positional_inputs = validate_solid_fn("@solid", self.name, fn, input_defs, ["context"])\n compute_fn = _create_solid_compute_wrapper(fn, input_defs, output_defs)\n\n solid_def = SolidDefinition(\n name=self.name,\n input_defs=input_defs,\n output_defs=output_defs,\n compute_fn=compute_fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n tags=self.tags,\n positional_inputs=positional_inputs,\n version=self.version,\n )\n update_wrapper(solid_def, fn)\n return solid_def\n\n\n[docs]def solid(\n name=None,\n description=None,\n input_defs=None,\n output_defs=None,\n config_schema=None,\n required_resource_keys=None,\n tags=None,\n version=None,\n):\n """Create a solid with the specified parameters from the decorated function.\n\n This shortcut simplifies the core :class:`SolidDefinition` API by exploding arguments into\n kwargs of the decorated compute function and omitting additional parameters when they are not\n needed.\n\n Input and output definitions will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the solid's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n Args:\n name (Optional[str]): Name of solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n description (Optional[str]): Human-readable description of this solid.\n input_defs (Optional[List[InputDefinition]]):\n List of input definitions. Inferred from typehints if not provided.\n output_defs (Optional[List[OutputDefinition]]):\n List of output definitions. Inferred from typehints if not provided.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available as context.solid_config.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n\n\n Examples:\n\n .. code-block:: python\n\n @solid\n def hello_world(_context):\n print('hello')\n\n @solid\n def hello_world(_context):\n return {'foo': 'bar'}\n\n @solid\n def hello_world(_context):\n return Output(value={'foo': 'bar'})\n\n @solid\n def hello_world(_context):\n yield Output(value={'foo': 'bar'})\n\n @solid\n def hello_world(_context, foo):\n return foo\n\n @solid(\n input_defs=[InputDefinition(name="foo", str)],\n output_defs=[OutputDefinition(str)]\n )\n def hello_world(_context, foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @solid\n def hello_world(_context, foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n @solid\n def hello_world(context, foo):\n context.log.info('log something')\n return foo\n\n @solid(\n config_schema={'str_value' : Field(str)}\n )\n def hello_world(context, foo):\n # context.solid_config is a dictionary with 'str_value' key\n return foo + context.solid_config['str_value']\n\n """\n # This case is for when decorator is used bare, without arguments. e.g. @solid versus @solid()\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Solid()(name)\n\n return _Solid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n )\n\n\ndef _create_solid_compute_wrapper(fn, input_defs, output_defs):\n check.callable_param(fn, "fn")\n check.list_param(input_defs, "input_defs", of_type=InputDefinition)\n check.list_param(output_defs, "output_defs", of_type=OutputDefinition)\n\n input_names = [\n input_def.name\n for input_def in input_defs\n if not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n ]\n\n @wraps(fn)\n def compute(context, input_defs):\n kwargs = {}\n for input_name in input_names:\n kwargs[input_name] = input_defs[input_name]\n\n result = fn(context, **kwargs)\n\n if inspect.isgenerator(result):\n yield from result\n else:\n if isinstance(result, (AssetMaterialization, Materialization, ExpectationResult)):\n raise DagsterInvariantViolationError(\n (\n "Error in solid {solid_name}: If you are returning an AssetMaterialization "\n "or an ExpectationResult from solid you must yield them to avoid "\n "ambiguity with an implied result from returning a value.".format(\n solid_name=context.solid.name\n )\n )\n )\n\n if isinstance(result, Output):\n yield result\n elif len(output_defs) == 1:\n if result is None and output_defs[0].is_required is False:\n context.log.warn(\n 'Value "None" returned for non-required output "{output_name}". '\n "This value will be passed to downstream solids. For conditional execution use\\n"\n ' yield Output(value, "{output_name}")\\n'\n "when you want the downstream solids to execute, "\n "and do not yield it when you want downstream solids to skip.".format(\n output_name=output_defs[0].name\n )\n )\n yield Output(value=result, output_name=output_defs[0].name)\n elif result is not None:\n if not output_defs:\n raise DagsterInvariantViolationError(\n (\n "Error in solid {solid_name}: Unexpectedly returned output {result} "\n "of type {type_}. Solid is explicitly defined to return no "\n "results."\n ).format(solid_name=context.solid.name, result=result, type_=type(result))\n )\n\n raise DagsterInvariantViolationError(\n (\n "Error in solid {solid_name}: Solid unexpectedly returned "\n "output {result} of type {type_}. Should "\n "be a generator, containing or yielding "\n "{n_results} results: {{{expected_results}}}."\n ).format(\n solid_name=context.solid.name,\n result=result,\n type_=type(result),\n n_results=len(output_defs),\n expected_results=", ".join(\n [\n "'{result_name}': {dagster_type}".format(\n result_name=output_def.name,\n dagster_type=output_def.dagster_type,\n )\n for output_def in output_defs\n ]\n ),\n )\n )\n\n return compute\n\n\ndef validate_solid_fn(\n decorator_name, fn_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True\n):\n check.str_param(decorator_name, "decorator_name")\n check.str_param(fn_name, "fn_name")\n check.callable_param(compute_fn, "compute_fn")\n check.list_param(input_defs, "input_defs", of_type=InputDefinition)\n expected_positionals = check.opt_list_param(\n expected_positionals, "expected_positionals", of_type=str\n )\n if exclude_nothing:\n names = set(\n inp.name for inp in input_defs if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name for inp in input_defs if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n names = set(inp.name for inp in input_defs)\n nothing_names = set()\n\n # Currently being super strict about naming. Might be a good idea to relax. Starting strict.\n fn_positionals, input_args = split_function_parameters(compute_fn, expected_positionals)\n\n # Validate Positional Parameters\n missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "{decorator_name} '{solid_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. Solid functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n decorator_name=decorator_name, solid_name=fn_name, missing_param=missing_positional\n )\n )\n\n # Validate non positional parameters\n invalid_function_info = validate_decorated_fn_input_args(names, input_args)\n if invalid_function_info:\n if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES["vararg"]:\n raise DagsterInvalidDefinitionError(\n "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "\n "'{param}'. Solid functions should only have keyword arguments that match "\n "input names and a first positional parameter named 'context'.".format(\n decorator_name=decorator_name,\n solid_name=fn_name,\n param=invalid_function_info.param,\n )\n )\n elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES["missing_name"]:\n if invalid_function_info.param in nothing_names:\n raise DagsterInvalidDefinitionError(\n "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "\n "one of the solid input_defs of type 'Nothing' which should not be included since "\n "no data will be passed for it. ".format(\n decorator_name=decorator_name,\n solid_name=fn_name,\n param=invalid_function_info.param,\n )\n )\n else:\n raise DagsterInvalidDefinitionError(\n "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "\n "one of the solid input_defs. Solid functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n decorator_name=decorator_name,\n solid_name=fn_name,\n param=invalid_function_info.param,\n )\n )\n elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES["extra"]:\n undeclared_inputs_printed = ", '".join(invalid_function_info.missing_names)\n raise DagsterInvalidDefinitionError(\n "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "\n "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "\n "should only have keyword arguments that match input names and a first positional "\n "parameter named 'context'.".format(\n decorator_name=decorator_name,\n solid_name=fn_name,\n undeclared_inputs_printed=undeclared_inputs_printed,\n )\n )\n\n return positional_arg_name_list(input_args)\n
\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict, namedtuple\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.serdes import whitelist_for_serdes\nfrom dagster.utils import frozentags\n\nfrom .hook import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\n\n[docs]class SolidInvocation(namedtuple("Solid", "name alias tags hook_defs")):\n """Identifies an instance of a solid in a pipeline dependency structure.\n\n Args:\n name (str): Name of the solid of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the solid. Necessary when there are\n multiple instances of the same solid.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the solid definition.\n hook_defs (Optional[Set[HookDefinition]]): A set of hook definitions applied to the\n solid instance.\n\n Examples:\n\n .. code-block:: python\n\n pipeline = PipelineDefinition(\n solid_defs=[solid_1, solid_2]\n dependencies={\n SolidInvocation('solid_1', alias='other_name') : {\n 'input_name' : DependencyDefinition('solid_1'),\n },\n 'solid_2' : {\n 'input_name': DependencyDefinition('other_name'),\n },\n }\n )\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`PipelineDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@pipeline <pipeline>` API:\n\n .. code-block:: python\n\n @pipeline\n def pipeline():\n other_name = solid_1.alias('other_name')\n solid_2(other_name(solid_1))\n\n """\n\n def __new__(cls, name, alias=None, tags=None, hook_defs=None):\n name = check.str_param(name, "name")\n alias = check.opt_str_param(alias, "alias")\n tags = frozentags(check.opt_dict_param(tags, "tags", value_type=str, key_type=str))\n hook_defs = frozenset(check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition))\n return super(cls, SolidInvocation).__new__(cls, name, alias, tags, hook_defs)\n\n\nclass Solid:\n """\n Solid invocation within a pipeline. Defined by its name inside the pipeline.\n\n Attributes:\n name (str):\n Name of the solid inside the pipeline. Must be unique per-pipeline.\n definition (NodeDefinition):\n Definition of the Node.\n """\n\n def __init__(self, name, definition, graph_definition, tags=None, hook_defs=None):\n from .graph import GraphDefinition\n from .solid import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition, "graph_definition", GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n input_handles = {}\n for name, input_def in self.definition.input_dict.items():\n input_handles[name] = SolidInputHandle(self, input_def)\n\n self._input_handles = input_handles\n\n output_handles = {}\n for name, output_def in self.definition.output_dict.items():\n output_handles[name] = SolidOutputHandle(self, output_def)\n\n self._output_handles = output_handles\n\n def input_handles(self):\n return self._input_handles.values()\n\n def output_handles(self):\n return self._output_handles.values()\n\n def input_handle(self, name):\n check.str_param(name, "name")\n return self._input_handles[name]\n\n def output_handle(self, name):\n check.str_param(name, "name")\n return self._output_handles[name]\n\n def has_input(self, name):\n return self.definition.has_input(name)\n\n def input_def_named(self, name):\n return self.definition.input_def_named(name)\n\n def has_output(self, name):\n return self.definition.has_output(name)\n\n def output_def_named(self, name):\n return self.definition.output_def_named(name)\n\n @property\n def is_composite(self):\n from .graph import GraphDefinition\n\n return isinstance(self.definition, GraphDefinition)\n\n @property\n def input_dict(self):\n return self.definition.input_dict\n\n @property\n def output_dict(self):\n return self.definition.output_dict\n\n @property\n def tags(self):\n return self.definition.tags.updated_with(self._additional_tags)\n\n def container_maps_input(self, input_name):\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name):\n return self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n\n def container_maps_fan_in_input(self, input_name, fan_in_index):\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name, fan_in_index):\n return self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n\n @property\n def hook_defs(self):\n return self._hook_defs\n\n\n@whitelist_for_serdes\nclass SolidHandle(namedtuple("_SolidHandle", "name parent")):\n def __new__(cls, name, parent):\n return super(SolidHandle, cls).__new__(\n cls, check.str_param(name, "name"), check.opt_inst_param(parent, "parent", SolidHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def path(self):\n """Return a list representation of the handle.\n\n Inverse of SolidHandle.from_path.\n\n Returns:\n List[str]:\n """\n path = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of SolidHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle):\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (SolidHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", SolidHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor):\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (SolidHandle): Handle to an ancestor of the current handle.\n\n Returns:\n SolidHandle:\n\n Example:\n\n .. code-block:: python\n\n handle = SolidHandle('baz', SolidHandle('bar', SolidHandle('foo', None)))\n ancestor = SolidHandle('bar', SolidHandle('foo', None))\n assert handle.pop(ancestor) == SolidHandle('baz', None)\n """\n\n check.inst_param(ancestor, "ancestor", SolidHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n "Handle {handle} does not descend from {ancestor}".format(\n handle=self.to_string(), ancestor=ancestor.to_string()\n ),\n )\n\n return SolidHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor):\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (SolidHandle): Handle to the new ancestor.\n\n Returns:\n SolidHandle:\n\n Example:\n\n .. code-block:: python\n\n handle = SolidHandle('baz', SolidHandle('bar', SolidHandle('foo', None)))\n ancestor = SolidHandle('quux' None)\n assert handle.with_ancestor(ancestor) == SolidHandle(\n 'baz', SolidHandle('bar', SolidHandle('foo', SolidHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", SolidHandle)\n\n return SolidHandle.from_path((ancestor.path if ancestor else []) + self.path)\n\n @staticmethod\n def from_path(path):\n check.list_param(path, "path", of_type=str)\n\n cur = None\n while len(path) > 0:\n cur = SolidHandle(name=path.pop(0), parent=cur)\n return cur\n\n @staticmethod\n def from_string(handle_str):\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return SolidHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr):\n """This method makes it possible to load a potentially nested SolidHandle after a\n roundtrip through json.loads(json.dumps(SolidHandle._asdict()))"""\n\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of SolidHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of SolidHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n dict_repr["parent"] = SolidHandle.from_dict(\n {"name": dict_repr["parent"][0], "parent": dict_repr["parent"][1],}\n )\n\n return SolidHandle(**{k: dict_repr[k] for k in ["name", "parent"]})\n\n\nclass SolidInputHandle(namedtuple("_SolidInputHandle", "solid input_def")):\n def __new__(cls, solid, input_def):\n return super(SolidInputHandle, cls).__new__(\n cls,\n check.inst_param(solid, "solid", Solid),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self):\n return struct_to_string(\n "SolidInputHandle", solid_name=self.solid.name, input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.solid.name, self.input_def.name))\n\n def __eq__(self, other):\n return self.solid.name == other.solid.name and self.input_def.name == other.input_def.name\n\n @property\n def solid_name(self):\n return self.solid.name\n\n @property\n def input_name(self):\n return self.input_def.name\n\n\nclass SolidOutputHandle(namedtuple("_SolidOutputHandle", "solid output_def")):\n def __new__(cls, solid, output_def):\n return super(SolidOutputHandle, cls).__new__(\n cls,\n check.inst_param(solid, "solid", Solid),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self):\n return struct_to_string(\n "SolidOutputHandle", solid_name=self.solid.name, output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.solid.name, self.output_def.name))\n\n def __eq__(self, other):\n return self.solid.name == other.solid.name and self.output_def.name == other.output_def.name\n\n def describe(self):\n return f"{self.solid_name}:{self.output_def.name}"\n\n @property\n def solid_name(self):\n return self.solid.name\n\n @property\n def is_dynamic(self):\n return self.output_def.is_dynamic\n\n\nclass InputToOutputHandleDict(defaultdict):\n def __init__(self):\n defaultdict.__init__(self, list)\n\n def __getitem__(self, key):\n check.inst_param(key, "key", SolidInputHandle)\n return defaultdict.__getitem__(self, key)\n\n def __setitem__(self, key, val):\n check.inst_param(key, "key", SolidInputHandle)\n if not (isinstance(val, SolidOutputHandle) or isinstance(val, list)):\n check.failed(\n "Value must be SolidOutoutHandle or List[SolidOutputHandle], got {val}".format(\n val=type(val)\n )\n )\n\n return defaultdict.__setitem__(self, key, val)\n\n\ndef _create_handle_dict(solid_dict, dep_dict):\n from .composition import MappedInputPlaceholder\n\n check.dict_param(solid_dict, "solid_dict", key_type=str, value_type=Solid)\n check.two_dim_dict_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict = InputToOutputHandleDict()\n\n for solid_name, input_dict in dep_dict.items():\n from_solid = solid_dict[solid_name]\n for input_name, dep_def in input_dict.items():\n if dep_def.is_multi():\n handles = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(solid_dict[inner_dep.solid].output_handle(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n "Unexpected MultiDependencyDefinition dependencies type {}".format(\n inner_dep\n )\n )\n\n handle_dict[from_solid.input_handle(input_name)] = handles\n\n else:\n handle_dict[from_solid.input_handle(input_name)] = solid_dict[\n dep_def.solid\n ].output_handle(dep_def.output)\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(solids, dep_dict):\n return DependencyStructure(list(dep_dict.keys()), _create_handle_dict(solids, dep_dict))\n\n def __init__(self, solid_names, handle_dict):\n self._solid_names = solid_names\n self._handle_dict = check.inst_param(handle_dict, "handle_dict", InputToOutputHandleDict)\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is solid\n # count during the GraphQL query in particular\n\n # solid_name => input_handle => list[output_handle]\n self._solid_input_index = defaultdict(dict)\n\n # solid_name => output_handle => list[input_handle]\n self._solid_output_index = defaultdict(lambda: defaultdict(list))\n\n # solid_name => dynamic output_handle\n self._solid_dynamic_index = {}\n\n for input_handle, output_handle_or_list in self._handle_dict.items():\n if isinstance(output_handle_or_list, list): # fan-in dep\n output_handle_list = []\n for handle in output_handle_or_list:\n if not isinstance(handle, SolidOutputHandle):\n continue\n\n if handle.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on dynamic output "{handle.describe()}".'\n )\n if self._solid_dynamic_index.get(handle.solid_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on output "{handle.describe()}", downstream of '\n f'"{self._solid_dynamic_index[handle.solid_name].describe()}".'\n )\n\n output_handle_list.append(handle)\n\n else: # singular dep\n output_handle = output_handle_or_list\n if output_handle.is_dynamic:\n self._validate_and_set_dynamic_output(input_handle, output_handle)\n\n if self._solid_dynamic_index.get(output_handle.solid_name):\n self._validate_and_set_dynamic_output(\n input_handle, self._solid_dynamic_index[output_handle.solid_name]\n )\n\n output_handle_list = [output_handle]\n\n self._solid_input_index[input_handle.solid.name][input_handle] = output_handle_list\n for output_handle in output_handle_list:\n self._solid_output_index[output_handle.solid.name][output_handle].append(\n input_handle\n )\n\n def _validate_and_set_dynamic_output(self, input_handle, output_handle):\n """Helper function for populating _solid_dynamic_index"""\n\n if not input_handle.solid.definition.input_supports_dynamic_output_dep(\n input_handle.input_name\n ):\n raise DagsterInvalidDefinitionError(\n f'Solid "{input_handle.solid_name}" cannot be downstream of dynamic output '\n f'"{output_handle.describe()}" since input "{input_handle.input_name}" maps to a solid '\n "that is already downstream of another dynamic output. Solids cannot be downstream of more "\n "than one dynamic output"\n )\n\n if self._solid_dynamic_index.get(input_handle.solid_name) is None:\n self._solid_dynamic_index[input_handle.solid_name] = output_handle\n return\n\n if self._solid_dynamic_index[input_handle.solid_name] != output_handle:\n raise DagsterInvalidDefinitionError(\n f'Solid "{input_handle.solid_name}" cannot be downstream of more than one dynamic output. '\n f'It is downstream of both "{output_handle.describe()}" and '\n f'"{self._solid_dynamic_index[input_handle.solid_name].describe()}"'\n )\n\n def all_upstream_outputs_from_solid(self, solid_name):\n check.str_param(solid_name, "solid_name")\n\n # flatten out all outputs that feed into the inputs of this solid\n return [\n output_handle\n for output_handle_list in self._solid_input_index[solid_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_solid(self, solid_name):\n """\n Returns a Dict[SolidInputHandle, List[SolidOutputHandle]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[SolidOutputHandle] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_input_index[solid_name]\n\n def output_to_downstream_inputs_for_solid(self, solid_name):\n """\n Returns a Dict[SolidOutputHandle, List[SolidInputHandle]] that\n represents all the downstream inputs for each output in the\n dictionary\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_output_index[solid_name]\n\n def has_singular_dep(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n return isinstance(self._handle_dict.get(solid_input_handle), SolidOutputHandle)\n\n def get_singular_dep(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep = self._handle_dict[solid_input_handle]\n check.invariant(\n isinstance(dep, SolidOutputHandle),\n "Cannot call get_singular_dep when dep is not singular, got {dep}".format(\n dep=type(dep)\n ),\n )\n return dep\n\n def has_multi_deps(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n return isinstance(self._handle_dict.get(solid_input_handle), list)\n\n def get_multi_deps(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep = self._handle_dict[solid_input_handle]\n check.invariant(\n isinstance(dep, list),\n "Cannot call get_multi_dep when dep is singular, got {dep}".format(dep=type(dep)),\n )\n return dep\n\n def has_deps(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n return solid_input_handle in self._handle_dict\n\n def get_deps_list(self, solid_input_handle):\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n check.invariant(self.has_deps(solid_input_handle))\n if self.has_singular_dep(solid_input_handle):\n return [self.get_singular_dep(solid_input_handle)]\n else:\n return self.get_multi_deps(solid_input_handle)\n\n def input_handles(self):\n return list(self._handle_dict.keys())\n\n def items(self):\n return self._handle_dict.items()\n\n def get_upstream_dynamic_handle_for_solid(self, solid_name):\n return self._solid_dynamic_index.get(solid_name)\n\n def debug_str(self):\n if not self.items():\n return "DependencyStructure: EMPTY"\n\n debug = "DependencyStructure: \\n"\n for in_handle, out_handle in self.items():\n debug += " {out_solid}.{out_name} ---> {in_solid}.{in_name}\\n".format(\n out_solid=out_handle.solid.name,\n out_name=out_handle.output_def.name,\n in_name=in_handle.input_def.name,\n in_solid=in_handle.solid.name,\n )\n return debug\n\n\nclass IDependencyDefinition(ABC): # pylint: disable=no-init\n @abstractmethod\n def get_solid_dependencies(self):\n pass\n\n @abstractmethod\n def is_multi(self):\n pass\n\n\n[docs]class DependencyDefinition(\n namedtuple("_DependencyDefinition", "solid output description"), IDependencyDefinition\n):\n """Represents an edge in the DAG of solid instances forming a pipeline.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a pipeline whose keys represent the dependent solid and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of solid_b depends on the output named 'result' of\n solid_a, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'solid_b': {\n 'input': DependencyDefinition('solid_a', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`PipelineDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@pipeline <pipeline>` API:\n\n .. code-block:: python\n\n @pipeline\n def pipeline():\n solid_b(solid_a())\n\n\n Args:\n solid (str): The name of the solid that is depended on, that is, from which the value\n passed between the two solids originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(cls, solid, output=DEFAULT_OUTPUT, description=None):\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(solid, "solid"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_solid_dependencies(self):\n return [self]\n\n def is_multi(self):\n return False\n\n\n[docs]class MultiDependencyDefinition(\n namedtuple("_MultiDependencyDefinition", "dependencies"), IDependencyDefinition\n):\n """Represents a fan-in edge in the DAG of solid instances forming a pipeline.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a pipeline whose keys represent the dependent solid and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of solid_c depends on the outputs named 'result' of\n solid_a and solid_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'solid_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('solid_a', 'result'),\n DependencyDefinition('solid_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`PipelineDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@pipeline <pipeline>` API:\n\n .. code-block:: python\n\n @pipeline\n def pipeline():\n solid_c(solid_a(), solid_b())\n\n Args:\n solid (str): The name of the solid that is depended on, that is, from which the value\n passed between the two solids originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(cls, dependencies):\n from .composition import MappedInputPlaceholder\n\n deps = check.list_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.solid + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n 'Duplicate dependencies on solid "{dep.solid}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition.".format(dep=dep)\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed("Unexpected dependencies entry {}".format(dep))\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n def get_solid_dependencies(self):\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]\n\n def is_multi(self):\n return True\n\n def get_dependencies_and_mappings(self):\n return self.dependencies\n
\nimport os\nimport re\nimport warnings\nfrom collections import namedtuple\nfrom enum import Enum\n\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterInvalidAssetKey\nfrom dagster.serdes import Persistable, whitelist_for_persistence\n\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\n\ndef last_file_comp(path):\n return os.path.basename(os.path.normpath(path))\n\n\nASSET_KEY_REGEX = re.compile("^[a-zA-Z0-9_.-]+$") # alphanumeric, _, -, .\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_STRUCTURED_DELIMITER = "."\n\n\ndef validate_asset_key_string(s):\n if not s or not ASSET_KEY_REGEX.match(s):\n raise DagsterInvalidAssetKey()\n\n return s\n\n\ndef parse_asset_key_string(s):\n return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n[docs]@whitelist_for_persistence\nclass AssetKey(namedtuple("_AssetKey", "path"), Persistable):\n """ Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n @solid\n def structured_asset_key_solid(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n @solid\n def structured_asset_key_solid_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n Args:\n path (str|str[]|str()): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path=None):\n if isinstance(path, str):\n path = [path]\n elif isinstance(path, list):\n path = check.list_param(path, "path", of_type=str)\n else:\n path = check.tuple_param(path, "path", of_type=str)\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return "AssetKey({})".format(self.path)\n\n def __repr__(self):\n return "AssetKey({})".format(self.path)\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n return self.to_string() == other.to_string()\n\n def to_string(self, legacy=False):\n if not self.path:\n return None\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(self.path)\n return seven.json.dumps(self.path)\n\n @staticmethod\n def from_db_string(asset_key_string):\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path, legacy=False):\n check.list_param(path, "path", of_type=str)\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(path)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(asset_key):\n if asset_key and asset_key.get("path"):\n return AssetKey(asset_key.get("path"))\n return None\n\n\n[docs]@whitelist_for_persistence\nclass EventMetadataEntry(\n namedtuple("_EventMetadataEntry", "label description entry_data"), Persistable\n):\n """The standard structure for describing metadata for Dagster events.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in Dagit and other tooling.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n entry_data (Union[(Union[TextMetadataEntryData, UrlMetadataEntryData, PathMetadataEntryData, JsonMetadataEntryData, MarkdownMetadataEntryData, FloatMetadataEntryData, IntMetadataEntryData]):\n Typed metadata entry data. The different types allow for customized display in tools\n like dagit.\n """\n\n def __new__(cls, label, description, entry_data):\n return super(EventMetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(entry_data, "entry_data", EntryDataUnion),\n )\n\n[docs] @staticmethod\n def text(text, label, description=None):\n """Static constructor for a metadata entry containing text as\n :py:class:`TextMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n Args:\n text (Optional[str]): The text of this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return EventMetadataEntry(label, description, TextMetadataEntryData(text))\n\n[docs] @staticmethod\n def url(url, label, description=None):\n """Static constructor for a metadata entry containing a URL as\n :py:class:`UrlMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata_entries=[\n EventMetadataEntry.url(\n "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n ),\n ],\n )\n\n Args:\n url (Optional[str]): The URL contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return EventMetadataEntry(label, description, UrlMetadataEntryData(url))\n\n[docs] @staticmethod\n def path(path, label, description=None):\n """Static constructor for a metadata entry containing a path as\n :py:class:`PathMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.path("path/to/file", label="filepath")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return EventMetadataEntry(label, description, PathMetadataEntryData(path))\n\n[docs] @staticmethod\n def fspath(path, label=None, description=None):\n """Static constructor for a metadata entry containing a filesystem path as\n :py:class:`PathMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.fspath("path/to/file")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (str): Short display label for this metadata entry. Defaults to the\n base name of the path.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return EventMetadataEntry.path(\n path, label if label is not None else last_file_comp(path), description\n )\n\n[docs] @staticmethod\n def json(data, label, description=None):\n """Static constructor for a metadata entry containing JSON data as\n :py:class:`JsonMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata_entries=[\n EventMetadataEntry.json(\n label="metadata", data={"missing_columns": missing_things},\n )\n ],\n )\n\n Args:\n data (Optional[Dict[str, Any]]): The JSON data contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return EventMetadataEntry(label, description, JsonMetadataEntryData(data))\n\n[docs] @staticmethod\n def md(md_str, label, description=None):\n """Static constructor for a metadata entry containing markdown data as\n :py:class:`MarkdownMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata_entries=[EventMetadataEntry.md(md_str=md_str)],\n )\n\n Args:\n md_str (Optional[str]): The markdown contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return EventMetadataEntry(label, description, MarkdownMetadataEntryData(md_str))\n\n @staticmethod\n def python_artifact(python_artifact, label, description=None):\n check.callable_param(python_artifact, "python_artifact")\n return EventMetadataEntry(\n label,\n description,\n PythonArtifactMetadataEntryData(python_artifact.__module__, python_artifact.__name__),\n )\n\n[docs] @staticmethod\n def float(value, label, description=None):\n """Static constructor for a metadata entry containing float as\n :py:class:`FloatMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n )\n\n Args:\n value (Optional[float]): The float value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return EventMetadataEntry(label, description, FloatMetadataEntryData(value))\n\n[docs] @staticmethod\n def int(value, label, description=None):\n """Static constructor for a metadata entry containing int as\n :py:class:`IntMetadataEntryData`. For example:\n\n .. code-block:: python\n\n @solid\n def emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.int(len(df), "number of rows")],\n )\n\n Args:\n value (Optional[int]): The int value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return EventMetadataEntry(label, description, IntMetadataEntryData(value))\n\n\n[docs]@whitelist_for_persistence\nclass TextMetadataEntryData(namedtuple("_TextMetadataEntryData", "text"), Persistable):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text):\n return super(TextMetadataEntryData, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )\n\n\n[docs]@whitelist_for_persistence\nclass UrlMetadataEntryData(namedtuple("_UrlMetadataEntryData", "url"), Persistable):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url):\n return super(UrlMetadataEntryData, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )\n\n\n[docs]@whitelist_for_persistence\nclass PathMetadataEntryData(namedtuple("_PathMetadataEntryData", "path"), Persistable):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string.\n """\n\n def __new__(cls, path):\n return super(PathMetadataEntryData, cls).__new__(\n cls, check.opt_str_param(path, "path", default="")\n )\n\n\n[docs]@whitelist_for_persistence\nclass JsonMetadataEntryData(namedtuple("_JsonMetadataEntryData", "data"), Persistable):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Optional[Dict[str, Any]]): The JSON data.\n """\n\n def __new__(cls, data):\n return super(JsonMetadataEntryData, cls).__new__(\n cls, check.opt_dict_param(data, "data", key_type=str)\n )\n\n\n[docs]@whitelist_for_persistence\nclass MarkdownMetadataEntryData(namedtuple("_MarkdownMetadataEntryData", "md_str"), Persistable):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str):\n return super(MarkdownMetadataEntryData, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )\n\n\n@whitelist_for_persistence\nclass PythonArtifactMetadataEntryData(\n namedtuple("_PythonArtifactMetadataEntryData", "module name"), Persistable\n):\n def __new__(cls, module, name):\n return super(PythonArtifactMetadataEntryData, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )\n\n\n[docs]@whitelist_for_persistence\nclass FloatMetadataEntryData(namedtuple("_FloatMetadataEntryData", "value"), Persistable):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value):\n return super(FloatMetadataEntryData, cls).__new__(\n cls, check.opt_float_param(value, "value")\n )\n\n\n[docs]@whitelist_for_persistence\nclass IntMetadataEntryData(namedtuple("_IntMetadataEntryData", "value"), Persistable):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value):\n return super(IntMetadataEntryData, cls).__new__(cls, check.opt_int_param(value, "value"))\n\n\nEntryDataUnion = (\n TextMetadataEntryData,\n UrlMetadataEntryData,\n PathMetadataEntryData,\n JsonMetadataEntryData,\n MarkdownMetadataEntryData,\n PythonArtifactMetadataEntryData,\n FloatMetadataEntryData,\n IntMetadataEntryData,\n)\n\n\n[docs]class Output(namedtuple("_Output", "value output_name")):\n """Event corresponding to one of a solid's outputs.\n\n Solid compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a solid using the\n :py:class:`SolidDefinition` API directly.\n\n Outputs are values produced by solids that will be consumed by downstream solids in a pipeline.\n They are type-checked at solid boundaries when their corresponding :py:class:`OutputDefinition`\n or the downstream :py:class:`InputDefinition` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding output definition. (default:\n "result")\n """\n\n def __new__(cls, value, output_name=DEFAULT_OUTPUT):\n return super(Output, cls).__new__(cls, value, check.str_param(output_name, "output_name"),)\n\n\n[docs]class DynamicOutput(namedtuple("_DynamicOutput", "value mapping_key output_name")):\n """\n (Experimental) Variant of :py:class:`Output` used to support mapping. Each DynamicOutput\n produced by a solid will result in the downstream dag being cloned to run on that individual\n value. Each DynamicOutput must have a unique mapping_key to distinguish it.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n output_name (Optional[str]):\n Name of the corresponding output definition. (default: "result")\n """\n\n def __new__(cls, value, mapping_key, output_name=DEFAULT_OUTPUT):\n\n return super(DynamicOutput, cls).__new__(\n cls,\n value,\n check_valid_name(check.str_param(mapping_key, "mapping_key")),\n check.str_param(output_name, "output_name"),\n )\n\n\n[docs]@whitelist_for_persistence\nclass AssetMaterialization(\n namedtuple("_AssetMaterialization", "asset_key description metadata_entries partition"),\n Persistable,\n):\n """Event indicating that a solid has materialized an asset.\n\n Solid compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n solids, and their persistence is controlled by solid logic, rather than by the Dagster\n framework.\n\n Solid authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in Dagit.\n\n Args:\n asset_key (str|List[str]|AssetKey): A key to identify the materialized asset across pipeline\n runs\n description (Optional[str]): A longer human-radable description of the materialized value.\n metadata_entries (Optional[List[EventMetadataEntry]]): Arbitrary metadata about the\n materialized value.\n partition (Optional[str]): The name of the partition that was materialized.\n """\n\n def __new__(cls, asset_key, description=None, metadata_entries=None, partition=None):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n elif isinstance(asset_key, list):\n check.is_list(asset_key, of_type=str)\n asset_key = AssetKey(asset_key)\n else:\n check.is_tuple(asset_key, of_type=str)\n asset_key = AssetKey(asset_key)\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, metadata_entries, of_type=EventMetadataEntry\n ),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self):\n return " ".join(self.asset_key.path)\n\n[docs] @staticmethod\n def file(path, description=None, asset_key=None):\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=asset_key,\n description=description,\n metadata_entries=[EventMetadataEntry.fspath(path)],\n )\n\n\n@whitelist_for_persistence\nclass Materialization(\n namedtuple("_Materialization", "label description metadata_entries asset_key partition"),\n Persistable,\n):\n """Event indicating that a solid has materialized a value.\n\n Solid compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, materializations can not be passed to other solids,\n and their persistence is controlled by solid logic, rather than by the Dagster framework.\n\n Solid authors should use these events to organize metadata about the side effects of their\n computations to enable downstream tooling like artifact catalogues and diff tools.\n\n Args:\n label (str): A short display name for the materialized value.\n description (Optional[str]): A longer human-radable description of the materialized value.\n metadata_entries (Optional[List[EventMetadataEntry]]): Arbitrary metadata about the\n materialized value.\n asset_key (Optional[str|AssetKey]): An optional parameter to identify the materialized asset\n across pipeline runs\n partition (Optional[str]): The name of the partition that was materialized.\n """\n\n def __new__(\n cls,\n label=None,\n description=None,\n metadata_entries=None,\n asset_key=None,\n partition=None,\n skip_deprecation_warning=False,\n ):\n if asset_key and isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n if not label:\n check.param_invariant(\n asset_key and asset_key.path,\n "label",\n "Either label or asset_key with a path must be provided",\n )\n label = asset_key.to_string()\n\n if not skip_deprecation_warning:\n warnings.warn("`Materialization` is deprecated; use `AssetMaterialization` instead.")\n\n return super(Materialization, cls).__new__(\n cls,\n label=check.str_param(label, "label"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, metadata_entries, of_type=EventMetadataEntry\n ),\n asset_key=asset_key,\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @staticmethod\n def file(path, description=None, asset_key=None):\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n return Materialization(\n label=last_file_comp(path),\n description=description,\n metadata_entries=[EventMetadataEntry.fspath(path)],\n asset_key=asset_key,\n )\n\n @classmethod\n def from_storage_dict(cls, storage_dict):\n # override the default `from_storage_dict` implementation in order to skip the deprecation\n # warning for historical Materialization events, loaded from event_log storage\n return Materialization.__new__(cls, skip_deprecation_warning=True, **storage_dict)\n\n\n[docs]@whitelist_for_persistence\nclass ExpectationResult(\n namedtuple("_ExpectationResult", "success label description metadata_entries"), Persistable\n):\n """Event corresponding to a data quality test.\n\n Solid compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata_entries (Optional[List[EventMetadataEntry]]): Arbitrary metadata about the\n expectation.\n """\n\n def __new__(cls, success, label=None, description=None, metadata_entries=None):\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, metadata_entries, of_type=EventMetadataEntry\n ),\n )\n\n\n[docs]@whitelist_for_persistence\nclass TypeCheck(namedtuple("_TypeCheck", "success description metadata_entries"), Persistable):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Solid compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata_entries (Optional[List[EventMetadataEntry]]): Arbitrary metadata about the\n type check.\n """\n\n def __new__(cls, success, description=None, metadata_entries=None):\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, metadata_entries, of_type=EventMetadataEntry\n ),\n )\n\n\n[docs]class Failure(Exception):\n """Event indicating solid failure.\n\n Raise events of this type from within solid compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata_entries (Optional[List[EventMetadataEntry]]): Arbitrary metadata about the\n failure.\n """\n\n def __init__(self, description=None, metadata_entries=None):\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=EventMetadataEntry\n )\n\n\n[docs]class RetryRequested(Exception):\n """\n An exception to raise from a solid to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[int]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n\n .. code-block:: python\n\n @solid\n def flakes():\n try:\n flakey_operation()\n except:\n raise RetryRequested(max_retries=3)\n """\n\n def __init__(self, max_retries=1, seconds_to_wait=None):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_int_param(seconds_to_wait, "seconds_to_wait")\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n namedtuple(\n "_ObjectStoreOperation",\n "op key dest_key obj serialization_strategy_name object_store_name value_name version mapping_key",\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op,\n key,\n dest_key=None,\n obj=None,\n serialization_strategy_name=None,\n object_store_name=None,\n value_name=None,\n version=None,\n mapping_key=None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(namedtuple("_HookExecutionResult", "hook_name is_skipped")):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name, is_skipped=None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=check.opt_bool_param(is_skipped, "is_skipped", default=False),\n )\n
\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.builtins import Int\nfrom dagster.config.field import Field\nfrom dagster.core.definitions.configurable import ConfigurableDefinition\nfrom dagster.core.definitions.reconstructable import ReconstructablePipeline\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.execution.retries import Retries, get_retries_config\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\n\n[docs]class ExecutorDefinition(ConfigurableDefinition):\n """\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n """\n\n def __init__(\n self, name, config_schema=None, executor_creation_fn=None, description=None,\n ):\n self._name = check.str_param(name, "name")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @property\n def name(self):\n return self._name\n\n @property\n def description(self):\n return self._description\n\n @property\n def config_schema(self):\n return self._config_schema\n\n @property\n def executor_creation_fn(self):\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema, _):\n return ExecutorDefinition(\n name=name or self.name,\n config_schema=config_schema,\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n )\n\n\n[docs]def executor(name=None, config_schema=None):\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`.\n """\n if callable(name):\n check.invariant(config_schema is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(name=name, config_schema=config_schema)\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name, config_schema=self.config_schema, executor_creation_fn=fn,\n )\n\n update_wrapper(executor_def, wrapped=fn)\n\n return executor_def\n\n\n[docs]@executor(\n name="in_process",\n config_schema={\n "retries": get_retries_config(),\n "marker_to_close": Field(str, is_required=False),\n },\n)\ndef in_process_executor(init_context):\n """The default in-process executor.\n\n In most Dagster environments, this will be the default executor. It is available by default on\n any :py:class:`ModeDefinition` that does not provide custom executors. To select it explicitly,\n include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n from dagster.core.executor.init import InitExecutorContext\n from dagster.core.executor.in_process import InProcessExecutor\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=Retries.from_config(init_context.executor_config.get("retries", {"enabled": {}})),\n marker_to_close=init_context.executor_config.get("marker_to_close"),\n )\n\n\n[docs]@executor(\n name="multiprocess",\n config_schema={\n "max_concurrent": Field(Int, is_required=False, default_value=0),\n "retries": get_retries_config(),\n },\n)\ndef multiprocess_executor(init_context):\n """The default multiprocess executor.\n\n This simple multiprocess executor is available by default on any :py:class:`ModeDefinition`\n that does not provide custom executors. To select the multiprocess executor, include a fragment\n such as the following in your config:\n\n .. code-block:: yaml\n\n execution:\n multiprocess:\n config:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n from dagster.core.executor.init import InitExecutorContext\n from dagster.core.executor.multiprocess import MultiprocessExecutor\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n\n check_cross_process_constraints(init_context)\n\n return MultiprocessExecutor(\n pipeline=init_context.pipeline,\n max_concurrent=init_context.executor_config["max_concurrent"],\n retries=Retries.from_config(init_context.executor_config["retries"]),\n )\n\n\ndefault_executors = [in_process_executor, multiprocess_executor]\n\n\ndef check_cross_process_constraints(init_context):\n from dagster.core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n\n _check_intra_process_pipeline(init_context.pipeline)\n _check_non_ephemeral_instance(init_context.instance)\n _check_persistent_storage_requirement(\n init_context.pipeline.get_definition(),\n init_context.mode_def,\n init_context.intermediate_storage_def,\n )\n\n\ndef _check_intra_process_pipeline(pipeline):\n if not isinstance(pipeline, ReconstructablePipeline):\n raise DagsterUnmetExecutorRequirementsError(\n 'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" '\n "that is not reconstructable. Pipelines must be loaded in a way that allows dagster to reconstruct "\n "them in a new process. This means: \\n"\n " * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\\n"\n " * loading the pipeline through the reconstructable() function\\n".format(\n name=pipeline.get_definition().name\n )\n )\n\n\ndef _all_outputs_non_mem_io_managers(pipeline_def, mode_def):\n """Returns true if every output definition in the pipeline uses an IO manager that's not\n the mem_io_manager.\n\n If true, this indicates that it's OK to execute steps in their own processes, because their\n outputs will be available to other processes.\n """\n # pylint: disable=comparison-with-callable\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n output_defs = [\n output_def\n for solid_def in pipeline_def.all_solid_defs\n for output_def in solid_def.output_defs\n ]\n for output_def in output_defs:\n if mode_def.resource_defs[output_def.io_manager_key] == mem_io_manager:\n return False\n\n return True\n\n\ndef _check_persistent_storage_requirement(pipeline_def, mode_def, intermediate_storage_def):\n """We prefer to store outputs with IO managers, but will fall back to intermediate storage\n if an IO manager isn't set.\n """\n if not (\n _all_outputs_non_mem_io_managers(pipeline_def, mode_def)\n or (intermediate_storage_def and intermediate_storage_def.is_persistent)\n ):\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes, but your pipeline "\n "includes solid outputs that will not be stored somewhere where other processes can"\n "retrieve them. "\n "Please make sure that your pipeline definition includes a ModeDefinition whose "\n 'resource_keys assign the "io_manager" key to an IOManager resource '\n "that stores outputs outside of the process, such as the fs_io_manager."\n )\n\n\ndef _check_non_ephemeral_instance(instance):\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an "\n "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate "\n "execution between multiple processes. You can configure your default instance "\n "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs."\n )\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import (\n BuiltinScalarDagsterType,\n DagsterType,\n resolve_dagster_type,\n)\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .utils import check_valid_name\n\n\nclass _NoValueSentinel:\n pass\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name, dagster_type, default_value):\n if default_value is not _NoValueSentinel:\n if dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n "Setting a default_value is invalid on InputDefinitions of type Nothing"\n )\n\n if isinstance(dagster_type, BuiltinScalarDagsterType):\n type_check = dagster_type.type_check_scalar_value(default_value)\n if not type_check.success:\n raise DagsterInvalidDefinitionError(\n (\n "Type check failed for the default_value of InputDefinition "\n "{input_name} of type {dagster_type}. "\n "Received value {value} of type {type}"\n ).format(\n input_name=input_name,\n dagster_type=dagster_type.display_name,\n value=default_value,\n type=type(default_value),\n ),\n )\n\n return default_value\n\n\n[docs]class InputDefinition:\n """Defines an argument to a solid's compute function.\n\n Inputs may flow from previous solids' outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n name (str): Name of the input.\n dagster_type (Optional[Any]): The type of this input. Users should provide one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type`, or\n :py:func:`PythonObjectDagsterType`, or a Python type. Defaults to :py:class:`Any`.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n root_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`RootInputManager` used for loading this input when it is not connected to an\n upstream output.\n metadata (Optional[Dict[str, Any]]): (Experimental) A dict of metadata for the input.\n """\n\n def __init__(\n self,\n name,\n dagster_type=None,\n description=None,\n default_value=_NoValueSentinel,\n root_manager_key=None,\n metadata=None,\n ):\n ""\n self._name = check_valid_name(name)\n\n self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n self._description = check.opt_str_param(description, "description")\n\n self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n if root_manager_key:\n experimental_arg_warning("root_manager_key", "InputDefinition")\n\n self._root_manager_key = check.opt_str_param(root_manager_key, "root_manager_key")\n\n if metadata:\n experimental_arg_warning("metadata", "InputDefinition")\n\n self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self):\n return self._dagster_type\n\n @property\n def description(self):\n return self._description\n\n @property\n def has_default_value(self):\n return self._default_value is not _NoValueSentinel\n\n @property\n def default_value(self):\n check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n return self._default_value\n\n @property\n def root_manager_key(self):\n return self._root_manager_key\n\n @property\n def metadata(self):\n return self._metadata\n\n[docs] def mapping_to(self, solid_name, input_name, fan_in_index=None):\n """Create an input mapping to an input of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`InputMapping` to the input of a child solid.\n\n Args:\n solid_name (str): The name of the child solid to which to map this input.\n input_name (str): The name of the child solid' input to which to map this input.\n fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n Examples:\n\n .. code-block:: python\n\n input_mapping = InputDefinition('composite_input', Int).mapping_to(\n 'child_solid', 'int_input'\n )\n """\n check.str_param(solid_name, "solid_name")\n check.str_param(input_name, "input_name")\n check.opt_int_param(fan_in_index, "fan_in_index")\n\n if fan_in_index is not None:\n maps_to = FanInInputPointer(solid_name, input_name, fan_in_index)\n else:\n maps_to = InputPointer(solid_name, input_name)\n return InputMapping(self, maps_to)\n\n\nclass InputPointer(namedtuple("_InputPointer", "solid_name input_name")):\n def __new__(cls, solid_name, input_name):\n return super(InputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n )\n\n\nclass FanInInputPointer(namedtuple("_FanInInputPointer", "solid_name input_name fan_in_index")):\n def __new__(cls, solid_name, input_name, fan_in_index):\n return super(FanInInputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n check.int_param(fan_in_index, "fan_in_index"),\n )\n\n\n[docs]class InputMapping(namedtuple("_InputMapping", "definition maps_to")):\n """Defines an input mapping for a composite solid.\n\n Args:\n definition (InputDefinition): Defines the input to the composite solid.\n solid_name (str): The name of the child solid onto which to map the input.\n input_name (str): The name of the input to the child solid onto which to map the input.\n """\n\n def __new__(cls, definition, maps_to):\n return super(InputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", InputDefinition),\n check.inst_param(maps_to, "maps_to", (InputPointer, FanInInputPointer)),\n )\n\n @property\n def maps_to_fan_in(self):\n return isinstance(self.maps_to, FanInInputPointer)\n
\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.definitions.configurable import ConfigurableDefinition\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\nfrom .utils import check_valid_name\n\n\n[docs]class IntermediateStorageDefinition(ConfigurableDefinition):\n """Defines intermediate data storage behaviors.\n\n Args:\n name (str): Name of the storage mode.\n is_persistent (bool): Whether the storage is persistent in a way that can cross process/node\n boundaries. Re-execution with, for example, the multiprocess executor, or with\n dagster-airflow, requires a persistent storage mode.\n required_resource_keys(Optional[Set[str]]): The resources that this storage needs at runtime to function.\n config_schema (Optional[ConfigSchema]): The schema for the storage's configuration schema.\n Configuration data passed in this schema will be made available to the\n ``intermediate_storage_creation_fn`` under ``init_context.intermediate_storage_config``.\n intermediate_storage_creation_fn: (Callable[[InitIntermediateStorageContext], IntermediateStorage])\n Called to construct the storage. This function should consume the init context and emit\n a :py:class:`IntermediateStorage`.\n """\n\n def __init__(\n self,\n name,\n is_persistent,\n required_resource_keys,\n config_schema=None,\n intermediate_storage_creation_fn=None,\n description=None,\n ):\n self._name = check_valid_name(name)\n self._is_persistent = check.bool_param(is_persistent, "is_persistent")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._intermediate_storage_creation_fn = check.opt_callable_param(\n intermediate_storage_creation_fn, "intermediate_storage_creation_fn"\n )\n self._required_resource_keys = frozenset(\n check.set_param(\n required_resource_keys if required_resource_keys else set(),\n "required_resource_keys",\n of_type=str,\n )\n )\n self._description = check.opt_str_param(description, "description")\n\n @property\n def name(self):\n return self._name\n\n @property\n def description(self):\n return self._description\n\n @property\n def is_persistent(self):\n return self._is_persistent\n\n @property\n def config_schema(self):\n return self._config_schema\n\n @property\n def intermediate_storage_creation_fn(self):\n return self._intermediate_storage_creation_fn\n\n @property\n def required_resource_keys(self):\n return self._required_resource_keys\n\n def copy_for_configured(self, name, description, config_schema, _):\n return IntermediateStorageDefinition(\n name=name or self.name,\n is_persistent=self.is_persistent,\n required_resource_keys=self.required_resource_keys,\n config_schema=config_schema,\n intermediate_storage_creation_fn=self.intermediate_storage_creation_fn,\n description=description or self.description,\n )\n\n\n[docs]def intermediate_storage(\n required_resource_keys=None, name=None, is_persistent=True, config_schema=None\n):\n """Creates an intermediate storage definition\n\n The decorated function will be passed as the ``intermediate_storage_creation_fn`` to a\n :py:class:`IntermediateStorageDefinition`.\n\n Args:\n name (str): The name of the intermediate storage.\n is_persistent (bool): Whether the storage is persistent in a way that can cross process/node\n boundaries. Re-execution with, for example, the multiprocess executor, or with\n dagster-airflow, requires a persistent storage mode.\n required_resource_keys (Optional[Set[str]]):\n The resources that this storage needs at runtime to function.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.intermediate_storage_config`.\n """\n\n if callable(name):\n check.invariant(is_persistent is True)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n return _IntermediateStorageDecoratorCallable()(name)\n\n return _IntermediateStorageDecoratorCallable(\n name=name,\n is_persistent=is_persistent,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n )\n\n\nclass _IntermediateStorageDecoratorCallable:\n def __init__(\n self, name=None, is_persistent=True, config_schema=None, required_resource_keys=None\n ):\n self.name = check.opt_str_param(name, "name")\n self.is_persistent = check.bool_param(is_persistent, "is_persistent")\n self.config_schema = config_schema # will be checked in definition\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n storage_def = IntermediateStorageDefinition(\n name=self.name,\n is_persistent=self.is_persistent,\n config_schema=self.config_schema,\n intermediate_storage_creation_fn=fn,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(storage_def, wrapped=fn)\n\n return storage_def\n
\nfrom collections import namedtuple\nfrom enum import Enum\n\nfrom dagster import check\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\n@whitelist_for_serdes\nclass JobType(Enum):\n SCHEDULE = "SCHEDULE"\n SENSOR = "SENSOR"\n\n\nclass JobContext:\n """Context for generating the execution parameters for an JobDefinition at runtime.\n\n An instance of this class is made available as the first argument to the JobDefinition\n functions: run_config_fn, tags_fn\n\n Attributes:\n instance (DagsterInstance): The instance configured to launch the job\n """\n\n __slots__ = ["_instance"]\n\n def __init__(self, instance):\n from dagster.core.instance import DagsterInstance\n\n self._instance = check.inst_param(instance, "instance", DagsterInstance)\n\n @property\n def instance(self):\n return self._instance\n\n\n[docs]@whitelist_for_serdes\nclass SkipReason(namedtuple("_SkipReason", "skip_message")):\n """\n Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in dagit for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message=None):\n return super(SkipReason, cls).__new__(\n cls, skip_message=check.opt_str_param(skip_message, "skip_message")\n )\n\n\n[docs]@whitelist_for_serdes\nclass RunRequest(namedtuple("_RunRequest", "run_key run_config tags")):\n """\n Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (str | None): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Dict]): The environment config that parameterizes the run execution to\n be launched, as a dict.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n """\n\n def __new__(cls, run_key, run_config=None, tags=None):\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_dict_param(run_config, "run_config"),\n tags=check.opt_dict_param(tags, "tags"),\n )\n\n\nclass JobDefinition:\n """Defines a job, which describes a series of runs for a particular pipeline. These runs are\n grouped by job_name, using tags.\n\n Args:\n name (str): The name of this job.\n pipeline_name (str): The name of the pipeline to execute.\n mode (Optional[str]): The mode to apply when executing this pipeline. (default: 'default')\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute. e.g. ``['*some_solid+', 'other_solid']``\n """\n\n __slots__ = [\n "_name",\n "_job_type",\n "_pipeline_name",\n "_tags_fn",\n "_run_config_fn",\n "_mode",\n "_solid_selection",\n ]\n\n def __init__(\n self, name, job_type, pipeline_name, mode="default", solid_selection=None,\n ):\n self._name = check_valid_name(name)\n self._job_type = check.inst_param(job_type, "job_type", JobType)\n self._pipeline_name = check.str_param(pipeline_name, "pipeline_name")\n self._mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n self._solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n\n @property\n def name(self):\n return self._name\n\n @property\n def pipeline_name(self):\n return self._pipeline_name\n\n @property\n def job_type(self):\n return self._job_type\n\n @property\n def solid_selection(self):\n return self._solid_selection\n\n @property\n def mode(self):\n return self._mode\n
\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import ConfigurableDefinition\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\n\n[docs]class LoggerDefinition(ConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are pipeline-scoped logging handlers, which will be automatically invoked whenever\n solids in a pipeline log messages.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within solid compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self, logger_fn, config_schema=None, description=None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n @property\n def logger_fn(self):\n return self._logger_fn\n\n @property\n def config_schema(self):\n return self._config_schema\n\n @property\n def description(self):\n return self._description\n\n def copy_for_configured(self, name, description, config_schema, _):\n check.invariant(name is None, "LoggerDefinitions do not have names")\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )\n\n\n[docs]def logger(config_schema=None, description=None):\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=config_schema)\n\n def _wrap(logger_fn):\n return LoggerDefinition(\n logger_fn=logger_fn, config_schema=config_schema, description=description,\n )\n\n return _wrap\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.definitions.executor import ExecutorDefinition, default_executors\nfrom dagster.loggers import default_loggers\nfrom dagster.utils.merger import merge_dicts\n\nfrom .logger import LoggerDefinition\nfrom .resource import ResourceDefinition\nfrom .utils import check_valid_name\n\nDEFAULT_MODE_NAME = "default"\n\n\n[docs]class ModeDefinition(\n namedtuple(\n "_ModeDefinition",\n "name resource_defs loggers executor_defs description intermediate_storage_defs",\n )\n):\n """Define a mode in which a pipeline can operate.\n\n A mode provides pipelines with a set of resource implementations, loggers, system storages,\n and executors.\n\n Args:\n name (Optional[str]): The name of the mode. Must be unique within the\n :py:class:`PipelineDefinition` to which the mode is attached. (default: "default").\n resource_defs (Optional[Dict[str, ResourceDefinition]]): A dictionary of string resource\n keys to their implementations. Individual solids may require resources to be present by\n these keys.\n logger_defs (Optional[Dict[str, LoggerDefinition]]): A dictionary of string logger\n identifiers to their implementations.\n executor_defs (Optional[List[ExecutorDefinition]]): The set of executors available when\n executing in this mode. By default, this will be the 'in_process' and 'multiprocess'\n executors (:py:data:`~dagster.default_executors`).\n description (Optional[str]): A human-readable description of the mode.\n intermediate_storage_defs (Optional[List[IntermediateStorageDefinition]]): The set of intermediate storage\n options available when executing in this mode. By default, this will be the 'in_memory'\n and 'filesystem' system storages.\n """\n\n def __new__(\n cls,\n name=None,\n resource_defs=None,\n logger_defs=None,\n executor_defs=None,\n description=None,\n intermediate_storage_defs=None,\n ):\n from dagster.core.storage.system_storage import default_intermediate_storage_defs\n\n from .intermediate_storage import IntermediateStorageDefinition\n\n check.opt_dict_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n if resource_defs and "io_manager" in resource_defs:\n resource_defs_with_defaults = resource_defs\n else:\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n resource_defs_with_defaults = merge_dicts(\n {"io_manager": mem_io_manager}, resource_defs or {}\n )\n\n return super(ModeDefinition, cls).__new__(\n cls,\n name=check_valid_name(name) if name else DEFAULT_MODE_NAME,\n resource_defs=resource_defs_with_defaults,\n loggers=(\n check.opt_dict_param(\n logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n )\n or default_loggers()\n ),\n intermediate_storage_defs=check.list_param(\n intermediate_storage_defs\n if intermediate_storage_defs\n else default_intermediate_storage_defs,\n "intermediate_storage_defs",\n of_type=IntermediateStorageDefinition,\n ),\n executor_defs=check.list_param(\n executor_defs if executor_defs else default_executors,\n "executor_defs",\n of_type=ExecutorDefinition,\n ),\n description=check.opt_str_param(description, "description"),\n )\n\n @property\n def resource_key_set(self):\n return frozenset(self.resource_defs.keys())\n\n def get_intermediate_storage_def(self, name):\n check.str_param(name, "name")\n for intermediate_storage_def in self.intermediate_storage_defs:\n if intermediate_storage_def.name == name:\n return intermediate_storage_def\n\n check.failed("{} storage definition not found".format(name))\n\n @staticmethod\n def from_resources(resources, name=None):\n check.dict_param(resources, "resources", key_type=str)\n\n return ModeDefinition(\n name=name,\n resource_defs={\n resource_name: ResourceDefinition.hardcoded_resource(resource)\n for resource_name, resource in resources.items()\n },\n )\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.types.dagster_type import resolve_dagster_type\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\n\n[docs]class OutputDefinition:\n """Defines an output from a solid's compute function.\n\n Solids can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many solids have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Output definitions may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Any]): The type of this output. Users should provide one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type`, or\n :py:func:`PythonObjectDagsterType`, or a Python type. Defaults to :py:class:`Any`.\n name (Optional[str]): Name of the output. (default: "result")\n description (Optional[str]): Human-readable description of the output.\n is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): (Experimental) A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n """\n\n def __init__(\n self,\n dagster_type=None,\n name=None,\n description=None,\n is_required=None,\n io_manager_key=None,\n metadata=None,\n ):\n self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n self._dagster_type = resolve_dagster_type(dagster_type)\n self._description = check.opt_str_param(description, "description")\n self._is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self._manager_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="io_manager"\n )\n if metadata:\n experimental_arg_warning("metadata", "OutputDefinition")\n self._metadata = metadata\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self):\n return self._dagster_type\n\n @property\n def description(self):\n return self._description\n\n @property\n def optional(self):\n return not self._is_required\n\n @property\n def is_required(self):\n return self._is_required\n\n @property\n def io_manager_key(self):\n return self._manager_key\n\n @property\n def metadata(self):\n return self._metadata\n\n @property\n def is_dynamic(self):\n return False\n\n[docs] def mapping_from(self, solid_name, output_name=None):\n """Create an output mapping from an output of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`OutputMapping` from the output of a child solid.\n\n Args:\n solid_name (str): The name of the child solid from which to map this output.\n input_name (str): The name of the child solid's output from which to map this output.\n\n Examples:\n\n .. code-block:: python\n\n output_mapping = OutputDefinition(Int).mapping_from('child_solid')\n """\n return OutputMapping(self, OutputPointer(solid_name, output_name))\n\n\n[docs]class DynamicOutputDefinition(OutputDefinition):\n """\n (EXPERIMENTAL) Variant of :py:class:`OutputDefinition` for an output that will dynamically\n alter the graph at runtime. Each copy of :py:class:`DynamicOutput` corresponding to this\n definition that is yielded from the solid will create a copy of the downstream graph.\n """\n\n @property\n def is_dynamic(self):\n return True\n\n\nclass OutputPointer(namedtuple("_OutputPointer", "solid_name output_name")):\n def __new__(cls, solid_name, output_name=None):\n return super(OutputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n )\n\n\n[docs]class OutputMapping(namedtuple("_OutputMapping", "definition maps_from")):\n """Defines an output mapping for a composite solid.\n\n Args:\n definition (OutputDefinition): Defines the output of the composite solid.\n solid_name (str): The name of the child solid from which to map the output.\n output_name (str): The name of the child solid's output from which to map the output.\n """\n\n def __new__(cls, definition, maps_from):\n return super(OutputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", OutputDefinition),\n check.inst_param(maps_from, "maps_from", OutputPointer),\n )\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.definitions.schedule import ScheduleDefinition, ScheduleExecutionContext\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus, PipelineRunsFilter\nfrom dagster.core.storage.tags import check_tags\nfrom dagster.utils import merge_dicts\n\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\ndef by_name(partition):\n return partition.name\n\n\n[docs]class Partition(namedtuple("_Partition", ("value name"))):\n """\n Partition is the representation of a logical slice across an axis of a pipeline's work\n\n Args:\n value (Any): The object for this partition\n name (str): Name for this partition\n """\n\n def __new__(cls, value=None, name=None):\n return super(Partition, cls).__new__(\n cls, name=check.opt_str_param(name, "name", str(value)), value=value\n )\n\n\ndef last_empty_partition(context, partition_set_def):\n check.inst_param(context, "context", ScheduleExecutionContext)\n partition_set_def = check.inst_param(\n partition_set_def, "partition_set_def", PartitionSetDefinition\n )\n partitions = partition_set_def.get_partitions()\n if not partitions:\n return None\n selected = None\n for partition in reversed(partitions):\n filters = PipelineRunsFilter.for_partition(partition_set_def, partition)\n matching = context.instance.get_runs(filters)\n if not any(run.status == PipelineRunStatus.SUCCESS for run in matching):\n selected = partition\n break\n return selected\n\n\ndef first_partition(context, partition_set_def=None):\n check.inst_param(context, "context", ScheduleExecutionContext)\n partition_set_def = check.inst_param(\n partition_set_def, "partition_set_def", PartitionSetDefinition\n )\n\n partitions = partition_set_def.get_partitions()\n if not partitions:\n return None\n\n return partitions[0]\n\n\n[docs]class PartitionSetDefinition(\n namedtuple(\n "_PartitionSetDefinition",\n (\n "name pipeline_name partition_fn solid_selection mode "\n "user_defined_run_config_fn_for_partition user_defined_tags_fn_for_partition"\n ),\n )\n):\n """\n Defines a partition set, representing the set of slices making up an axis of a pipeline\n\n Args:\n name (str): Name for this partition set\n pipeline_name (str): The name of the pipeline definition\n partition_fn (Callable[void, List[Partition]]): User-provided function to define the set of\n valid partition objects.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this partition. (default: 'default')\n run_config_fn_for_partition (Callable[[Partition], [Dict]]): A\n function that takes a :py:class:`~dagster.Partition` and returns the run\n configuration that parameterizes the execution for this partition, as a dict\n tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]): A function that\n takes a :py:class:`~dagster.Partition` and returns a list of key value pairs that will\n be added to the generated run for this partition.\n """\n\n def __new__(\n cls,\n name,\n pipeline_name,\n partition_fn,\n solid_selection=None,\n mode=None,\n run_config_fn_for_partition=lambda _partition: {},\n tags_fn_for_partition=lambda _partition: {},\n ):\n def _wrap(x):\n if isinstance(x, Partition):\n return x\n if isinstance(x, str):\n return Partition(x)\n raise DagsterInvalidDefinitionError(\n "Expected <Partition> | <str>, received {type}".format(type=type(x))\n )\n\n return super(PartitionSetDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n partition_fn=lambda: [\n _wrap(x) for x in check.callable_param(partition_fn, "partition_fn")()\n ],\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n mode=check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME),\n user_defined_run_config_fn_for_partition=check.callable_param(\n run_config_fn_for_partition, "run_config_fn_for_partition"\n ),\n user_defined_tags_fn_for_partition=check.callable_param(\n tags_fn_for_partition, "tags_fn_for_partition"\n ),\n )\n\n def run_config_for_partition(self, partition):\n return self.user_defined_run_config_fn_for_partition(partition)\n\n def tags_for_partition(self, partition):\n user_tags = self.user_defined_tags_fn_for_partition(partition)\n check_tags(user_tags, "user_tags")\n\n tags = merge_dicts(user_tags, PipelineRun.tags_for_partition_set(self, partition))\n\n return tags\n\n def get_partitions(self):\n return self.partition_fn()\n\n def get_partition(self, name):\n for partition in self.get_partitions():\n if partition.name == name:\n return partition\n\n check.failed("Partition name {} not found!".format(name))\n\n def get_partition_names(self):\n return [part.name for part in self.get_partitions()]\n\n def create_schedule_definition(\n self,\n schedule_name,\n cron_schedule,\n partition_selector,\n should_execute=None,\n environment_vars=None,\n execution_timezone=None,\n ):\n """Create a ScheduleDefinition from a PartitionSetDefinition.\n\n Arguments:\n schedule_name (str): The name of the schedule.\n cron_schedule (str): A valid cron string for the schedule\n partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition],\n Partition): Function that determines the partition to use at a given execution time.\n For time-based partition sets, will likely be either `identity_partition_selector` or a\n selector returned by `create_offset_partition_selector`.\n should_execute (Optional[function]): Function that runs at schedule execution time that\n determines whether a schedule should execute. Defaults to a function that always returns\n ``True``.\n environment_vars (Optional[dict]): The environment variables to set for the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n\n Returns:\n ScheduleDefinition: The generated ScheduleDefinition for the partition selector\n """\n\n check.str_param(schedule_name, "schedule_name")\n check.str_param(cron_schedule, "cron_schedule")\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.callable_param(partition_selector, "partition_selector")\n check.opt_str_param(execution_timezone, "execution_timezone")\n\n def _should_execute_wrapper(context):\n check.inst_param(context, "context", ScheduleExecutionContext)\n selected_partition = partition_selector(context, self)\n\n if not selected_partition or not selected_partition.name in self.get_partition_names():\n return False\n elif not should_execute:\n return True\n else:\n return should_execute(context)\n\n def _run_config_fn_wrapper(context):\n check.inst_param(context, "context", ScheduleExecutionContext)\n selected_partition = partition_selector(context, self)\n if not selected_partition or not selected_partition.name in self.get_partition_names():\n raise DagsterInvariantViolationError(\n "The partition selection function `{selector}` did not return "\n "a partition from PartitionSet {partition_set}".format(\n selector=getattr(partition_selector, "__name__", repr(partition_selector)),\n partition_set=self.name,\n )\n )\n\n return self.run_config_for_partition(selected_partition)\n\n def _tags_fn_wrapper(context):\n check.inst_param(context, "context", ScheduleExecutionContext)\n selected_partition = partition_selector(context, self)\n if not selected_partition:\n raise DagsterInvariantViolationError(\n "The partition selection function `{selector}` did not return "\n "a partition from PartitionSet {partition_set}".format(\n selector=getattr(partition_selector, "__name__", repr(partition_selector)),\n partition_set=self.name,\n )\n )\n\n return self.tags_for_partition(selected_partition)\n\n return PartitionScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=self.pipeline_name,\n run_config_fn=_run_config_fn_wrapper,\n tags_fn=_tags_fn_wrapper,\n solid_selection=self.solid_selection,\n mode=self.mode,\n should_execute=_should_execute_wrapper,\n environment_vars=environment_vars,\n partition_set=self,\n execution_timezone=execution_timezone,\n )\n\n\nclass PartitionScheduleDefinition(ScheduleDefinition):\n __slots__ = ["_partition_set"]\n\n def __init__(\n self,\n name,\n cron_schedule,\n pipeline_name,\n tags_fn,\n solid_selection,\n mode,\n should_execute,\n environment_vars,\n partition_set,\n run_config_fn=None,\n execution_timezone=None,\n ):\n super(PartitionScheduleDefinition, self).__init__(\n name=check_valid_name(name),\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n run_config_fn=run_config_fn,\n tags_fn=tags_fn,\n solid_selection=solid_selection,\n mode=mode,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n )\n self._partition_set = check.inst_param(\n partition_set, "partition_set", PartitionSetDefinition\n )\n\n def get_partition_set(self):\n return self._partition_set\n
\nimport uuid\nimport warnings\n\nfrom dagster import check\nfrom dagster.core.definitions.solid import NodeDefinition\nfrom dagster.core.errors import (\n DagsterInvalidDefinitionError,\n DagsterInvalidSubsetError,\n DagsterInvariantViolationError,\n)\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition\nfrom dagster.core.storage.root_input_manager import IInputManagerDefinition\nfrom dagster.core.types.dagster_type import DagsterTypeKind, construct_dagster_type_dictionary\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .config import ConfigMapping\nfrom .dependency import (\n DependencyDefinition,\n MultiDependencyDefinition,\n SolidHandle,\n SolidInvocation,\n)\nfrom .graph import GraphDefinition\nfrom .hook import HookDefinition\nfrom .mode import ModeDefinition\nfrom .preset import PresetDefinition\nfrom .solid import NodeDefinition\nfrom .utils import validate_tags\n\n\ndef _anonymous_pipeline_name():\n return "__pipeline__" + str(uuid.uuid4()).replace("-", "")\n\n\n[docs]class PipelineDefinition(GraphDefinition):\n """Defines a Dagster pipeline.\n\n A pipeline is made up of\n\n - Solids, each of which is a single functional unit of data computation.\n - Dependencies, which determine how the values produced by solids as their outputs flow from\n one solid to another. This tells Dagster how to arrange solids, and potentially multiple\n aliased instances of solids, into a directed, acyclic graph (DAG) of compute.\n - Modes, which can be used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline, and to switch between them.\n - Presets, which can be used to ship common combinations of pipeline config options in Python\n code, and to switch between them.\n\n Args:\n solid_defs (List[SolidDefinition]): The set of solids used in this pipeline.\n name (Optional[str]): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n dependencies (Optional[Dict[Union[str, SolidInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each solid's inputs on the outputs of\n other solids in the pipeline. Keys of the top level dict are either the string names of\n solids in the pipeline or, in the case of aliased solids,\n :py:class:`SolidInvocations <SolidInvocation>`. Values of the top level dict are\n themselves dicts, which map input names belonging to the solid or aliased solid to\n :py:class:`DependencyDefinitions <DependencyDefinition>`.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary available\n resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[Set[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n\n _parent_pipeline_def (INTERNAL ONLY): Used for tracking pipelines created using solid subsets.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def return_one():\n return 1\n\n\n @solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\n def apply_op(context, num):\n return context.resources.op(num)\n\n @resource(config_schema=Int)\n def adder_resource(init_context):\n return lambda x: x + init_context.resource_config\n\n\n add_mode = ModeDefinition(\n name='add_mode',\n resource_defs={'op': adder_resource},\n description='Mode that adds things',\n )\n\n\n add_three_preset = PresetDefinition(\n name='add_three_preset',\n run_config={'resources': {'op': {'config': 3}}},\n mode='add_mode',\n )\n\n\n pipeline_def = PipelineDefinition(\n name='basic',\n solid_defs=[return_one, apply_op],\n dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n mode_defs=[add_mode],\n preset_defs=[add_three_preset],\n )\n """\n\n def __init__(\n self,\n solid_defs,\n name=None,\n description=None,\n dependencies=None,\n mode_defs=None,\n preset_defs=None,\n tags=None,\n hook_defs=None,\n input_mappings=None,\n output_mappings=None,\n config_mapping=None,\n positional_inputs=None,\n _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115\n ):\n if not name:\n warnings.warn(\n "Pipeline must have a name. Names will be required starting in 0.10.0 or later."\n )\n name = _anonymous_pipeline_name()\n\n # For these warnings they check truthiness because they get changed to [] higher\n # in the stack for the decorator case\n\n if input_mappings:\n experimental_arg_warning("input_mappings", "PipelineDefinition")\n\n if output_mappings:\n experimental_arg_warning("output_mappings", "PipelineDefinition")\n\n if config_mapping is not None:\n experimental_arg_warning("config_mapping", "PipelineDefinition")\n\n if positional_inputs:\n experimental_arg_warning("positional_inputs", "PipelineDefinition")\n\n super(PipelineDefinition, self).__init__(\n name=name,\n description=description,\n dependencies=dependencies,\n node_defs=solid_defs,\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config_mapping=config_mapping,\n )\n\n self._current_level_node_defs = solid_defs\n self._tags = validate_tags(tags)\n\n mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition)\n\n if not mode_definitions:\n mode_definitions = [ModeDefinition()]\n\n self._mode_definitions = mode_definitions\n\n seen_modes = set()\n for mode_def in mode_definitions:\n if mode_def.name in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '\n "Modes must have unique names."\n ).format(mode_name=mode_def.name, pipeline_name=self._name)\n )\n seen_modes.add(mode_def.name)\n\n self._dagster_type_dict = construct_dagster_type_dictionary(self._current_level_node_defs)\n\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n self._preset_dict = {}\n for preset in self._preset_defs:\n if preset.name in self._preset_dict:\n raise DagsterInvalidDefinitionError(\n (\n 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '\n "PresetDefinitions must have unique names."\n ).format(name=preset.name, pipeline_name=self._name)\n )\n if preset.mode not in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'PresetDefinition "{name}" in "{pipeline_name}" '\n 'references mode "{mode}" which is not defined.'\n ).format(name=preset.name, pipeline_name=self._name, mode=preset.mode)\n )\n self._preset_dict[preset.name] = preset\n\n # Validate solid resource dependencies\n _validate_resource_dependencies(\n self._mode_definitions,\n self._current_level_node_defs,\n self._dagster_type_dict,\n self._solid_dict,\n self._hook_defs,\n )\n\n # Validate unsatisfied inputs can be materialized from config\n _validate_inputs(self._dependency_structure, self._solid_dict, self._mode_definitions)\n\n # Recursively explore all nodes in the this pipeline\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._parent_pipeline_def = check.opt_inst_param(\n _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition\n )\n self._cached_run_config_schemas = {}\n self._cached_external_pipeline = None\n\n def copy_for_configured(self, name, description, config_schema, config_or_config_fn):\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only pipelines utilizing config mapping can be pre-configured. The pipeline "\n '"{graph_name}" does not have a config mapping, and thus has nothing to be '\n "configured.".format(graph_name=self.name)\n )\n\n return PipelineDefinition(\n solid_defs=self._solid_defs,\n name=self._name_for_configured_node(self.name, name, config_or_config_fn),\n description=description or self.description,\n dependencies=self._dependencies,\n mode_defs=self._mode_definitions,\n preset_defs=self.preset_defs,\n hook_defs=self.hook_defs,\n input_mappings=self._input_mappings,\n output_mappings=self._output_mappings,\n config_mapping=ConfigMapping(\n self._config_mapping.config_fn, config_schema=config_schema\n ),\n positional_inputs=self.positional_inputs,\n _parent_pipeline_def=self._parent_pipeline_def,\n )\n\n def get_run_config_schema(self, mode=None):\n check.str_param(mode, "mode")\n\n mode_def = self.get_mode_definition(mode)\n\n if mode_def.name in self._cached_run_config_schemas:\n return self._cached_run_config_schemas[mode_def.name]\n\n self._cached_run_config_schemas[mode_def.name] = _create_run_config_schema(self, mode_def)\n return self._cached_run_config_schemas[mode_def.name]\n\n @property\n def mode_definitions(self):\n return self._mode_definitions\n\n @property\n def preset_defs(self):\n return self._preset_defs\n\n def _get_mode_definition(self, mode):\n check.str_param(mode, "mode")\n for mode_definition in self._mode_definitions:\n if mode_definition.name == mode:\n return mode_definition\n\n return None\n\n def get_default_mode(self):\n return self._mode_definitions[0]\n\n @property\n def is_single_mode(self):\n return len(self._mode_definitions) == 1\n\n @property\n def is_multi_mode(self):\n return len(self._mode_definitions) > 1\n\n def has_mode_definition(self, mode):\n check.str_param(mode, "mode")\n return bool(self._get_mode_definition(mode))\n\n def get_default_mode_name(self):\n return self._mode_definitions[0].name\n\n def get_mode_definition(self, mode=None):\n check.opt_str_param(mode, "mode")\n if mode is None:\n check.invariant(self.is_single_mode)\n return self.get_default_mode()\n\n mode_def = self._get_mode_definition(mode)\n\n check.invariant(\n mode_def is not None,\n "Could not find mode {mode} in pipeline {name}".format(mode=mode, name=self._name),\n )\n\n return mode_def\n\n @property\n def available_modes(self):\n return [mode_def.name for mode_def in self._mode_definitions]\n\n @property\n def display_name(self):\n """str: Display name of pipeline.\n\n Name suitable for exception messages, logging etc. If pipeline\n is unnamed the method will return "<<unnamed>>".\n """\n return self._name if self._name else "<<unnamed>>"\n\n @property\n def tags(self):\n return self._tags\n\n def has_dagster_type(self, name):\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name):\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def all_dagster_types(self):\n return self._dagster_type_dict.values()\n\n @property\n def all_solid_defs(self):\n return list(self._all_node_defs.values())\n\n @property\n def top_level_solid_defs(self):\n return self._current_level_node_defs\n\n def solid_def_named(self, name):\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, "{} not found".format(name))\n return self._all_node_defs[name]\n\n def has_solid_def(self, name):\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_pipeline_subset_def(self, solids_to_execute):\n return (\n self if solids_to_execute is None else _get_pipeline_subset_def(self, solids_to_execute)\n )\n\n def get_presets(self):\n return list(self._preset_dict.values())\n\n def has_preset(self, name):\n check.str_param(name, "name")\n return name in self._preset_dict\n\n def get_preset(self, name):\n check.str_param(name, "name")\n if name not in self._preset_dict:\n raise DagsterInvariantViolationError(\n (\n 'Could not find preset for "{name}". Available presets '\n 'for pipeline "{pipeline_name}" are {preset_names}.'\n ).format(\n name=name, preset_names=list(self._preset_dict.keys()), pipeline_name=self._name\n )\n )\n\n return self._preset_dict[name]\n\n def get_pipeline_snapshot(self):\n return self.get_pipeline_index().pipeline_snapshot\n\n def get_pipeline_snapshot_id(self):\n return self.get_pipeline_index().pipeline_snapshot_id\n\n def get_pipeline_index(self):\n from dagster.core.snap import PipelineSnapshot\n from dagster.core.host_representation import PipelineIndex\n\n return PipelineIndex(\n PipelineSnapshot.from_pipeline_def(self), self.get_parent_pipeline_snapshot()\n )\n\n def get_config_schema_snapshot(self):\n return self.get_pipeline_snapshot().config_schema_snapshot\n\n @property\n def is_subset_pipeline(self):\n return False\n\n @property\n def parent_pipeline_def(self):\n return None\n\n def get_parent_pipeline_snapshot(self):\n return None\n\n @property\n def solids_to_execute(self):\n return None\n\n @property\n def hook_defs(self):\n return self._hook_defs\n\n def get_all_hooks_for_handle(self, handle):\n """Gather all the hooks for the given solid from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Solid (solid invocation)\n * PipelineDefinition\n\n Args:\n handle (SolidHandle): The solid's handle\n\n Returns:\n FrozeSet[HookDefinition]\n """\n check.inst_param(handle, "handle", SolidHandle)\n hook_defs = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level solid\n name = lineage.pop()\n solid = self.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks on non-top-level solids\n while lineage:\n name = lineage.pop()\n solid = solid.definition.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks applied to a pipeline definition will run on every solid\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def with_hooks(self, hook_defs):\n """Apply a set of hooks to all solid instances within the pipeline."""\n\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n return PipelineDefinition(\n solid_defs=self.top_level_solid_defs,\n name=self.name,\n description=self.description,\n dependencies=self.dependencies,\n mode_defs=self.mode_definitions,\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=hook_defs.union(self.hook_defs),\n _parent_pipeline_def=self._parent_pipeline_def,\n )\n\n\nclass PipelineSubsetDefinition(PipelineDefinition):\n @property\n def solids_to_execute(self):\n return frozenset(self._solid_dict.keys())\n\n @property\n def solid_selection(self):\n # we currently don't pass the real solid_selection (the solid query list) down here.\n # so in the short-term, to make the call sites cleaner, we will convert the solids to execute\n # to a list\n return list(self._solid_dict.keys())\n\n @property\n def parent_pipeline_def(self):\n return self._parent_pipeline_def\n\n def get_parent_pipeline_snapshot(self):\n return self._parent_pipeline_def.get_pipeline_snapshot()\n\n @property\n def is_subset_pipeline(self):\n return True\n\n def get_pipeline_subset_def(self, solids_to_execute):\n raise DagsterInvariantViolationError("Pipeline subsets may not be subset again.")\n\n\ndef _dep_key_of(solid):\n return SolidInvocation(solid.definition.name, solid.name)\n\n\ndef _get_pipeline_subset_def(pipeline_def, solids_to_execute):\n """\n Build a pipeline which is a subset of another pipeline.\n Only includes the solids which are in solids_to_execute.\n """\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solids_to_execute, "solids_to_execute", of_type=str)\n\n for solid_name in solids_to_execute:\n if not pipeline_def.has_solid_named(solid_name):\n raise DagsterInvalidSubsetError(\n "Pipeline {pipeline_name} has no solid named {name}.".format(\n pipeline_name=pipeline_def.name, name=solid_name\n ),\n )\n\n solids = list(map(pipeline_def.solid_named, solids_to_execute))\n deps = {_dep_key_of(solid): {} for solid in solids}\n\n for solid in solids:\n for input_handle in solid.input_handles():\n if pipeline_def.dependency_structure.has_singular_dep(input_handle):\n output_handle = pipeline_def.dependency_structure.get_singular_dep(input_handle)\n if output_handle.solid.name in solids_to_execute:\n deps[_dep_key_of(solid)][input_handle.input_def.name] = DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n elif pipeline_def.dependency_structure.has_multi_deps(input_handle):\n output_handles = pipeline_def.dependency_structure.get_multi_deps(input_handle)\n deps[_dep_key_of(solid)][input_handle.input_def.name] = MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n for output_handle in output_handles\n if output_handle.solid.name in solids_to_execute\n ]\n )\n\n try:\n sub_pipeline_def = PipelineSubsetDefinition(\n name=pipeline_def.name, # should we change the name for subsetted pipeline?\n solid_defs=list({solid.definition for solid in solids}),\n mode_defs=pipeline_def.mode_definitions,\n dependencies=deps,\n _parent_pipeline_def=pipeline_def,\n tags=pipeline_def.tags,\n hook_defs=pipeline_def.hook_defs,\n )\n\n return sub_pipeline_def\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(solids_to_execute)} for pipeline "\n f"{pipeline_def.name} results in an invalid pipeline"\n ) from exc\n\n\ndef _validate_resource_dependencies(\n mode_definitions, node_defs, dagster_type_dict, solid_dict, pipeline_hook_defs\n):\n """This validation ensures that each pipeline context provides the resources that are required\n by each solid.\n """\n check.list_param(mode_definitions, "mode_definitions", of_type=ModeDefinition)\n check.list_param(node_defs, "node_defs", of_type=NodeDefinition)\n check.dict_param(dagster_type_dict, "dagster_type_dict")\n check.dict_param(solid_dict, "solid_dict")\n check.set_param(pipeline_hook_defs, "pipeline_hook_defs", of_type=HookDefinition)\n\n for mode_def in mode_definitions:\n mode_resources = set(mode_def.resource_defs.keys())\n for node_def in node_defs:\n for required_resource in node_def.required_resource_keys:\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by solid def {node_def_name}, but is not '\n 'provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n node_def_name=node_def.name,\n mode_name=mode_def.name,\n )\n )\n\n _validate_type_resource_deps_for_mode(mode_def, mode_resources, dagster_type_dict)\n\n for intermediate_storage in mode_def.intermediate_storage_defs or []:\n for required_resource in intermediate_storage.required_resource_keys:\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n "Resource '{resource}' is required by intermediate storage "\n "'{storage_name}', but is not provided by mode '{mode_name}'."\n ).format(\n resource=required_resource,\n storage_name=intermediate_storage.name,\n mode_name=mode_def.name,\n )\n )\n for solid in solid_dict.values():\n for hook_def in solid.hook_defs:\n for required_resource in hook_def.required_resource_keys:\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by hook "{hook_name}", but is not '\n 'provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n hook_name=hook_def.name,\n mode_name=mode_def.name,\n )\n )\n\n for hook_def in pipeline_hook_defs:\n for required_resource in hook_def.required_resource_keys:\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by hook "{hook_name}", but is not '\n 'provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n hook_name=hook_def.name,\n mode_name=mode_def.name,\n )\n )\n\n\ndef _validate_type_resource_deps_for_mode(mode_def, mode_resources, dagster_type_dict):\n for dagster_type in dagster_type_dict.values():\n for required_resource in dagster_type.required_resource_keys:\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by type "{type_name}", but is not '\n 'provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n type_name=dagster_type.display_name,\n mode_name=mode_def.name,\n )\n )\n if dagster_type.loader:\n for required_resource in dagster_type.loader.required_resource_keys():\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by the loader on type '\n '"{type_name}", but is not provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n type_name=dagster_type.display_name,\n mode_name=mode_def.name,\n )\n )\n if dagster_type.materializer:\n for required_resource in dagster_type.materializer.required_resource_keys():\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by the materializer on type '\n '"{type_name}", but is not provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n type_name=dagster_type.display_name,\n mode_name=mode_def.name,\n )\n )\n\n for plugin in dagster_type.auto_plugins:\n used_by_storage = set(\n [\n intermediate_storage_def.name\n for intermediate_storage_def in mode_def.intermediate_storage_defs\n if plugin.compatible_with_storage_def(intermediate_storage_def)\n ]\n )\n\n if used_by_storage:\n for required_resource in plugin.required_resource_keys():\n if required_resource not in mode_resources:\n raise DagsterInvalidDefinitionError(\n (\n 'Resource "{resource}" is required by the plugin "{plugin_name}"'\n ' on type "{type_name}" (used with storages {storages}), '\n 'but is not provided by mode "{mode_name}".'\n ).format(\n resource=required_resource,\n type_name=dagster_type.display_name,\n plugin_name=plugin.__name__,\n mode_name=mode_def.name,\n storages=used_by_storage,\n )\n )\n\n\ndef _validate_inputs(dependency_structure, solid_dict, mode_definitions):\n for solid in solid_dict.values():\n for handle in solid.input_handles():\n if dependency_structure.has_deps(handle):\n for mode_def in mode_definitions:\n for source_output_handle in dependency_structure.get_deps_list(handle):\n output_manager_key = source_output_handle.output_def.io_manager_key\n output_manager_def = mode_def.resource_defs[output_manager_key]\n # TODO: remove the IOutputManagerDefinition check when asset store\n # API is removed.\n if isinstance(\n output_manager_def, IOutputManagerDefinition\n ) and not isinstance(output_manager_def, IInputManagerDefinition):\n raise DagsterInvalidDefinitionError(\n f'Input "{handle.input_def.name}" of solid "{solid.name}" is '\n f'connected to output "{source_output_handle.output_def.name}" '\n f'of solid "{source_output_handle.solid.name}". In mode '\n f'"{mode_def.name}", that output does not have an output '\n f"manager that knows how to load inputs, so we don't know how "\n f"to load the input. To address this, assign an IOManager to "\n f"the upstream output."\n )\n else:\n if (\n not handle.input_def.dagster_type.loader\n and not handle.input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not handle.input_def.root_manager_key\n ):\n raise DagsterInvalidDefinitionError(\n 'Input "{input_name}" in solid "{solid_name}" is not connected to '\n "the output of a previous solid and can not be loaded from configuration, "\n "creating an impossible to execute pipeline. "\n "Possible solutions are:\\n"\n ' * add a dagster_type_loader for the type "{dagster_type}"\\n'\n ' * connect "{input_name}" to the output of another solid\\n'.format(\n solid_name=solid.name,\n input_name=handle.input_def.name,\n dagster_type=handle.input_def.dagster_type.display_name,\n )\n )\n\n\ndef _build_all_node_defs(node_defs):\n all_defs = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting solid definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(pipeline_def, mode_definition):\n from .environment_configs import (\n EnvironmentClassCreationData,\n construct_config_type_dictionary,\n define_environment_cls,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset pipeline, include the missing solids\n # from the original pipeline as ignored to allow execution with\n # run config that is valid for the original\n if pipeline_def.is_subset_pipeline:\n ignored_solids = [\n solid\n for solid in pipeline_def.parent_pipeline_def.solids\n if not pipeline_def.has_solid_named(solid.name)\n ]\n else:\n ignored_solids = []\n\n environment_type = define_environment_cls(\n EnvironmentClassCreationData(\n pipeline_name=pipeline_def.name,\n solids=pipeline_def.solids,\n dependency_structure=pipeline_def.dependency_structure,\n mode_definition=mode_definition,\n logger_defs=mode_definition.loggers,\n ignored_solids=ignored_solids,\n )\n )\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n pipeline_def.all_solid_defs, environment_type\n )\n\n return RunConfigSchema(\n environment_type=environment_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n )\n
\nfrom collections import namedtuple\n\nimport pkg_resources\nimport yaml\nfrom dagster import check\nfrom dagster.core.definitions.utils import config_from_files, config_from_yaml_strings\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.utils.merger import deep_merge_dicts\n\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\n[docs]class PresetDefinition(\n namedtuple("_PresetDefinition", "name run_config solid_selection mode tags")\n):\n """Defines a preset configuration in which a pipeline can execute.\n\n Presets can be used in Dagit to load predefined configurations into the tool.\n\n Presets may also be used from the Python API (in a script, or in test) as follows:\n\n .. code-block:: python\n\n execute_pipeline(pipeline_def, preset='example_preset')\n\n Presets may also be used with the command line tools:\n\n .. code-block:: shell\n\n $ dagster pipeline execute example_pipeline --preset example_preset\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n run_config (Optional[dict]): A dict representing the config to set with the preset.\n This is equivalent to the ``run_config`` argument to :py:func:`execute_pipeline`.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default: 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n """\n\n def __new__(\n cls, name, run_config=None, solid_selection=None, mode=None, tags=None,\n ):\n\n return super(PresetDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n run_config=run_config,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n mode=check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME),\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n )\n\n[docs] @staticmethod\n def from_files(name, config_files=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML files.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n config_files (Optional[List[str]]): List of paths or glob patterns for yaml files\n to load and parse as the environment config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML files.\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n check.str_param(name, "name")\n config_files = check.opt_list_param(config_files, "config_files")\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_files(config_files)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)\n\n[docs] @staticmethod\n def from_yaml_strings(name, yaml_strings=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML strings.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n yaml_strings (Optional[List[str]]): List of yaml strings to parse as the environment\n config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n check.str_param(name, "name")\n yaml_strings = check.opt_list_param(yaml_strings, "yaml_strings", of_type=str)\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_yaml_strings(yaml_strings)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)\n\n[docs] @staticmethod\n def from_pkg_resources(\n name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None\n ):\n """Load a preset from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n\n .. code-block:: python\n\n PresetDefinition.from_pkg_resources(\n name='local',\n mode='local',\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n pkg_resource_defs (Optional[List[(str, str)]]): List of pkg_resource modules/files to\n load as environment config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g.\n ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n pkg_resource_defs = check.opt_list_param(\n pkg_resource_defs, "pkg_resource_defs", of_type=tuple\n )\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs} "\n f'on preset "{name}".'\n ) from err\n\n return PresetDefinition.from_yaml_strings(name, yaml_strings, solid_selection, mode, tags)\n\n[docs] def get_environment_yaml(self):\n """Get the environment dict set on a preset as YAML.\n\n Returns:\n str: The environment dict as YAML.\n """\n return yaml.dump(self.run_config or {}, default_flow_style=False)\n\n[docs] def with_additional_config(self, run_config):\n """Return a new PresetDefinition with additional config merged in to the existing config."""\n\n check.opt_nullable_dict_param(run_config, "run_config")\n if run_config is None:\n return self\n else:\n return PresetDefinition(\n name=self.name,\n solid_selection=self.solid_selection,\n mode=self.mode,\n tags=self.tags,\n run_config=deep_merge_dicts(self.run_config, run_config),\n )\n
\nimport inspect\nimport os\nimport sys\nfrom collections import namedtuple\nfrom functools import lru_cache\n\nfrom dagster import check, seven\nfrom dagster.core.code_pointer import (\n CodePointer,\n CustomPointer,\n FileCodePointer,\n ModuleCodePointer,\n get_python_file_from_target,\n)\nfrom dagster.core.errors import DagsterInvalidSubsetError, DagsterInvariantViolationError\nfrom dagster.core.origin import PipelinePythonOrigin, RepositoryPythonOrigin, SchedulePythonOrigin\nfrom dagster.core.selector import parse_solid_selection\nfrom dagster.serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster.utils.backcompat import experimental\n\nfrom .pipeline_base import IPipeline\n\n\ndef get_ephemeral_repository_name(pipeline_name):\n check.str_param(pipeline_name, "pipeline_name")\n return "__repository__{pipeline_name}".format(pipeline_name=pipeline_name)\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n namedtuple("_ReconstructableRepository", "pointer container_image")\n):\n def __new__(\n cls, pointer, container_image=None,\n ):\n return super(ReconstructableRepository, cls).__new__(\n cls,\n pointer=check.inst_param(pointer, "pointer", CodePointer),\n container_image=check.opt_str_param(container_image, "container_image"),\n )\n\n @lru_cache(maxsize=1)\n def get_definition(self):\n return repository_def_from_pointer(self.pointer)\n\n def get_reconstructable_pipeline(self, name):\n return ReconstructablePipeline(self, name)\n\n def get_reconstructable_schedule(self, name):\n return ReconstructableSchedule(self, name)\n\n @classmethod\n def for_file(cls, file, fn_name, working_directory=None, container_image=None):\n if not working_directory:\n working_directory = os.getcwd()\n return cls(FileCodePointer(file, fn_name, working_directory), container_image)\n\n @classmethod\n def for_module(cls, module, fn_name, container_image=None):\n return cls(ModuleCodePointer(module, fn_name), container_image)\n\n def get_cli_args(self):\n return self.pointer.get_cli_args()\n\n def get_python_origin(self):\n return RepositoryPythonOrigin(\n executable_path=sys.executable,\n code_pointer=self.pointer,\n container_image=self.container_image,\n )\n\n def get_python_origin_id(self):\n return self.get_python_origin().get_id()\n\n\n@whitelist_for_serdes\nclass ReconstructablePipeline(\n namedtuple(\n "_ReconstructablePipeline",\n "repository pipeline_name solid_selection_str solids_to_execute",\n ),\n IPipeline,\n):\n def __new__(\n cls, repository, pipeline_name, solid_selection_str=None, solids_to_execute=None,\n ):\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n return super(ReconstructablePipeline, cls).__new__(\n cls,\n repository=check.inst_param(repository, "repository", ReconstructableRepository),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n solid_selection_str=check.opt_str_param(solid_selection_str, "solid_selection_str"),\n solids_to_execute=solids_to_execute,\n )\n\n @property\n def solid_selection(self):\n return seven.json.loads(self.solid_selection_str) if self.solid_selection_str else None\n\n @lru_cache(maxsize=1)\n def get_definition(self):\n return (\n self.repository.get_definition()\n .get_pipeline(self.pipeline_name)\n .get_pipeline_subset_def(self.solids_to_execute)\n )\n\n def _resolve_solid_selection(self, solid_selection):\n # resolve a list of solid selection queries to a frozenset of qualified solid names\n # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'}\n check.list_param(solid_selection, "solid_selection", of_type=str)\n solids_to_execute = parse_solid_selection(self.get_definition(), solid_selection)\n if len(solids_to_execute) == 0:\n raise DagsterInvalidSubsetError(\n "No qualified solids to execute found for solid_selection={requested}".format(\n requested=solid_selection\n )\n )\n return solids_to_execute\n\n def get_reconstructable_repository(self):\n return self.repository\n\n def _subset_for_execution(self, solids_to_execute, solid_selection=None):\n if solids_to_execute:\n pipe = ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n solid_selection_str=seven.json.dumps(solid_selection) if solid_selection else None,\n solids_to_execute=frozenset(solids_to_execute),\n )\n else:\n pipe = ReconstructablePipeline(\n repository=self.repository, pipeline_name=self.pipeline_name,\n )\n\n pipe.get_definition() # verify the subset is correct\n return pipe\n\n def subset_for_execution(self, solid_selection):\n # take a list of solid queries and resolve the queries to names of solids to execute\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n solids_to_execute = (\n self._resolve_solid_selection(solid_selection) if solid_selection else None\n )\n\n return self._subset_for_execution(solids_to_execute, solid_selection)\n\n def subset_for_execution_from_existing_pipeline(self, solids_to_execute):\n # take a frozenset of resolved solid names from an existing pipeline\n # so there's no need to parse the selection\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n\n return self._subset_for_execution(solids_to_execute)\n\n def describe(self):\n return '"{name}" in repository ({repo})'.format(\n repo=self.repository.pointer.describe, name=self.pipeline_name\n )\n\n @staticmethod\n def for_file(python_file, fn_name):\n return bootstrap_standalone_recon_pipeline(\n FileCodePointer(python_file, fn_name, os.getcwd())\n )\n\n @staticmethod\n def for_module(module, fn_name):\n return bootstrap_standalone_recon_pipeline(ModuleCodePointer(module, fn_name))\n\n def to_dict(self):\n return pack_value(self)\n\n @staticmethod\n def from_dict(val):\n check.dict_param(val, "val")\n\n inst = unpack_value(val)\n check.invariant(\n isinstance(inst, ReconstructablePipeline),\n "Deserialized object is not instance of ReconstructablePipeline, got {type}".format(\n type=type(inst)\n ),\n )\n return inst\n\n def get_python_origin(self):\n return PipelinePythonOrigin(self.pipeline_name, self.repository.get_python_origin())\n\n def get_python_origin_id(self):\n return self.get_python_origin().get_id()\n\n\n@whitelist_for_serdes\nclass ReconstructableSchedule(namedtuple("_ReconstructableSchedule", "repository schedule_name",)):\n def __new__(\n cls, repository, schedule_name,\n ):\n return super(ReconstructableSchedule, cls).__new__(\n cls,\n repository=check.inst_param(repository, "repository", ReconstructableRepository),\n schedule_name=check.str_param(schedule_name, "schedule_name"),\n )\n\n def get_python_origin(self):\n return SchedulePythonOrigin(self.schedule_name, self.repository.get_python_origin())\n\n def get_python_origin_id(self):\n return self.get_python_origin().get_id()\n\n @lru_cache(maxsize=1)\n def get_definition(self):\n return self.repository.get_definition().get_schedule_def(self.schedule_name)\n\n\n[docs]def reconstructable(target):\n """\n Create a ReconstructablePipeline from a function that returns a PipelineDefinition, or a\n function decorated with :py:func:`@pipeline <dagster.pipeline>`\n\n When your pipeline must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like dagstermill), Dagster must know how to reconstruct the pipeline\n on the other side of the process boundary.\n\n This function implements a very conservative strategy for reconstructing pipelines, so that\n its behavior is easy to predict, but as a consequence it is not able to reconstruct certain\n kinds of pipelines, such as those defined by lambdas, in nested scopes (e.g., dynamically\n within a method call), or in interactive environments such as the Python REPL or Jupyter\n notebooks.\n\n If you need to reconstruct pipelines constructed in these ways, you should use\n :py:func:`build_reconstructable_pipeline` instead, which allows you to specify your own\n strategy for reconstructing a pipeline.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import PipelineDefinition, pipeline, reconstructable\n\n @pipeline\n def foo_pipeline():\n ...\n\n reconstructable_foo_pipeline = reconstructable(foo_pipeline)\n\n\n def make_bar_pipeline():\n return PipelineDefinition(...)\n\n reconstructable_bar_pipeline = reconstructable(bar_pipeline)\n """\n from dagster.core.definitions import PipelineDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, PipelineDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n "by a decorated function, got {type}.".format(type=type(target)),\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_pipeline."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n 'Reconstructable target "{target.__name__}" has a different '\n '__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_pipeline.".format(\n target=target\n )\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and inspect.getmodule(target).__name__ != "__main__"\n ):\n return ReconstructablePipeline.for_module(target.__module__, target.__name__)\n except: # pylint: disable=bare-except\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct pipelines defined in interactive environments "\n "like <stdin>, IPython, or Jupyter notebooks. "\n "Use a pipeline defined in a module or file instead, or "\n "use build_reconstructable_pipeline."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_pipeline(pointer)\n\n\n@experimental\ndef build_reconstructable_pipeline(\n reconstructor_module_name,\n reconstructor_function_name,\n reconstructable_args=None,\n reconstructable_kwargs=None,\n):\n """\n Create a ReconstructablePipeline.\n\n When your pipeline must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like dagstermill), Dagster must know how to reconstruct the pipeline\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing pipelines, so\n that you can reconstruct certain kinds of pipelines that are not supported by\n :py:func:`reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct pipelines constructed in these ways, use this function instead of\n :py:func:`reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the pipeline.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n pipeline.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the pipeline.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n pipeline. Values of the dict must be JSON serializable.\n\n Examples:\n\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import PipelineDefinition, pipeline, build_reconstructable_pipeline\n\n class PipelineFactory:\n def make_pipeline(*args, **kwargs):\n\n @pipeline\n def _pipeline(...):\n ...\n\n return _pipeline\n\n def reconstruct_pipeline(*args):\n factory = PipelineFactory()\n return factory.make_pipeline(*args)\n\n factory = PipelineFactory()\n\n foo_pipeline_args = (...,...)\n\n foo_pipeline_kwargs = {...:...}\n\n foo_pipeline = factory.make_pipeline(*foo_pipeline_args, **foo_pipeline_kwargs)\n\n reconstructable_foo_pipeline = build_reconstructable_pipeline(\n 'mymodule',\n 'reconstruct_pipeline',\n foo_pipeline_args,\n foo_pipeline_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n\n reconstructable_args = list(check.opt_tuple_param(reconstructable_args, "reconstructable_args"))\n reconstructable_kwargs = list(\n (\n [key, value]\n for key, value in check.opt_dict_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name, reconstructor_function_name\n )\n\n pointer = CustomPointer(reconstructor_pointer, reconstructable_args, reconstructable_kwargs)\n\n pipeline_def = pipeline_def_from_pointer(pointer)\n\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )\n\n\ndef bootstrap_standalone_recon_pipeline(pointer):\n # So this actually straps the the pipeline for the sole\n # purpose of getting the pipeline name. If we changed ReconstructablePipeline\n # to get the pipeline on demand in order to get name, we could avoid this.\n pipeline_def = pipeline_def_from_pointer(pointer)\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )\n\n\ndef _check_is_loadable(definition):\n from .pipeline import PipelineDefinition\n from .repository import RepositoryDefinition\n\n if not isinstance(definition, (PipelineDefinition, RepositoryDefinition)):\n raise DagsterInvariantViolationError(\n (\n "Loadable attributes must be either a PipelineDefinition or a "\n "RepositoryDefinition. Got {definition}."\n ).format(definition=repr(definition))\n )\n return definition\n\n\ndef load_def_in_module(module_name, attribute):\n return def_from_pointer(CodePointer.from_module(module_name, attribute))\n\n\ndef load_def_in_package(package_name, attribute):\n return def_from_pointer(CodePointer.from_python_package(package_name, attribute))\n\n\ndef load_def_in_python_file(python_file, attribute, working_directory):\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(pointer):\n target = pointer.load_target()\n\n from .pipeline import PipelineDefinition\n from .repository import RepositoryDefinition\n\n if isinstance(target, (PipelineDefinition, RepositoryDefinition)) or not callable(target):\n return _check_is_loadable(target)\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_args(target):\n raise DagsterInvariantViolationError(\n "Error invoking function at {target} with no arguments. "\n "Reconstructable target must be callable with no arguments".format(\n target=pointer.describe()\n )\n )\n\n return _check_is_loadable(target())\n\n\ndef pipeline_def_from_pointer(pointer):\n from .pipeline import PipelineDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, PipelineDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a PipelineDefinition. "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\ndef repository_def_from_target_def(target):\n from .pipeline import PipelineDefinition\n from .repository import RepositoryData, RepositoryDefinition\n\n # special case - we can wrap a single pipeline in a repository\n if isinstance(target, PipelineDefinition):\n # consider including pipeline name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=RepositoryData.from_list([target]),\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n else:\n return None\n\n\ndef repository_def_from_pointer(pointer):\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target)\n if not repo_def:\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a "\n "RepositoryDefinition or a PipelineDefinition. "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n return repo_def\n
\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.utils import merge_dicts\n\nfrom .job import JobDefinition, JobType\nfrom .partition import PartitionScheduleDefinition, PartitionSetDefinition\nfrom .pipeline import PipelineDefinition\nfrom .schedule import ScheduleDefinition\nfrom .utils import check_valid_name\n\nVALID_REPOSITORY_DATA_DICT_KEYS = {\n "pipelines",\n "partition_sets",\n "schedules",\n "jobs",\n}\n\n\nclass _CacheingDefinitionIndex:\n def __init__(self, definition_class, definition_class_name, definition_kind, definitions):\n\n for key, definition in definitions.items():\n check.invariant(\n isinstance(definition, definition_class) or callable(definition),\n "Bad definition for {definition_kind} {key}: must be {definition_class_name} or "\n "callable, got {type_}".format(\n definition_kind=definition_kind,\n key=key,\n definition_class_name=definition_class_name,\n type_=type(definition),\n ),\n )\n\n self._definition_class = definition_class\n self._definition_class_name = definition_class_name\n self._definition_kind = definition_kind\n\n self._definitions = definitions\n self._definition_cache = {}\n self._definition_names = None\n self._all_definitions = None\n\n def get_definition_names(self):\n if self._definition_names:\n return self._definition_names\n\n self._definition_names = list(self._definitions.keys())\n return self._definition_names\n\n def has_definition(self, definition_name):\n check.str_param(definition_name, "definition_name")\n\n return definition_name in self.get_definition_names()\n\n def get_all_definitions(self):\n if self._all_definitions is not None:\n return self._all_definitions\n\n self._all_definitions = list(\n sorted(\n map(self.get_definition, self.get_definition_names()),\n key=lambda definition: definition.name,\n )\n )\n return self._all_definitions\n\n def get_definition(self, definition_name):\n check.str_param(definition_name, "definition_name")\n\n if definition_name in self._definition_cache:\n return self._definition_cache[definition_name]\n\n if definition_name not in self._definitions:\n raise DagsterInvariantViolationError(\n "Could not find {definition_kind} '{definition_name}'. Found: "\n "{found_names}.".format(\n definition_kind=self._definition_kind,\n definition_name=definition_name,\n found_names=", ".join(\n [\n "'{found_name}'".format(found_name=found_name)\n for found_name in self.get_definition_names()\n ]\n ),\n )\n )\n\n definition_source = self._definitions[definition_name]\n\n if isinstance(definition_source, self._definition_class):\n self._definition_cache[definition_name] = definition_source\n return definition_source\n else:\n definition = definition_source()\n check.invariant(\n isinstance(definition, self._definition_class),\n "Bad constructor for {definition_kind} {definition_name}: must return "\n "{definition_class_name}, got value of type {type_}".format(\n definition_kind=self._definition_kind,\n definition_name=definition_name,\n definition_class_name=self._definition_class_name,\n type_=type(definition),\n ),\n )\n check.invariant(\n definition.name == definition_name,\n "Bad constructor for {definition_kind} '{definition_name}': name in "\n "{definition_class_name} does not match: got '{definition_def_name}'".format(\n definition_kind=self._definition_kind,\n definition_name=definition_name,\n definition_class_name=self._definition_class_name,\n definition_def_name=definition.name,\n ),\n )\n self._definition_cache[definition_name] = definition\n return definition\n\n\nclass RepositoryData:\n """Contains definitions belonging to a repository.\n\n Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass RepositoryData for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n def __init__(self, pipelines, partition_sets, schedules, jobs):\n """Constructs a new RepositoryData object.\n\n You may pass pipeline, partition_set, and schedule definitions directly, or you may pass\n callables with no arguments that will be invoked to lazily construct definitions when\n accessed by name. This can be helpful for performance when there are many definitions in a\n repository, or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n pipelines (Dict[str, Union[PipelineDefinition, Callable[[], PipelineDefinition]]]):\n The pipeline definitions belonging to the repository.\n partition_sets (Dict[str, Union[PartitionSetDefinition, Callable[[], PartitionSetDefinition]]]):\n The partition sets belonging to the repository.\n schedules (Dict[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n jobs (Dict[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The predefined jobs for a repository.\n\n """\n check.dict_param(pipelines, "pipelines", key_type=str)\n check.dict_param(partition_sets, "partition_sets", key_type=str)\n check.dict_param(schedules, "schedules", key_type=str)\n check.dict_param(jobs, "jobs", key_type=str)\n\n self._pipelines = _CacheingDefinitionIndex(\n PipelineDefinition, "PipelineDefinition", "pipeline", pipelines\n )\n self._schedules = _CacheingDefinitionIndex(\n ScheduleDefinition, "ScheduleDefinition", "schedule", schedules\n )\n schedule_partition_sets = [\n schedule.get_partition_set()\n for schedule in self._schedules.get_all_definitions()\n if isinstance(schedule, PartitionScheduleDefinition)\n ]\n self._partition_sets = _CacheingDefinitionIndex(\n PartitionSetDefinition,\n "PartitionSetDefinition",\n "partition set",\n merge_dicts(\n {partition_set.name: partition_set for partition_set in schedule_partition_sets},\n partition_sets,\n ),\n )\n self._jobs = _CacheingDefinitionIndex(JobDefinition, "JobDefinition", "job", jobs,)\n self._all_pipelines = None\n self._solids = None\n self._all_solids = None\n\n @staticmethod\n def from_dict(repository_definitions):\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n check.dict_param(repository_definitions, "repository_definitions", key_type=str)\n check.invariant(\n set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS),\n "Bad dict: must not contain keys other than {{{valid_keys}}}: found {bad_keys}.".format(\n valid_keys=", ".join(\n ["'{key}'".format(key=key) for key in VALID_REPOSITORY_DATA_DICT_KEYS]\n ),\n bad_keys=", ".join(\n [\n "'{key}'"\n for key in repository_definitions.keys()\n if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n ]\n ),\n ),\n )\n\n for key in VALID_REPOSITORY_DATA_DICT_KEYS:\n if key not in repository_definitions:\n repository_definitions[key] = {}\n\n return RepositoryData(**repository_definitions)\n\n @classmethod\n def from_list(cls, repository_definitions):\n """Static constructor.\n\n Args:\n repository_definition (List[Union[PipelineDefinition, PartitionSetDefinition, ScheduleDefinition]]):\n Use this constructor when you have no need to lazy load pipelines or other\n definitions.\n """\n pipelines = {}\n partition_sets = {}\n schedules = {}\n jobs = {}\n for definition in repository_definitions:\n if isinstance(definition, PipelineDefinition):\n if definition.name in pipelines:\n raise DagsterInvalidDefinitionError(\n "Duplicate pipeline definition found for pipeline {pipeline_name}".format(\n pipeline_name=definition.name\n )\n )\n pipelines[definition.name] = definition\n elif isinstance(definition, PartitionSetDefinition):\n if definition.name in partition_sets:\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(partition_set_name=definition.name)\n )\n partition_sets[definition.name] = definition\n elif isinstance(definition, JobDefinition):\n if isinstance(definition, ScheduleDefinition):\n if definition.name in schedules:\n raise DagsterInvalidDefinitionError(\n "Duplicate schedule definition found for schedule {schedule_name}".format(\n schedule_name=definition.name\n )\n )\n schedules[definition.name] = definition\n if isinstance(definition, PartitionScheduleDefinition):\n partition_set_def = definition.get_partition_set()\n if (\n partition_set_def.name in partition_sets\n and partition_set_def != partition_sets[partition_set_def.name]\n ):\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(\n partition_set_name=partition_set_def.name\n )\n )\n partition_sets[partition_set_def.name] = partition_set_def\n if definition.name in jobs:\n raise DagsterInvalidDefinitionError(\n "Duplicate job definition found for job {name}".format(name=definition.name)\n )\n jobs[definition.name] = definition\n\n return RepositoryData(\n pipelines=pipelines, partition_sets=partition_sets, schedules=schedules, jobs=jobs,\n )\n\n def get_pipeline_names(self):\n """Get the names of all pipelines in the repository.\n\n Returns:\n List[str]\n """\n return self._pipelines.get_definition_names()\n\n def has_pipeline(self, pipeline_name):\n """Check if a pipeline with a given name is present in the repository.\n\n Args:\n pipeline_name (str): The name of the pipeline.\n\n Returns:\n bool\n """\n check.str_param(pipeline_name, "pipeline_name")\n return self._pipelines.has_definition(pipeline_name)\n\n def get_all_pipelines(self):\n """Return all pipelines in the repository as a list.\n\n Note that this will construct any pipeline that has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines in the repository.\n """\n if self._all_pipelines is not None:\n return self._all_pipelines\n\n self._all_pipelines = self._pipelines.get_all_definitions()\n self.get_all_solid_defs()\n return self._all_pipelines\n\n def get_pipeline(self, pipeline_name):\n """Get a pipeline by name.\n\n If this pipeline has not yet been constructed, only this pipeline is constructed, and will\n be cached for future calls.\n\n Args:\n pipeline_name (str): Name of the pipeline to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline definition corresponding to the given name.\n """\n\n check.str_param(pipeline_name, "pipeline_name")\n\n return self._pipelines.get_definition(pipeline_name)\n\n def get_partition_set_names(self):\n """Get the names of all partition sets in the repository.\n\n Returns:\n List[str]\n """\n return self._partition_sets.get_definition_names()\n\n def has_partition_set(self, partition_set_name):\n """Check if a partition set with a given name is present in the repository.\n\n Args:\n partition_set_name (str): The name of the partition set.\n\n Returns:\n bool\n """\n check.str_param(partition_set_name, "partition_set_name")\n return self._partition_sets.has_definition(partition_set_name)\n\n def get_all_partition_sets(self):\n """Return all partition sets in the repository as a list.\n\n Note that this will construct any partition set that has not yet been constructed.\n\n Returns:\n List[PartitionSetDefinition]: All partition sets in the repository.\n """\n return self._partition_sets.get_all_definitions()\n\n def get_partition_set(self, partition_set_name):\n """Get a partition set by name.\n\n If this partition set has not yet been constructed, only this partition set is constructed,\n and will be cached for future calls.\n\n Args:\n partition_set_name (str): Name of the partition set to retrieve.\n\n Returns:\n PartitionSetDefinition: The partition set definition corresponding to the given name.\n """\n\n check.str_param(partition_set_name, "partition_set_name")\n\n return self._partition_sets.get_definition(partition_set_name)\n\n def get_schedule_names(self):\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self):\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All pipelines in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name):\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name):\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self):\n return [\n definition\n for definition in self._jobs.get_all_definitions()\n if definition.job_type == JobType.SENSOR\n ]\n\n def get_sensor(self, name):\n return self._jobs.get_definition(name)\n\n def has_sensor(self, name):\n return self._jobs.has_definition(name)\n\n def get_all_jobs(self):\n return self._jobs.get_all_definitions()\n\n def get_job(self, name):\n check.str_param(name, "name")\n return self._jobs.get_definition(name)\n\n def has_job(self, name):\n check.str_param(name, "name")\n return self._jobs.has_definition(name)\n\n def get_all_solid_defs(self):\n if self._all_solids is not None:\n return self._all_solids\n\n self._all_solids = self._construct_solid_defs()\n return list(self._all_solids.values())\n\n def has_solid(self, solid_name):\n if self._all_solids is not None:\n return solid_name in self._all_solids\n\n self._all_solids = self._construct_solid_defs()\n return solid_name in self._all_solids\n\n def _construct_solid_defs(self):\n solid_defs = {}\n solid_to_pipeline = {}\n # This looks like it should infinitely loop but the\n # memoization of _all_pipelines and _all_solids short\n # circuits that\n for pipeline in self.get_all_pipelines():\n for solid_def in pipeline.all_solid_defs:\n if solid_def.name not in solid_defs:\n solid_defs[solid_def.name] = solid_def\n solid_to_pipeline[solid_def.name] = pipeline.name\n\n if not solid_defs[solid_def.name] is solid_def:\n first_name, second_name = sorted(\n [solid_to_pipeline[solid_def.name], pipeline.name]\n )\n raise DagsterInvalidDefinitionError(\n (\n "Duplicate solids found in repository with name '{solid_def_name}'. "\n "Solid definition names must be unique within a repository. Solid is "\n "defined in pipeline '{first_pipeline_name}' and in pipeline "\n "'{second_pipeline_name}'."\n ).format(\n solid_def_name=solid_def.name,\n first_pipeline_name=first_name,\n second_pipeline_name=second_name,\n )\n )\n\n return solid_defs\n\n def solid_def_named(self, name):\n """Get the solid with the given name in the repository.\n\n Args:\n name (str): The name of the solid for which to retrieve the solid definition.\n\n Returns:\n SolidDefinition: The solid with the given name.\n """\n check.str_param(name, "name")\n\n if not self.has_solid(name):\n check.failed("could not find solid_def for solid {name}".format(name=name))\n\n return self._all_solids[name]\n\n\n[docs]class RepositoryDefinition:\n """Define a repository that contains a collection of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n """\n\n def __init__(\n self, name, repository_data, description=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data = check.inst_param(repository_data, "repository_data", RepositoryData)\n\n @property\n def name(self):\n return self._name\n\n @property\n def description(self):\n return self._description\n\n @property\n def pipeline_names(self):\n """List[str]: Names of all pipelines in the repository"""\n return self._repository_data.get_pipeline_names()\n\n[docs] def has_pipeline(self, name):\n """Check if a pipeline with a given name is present in the repository.\n\n Args:\n name (str): The name of the pipeline.\n\n Returns:\n bool\n """\n return self._repository_data.has_pipeline(name)\n\n[docs] def get_pipeline(self, name):\n """Get a pipeline by name.\n\n If this pipeline is present in the lazily evaluated ``pipeline_dict`` passed to the\n constructor, but has not yet been constructed, only this pipeline is constructed, and will\n be cached for future calls.\n\n Args:\n name (str): Name of the pipeline to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline definition corresponding to the given name.\n """\n return self._repository_data.get_pipeline(name)\n\n[docs] def get_all_pipelines(self):\n """Return all pipelines in the repository as a list.\n\n Note that this will construct any pipeline in the lazily evaluated ``pipeline_dict`` that\n has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines in the repository.\n """\n return self._repository_data.get_all_pipelines()\n\n[docs] def get_all_solid_defs(self):\n """Get all the solid definitions in a repository.\n\n Returns:\n List[SolidDefinition]: All solid definitions in the repository.\n """\n return self._repository_data.get_all_solid_defs()\n\n[docs] def solid_def_named(self, name):\n """Get the solid with the given name in the repository.\n\n Args:\n name (str): The name of the solid for which to retrieve the solid definition.\n\n Returns:\n SolidDefinition: The solid with the given name.\n """\n check.str_param(name, "name")\n return self._repository_data.solid_def_named(name)\n\n @property\n def partition_set_defs(self):\n return self._repository_data.get_all_partition_sets()\n\n def get_partition_set_def(self, name):\n return self._repository_data.get_partition_set(name)\n\n @property\n def schedule_defs(self):\n return self._repository_data.get_all_schedules()\n\n def get_schedule_def(self, name):\n return self._repository_data.get_schedule(name)\n\n def has_schedule_def(self, name):\n return self._repository_data.has_schedule(name)\n\n @property\n def sensor_defs(self):\n return self._repository_data.get_all_sensors()\n\n def get_sensor_def(self, name):\n return self._repository_data.get_sensor(name)\n\n def has_sensor_def(self, name):\n return self._repository_data.has_sensor(name)\n\n @property\n def job_defs(self):\n return self._repository_data.get_all_jobs()\n\n def get_job_def(self, name):\n return self._repository_data.get_job(name)\n\n def has_job_def(self, name):\n return self._repository_data.has_job(name)\n
\nfrom collections import namedtuple\nfrom functools import update_wrapper\n\nfrom dagster import check, seven\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import ConfigurableDefinition\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterUnknownResourceError\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..decorator_utils import split_function_parameters, validate_decorated_fn_positionals\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\n\n[docs]class ResourceDefinition(ConfigurableDefinition):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n solids during pipeline execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n pipeline execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to solid executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.resource_config`.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n ):\n EXPECTED_POSITIONALS = ["*"]\n fn_positionals, _ = split_function_parameters(resource_fn, EXPECTED_POSITIONALS)\n missing_positional = validate_decorated_fn_positionals(fn_positionals, EXPECTED_POSITIONALS)\n\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@resource '{resource_name}' decorated function does not have required "\n "positional parameter '{missing_param}'. Resource functions should only have keyword "\n "arguments that match input names and a first positional parameter.".format(\n resource_name=resource_fn.__name__, missing_param=missing_positional\n )\n )\n\n self._resource_fn = check.opt_callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "ResourceDefinition.__init__")\n\n @property\n def resource_fn(self):\n return self._resource_fn\n\n @property\n def config_schema(self):\n return self._config_schema\n\n @property\n def description(self):\n return self._description\n\n @property\n def version(self):\n return self._version\n\n @property\n def required_resource_keys(self):\n return self._required_resource_keys\n\n[docs] @staticmethod\n def none_resource(description=None):\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)\n\n[docs] @staticmethod\n def hardcoded_resource(value, description=None):\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): A hardcoded object which helps mock the resource.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n return ResourceDefinition(resource_fn=lambda _init_context: value, description=description)\n\n[docs] @staticmethod\n def mock_resource(description=None):\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n return ResourceDefinition.hardcoded_resource(\n value=seven.mock.MagicMock(), description=description\n )\n\n @staticmethod\n def string_resource(description=None):\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )\n\n def copy_for_configured(self, name, description, config_schema, _):\n check.invariant(name is None, "ResourceDefintions do not have names")\n return ResourceDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n )\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self, config_schema=None, description=None, required_resource_keys=None, version=None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n resource_def = ResourceDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(resource_def, wrapped=fn)\n\n return resource_def\n\n\n[docs]def resource(config_schema=None, description=None, required_resource_keys=None, version=None):\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn):\n return _ResourceDecoratorCallable(\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap\n\n\nclass ScopedResourcesBuilder(namedtuple("ScopedResourcesBuilder", "resource_instance_dict")):\n """There are concepts in the codebase (e.g. solids, system storage) that receive\n only the resources that they have specified in required_resource_keys.\n ScopedResourcesBuilder is responsible for dynamically building a class with\n only those required resources and returning an instance of that class."""\n\n def __new__(cls, resource_instance_dict=None):\n return super(ScopedResourcesBuilder, cls).__new__(\n cls,\n resource_instance_dict=check.opt_dict_param(\n resource_instance_dict, "resource_instance_dict", key_type=str\n ),\n )\n\n def build(self, required_resource_keys):\n\n """We dynamically create a type that has the resource keys as properties, to enable dotting into\n the resources from a context.\n\n For example, given:\n\n resources = {'foo': <some resource>, 'bar': <some other resource>}\n\n then this will create the type Resource(namedtuple('foo bar'))\n\n and then binds the specified resources into an instance of this object, which can be consumed\n as, e.g., context.resources.foo.\n """\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n # it is possible that the surrounding context does NOT have the required resource keys\n # because we are building a context for steps that we are not going to execute (e.g. in the\n # resume/retry case, in order to generate copy intermediates events)\n resource_instance_dict = {\n key: self.resource_instance_dict[key]\n for key in required_resource_keys\n if key in self.resource_instance_dict\n }\n\n class ScopedResources(namedtuple("Resources", list(resource_instance_dict.keys()))):\n def __getattr__(self, attr):\n raise DagsterUnknownResourceError(attr)\n\n return ScopedResources(**resource_instance_dict)\n
\nfrom datetime import datetime\n\nimport pendulum\nfrom dagster import check\nfrom dagster.core.errors import (\n DagsterInvalidDefinitionError,\n ScheduleExecutionError,\n user_code_error_boundary,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import check_tags\nfrom dagster.utils import ensure_gen, merge_dicts\n\nfrom .job import JobContext, JobDefinition, JobType, RunRequest, SkipReason\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\n[docs]class ScheduleExecutionContext(JobContext):\n """Schedule-specific execution context.\n\n An instance of this class is made available as the first argument to various ScheduleDefinition\n functions. It is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Attributes:\n instance (DagsterInstance): The instance configured to run the schedule\n scheduled_execution_time (datetime):\n The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n Not available in all schedulers - currently only set in deployments using\n DagsterDaemonScheduler.\n """\n\n __slots__ = ["_scheduled_execution_time"]\n\n def __init__(self, instance, scheduled_execution_time):\n super(ScheduleExecutionContext, self).__init__(\n check.inst_param(instance, "instance", DagsterInstance)\n )\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n\n @property\n def scheduled_execution_time(self):\n return self._scheduled_execution_time\n\n\n[docs]class ScheduleDefinition(JobDefinition):\n """Define a schedule that targets a pipeline\n\n Args:\n name (str): The name of the schedule to create.\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n '45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n execution_fn (Callable[ScheduleExecutionContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleExecutionContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Dict]): The environment config that parameterizes this execution,\n as a dict.\n run_config_fn (Callable[[ScheduleExecutionContext], [Dict]]): A function that takes a\n ScheduleExecutionContext object and returns the environment configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``\n and ``run_config_fn``.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleExecutionContext], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleExecutionContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this schedule. (default: 'default')\n should_execute (Optional[Callable[[ScheduleExecutionContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[dict[str, str]]): The environment variables to set for the\n schedule\n execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works\n with DagsterDaemonScheduler, and must be set when using that scheduler.\n """\n\n __slots__ = [\n "_cron_schedule",\n "_environment_vars",\n "_execution_fn",\n "_execution_timezone",\n ]\n\n def __init__(\n self,\n name,\n cron_schedule,\n pipeline_name,\n run_config=None,\n run_config_fn=None,\n tags=None,\n tags_fn=None,\n solid_selection=None,\n mode="default",\n should_execute=None,\n environment_vars=None,\n execution_timezone=None,\n execution_fn=None,\n ):\n\n super(ScheduleDefinition, self).__init__(\n check_valid_name(name),\n job_type=JobType.SCHEDULE,\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n mode=check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME),\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n )\n\n self._cron_schedule = check.str_param(cron_schedule, "cron_schedule")\n self._environment_vars = check.opt_dict_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n else:\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n run_config_fn = check.opt_callable_param(\n run_config_fn,\n "run_config_fn",\n default=lambda _context: check.opt_dict_param(run_config, "run_config"),\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n check_tags(tags, "tags")\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(tags_fn, "tags_fn", default=lambda _context: {})\n\n should_execute = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n def _execution_fn(context):\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {name}",\n ):\n if not should_execute(context):\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of run_config_fn for schedule {name}",\n ):\n evaluated_run_config = run_config_fn(context)\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = tags_fn(context)\n\n yield RunRequest(\n run_key=None, run_config=evaluated_run_config, tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.timezone(self._execution_timezone)\n except ValueError:\n raise DagsterInvalidDefinitionError(\n "Invalid execution timezone {timezone} for {schedule_name}".format(\n schedule_name=name, timezone=self._execution_timezone\n )\n )\n\n @property\n def cron_schedule(self):\n return self._cron_schedule\n\n @property\n def environment_vars(self):\n return self._environment_vars\n\n @property\n def execution_timezone(self):\n return self._execution_timezone\n\n def get_execution_data(self, context):\n check.inst_param(context, "context", ScheduleExecutionContext)\n result = list(ensure_gen(self._execution_fn(context)))\n\n if not result:\n return []\n\n if len(result) == 1:\n check.is_list(result, of_type=(RunRequest, SkipReason))\n data = result[0]\n\n if isinstance(data, SkipReason):\n return result\n check.inst(data, RunRequest)\n return [\n RunRequest(\n run_key=data.run_key,\n run_config=data.run_config,\n tags=merge_dicts(data.tags, PipelineRun.tags_for_schedule(self)),\n )\n ]\n\n check.is_list(result, of_type=RunRequest)\n\n check.invariant(\n not any(not data.run_key for data in result),\n "Schedules that return multiple RunRequests must specify a run_key in each RunRequest",\n )\n\n # clone all the run requests with the required schedule tags\n return [\n RunRequest(\n run_key=data.run_key,\n run_config=data.run_config,\n tags=merge_dicts(data.tags, PipelineRun.tags_for_schedule(self)),\n )\n for data in result\n ]\n
\nimport inspect\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.utils import ensure_gen\n\nfrom .job import JobContext, JobDefinition, JobType, RunRequest, SkipReason\n\n\n[docs]class SensorExecutionContext(JobContext):\n """Sensor execution context.\n\n An instance of this class is made available as the first argument to the evaluation function\n on SensorDefinition.\n\n Attributes:\n instance (DagsterInstance): The instance configured to run the schedule\n last_completion_time (float): The last time that the sensor was evaluated (UTC).\n last_run_key (str): The run key of the RunRequest most recently created by this sensor.\n """\n\n __slots__ = ["_last_completion_time", "_last_run_key"]\n\n def __init__(self, instance, last_completion_time, last_run_key):\n super(SensorExecutionContext, self).__init__(\n check.inst_param(instance, "instance", DagsterInstance),\n )\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n\n @property\n def last_completion_time(self):\n return self._last_completion_time\n\n @property\n def last_run_key(self):\n return self._last_run_key\n\n\n[docs]class SensorDefinition(JobDefinition):\n """Define a sensor that initiates a set of job runs\n\n Args:\n name (str): The name of the sensor to create.\n pipeline_name (str): The name of the pipeline to execute when the sensor fires.\n evaluation_fn (Callable[[SensorExecutionContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorExecutionContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the sensor runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this sensor. (default: 'default')\n """\n\n __slots__ = [\n "_evaluation_fn",\n ]\n\n def __init__(\n self, name, pipeline_name, evaluation_fn, solid_selection=None, mode=None,\n ):\n super(SensorDefinition, self).__init__(\n name,\n job_type=JobType.SENSOR,\n pipeline_name=pipeline_name,\n mode=mode,\n solid_selection=solid_selection,\n )\n self._evaluation_fn = check.callable_param(evaluation_fn, "evaluation_fn")\n\n def get_execution_data(self, context):\n check.inst_param(context, "context", SensorExecutionContext)\n result = list(ensure_gen(self._evaluation_fn(context)))\n\n if not result or result == [None]:\n return []\n\n if len(result) == 1:\n return check.is_list(result, of_type=(RunRequest, SkipReason))\n\n return check.is_list(result, of_type=RunRequest)\n\n\ndef wrap_sensor_evaluation(sensor_name, result):\n if inspect.isgenerator(result):\n for item in result:\n yield item\n\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{result} of type {type(result)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n
\nimport warnings\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .config import ConfigMapping\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\nfrom .graph import GraphDefinition\nfrom .i_solid_definition import NodeDefinition\nfrom .input import InputDefinition\nfrom .output import OutputDefinition\n\n\n[docs]class SolidDefinition(NodeDefinition):\n """\n The definition of a Solid that performs a user-defined computation.\n\n For more details on what a solid is, refer to the\n `Solid Guide <../../learn/guides/solid/solid>`_ .\n\n End users should prefer the :func:`@solid <solid>` and :func:`@lambda_solid <lambda_solid>`\n decorators. SolidDefinition is generally intended to be used by framework authors.\n\n Args:\n name (str): Name of the solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n input_defs (List[InputDefinition]): Inputs of the solid.\n compute_fn (Callable): The core of the solid, the function that does the actual\n computation. The signature of this function is determined by ``input_defs``, with\n an additional injected first argument, ``context``, a collection of information provided\n by the system.\n\n This function must return a generator, which must yield one :py:class:`Output` for each\n of the solid's ``output_defs``, and additionally may yield other types of Dagster\n events, including :py:class:`Materialization` and :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the solid.\n config_schema (Optional[ConfigSchema): The schema for the config. Configuration data\n available in `init_context.solid_config`.\n description (Optional[str]): Human-readable description of the solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this\n solid.\n positional_inputs (Optional[List[str]]): The positional order of the input names if it\n differs from the order of the input definitions.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n SolidDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n )\n """\n\n def __init__(\n self,\n name,\n input_defs,\n compute_fn,\n output_defs,\n config_schema=None,\n description=None,\n tags=None,\n required_resource_keys=None,\n positional_inputs=None,\n version=None,\n ):\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "SolidDefinition.__init__")\n\n super(SolidDefinition, self).__init__(\n name=name,\n input_defs=check.list_param(input_defs, "input_defs", InputDefinition),\n output_defs=check.list_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n @property\n def compute_fn(self):\n return self._compute_fn\n\n @property\n def config_schema(self):\n return self._config_schema\n\n @property\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n @property\n def has_config_entry(self):\n warnings.warn(\n "SolidDefinition.has_config_entry is deprecated, starting in 0.10.0, because whether "\n "the solid has configurable inputs or outputs depends on what managers are supplied "\n "for its inputs and outputs."\n )\n return self._config_schema or self.has_configurable_inputs or self.has_configurable_outputs\n\n @property\n def version(self):\n return self._version\n\n def all_dagster_types(self):\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self):\n yield self\n\n def resolve_output_to_origin(self, output_name, handle):\n return self.output_def_named(output_name), handle\n\n def input_has_default(self, input_name):\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name):\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name):\n return True\n\n def copy_for_configured(self, name, description, config_schema, config_or_config_fn):\n return SolidDefinition(\n name=self._name_for_configured_node(self.name, name, config_or_config_fn),\n input_defs=self.input_defs,\n compute_fn=self.compute_fn,\n output_defs=self.output_defs,\n config_schema=config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n positional_inputs=self.positional_inputs,\n version=self.version,\n )\n\n\n[docs]class CompositeSolidDefinition(GraphDefinition):\n """The core unit of composition and abstraction, composite solids allow you to\n define a solid from a graph of solids.\n\n In the same way you would refactor a block of code in to a function to deduplicate, organize,\n or manage complexity - you can refactor solids in a pipeline in to a composite solid.\n\n Args:\n name (str): The name of this composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]): The set of solid\n definitions used in this composite solid. Composites may be arbitrarily nested.\n input_mappings (Optional[List[InputMapping]]): Define the inputs to the composite solid,\n and how they map to the inputs of its constituent solids.\n output_mappings (Optional[List[OutputMapping]]): Define the outputs of the composite solid,\n and how they map from the outputs of its constituent solids.\n config_mapping (Optional[ConfigMapping]): By specifying a config mapping, you can override\n the configuration for the child solids contained within this composite solid. Config\n mappings require both a configuration field to be specified, which is exposed as the\n configuration for the composite solid, and a configuration mapping function, which\n is called to map the configuration of the composite solid into the configuration that\n is applied to any child solids.\n dependencies (Optional[Dict[Union[str, SolidInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares where each solid gets its inputs. The keys at the top\n level dict are either string names of solids or SolidInvocations. The values\n are dicts that map input names to DependencyDefinitions.\n description (Optional[str]): Human readable description of this composite solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n may expect and require certain metadata to be attached to a solid.\n positional_inputs (Optional[List[str]]): The positional order of the inputs if it\n differs from the order of the input mappings\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n add_two = CompositeSolidDefinition(\n 'add_two',\n solid_defs=[add_one],\n dependencies={\n SolidInvocation('add_one', 'adder_1'): {},\n SolidInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n },\n input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n )\n """\n\n def __init__(\n self,\n name,\n solid_defs,\n input_mappings=None,\n output_mappings=None,\n config_mapping=None,\n dependencies=None,\n description=None,\n tags=None,\n positional_inputs=None,\n ):\n super(CompositeSolidDefinition, self).__init__(\n name=name,\n description=description,\n node_defs=solid_defs,\n dependencies=dependencies,\n tags=tags,\n positional_inputs=positional_inputs,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config_mapping=config_mapping,\n )\n\n def all_dagster_types(self):\n yield from self.all_input_output_types()\n\n for node_def in self._node_defs:\n yield from node_def.all_dagster_types()\n\n def copy_for_configured(self, name, description, config_schema, config_or_config_fn):\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only composite solids utilizing config mapping can be pre-configured. The solid "\n '"{graph_name}" does not have a config mapping, and thus has nothing to be '\n "configured.".format(graph_name=self.name)\n )\n\n return CompositeSolidDefinition(\n name=self._name_for_configured_node(self.name, name, config_or_config_fn),\n solid_defs=self._node_defs,\n input_mappings=self.input_mappings,\n output_mappings=self.output_mappings,\n config_mapping=ConfigMapping(\n self._config_mapping.config_fn, config_schema=config_schema,\n ),\n dependencies=self.dependencies,\n description=description or self.description,\n tags=self.tags,\n positional_inputs=self.positional_inputs,\n )\n
\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster.core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nimport traceback\nfrom contextlib import contextmanager\n\nfrom dagster import check\nfrom dagster.utils.interrupts import raise_interrupts_as\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n """\n Pipeline execution was interrupted during the execution process.\n\n Just like KeyboardInterrupt this inherits from BaseException\n as to not be accidentally caught by code that catches Exception\n and thus prevent the interpreter from exiting.\n """\n\n\n[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions."""\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False\n\n\n[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""\n\n\nclass DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more solids in the specified subset do not exist on the pipeline.'\n - The subset produces an invalid pipeline.\n """\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, List, Optional, Selector, Shape, Permissive\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}" + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=" Reason: {reason}.".format(reason=reason) if reason else "",\n ),\n **kwargs,\n )\n\n\n[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime."""\n\n\n[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)\n\n\n[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)\n\n\n[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)\n\n\n@contextmanager\ndef raise_execution_interrupts():\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n[docs]@contextmanager\ndef user_code_error_boundary(error_cls, msg_fn, control_flow_exceptions=None, **kwargs):\n """\n Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around an user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError)\n control_flow_exceptions = tuple(\n check.opt_list_param(control_flow_exceptions, "control_flow_exceptions")\n )\n\n with raise_execution_interrupts():\n try:\n yield\n except control_flow_exceptions as cf:\n # A control flow exception has occurred and should be propagated\n raise cf\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e: # pylint: disable=W0703\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n\n\n[docs]class DagsterUserCodeExecutionError(DagsterError):\n """\n This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self):\n return True\n\n\n[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the solid type system at runtime. E.g. a solid receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.solid_name = check.str_param(kwargs.pop("solid_name"), "solid_name")\n self.solid_def_name = check.str_param(kwargs.pop("solid_def_name"), "solid_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)\n\n\n[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """\n\n\n[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.CompositeSolidDefinition` during config parsing.\n """\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\nclass DagsterTypeMaterializationError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an output\n materialization function defined in a :py:class:`~dagster.DagsterTypeMaterializer` during\n materialization of a custom type.\n """\n\n\n[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """ Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of a solid without first supplying the\n solid with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n "Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n ).format(resource_name=resource_name)\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)\n\n\n[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema)."""\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster.config.errors import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += "\\n Error {i_error}: {error_message}".format(\n i_error=i_error + 1, error_message=error.message\n )\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)\n\n\n[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster.core.instance.DagsterInstance` or system storage configuration.\n """\n\n\n[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with."""\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterScheduleWipeRequired(DagsterError):\n """Indicates that the user must wipe their stored schedule state."""\n\n\nclass DagsterInstanceMigrationRequired(DagsterError):\n """Indicates that the dagster instance must be migrated."""\n\n def __init__(self, msg=None, db_revision=None, head_revision=None, original_exc_info=None):\n super(DagsterInstanceMigrationRequired, self).__init__(\n "Instance is out of date and must be migrated{additional_msg}."\n "{revision_clause} Please run `dagster instance migrate`.{original_exception_clause}".format(\n additional_msg=" ({msg})".format(msg=msg) if msg else "",\n revision_clause=(\n " Database is at revision {db_revision}, head is "\n "{head_revision}.".format(db_revision=db_revision, head_revision=head_revision)\n if db_revision or head_revision\n else ""\n ),\n original_exception_clause=(\n "\\n\\nOriginal exception:\\n\\n{original_exception}".format(\n original_exception="".join(traceback.format_exception(*original_exc_info))\n )\n if original_exc_info\n else ""\n ),\n )\n )\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id"""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous pipeline and\n solid execution APIs (:py:func:`~dagster.execute_pipeline`, :py:func:`~dagster.execute_solid`,\n etc.), that is, typically in test, and a :py:class:`~dagster.DagsterType`'s type check fails\n by returning either ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success``\n member is ``False``.\n """\n\n def __init__(self, description=None, metadata_entries=None, dagster_type=None):\n from dagster import EventMetadataEntry, DagsterType\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=EventMetadataEntry\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n\n\n[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n "Event logs invalid for run id {}".format(run_id)\n )\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """ Error raised by invalid asset key """\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """ Error raised during the execution of a user-defined hook. """\n\n\nclass DagsterImportError(DagsterError):\n """ Import error raised while importing user-code. """\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When pipeline execution complete with steps in an unknown state"""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes."""\n
\n"""Structured representations of system events."""\nimport logging\nimport os\nfrom collections import namedtuple\nfrom enum import Enum\n\nfrom dagster import check\nfrom dagster.core.definitions import (\n AssetMaterialization,\n EventMetadataEntry,\n ExpectationResult,\n Materialization,\n SolidHandle,\n)\nfrom dagster.core.definitions.events import ObjectStoreOperationType\nfrom dagster.core.execution.context.system import (\n HookContext,\n SystemExecutionContext,\n SystemStepExecutionContext,\n)\nfrom dagster.core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster.core.execution.plan.outputs import StepOutputData\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.serdes import register_serdes_tuple_fallbacks, whitelist_for_serdes\nfrom dagster.utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster.utils.timing import format_duration\n\n\n[docs]class DagsterEventType(Enum):\n """The types of events that may be yielded by solid and pipeline execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n STEP_MATERIALIZATION = "STEP_MATERIALIZATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n\n PIPELINE_INIT_FAILURE = "PIPELINE_INIT_FAILURE"\n\n PIPELINE_ENQUEUED = "PIPELINE_ENQUEUED"\n PIPELINE_DEQUEUED = "PIPELINE_DEQUEUED"\n PIPELINE_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n\n PIPELINE_START = "PIPELINE_START" # Execution has started\n PIPELINE_SUCCESS = "PIPELINE_SUCCESS"\n PIPELINE_FAILURE = "PIPELINE_FAILURE"\n\n PIPELINE_CANCELING = "PIPELINE_CANCELING"\n PIPELINE_CANCELED = "PIPELINE_CANCELED"\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.STEP_MATERIALIZATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.PIPELINE_INIT_FAILURE,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.PIPELINE_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.PIPELINE_ENQUEUED,\n DagsterEventType.PIPELINE_DEQUEUED,\n DagsterEventType.PIPELINE_STARTING,\n DagsterEventType.PIPELINE_START,\n DagsterEventType.PIPELINE_SUCCESS,\n DagsterEventType.PIPELINE_INIT_FAILURE,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_CANCELING,\n DagsterEventType.PIPELINE_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\n\ndef _assert_type(method, expected_type, actual_type):\n check.invariant(\n expected_type == actual_type,\n (\n "{method} only callable when event_type is {expected_type}, called on {actual_type}"\n ).format(method=method, expected_type=expected_type, actual_type=actual_type),\n )\n\n\ndef _validate_event_specific_data(event_type, event_specific_data):\n from dagster.core.execution.plan.objects import StepFailureData, StepSuccessData\n from dagster.core.execution.plan.inputs import StepInputData\n\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.STEP_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type == DagsterEventType.ENGINE_EVENT:\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n\n return event_specific_data\n\n\ndef log_step_event(step_context, event):\n check.inst_param(step_context, "step_context", SystemStepExecutionContext)\n check.inst_param(event, "event", DagsterEvent)\n\n event_type = DagsterEventType(event.event_type_value)\n log_fn = step_context.log.error if event_type in FAILURE_EVENTS else step_context.log.debug\n\n log_fn(\n event.message\n or "{event_type} for step {step_key}".format(\n event_type=event_type, step_key=step_context.step.key\n ),\n dagster_event=event,\n pipeline_name=step_context.pipeline_name,\n )\n\n\ndef log_pipeline_event(pipeline_context, event, step_key):\n event_type = DagsterEventType(event.event_type_value)\n\n log_fn = (\n pipeline_context.log.error if event_type in FAILURE_EVENTS else pipeline_context.log.debug\n )\n\n log_fn(\n event.message\n or "{event_type} for pipeline {pipeline_name}".format(\n event_type=event_type, pipeline_name=pipeline_context.pipeline_name\n ),\n dagster_event=event,\n pipeline_name=pipeline_context.pipeline_name,\n step_key=step_key,\n )\n\n\ndef log_resource_event(log_manager, pipeline_name, event):\n check.inst_param(log_manager, "log_manager", DagsterLogManager)\n check.inst_param(event, "event", DagsterEvent)\n check.inst(event.event_specific_data, EngineEventData)\n\n log_fn = log_manager.error if event.event_specific_data.error else log_manager.debug\n log_fn(event.message, dagster_event=event, pipeline_name=pipeline_name, step_key=event.step_key)\n\n\n[docs]@whitelist_for_serdes\nclass DagsterEvent(\n namedtuple(\n "_DagsterEvent",\n "event_type_value pipeline_name step_handle solid_handle step_kind_value "\n "logging_tags event_specific_data message pid step_key",\n )\n):\n """Events yielded by solid and pipeline execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n pipeline_name (str)\n step_key (str)\n solid_handle (SolidHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(event_type, step_context, event_specific_data=None, message=None):\n\n check.inst_param(step_context, "step_context", SystemStepExecutionContext)\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_pipeline(\n event_type, pipeline_context, message=None, event_specific_data=None, step_handle=None\n ):\n check.inst_param(pipeline_context, "pipeline_context", SystemExecutionContext)\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n pipeline_name = pipeline_context.pipeline_name\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n step_key = step_handle.to_key() if step_handle else None\n log_pipeline_event(pipeline_context, event, step_key)\n\n return event\n\n @staticmethod\n def from_resource(execution_plan, log_manager, message=None, event_specific_data=None):\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n pipeline_name = execution_plan.pipeline_def.name\n event = DagsterEvent(\n DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, pipeline_name, event)\n return event\n\n def __new__(\n cls,\n event_type_value,\n pipeline_name,\n step_handle=None,\n solid_handle=None,\n step_kind_value=None,\n logging_tags=None,\n event_specific_data=None,\n message=None,\n pid=None,\n # legacy\n step_key=None,\n ):\n event_type_value, event_specific_data = _handle_back_compat(\n event_type_value, event_specific_data\n )\n\n # old events may contain solid_handle but not step_handle\n if solid_handle is not None and step_handle is None:\n step_handle = StepHandle(solid_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(pipeline_name, "pipeline_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(solid_handle, "solid_handle", SolidHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_dict_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def solid_name(self):\n return self.solid_handle.name\n\n @property\n def event_type(self):\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @property\n def is_step_event(self):\n return self.event_type in STEP_EVENTS\n\n @property\n def is_hook_event(self):\n return self.event_type in HOOK_EVENTS\n\n @property\n def step_kind(self):\n from dagster.core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @property\n def is_step_success(self):\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @property\n def is_successful_output(self):\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @property\n def is_step_start(self):\n return self.event_type == DagsterEventType.STEP_START\n\n @property\n def is_step_failure(self):\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @property\n def is_step_skipped(self):\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @property\n def is_step_up_for_retry(self):\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @property\n def is_step_restarted(self):\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_pipeline_success(self):\n return self.event_type == DagsterEventType.PIPELINE_SUCCESS\n\n @property\n def is_pipeline_failure(self):\n return self.event_type == DagsterEventType.PIPELINE_FAILURE\n\n @property\n def is_pipeline_init_failure(self):\n return self.event_type == DagsterEventType.PIPELINE_INIT_FAILURE\n\n @property\n def is_failure(self):\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_pipeline_event(self):\n return self.event_type in PIPELINE_EVENTS\n\n @property\n def is_engine_event(self):\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @property\n def is_handled_output(self):\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @property\n def is_loaded_input(self):\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @property\n def is_step_materialization(self):\n return self.event_type == DagsterEventType.STEP_MATERIALIZATION\n\n @property\n def asset_key(self):\n if self.event_type != DagsterEventType.STEP_MATERIALIZATION:\n return None\n return self.step_materialization_data.materialization.asset_key\n\n @property\n def partition(self):\n if self.event_type != DagsterEventType.STEP_MATERIALIZATION:\n return None\n return self.step_materialization_data.materialization.partition\n\n @property\n def step_input_data(self):\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return self.event_specific_data\n\n @property\n def step_output_data(self):\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return self.event_specific_data\n\n @property\n def step_success_data(self):\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return self.event_specific_data\n\n @property\n def step_failure_data(self):\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return self.event_specific_data\n\n @property\n def step_retry_data(self):\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return self.event_specific_data\n\n @property\n def step_materialization_data(self):\n _assert_type(\n "step_materialization_data", DagsterEventType.STEP_MATERIALIZATION, self.event_type\n )\n return self.event_specific_data\n\n @property\n def step_expectation_result_data(self):\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return self.event_specific_data\n\n @property\n def pipeline_init_failure_data(self):\n _assert_type(\n "pipeline_init_failure_data", DagsterEventType.PIPELINE_INIT_FAILURE, self.event_type\n )\n return self.event_specific_data\n\n @property\n def pipeline_failure_data(self):\n _assert_type("pipeline_failure_data", DagsterEventType.PIPELINE_FAILURE, self.event_type)\n return self.event_specific_data\n\n @property\n def engine_event_data(self):\n _assert_type("engine_event_data", DagsterEventType.ENGINE_EVENT, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_completed_data(self):\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self):\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_skipped_data(self):\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @staticmethod\n def step_output_event(step_context, step_output_data):\n check.inst_param(step_output_data, "step_output_data", StepOutputData)\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message='Yielded output "{output_name}"{mapping_clause} of type "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=step_context.step.step_output_named(\n step_output_data.step_output_handle.output_name\n ).output_def.dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check).",\n mapping_clause=f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else "",\n ),\n )\n\n @staticmethod\n def step_failure_event(step_context, step_failure_data):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message='Execution of step "{step_key}" failed.'.format(step_key=step_context.step.key),\n )\n\n @staticmethod\n def step_retry_event(step_context, step_retry_data):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message='Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=" in {n} seconds".format(n=step_retry_data.seconds_to_wait)\n if step_retry_data.seconds_to_wait\n else "",\n ),\n )\n\n @staticmethod\n def step_input_event(step_context, step_input_data):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=step_context.step.step_input_named(\n step_input_data.input_name\n ).dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check).",\n ),\n )\n\n @staticmethod\n def step_start_event(step_context):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message='Started execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def step_restarted_event(step_context, previous_attempts):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(step_context, success):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key, duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context):\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message='Skipped execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def step_materialization(step_context, materialization):\n check.inst_param(\n materialization, "materialization", (AssetMaterialization, Materialization)\n )\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization),\n message=materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=" {label}".format(label=materialization.label)\n if materialization.label\n else ""\n ),\n )\n\n @staticmethod\n def step_expectation_result(step_context, expectation_result):\n check.inst_param(expectation_result, "expectation_result", ExpectationResult)\n\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def pipeline_start(pipeline_context):\n return DagsterEvent.from_pipeline(\n DagsterEventType.PIPELINE_START,\n pipeline_context,\n message='Started execution of pipeline "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_success(pipeline_context):\n return DagsterEvent.from_pipeline(\n DagsterEventType.PIPELINE_SUCCESS,\n pipeline_context,\n message='Finished execution of pipeline "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_failure(pipeline_context, context_msg, error_info=None):\n\n return DagsterEvent.from_pipeline(\n DagsterEventType.PIPELINE_FAILURE,\n pipeline_context,\n message='Execution of pipeline "{pipeline_name}" failed. {context_msg}'.format(\n pipeline_name=pipeline_context.pipeline_name,\n context_msg=check.str_param(context_msg, "context_msg"),\n ),\n event_specific_data=PipelineFailureData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def pipeline_canceled(pipeline_context, error_info=None):\n return DagsterEvent.from_pipeline(\n DagsterEventType.PIPELINE_CANCELED,\n pipeline_context,\n message='Execution of pipeline "{pipeline_name}" canceled.'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n event_specific_data=PipelineCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def resource_init_start(execution_plan, log_manager, resource_keys):\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n return DagsterEvent.from_resource(\n execution_plan=check.inst_param(execution_plan, "execution_plan", ExecutionPlan),\n log_manager=check.inst_param(log_manager, "log_manager", DagsterLogManager),\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata_entries=[], marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(execution_plan, log_manager, resource_instances, resource_init_times):\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n metadata_entries = []\n for resource_key in resource_instances.keys():\n resource_obj = resource_instances[resource_key]\n resource_time = resource_init_times[resource_key]\n metadata_entries.append(\n EventMetadataEntry.python_artifact(\n resource_obj.__class__, resource_key, "Initialized in {}".format(resource_time)\n )\n )\n\n return DagsterEvent.from_resource(\n execution_plan=check.inst_param(execution_plan, "execution_plan", ExecutionPlan),\n log_manager=check.inst_param(log_manager, "log_manager", DagsterLogManager),\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata_entries=metadata_entries, marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(execution_plan, log_manager, resource_keys, error):\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n return DagsterEvent.from_resource(\n execution_plan=check.inst_param(execution_plan, "execution_plan", ExecutionPlan),\n log_manager=check.inst_param(log_manager, "log_manager", DagsterLogManager),\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[], marker_end="resources", error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(execution_plan, log_manager, resource_keys, error):\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n return DagsterEvent.from_resource(\n execution_plan=check.inst_param(execution_plan, "execution_plan", ExecutionPlan),\n log_manager=check.inst_param(log_manager, "log_manager", DagsterLogManager),\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[], marker_start=None, marker_end=None, error=error,\n ),\n )\n\n @staticmethod\n def pipeline_init_failure(pipeline_name, failure_data, log_manager):\n check.inst_param(failure_data, "failure_data", PipelineInitFailureData)\n check.inst_param(log_manager, "log_manager", DagsterLogManager)\n # this failure happens trying to bring up context so can't use from_pipeline\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_INIT_FAILURE.value,\n pipeline_name=pipeline_name,\n event_specific_data=failure_data,\n message=(\n 'Pipeline failure during initialization of pipeline "{pipeline_name}". '\n "This may be due to a failure in initializing a resource or logger."\n ).format(pipeline_name=pipeline_name),\n pid=os.getpid(),\n )\n log_manager.error(\n event.message\n or "{event_type} for pipeline {pipeline_name}".format(\n event_type=DagsterEventType.PIPELINE_INIT_FAILURE, pipeline_name=pipeline_name\n ),\n dagster_event=event,\n pipeline_name=pipeline_name,\n )\n return event\n\n @staticmethod\n def engine_event(pipeline_context, message, event_specific_data=None, step_handle=None):\n return DagsterEvent.from_pipeline(\n DagsterEventType.ENGINE_EVENT,\n pipeline_context,\n message,\n event_specific_data=event_specific_data,\n step_handle=step_handle,\n )\n\n @staticmethod\n def object_store_operation(step_context, object_store_operation_result):\n from dagster.core.definitions.events import ObjectStoreOperation\n\n check.inst_param(\n object_store_operation_result, "object_store_operation_result", ObjectStoreOperation\n )\n\n object_store_name = (\n "{object_store_name} ".format(\n object_store_name=object_store_operation_result.object_store_name\n )\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n " using {serialization_strategy_name}".format(\n serialization_strategy_name=object_store_operation_result.serialization_strategy_name\n )\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n "Stored intermediate object for output {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n "Retrieved intermediate object for input {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata_entries=[\n EventMetadataEntry.path(object_store_operation_result.key, label="key")\n ],\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(step_context, output_name, manager_key):\n check.str_param(output_name, "output_name")\n check.str_param(manager_key, "manager_key")\n message = f'Handled output "{output_name}" using output manager ' f'"{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name, manager_key=manager_key,\n ),\n message=message,\n )\n\n @staticmethod\n def loaded_input(\n step_context, input_name, manager_key, upstream_output_name=None, upstream_step_key=None\n ):\n\n check.str_param(input_name, "input_name")\n check.str_param(manager_key, "manager_key")\n check.opt_str_param(upstream_output_name, "upstream_output_name")\n check.opt_str_param(upstream_step_key, "upstream_step_key")\n\n message = f'Loaded input "{input_name}" using input manager ' f'"{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step ' f'"{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n ),\n message=message,\n )\n\n @staticmethod\n def hook_completed(hook_context, hook_def):\n event_type = DagsterEventType.HOOK_COMPLETED\n check.inst_param(hook_context, "hook_context", HookContext)\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=hook_context.pipeline_name,\n step_handle=hook_context.step.handle,\n solid_handle=hook_context.step.solid_handle,\n step_kind_value=hook_context.step.kind.value,\n logging_tags=hook_context.logging_tags,\n message=(\n 'Finished the execution of hook "{hook_name}" triggered for solid "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=hook_context.solid.name),\n )\n\n hook_context.log.debug(\n event.message, dagster_event=event, pipeline_name=hook_context.pipeline_name,\n )\n\n return event\n\n @staticmethod\n def hook_errored(hook_context, error):\n event_type = DagsterEventType.HOOK_ERRORED\n check.inst_param(hook_context, "hook_context", HookContext)\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=hook_context.pipeline_name,\n step_handle=hook_context.step.handle,\n solid_handle=hook_context.step.solid_handle,\n step_kind_value=hook_context.step.kind.value,\n logging_tags=hook_context.logging_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n hook_context.log.error(\n str(error), dagster_event=event, pipeline_name=hook_context.pipeline_name,\n )\n\n return event\n\n @staticmethod\n def hook_skipped(hook_context, hook_def):\n event_type = DagsterEventType.HOOK_SKIPPED\n check.inst_param(hook_context, "hook_context", HookContext)\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=hook_context.pipeline_name,\n step_handle=hook_context.step.handle,\n solid_handle=hook_context.step.solid_handle,\n step_kind_value=hook_context.step.kind.value,\n logging_tags=hook_context.logging_tags,\n message=(\n 'Skipped the execution of hook "{hook_name}". It did not meet its triggering '\n 'condition during the execution of solid "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=hook_context.solid.name),\n )\n\n hook_context.log.debug(\n event.message, dagster_event=event, pipeline_name=hook_context.pipeline_name,\n )\n\n return event\n\n\ndef get_step_output_event(events, step_key, output_name="result"):\n check.list_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(namedtuple("_StepMaterializationData", "materialization")):\n pass\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(namedtuple("_StepExpectationResultData", "expectation_result")):\n pass\n\n\n@whitelist_for_serdes\nclass ObjectStoreOperationResultData(\n namedtuple(\n "_ObjectStoreOperationResultData",\n "op value_name metadata_entries address version mapping_key",\n )\n):\n def __new__(\n cls, op, value_name, metadata_entries, address=None, version=None, mapping_key=None\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=check.opt_str_param(op, "op"),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata_entries=check.opt_list_param(metadata_entries, "metadata_entries"),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes\nclass EngineEventData(\n namedtuple("_EngineEventData", "metadata_entries error marker_start marker_end")\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(cls, metadata_entries=None, error=None, marker_start=None, marker_end=None):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", EventMetadataEntry\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(pid, step_keys_to_execute=None, marker_end=None):\n check.int_param(pid, "pid")\n check.opt_list_param(step_keys_to_execute, "step_keys_to_execute")\n return EngineEventData(\n metadata_entries=[EventMetadataEntry.text(str(pid), "pid")]\n + (\n [EventMetadataEntry.text(str(step_keys_to_execute), "step_keys")]\n if step_keys_to_execute\n else []\n ),\n marker_end=marker_end,\n )\n\n @staticmethod\n def multiprocess(pid, step_keys_to_execute=None):\n check.int_param(pid, "pid")\n check.opt_list_param(step_keys_to_execute, "step_keys_to_execute")\n return EngineEventData(\n metadata_entries=[EventMetadataEntry.text(str(pid), "pid")]\n + (\n [EventMetadataEntry.text(str(step_keys_to_execute), "step_keys")]\n if step_keys_to_execute\n else []\n )\n )\n\n @staticmethod\n def interrupted(steps_interrupted):\n check.list_param(steps_interrupted, "steps_interrupted", str)\n return EngineEventData(\n metadata_entries=[EventMetadataEntry.text(str(steps_interrupted), "steps_interrupted")]\n )\n\n @staticmethod\n def engine_error(error):\n check.inst_param(error, "error", SerializableErrorInfo)\n return EngineEventData(metadata_entries=[], error=error)\n\n\n@whitelist_for_serdes\nclass PipelineInitFailureData(namedtuple("_PipelineInitFailureData", "error")):\n def __new__(cls, error):\n return super(PipelineInitFailureData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass PipelineFailureData(namedtuple("_PipelineFailureData", "error")):\n def __new__(cls, error):\n return super(PipelineFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass PipelineCanceledData(namedtuple("_PipelineCanceledData", "error")):\n def __new__(cls, error):\n return super(PipelineCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(namedtuple("_HookErroredData", "error")):\n def __new__(cls, error):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo),\n )\n\n\n@whitelist_for_serdes\nclass HandledOutputData(namedtuple("_HandledOutputData", "output_name manager_key")):\n def __new__(cls, output_name, manager_key):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n )\n\n\n@whitelist_for_serdes\nclass LoadedInputData(\n namedtuple("_LoadedInputData", "input_name manager_key upstream_output_name upstream_step_key")\n):\n def __new__(cls, input_name, manager_key, upstream_output_name=None, upstream_step_key=None):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Synthetic |\n# | Process |\n# | Events |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass AssetStoreOperationData(\n namedtuple("_AssetStoreOperationData", "op step_key output_name asset_store_key")\n):\n pass\n\n\n@whitelist_for_serdes\nclass AssetStoreOperationType(Enum):\n # keep this around to prevent issues like https://github.com/dagster-io/dagster/issues/3533\n SET_ASSET = "SET_ASSET"\n GET_ASSET = "GET_ASSET"\n\n\ndef _handle_back_compat(event_type_value, event_specific_data):\n if event_type_value == "PIPELINE_PROCESS_START":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_STARTED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_EXITED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "ASSET_STORE_OPERATION":\n if event_specific_data.op in ("GET_ASSET", AssetStoreOperationType.GET_ASSET):\n return (\n DagsterEventType.LOADED_INPUT.value,\n LoadedInputData(\n event_specific_data.output_name, event_specific_data.asset_store_key\n ),\n )\n if event_specific_data.op in ("SET_ASSET", AssetStoreOperationType.SET_ASSET):\n return (\n DagsterEventType.HANDLED_OUTPUT.value,\n HandledOutputData(\n event_specific_data.output_name, event_specific_data.asset_store_key\n ),\n )\n else:\n return event_type_value, event_specific_data\n\n\nregister_serdes_tuple_fallbacks(\n {\n "PipelineProcessStartedData": None,\n "PipelineProcessExitedData": None,\n "PipelineProcessStartData": None,\n "AssetStoreOperationData": AssetStoreOperationData,\n }\n)\n
\nimport sys\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, FrozenSet, Iterator, List, Optional, Tuple, Union\n\nfrom dagster import check\nfrom dagster.core.definitions import IPipeline, PipelineDefinition\nfrom dagster.core.definitions.pipeline import PipelineSubsetDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.context.system import SystemPipelineExecutionContext\nfrom dagster.core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.resolve_versions import resolve_memoized_execution_plan\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.instance import DagsterInstance, is_memoized_run\nfrom dagster.core.selector import parse_items_from_selection, parse_step_selection\nfrom dagster.core.storage.mem_io_manager import InMemoryIOManager\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.system_config.objects import EnvironmentConfig\nfrom dagster.core.telemetry import log_repo_stats, telemetry_wrapper\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom dagster.utils.interrupts import capture_interrupts\n\nfrom .context_creation_pipeline import (\n ExecutionContextManager,\n PipelineExecutionContextManager,\n PlanExecutionContextManager,\n scoped_pipeline_context,\n)\nfrom .results import PipelineExecutionResult\n\n## Brief guide to the execution APIs\n# | function name | operates over | sync | supports | creates new PipelineRun |\n# | | | | reexecution | in instance |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_pipeline_iterator | IPipeline | async | no | yes |\n# | execute_pipeline | IPipeline | sync | no | yes |\n# | execute_run_iterator | PipelineRun | async | (1) | no |\n# | execute_run | PipelineRun | sync | (1) | no |\n# | execute_plan_iterator | ExecutionPlan | async | (2) | no |\n# | execute_plan | ExecutionPlan | sync | (2) | no |\n# | reexecute_pipeline | IPipeline | sync | yes | yes |\n# | reexecute_pipeline_iterator | IPipeline | async | yes | yes |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the PipelineRun passed to this function. Specifically,\n# parent_run_id and root_run_id must be set and consistent, and if a solids_to_execute or\n# step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance\n) -> Iterator[DagsterEvent]:\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(instance, "instance", DagsterInstance)\n\n if pipeline_run.status == PipelineRunStatus.CANCELED:\n # This can happen if the run was force-terminated while it was starting\n def gen_execute_on_cancel():\n yield instance.report_engine_event(\n "Not starting execution since the run was canceled before execution could start",\n pipeline_run,\n )\n\n return gen_execute_on_cancel()\n\n check.invariant(\n pipeline_run.status == PipelineRunStatus.NOT_STARTED\n or pipeline_run.status == PipelineRunStatus.STARTING,\n desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n ),\n )\n\n if pipeline_run.solids_to_execute:\n pipeline_def = pipeline.get_definition()\n if isinstance(pipeline_def, PipelineSubsetDefinition):\n check.invariant(\n pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that conflicts "\n "with pipeline subset {pipeline_solids_to_execute}.".format(\n pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n ),\n )\n else:\n # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n # note that when we receive the solids to execute via PipelineRun, it won't support\n # solid selection query syntax\n pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n pipeline_run.solids_to_execute\n )\n execution_plan = create_execution_plan(\n pipeline,\n run_config=pipeline_run.run_config,\n mode=pipeline_run.mode,\n step_keys_to_execute=pipeline_run.step_keys_to_execute,\n )\n\n return iter(\n _ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=_pipeline_execution_iterator,\n execution_context_manager=PipelineExecutionContextManager(\n execution_plan=execution_plan,\n pipeline_run=pipeline_run,\n instance=instance,\n run_config=pipeline_run.run_config,\n raise_on_error=False,\n ),\n )\n )\n\n\ndef execute_run(\n pipeline: IPipeline,\n pipeline_run: PipelineRun,\n instance: DagsterInstance,\n raise_on_error: bool = False,\n) -> PipelineExecutionResult:\n """Executes an existing pipeline run synchronously.\n\n Synchronous version of execute_run_iterator.\n\n Args:\n pipeline (IPipeline): The pipeline to execute.\n pipeline_run (PipelineRun): The run to execute\n instance (DagsterInstance): The instance in which the run has been created.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``False``.\n\n Returns:\n PipelineExecutionResult: The result of the execution.\n """\n if isinstance(pipeline, PipelineDefinition):\n raise DagsterInvariantViolationError(\n "execute_run requires an IPipeline but received a PipelineDefinition "\n "directly instead. To support hand-off to other processes provide a "\n "ReconstructablePipeline which can be done using reconstructable(). For in "\n "process only execution you can use InMemoryPipeline."\n )\n\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(instance, "instance", DagsterInstance)\n\n if pipeline_run.status == PipelineRunStatus.CANCELED:\n message = "Not starting execution since the run was canceled before execution could start"\n instance.report_engine_event(\n message, pipeline_run,\n )\n raise DagsterInvariantViolationError(message)\n\n check.invariant(\n pipeline_run.status == PipelineRunStatus.NOT_STARTED\n or pipeline_run.status == PipelineRunStatus.STARTING,\n desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n ),\n )\n pipeline_def = pipeline.get_definition()\n if pipeline_run.solids_to_execute:\n if isinstance(pipeline_def, PipelineSubsetDefinition):\n check.invariant(\n pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that "\n "conflicts with pipeline subset {pipeline_solids_to_execute}.".format(\n pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n ),\n )\n else:\n # when `execute_run` is directly called, the sub pipeline hasn't been created\n # note that when we receive the solids to execute via PipelineRun, it won't support\n # solid selection query syntax\n pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n pipeline_run.solids_to_execute\n )\n\n execution_plan = create_execution_plan(\n pipeline,\n run_config=pipeline_run.run_config,\n mode=pipeline_run.mode,\n step_keys_to_execute=pipeline_run.step_keys_to_execute,\n )\n\n if is_memoized_run(pipeline_run.tags):\n execution_plan = resolve_memoized_execution_plan(execution_plan)\n\n _execute_run_iterable = _ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=_pipeline_execution_iterator,\n execution_context_manager=PipelineExecutionContextManager(\n execution_plan=execution_plan,\n pipeline_run=pipeline_run,\n instance=instance,\n run_config=pipeline_run.run_config,\n raise_on_error=raise_on_error,\n ),\n )\n event_list = list(_execute_run_iterable)\n pipeline_context = _execute_run_iterable.pipeline_context\n\n # workaround for mem_io_manager to work in reconstruct_context, e.g. result.result_for_solid\n # in-memory values dict will get lost when the resource is re-initiated in reconstruct_context\n # so instead of re-initiating every single resource, we pass the resource instances to\n # reconstruct_context directly to avoid re-building from resource def.\n resource_instances_to_override = {}\n if pipeline_context: # None if we have a pipeline failure\n for (\n key,\n resource_instance,\n ) in pipeline_context.scoped_resources_builder.resource_instance_dict.items():\n if isinstance(resource_instance, InMemoryIOManager):\n resource_instances_to_override[key] = resource_instance\n\n return PipelineExecutionResult(\n pipeline.get_definition(),\n pipeline_run.run_id,\n event_list,\n lambda hardcoded_resources_arg: scoped_pipeline_context(\n execution_plan,\n pipeline_run.run_config,\n pipeline_run,\n instance,\n intermediate_storage=pipeline_context.intermediate_storage,\n resource_instances_to_override=hardcoded_resources_arg,\n ),\n resource_instances_to_override=resource_instances_to_override,\n )\n\n\n[docs]def execute_pipeline_iterator(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Iterator[DagsterEvent]:\n """Execute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`execute_pipeline`, this function yields the stream of events resulting from pipeline\n execution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The environment configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n - ['some_solid']: select "some_solid" itself.\n - ['*some_solid']: select "some_solid" and all its ancestors (upstream dependencies).\n - ['*some_solid+++']: select "some_solid", all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select "some_solid" and all its\n ancestors, "other_solid_a" itself, and "other_solid_b" and its direct child solids.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline execution.\n """\n\n with _ephemeral_instance_if_missing(instance) as execute_instance:\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)\n\n\n@contextmanager\ndef _ephemeral_instance_if_missing(\n instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n if instance:\n yield instance\n else:\n with DagsterInstance.ephemeral() as ephemeral_instance:\n yield ephemeral_instance\n\n\n[docs]def execute_pipeline(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Execute a pipeline synchronously.\n\n Users will typically call this API when testing pipeline execution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The environment configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n - ['some_solid']: select "some_solid" itself.\n - ['*some_solid']: select "some_solid" and all its ancestors (upstream dependencies).\n - ['*some_solid+++']: select "some_solid", all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select "some_solid" and all its\n ancestors, "other_solid_a" itself, and "other_solid_b" and its direct child solids.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`execute_pipeline_iterator`.\n """\n\n with _ephemeral_instance_if_missing(instance) as execute_instance:\n return _logged_execute_pipeline(\n pipeline,\n instance=execute_instance,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n raise_on_error=raise_on_error,\n )\n\n\n@telemetry_wrapper\ndef _logged_execute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n instance: DagsterInstance,\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n log_repo_stats(instance=instance, pipeline=pipeline, source="execute_pipeline")\n\n pipeline_run = instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n )\n\n return execute_run(pipeline, pipeline_run, instance, raise_on_error=raise_on_error)\n\n\n[docs]def reexecute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: DagsterInstance = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Reexecute an existing pipeline run.\n\n Users will typically call this API when testing pipeline reexecution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The environment configuration that parametrizes this run,\n as a dict.\n step_selection (Optional[List[str]]): A list of step selection queries (including single\n step keys) to execute. For example:\n - ['some_solid']: select the execution step "some_solid" itself.\n - ['*some_solid']: select the step "some_solid" and all its ancestors\n (upstream dependencies).\n - ['*some_solid+++']: select the step "some_solid", all its ancestors,\n and its descendants (downstream dependencies) within 3 levels down.\n - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select\n "some_solid" and all its ancestors, "other_solid_a" itself, and\n "other_solid_b" and its direct child execution steps.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with _ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline, run_config=run_config, mode=mode, preset=preset, tags=tags,\n )\n\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n check.invariant(\n parent_pipeline_run,\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n # resolve step selection DSL queries using parent execution plan snapshot\n if step_selection:\n full_plan = create_execution_plan(pipeline, parent_pipeline_run.run_config, mode)\n step_keys = parse_items_from_selection(step_selection)\n # resolve execution plan with any resolved dynamic step keys\n resolved_plan = full_plan.build_subset_plan(step_keys)\n # parse selection using all step deps\n step_keys_to_execute = parse_step_selection(\n resolved_plan.get_all_step_deps(), step_selection\n )\n else:\n step_keys_to_execute = None\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n # convert to frozenset https://github.com/dagster-io/dagster/issues/2914\n step_keys_to_execute=list(step_keys_to_execute) if step_keys_to_execute else None,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run(pipeline, pipeline_run, execute_instance, raise_on_error=raise_on_error)\n\n\n[docs]def reexecute_pipeline_iterator(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: DagsterInstance = None,\n) -> Iterator[DagsterEvent]:\n """Reexecute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline\n reexecution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The environment configuration that parametrizes this run,\n as a dict.\n step_selection (Optional[List[str]]): A list of step selection queries (including single\n step keys) to execute. For example:\n - ['some_solid']: select the execution step "some_solid" itself.\n - ['*some_solid']: select the step "some_solid" and all its ancestors\n (upstream dependencies).\n - ['*some_solid+++']: select the step "some_solid", all its ancestors,\n and its descendants (downstream dependencies) within 3 levels down.\n - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select\n "some_solid" and all its ancestors, "other_solid_a" itself, and\n "other_solid_b" and its direct child execution steps.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with _ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=None,\n )\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n check.invariant(\n parent_pipeline_run,\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n # resolve step selection DSL queries using parent execution plan snapshot\n if step_selection:\n parent_execution_plan_snapshot = execute_instance.get_execution_plan_snapshot(\n parent_pipeline_run.execution_plan_snapshot_id\n )\n step_keys_to_execute = parse_step_selection(\n parent_execution_plan_snapshot.step_deps, step_selection\n )\n else:\n step_keys_to_execute = None\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n # convert to frozenset https://github.com/dagster-io/dagster/issues/2914\n step_keys_to_execute=list(step_keys_to_execute) if step_keys_to_execute else None,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n pipeline_run: PipelineRun,\n instance: DagsterInstance,\n retries: Optional[Retries] = None,\n run_config: Optional[dict] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retries = check.opt_inst_param(retries, "retries", Retries, Retries.disabled_mode())\n run_config = check.opt_dict_param(run_config, "run_config")\n\n return iter(\n _ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n retries=retries,\n execution_plan=execution_plan,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n raise_on_error=False,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n instance: DagsterInstance,\n pipeline_run: PipelineRun,\n run_config: Optional[Dict] = None,\n retries: Optional[Retries] = None,\n) -> List[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_pipeline() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_inst_param(retries, "retries", Retries)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n retries=retries,\n )\n )\n\n\ndef _check_pipeline(pipeline: Union[PipelineDefinition, IPipeline]) -> IPipeline:\n # backcompat\n if isinstance(pipeline, PipelineDefinition):\n pipeline = InMemoryPipeline(pipeline)\n\n check.inst_param(pipeline, "pipeline", IPipeline)\n return pipeline\n\n\ndef create_execution_plan(\n pipeline: Union[IPipeline, PipelineDefinition],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n step_keys_to_execute: Optional[List[str]] = None,\n) -> ExecutionPlan:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n mode = check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name())\n check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=mode)\n\n return ExecutionPlan.build(\n pipeline, environment_config, mode=mode, step_keys_to_execute=step_keys_to_execute\n )\n\n\ndef _pipeline_execution_iterator(\n pipeline_context: SystemPipelineExecutionContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (SystemPipelineExecutionContext):\n execution_plan (ExecutionPlan):\n """\n check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n yield DagsterEvent.pipeline_start(pipeline_context)\n\n pipeline_exception_info = None\n pipeline_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in pipeline_context.executor.execute(pipeline_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n pipeline_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n raise\n except Exception: # pylint: disable=broad-except\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n raise # finally block will run before this is re-raised\n finally:\n if pipeline_canceled_info:\n reloaded_run = pipeline_context.instance.get_run_by_id(pipeline_context.run_id)\n if reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELING:\n event = DagsterEvent.pipeline_canceled(pipeline_context, pipeline_canceled_info)\n else:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n pipeline_canceled_info,\n )\n elif pipeline_exception_info:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "An exception was thrown during execution.",\n pipeline_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.pipeline_failure(\n pipeline_context, "Steps failed: {}.".format(failed_steps),\n )\n else:\n event = DagsterEvent.pipeline_success(pipeline_context)\n if not generator_closed:\n yield event\n\n\nclass _ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `PipelineExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster.utils.EventGenerationManager`.\n """\n\n def __init__(self, execution_plan, iterator, execution_context_manager):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.pipeline_context = None\n\n def __iter__(self):\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.pipeline_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.pipeline_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan, pipeline_context=self.pipeline_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_pipeline_args(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict],\n mode: Optional[str],\n preset: Optional[str],\n tags: Optional[Dict[str, Any]],\n solid_selection: Optional[List[str]] = None,\n) -> Tuple[\n IPipeline, Optional[dict], Optional[str], Dict[str, Any], FrozenSet[str], Optional[List[str]],\n]:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n check.invariant(\n not (mode is not None and preset is not None),\n "You may set only one of `mode` (got {mode}) or `preset` (got {preset}).".format(\n mode=mode, preset=preset\n ),\n )\n\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n if preset is not None:\n pipeline_preset = pipeline_def.get_preset(preset)\n\n if pipeline_preset.run_config is not None:\n check.invariant(\n (not run_config) or (pipeline_preset.run_config == run_config),\n "The environment set in preset '{preset}' does not agree with the environment "\n "passed in the `run_config` argument.".format(preset=preset),\n )\n\n run_config = pipeline_preset.run_config\n\n # load solid_selection from preset\n if pipeline_preset.solid_selection is not None:\n check.invariant(\n solid_selection is None or solid_selection == pipeline_preset.solid_selection,\n "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "\n "the `solid_selection` argument: {solid_selection}".format(\n preset=preset,\n preset_subset=pipeline_preset.solid_selection,\n solid_selection=solid_selection,\n ),\n )\n solid_selection = pipeline_preset.solid_selection\n\n check.invariant(\n mode is None or mode == pipeline_preset.mode,\n "Mode {mode} does not agree with the mode set in preset '{preset}': "\n "('{preset_mode}')".format(preset=preset, preset_mode=pipeline_preset.mode, mode=mode),\n )\n\n mode = pipeline_preset.mode\n\n tags = merge_dicts(pipeline_preset.tags, tags)\n\n if mode is not None:\n if not pipeline_def.has_mode_definition(mode):\n raise DagsterInvariantViolationError(\n (\n "You have attempted to execute pipeline {name} with mode {mode}. "\n "Available modes: {modes}"\n ).format(\n name=pipeline_def.name, mode=mode, modes=pipeline_def.available_modes,\n )\n )\n else:\n if pipeline_def.is_multi_mode:\n raise DagsterInvariantViolationError(\n (\n "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "\n "attempted to execute it without specifying a mode. Set "\n "mode property on the PipelineRun object."\n ).format(name=pipeline_def.name, modes=pipeline_def.available_modes)\n )\n mode = pipeline_def.get_default_mode_name()\n\n tags = merge_dicts(pipeline_def.tags, tags)\n\n # generate pipeline subset from the given solid_selection\n if solid_selection:\n pipeline = pipeline.subset_for_execution(solid_selection)\n\n return (\n pipeline,\n run_config,\n mode,\n tags,\n pipeline.solids_to_execute,\n solid_selection,\n )\n
\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom typing import Any, Optional\n\nfrom dagster import check\nfrom dagster.core.definitions.dependency import Solid\nfrom dagster.core.definitions.pipeline import PipelineDefinition\nfrom dagster.core.definitions.solid import SolidDefinition\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.utils.forked_pdb import ForkedPdb\n\nfrom .step import StepExecutionContext\nfrom .system import SystemComputeExecutionContext\n\n\n[docs]class AbstractComputeExecutionContext(ABC): # pylint: disable=no-init\n """Base class for solid context implemented by SolidExecutionContext and DagstermillExecutionContext"""\n\n[docs] @abstractmethod\n def has_tag(self, key) -> bool:\n """Implement this method to check if a logging tag is set."""\n\n[docs] @abstractmethod\n def get_tag(self, key: str) -> str:\n """Implement this method to get a logging tag."""\n\n @abstractproperty\n def run_id(self) -> str:\n """The run id for the context."""\n\n @abstractproperty\n def solid_def(self) -> SolidDefinition:\n """The solid definition corresponding to the execution step being executed."""\n\n @abstractproperty\n def solid(self) -> Solid:\n """The solid corresponding to the execution step being executed."""\n\n @abstractproperty\n def pipeline_def(self) -> PipelineDefinition:\n """The pipeline being executed."""\n\n @abstractproperty\n def pipeline_run(self) -> PipelineRun:\n """The PipelineRun object corresponding to the execution."""\n\n @abstractproperty\n def resources(self) -> Any:\n """Resources available in the execution context."""\n\n @abstractproperty\n def log(self) -> DagsterLogManager:\n """The log manager available in the execution context."""\n\n @abstractproperty\n def solid_config(self) -> Any:\n """The parsed config specific to this solid."""\n\n\n[docs]class SolidExecutionContext(StepExecutionContext, AbstractComputeExecutionContext):\n """The ``context`` object available to solid compute logic."""\n\n __slots__ = ["_system_compute_execution_context"]\n\n def __init__(self, system_compute_execution_context: SystemComputeExecutionContext):\n self._system_compute_execution_context = check.inst_param(\n system_compute_execution_context,\n "system_compute_execution_context",\n SystemComputeExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n super(SolidExecutionContext, self).__init__(system_compute_execution_context)\n\n @property\n def solid_config(self) -> Any:\n """The parsed config specific to this solid."""\n return self._system_compute_execution_context.solid_config\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """The current PipelineRun"""\n return self._system_compute_execution_context.pipeline_run\n\n @property\n def instance(self) -> DagsterInstance:\n """The current Instance"""\n return self._system_compute_execution_context.instance\n\n @property\n def pdb(self) -> ForkedPdb:\n """Allows pdb debugging from within the solid.\n\n Example:\n\n .. code-block:: python\n\n @solid\n def debug_solid(context):\n context.pdb.set_trace()\n\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n
\nfrom collections import namedtuple\nfrom typing import Any, Dict, Optional, Set\n\nfrom dagster import check\nfrom dagster.core.definitions.pipeline import PipelineDefinition\nfrom dagster.core.definitions.resource import ResourceDefinition, ScopedResourcesBuilder\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\n\n\n[docs]class InitResourceContext(\n namedtuple(\n "InitResourceContext",\n (\n "resource_config resource_def pipeline_run log_manager resources "\n "instance_for_backwards_compat pipeline_def_for_backwards_compat"\n ),\n )\n):\n """Resource-specific initialization context.\n\n Attributes:\n resource_config (Any): The configuration data provided by the environment config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n resource_def (ResourceDefinition): The definition of the resource currently being\n constructed.\n pipeline_run (PipelineRun): The pipeline run in context.\n run_id (str): The id for this run of the pipeline.\n log_manager (DagsterLogManager): The log manager for this run of the pipeline\n resources (ScopedResources): The resources that are available to the resource that we are\n initalizing.\n """\n\n def __new__(\n cls,\n resource_config: Any,\n resource_def: ResourceDefinition,\n pipeline_run: PipelineRun,\n log_manager: Optional[DagsterLogManager] = None,\n resource_instance_dict: Optional[Dict[str, Any]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n instance_for_backwards_compat: Optional[DagsterInstance] = None,\n pipeline_def_for_backwards_compat: Optional[PipelineDefinition] = None,\n ):\n check.opt_dict_param(resource_instance_dict, "resource_instance_dict")\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n scoped_resources_builder = ScopedResourcesBuilder(resource_instance_dict)\n\n return super(InitResourceContext, cls).__new__(\n cls,\n resource_config,\n check.inst_param(resource_def, "resource_def", ResourceDefinition),\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n check.opt_inst_param(log_manager, "log_manager", DagsterLogManager),\n resources=scoped_resources_builder.build(required_resource_keys),\n # The following are used internally for adapting intermediate storage defs to resources\n instance_for_backwards_compat=check.opt_inst_param(\n instance_for_backwards_compat, "instance_for_backwards_compat", DagsterInstance\n ),\n pipeline_def_for_backwards_compat=check.opt_inst_param(\n pipeline_def_for_backwards_compat,\n "pipeline_def_for_backwards_compat",\n PipelineDefinition,\n ),\n )\n\n @property\n def log(self) -> DagsterLogManager:\n return self.log_manager\n\n @property\n def run_id(self) -> str:\n return self.pipeline_run.run_id\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resource_def=self.resource_def,\n pipeline_run=self.pipeline_run,\n log_manager=self.log_manager,\n instance_for_backwards_compat=self.instance_for_backwards_compat,\n )\n
\nfrom collections import namedtuple\nfrom typing import Any\n\nfrom dagster import check\nfrom dagster.core.definitions.logger import LoggerDefinition\nfrom dagster.core.definitions.pipeline import PipelineDefinition\n\n\n[docs]class InitLoggerContext(\n namedtuple("InitLoggerContext", "logger_config pipeline_def logger_def run_id")\n):\n """Logger-specific initialization context.\n\n An instance of this class is made available as the first argument to the ``logger_fn`` decorated\n by :py:func:`@logger <logger>` or set on a :py:class:`LoggerDefinition`.\n\n Users should not instantiate this class.\n\n Attributes:\n logger_config (Any): The configuration data provided by the environment config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`\n pipeline_def (PipelineDefinition): The pipeline definition currently being executed.\n logger_def (LoggerDefinition): The logger definition for the logger being constructed.\n run_id (str): The ID for this run of the pipeline.\n """\n\n def __new__(\n cls,\n logger_config: Any,\n pipeline_def: PipelineDefinition,\n logger_def: LoggerDefinition,\n run_id: str,\n ):\n return super(InitLoggerContext, cls).__new__(\n cls,\n logger_config,\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition),\n check.inst_param(logger_def, "logger_def", LoggerDefinition),\n check.str_param(run_id, "run_id"),\n )\n
\n"""\nThis module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module\n"""\nfrom collections import namedtuple\nfrom typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Set\n\nfrom dagster import check\nfrom dagster.core.definitions.hook import HookDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.pipeline import PipelineDefinition\nfrom dagster.core.definitions.pipeline_base import IPipeline\nfrom dagster.core.definitions.resource import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.step import ExecutionStep\nfrom dagster.core.execution.plan.utils import build_resources_for_manager\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.io_manager import IOManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster.core.system_config.objects import EnvironmentConfig\nfrom dagster.core.types.dagster_type import DagsterType, resolve_dagster_type\n\nif TYPE_CHECKING:\n from dagster.core.definitions.intermediate_storage import IntermediateStorageDefinition\n from dagster.core.definitions.dependency import Solid, SolidHandle\n from dagster.core.storage.intermediate_storage import IntermediateStorage\n from dagster.core.instance import DagsterInstance\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n\nclass SystemExecutionContextData(\n namedtuple(\n "_SystemExecutionContextData",\n (\n "pipeline_run scoped_resources_builder environment_config pipeline "\n "mode_def intermediate_storage_def instance intermediate_storage "\n "raise_on_error retries execution_plan"\n ),\n )\n):\n """\n SystemExecutionContextData is the data that remains constant throughout the entire\n execution of a pipeline or plan.\n """\n\n def __new__(\n cls,\n pipeline_run: PipelineRun,\n scoped_resources_builder: ScopedResourcesBuilder,\n environment_config: EnvironmentConfig,\n pipeline: IPipeline,\n mode_def: ModeDefinition,\n intermediate_storage_def: Optional["IntermediateStorageDefinition"],\n instance: "DagsterInstance",\n intermediate_storage: "IntermediateStorage",\n raise_on_error: bool,\n retries: Retries,\n execution_plan: "ExecutionPlan",\n ):\n from dagster.core.definitions.intermediate_storage import IntermediateStorageDefinition\n from dagster.core.storage.intermediate_storage import IntermediateStorage\n from dagster.core.instance import DagsterInstance\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n return super(SystemExecutionContextData, cls).__new__(\n cls,\n pipeline_run=check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n scoped_resources_builder=check.inst_param(\n scoped_resources_builder, "scoped_resources_builder", ScopedResourcesBuilder\n ),\n environment_config=check.inst_param(\n environment_config, "environment_config", EnvironmentConfig\n ),\n pipeline=check.inst_param(pipeline, "pipeline", IPipeline),\n mode_def=check.inst_param(mode_def, "mode_def", ModeDefinition),\n intermediate_storage_def=check.opt_inst_param(\n intermediate_storage_def, "intermediate_storage_def", IntermediateStorageDefinition\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n intermediate_storage=check.inst_param(\n intermediate_storage, "intermediate_storage", IntermediateStorage\n ),\n raise_on_error=check.bool_param(raise_on_error, "raise_on_error"),\n retries=check.inst_param(retries, "retries", Retries),\n execution_plan=check.inst_param(execution_plan, "execution_plan", ExecutionPlan),\n )\n\n @property\n def run_id(self) -> str:\n return self.pipeline_run.run_id\n\n @property\n def run_config(self) -> dict:\n return self.environment_config.original_config_dict\n\n @property\n def pipeline_name(self) -> str:\n return self.pipeline_run.pipeline_name\n\n\nclass SystemExecutionContext:\n __slots__ = ["_execution_context_data", "_log_manager"]\n\n def __init__(\n self, execution_context_data: SystemExecutionContextData, log_manager: DagsterLogManager\n ):\n self._execution_context_data = check.inst_param(\n execution_context_data, "execution_context_data", SystemExecutionContextData\n )\n self._log_manager = check.inst_param(log_manager, "log_manager", DagsterLogManager)\n\n @property\n def pipeline_run(self) -> PipelineRun:\n return self._execution_context_data.pipeline_run\n\n @property\n def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n return self._execution_context_data.scoped_resources_builder\n\n @property\n def run_id(self) -> str:\n return self._execution_context_data.run_id\n\n @property\n def run_config(self) -> dict:\n return self._execution_context_data.run_config\n\n @property\n def environment_config(self) -> EnvironmentConfig:\n return self._execution_context_data.environment_config\n\n @property\n def pipeline(self) -> IPipeline:\n return self._execution_context_data.pipeline\n\n @property\n def pipeline_name(self) -> str:\n return self._execution_context_data.pipeline_name\n\n @property\n def mode_def(self) -> ModeDefinition:\n return self._execution_context_data.mode_def\n\n @property\n def intermediate_storage_def(self) -> "IntermediateStorageDefinition":\n return self._execution_context_data.intermediate_storage_def\n\n @property\n def instance(self) -> "DagsterInstance":\n return self._execution_context_data.instance\n\n @property\n def intermediate_storage(self):\n return self._execution_context_data.intermediate_storage\n\n @property\n def file_manager(self) -> None:\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in 0.10.0. "\n "Please access it via `context.resources.file_manager` instead."\n )\n\n @property\n def raise_on_error(self) -> bool:\n return self._execution_context_data.raise_on_error\n\n @property\n def retries(self) -> Retries:\n return self._execution_context_data.retries\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def logging_tags(self) -> Dict[str, str]:\n return self._log_manager.logging_tags\n\n @property\n def execution_plan(self):\n return self._execution_context_data.execution_plan\n\n def has_tag(self, key: str) -> bool:\n check.str_param(key, "key")\n return key in self.logging_tags\n\n def get_tag(self, key: str) -> Optional[str]:\n check.str_param(key, "key")\n return self.logging_tags.get(key)\n\n def for_step(self, step: ExecutionStep) -> "SystemStepExecutionContext":\n\n check.inst_param(step, "step", ExecutionStep)\n\n return SystemStepExecutionContext(\n self._execution_context_data, self._log_manager.with_tags(**step.logging_tags), step,\n )\n\n def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n return TypeCheckContext(self._execution_context_data, self.log, dagster_type)\n\n def using_io_manager(self, step_output_handle: StepOutputHandle) -> bool:\n # pylint: disable=comparison-with-callable\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n output_manager_key = self.execution_plan.get_step_output(\n step_output_handle\n ).output_def.io_manager_key\n return self.mode_def.resource_defs[output_manager_key] != mem_io_manager\n\n\nclass SystemPipelineExecutionContext(SystemExecutionContext):\n __slots__ = ["_executor"]\n\n def __init__(\n self,\n execution_context_data: SystemExecutionContextData,\n log_manager: DagsterLogManager,\n executor: Executor,\n ):\n super(SystemPipelineExecutionContext, self).__init__(execution_context_data, log_manager)\n self._executor = check.inst_param(executor, "executor", Executor)\n\n @property\n def executor(self) -> Executor:\n return self._executor\n\n\nclass SystemStepExecutionContext(SystemExecutionContext):\n __slots__ = ["_step", "_resources", "_required_resource_keys", "_step_launcher"]\n\n def __init__(\n self,\n execution_context_data: SystemExecutionContextData,\n log_manager: DagsterLogManager,\n step: ExecutionStep,\n ):\n from dagster.core.execution.resources_init import get_required_resource_keys_for_step\n\n self._step = check.inst_param(step, "step", ExecutionStep)\n super(SystemStepExecutionContext, self).__init__(execution_context_data, log_manager)\n self._required_resource_keys = get_required_resource_keys_for_step(\n step,\n execution_context_data.execution_plan,\n execution_context_data.intermediate_storage_def,\n )\n self._resources = self._execution_context_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n step_launcher_resources = [\n resource for resource in self._resources if isinstance(resource, StepLauncher)\n ]\n\n self._step_launcher: Optional[StepLauncher] = None\n if len(step_launcher_resources) > 1:\n raise DagsterInvariantViolationError(\n "Multiple required resources for solid {solid_name} have inherit StepLauncher"\n "There should be at most one step launcher resource per solid.".format(\n solid_name=step.solid_handle.name\n )\n )\n elif len(step_launcher_resources) == 1:\n self._step_launcher = step_launcher_resources[0]\n\n self._log_manager = log_manager\n\n def for_compute(self) -> "SystemComputeExecutionContext":\n return SystemComputeExecutionContext(self._execution_context_data, self.log, self.step)\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n return self._step_launcher\n\n @property\n def solid_handle(self) -> "SolidHandle":\n return self._step.solid_handle\n\n @property\n def solid_def(self) -> SolidDefinition:\n return self.solid.definition\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n return self._execution_context_data.pipeline.get_definition()\n\n @property\n def solid(self) -> "Solid":\n return self.pipeline_def.get_solid(self._step.solid_handle)\n\n @property\n def resources(self) -> NamedTuple:\n return self._resources\n\n @property\n def required_resource_keys(self) -> Set[str]:\n return self._required_resource_keys\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n return HookContext(self._execution_context_data, self.log, hook_def, self.step)\n\n def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> str:\n # determine if the step is skipped\n if (\n # this is re-execution\n self.pipeline_run.parent_run_id\n # only part of the pipeline is being re-executed\n and len(self.execution_plan.step_handles_to_execute) < len(self.execution_plan.steps)\n # this step is not being executed\n and step_output_handle.step_key not in self.execution_plan.step_handles_to_execute\n ):\n return self.pipeline_run.parent_run_id\n else:\n return self.pipeline_run.run_id\n\n def get_output_context(self, step_output_handle) -> "OutputContext":\n return get_output_context(\n self.execution_plan,\n self.environment_config,\n step_output_handle,\n self._get_source_run_id(step_output_handle),\n log_manager=self._log_manager,\n step_context=self,\n )\n\n def for_input_manager(\n self,\n name: str,\n config: dict,\n metadata: Any,\n dagster_type: DagsterType,\n source_handle: Optional[StepOutputHandle] = None,\n resource_config: Any = None,\n resources: Optional[NamedTuple] = None,\n ) -> "InputContext":\n return InputContext(\n pipeline_name=self.pipeline_def.name,\n name=name,\n solid_def=self.solid_def,\n config=config,\n metadata=metadata,\n upstream_output=self.get_output_context(source_handle) if source_handle else None,\n dagster_type=dagster_type,\n log_manager=self._log_manager,\n step_context=self,\n resource_config=resource_config,\n resources=resources,\n )\n\n def using_default_intermediate_storage(self) -> bool:\n from dagster.core.storage.system_storage import mem_intermediate_storage\n\n # pylint: disable=comparison-with-callable\n return (\n self.intermediate_storage_def is None\n or self.intermediate_storage_def == mem_intermediate_storage\n )\n\n def get_io_manager(self, step_output_handle) -> IOManager:\n step_output = self.execution_plan.get_step_output(step_output_handle)\n # backcompat: if intermediate storage is specified, adapt it to object manager\n if self.using_default_intermediate_storage():\n output_manager = getattr(self.resources, step_output.output_def.io_manager_key)\n else:\n from dagster.core.storage.intermediate_storage import IntermediateStorageAdapter\n\n output_manager = IntermediateStorageAdapter(self.intermediate_storage)\n return check.inst(output_manager, IOManager)\n\n\n[docs]class SystemComputeExecutionContext(SystemStepExecutionContext):\n @property\n def solid_config(self) -> Any:\n solid_config = self.environment_config.solids.get(str(self.solid_handle))\n return solid_config.config if solid_config else None\n\n\n[docs]class TypeCheckContext(SystemExecutionContext):\n """The ``context`` object available to a type check function on a DagsterType.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n resources (Any): An object whose attributes contain the resources available to this solid.\n run_id (str): The id of this pipeline run.\n """\n\n def __init__(\n self,\n execution_context_data: SystemExecutionContextData,\n log_manager: DagsterLogManager,\n dagster_type: DagsterType,\n ):\n super(TypeCheckContext, self).__init__(execution_context_data, log_manager)\n self._resources = self._execution_context_data.scoped_resources_builder.build(\n dagster_type.required_resource_keys\n )\n self._log_manager = log_manager\n\n @property\n def resources(self) -> NamedTuple:\n return self._resources\n\n\n[docs]class HookContext(SystemExecutionContext):\n """The ``context`` object available to a hook function on an DagsterEvent.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n hook_def (HookDefinition): The hook that the context object belongs to.\n step (ExecutionStep): The compute step associated with the hook.\n solid (Solid): The solid instance associated with the hook.\n resources (NamedTuple): Resources available in the hook context.\n solid_config (Any): The parsed config specific to this solid.\n """\n\n def __init__(\n self,\n execution_context_data: SystemExecutionContextData,\n log_manager: DagsterLogManager,\n hook_def: HookDefinition,\n step: ExecutionStep,\n ):\n\n super(HookContext, self).__init__(execution_context_data, log_manager)\n self._log_manager = log_manager\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._step = check.inst_param(step, "step", ExecutionStep)\n\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = self._execution_context_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n return self._execution_context_data.pipeline.get_definition()\n\n @property\n def solid(self) -> "Solid":\n return self.pipeline_def.get_solid(self._step.solid_handle)\n\n @property\n def resources(self) -> NamedTuple:\n return self._resources\n\n @property\n def required_resource_keys(self) -> Set[str]:\n return self._required_resource_keys\n\n @property\n def solid_config(self) -> Any:\n solid_config = self.environment_config.solids.get(str(self._step.solid_handle))\n return solid_config.config if solid_config else None\n\n\n[docs]class OutputContext(\n namedtuple(\n "_OutputContext",\n "step_key name pipeline_name run_id metadata mapping_key config solid_def dagster_type log version step_context resource_config resources",\n )\n):\n """\n The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Attributes:\n step_key (str): The step_key for the compute step that produced the output.\n name (str): The name of the output that produced the output.\n pipeline_name (str): The name of the pipeline definition.\n run_id (Optional[str]): The id of the run that produced the output.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n solid_def (Optional[SolidDefinition]): The definition of the solid that produced the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n log (Optional[DagsterLogManager]): The log manager to use for this output.\n version (Optional[str]): (Experimental) The version of the output.\n resources (Optional[ScopedResources]): The resources required by the output manager, specified by the\n `required_resource_keys` parameter.\n """\n\n def __new__(\n cls,\n step_key: str,\n name: str,\n pipeline_name: str,\n run_id: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n mapping_key: Optional[str] = None,\n config: Any = None,\n solid_def: Optional[SolidDefinition] = None,\n dagster_type: Optional[DagsterType] = None,\n log_manager: Optional[DagsterLogManager] = None,\n version: Optional[str] = None,\n # This is used internally by the intermediate storage adapter, we don't usually expect users to mock this.\n step_context: Optional[SystemStepExecutionContext] = None,\n resource_config: Optional[Any] = None,\n resources: Optional[NamedTuple] = None,\n ):\n\n return super(OutputContext, cls).__new__(\n cls,\n step_key=check.str_param(step_key, "step_key"),\n name=check.str_param(name, "name"),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n run_id=check.opt_str_param(run_id, "run_id"),\n metadata=check.opt_dict_param(metadata, "metadata"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n config=config,\n solid_def=check.opt_inst_param(solid_def, "solid_def", SolidDefinition),\n dagster_type=check.inst_param(\n resolve_dagster_type(dagster_type), "dagster_type", DagsterType\n ), # this allows the user to mock the context with unresolved dagster type\n log=check.opt_inst_param(log_manager, "log_manager", DagsterLogManager),\n version=check.opt_str_param(version, "version"),\n step_context=check.opt_inst_param(\n step_context, "step_context", SystemStepExecutionContext\n ),\n resource_config=resource_config,\n resources=resources,\n )\n\n[docs] def get_run_scoped_output_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n if self.mapping_key:\n return [self.run_id, self.step_key, self.name, self.mapping_key]\n\n return [self.run_id, self.step_key, self.name]\n\n\n[docs]class InputContext(\n namedtuple(\n "_InputContext",\n "name pipeline_name solid_def config metadata upstream_output dagster_type log step_context resource_config resources",\n )\n):\n """\n The ``context`` object available to the load_input method of :py:class:`RootInputManager`.\n\n Attributes:\n name (Optional[str]): The name of the input that we're loading.\n pipeline_name (str): The name of the pipeline.\n solid_def (Optional[SolidDefinition]): The definition of the solid that's loading the input.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n log (Optional[DagsterLogManager]): The log manager to use for this input.\n resource_config (Optional[Dict[str, Any]]): The config associated with the resource that\n initializes the RootInputManager.\n resources (ScopedResources): The resources required by the resource that initializes the\n input manager. If using the :py:func:`@root_input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n """\n\n def __new__(\n cls,\n pipeline_name: str,\n # This will be None when called from calling SolidExecutionResult.output_value\n name: Optional[str] = None,\n solid_def: Optional[SolidDefinition] = None,\n config: Any = None,\n metadata: Optional[Dict[str, Any]] = None,\n upstream_output: Optional[OutputContext] = None,\n dagster_type: Optional[DagsterType] = None,\n log_manager: Optional[DagsterLogManager] = None,\n # This is used internally by the intermediate storage adapter, we don't expect users to mock this.\n step_context: Optional[SystemStepExecutionContext] = None,\n resource_config: Any = None,\n resources: Optional[NamedTuple] = None,\n ):\n\n return super(InputContext, cls).__new__(\n cls,\n name=check.opt_str_param(name, "name"),\n pipeline_name=check.opt_str_param(pipeline_name, "pipeline_name"),\n solid_def=check.opt_inst_param(solid_def, "solid_def", SolidDefinition),\n config=config,\n metadata=metadata,\n upstream_output=check.opt_inst_param(upstream_output, "upstream_output", OutputContext),\n dagster_type=check.inst_param(\n resolve_dagster_type(dagster_type), "dagster_type", DagsterType\n ), # this allows the user to mock the context with unresolved dagster type\n log=check.opt_inst_param(log_manager, "log_manager", DagsterLogManager),\n step_context=check.opt_inst_param(\n step_context, "step_context", SystemStepExecutionContext\n ),\n resource_config=resource_config,\n resources=resources,\n )\n\n\ndef _step_output_version(\n execution_plan: "ExecutionPlan", step_output_handle: StepOutputHandle\n) -> Optional[str]:\n step_output_versions = execution_plan.resolve_step_output_versions()\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n environment_config: EnvironmentConfig,\n step_output_handle: StepOutputHandle,\n run_id: str,\n log_manager: Optional[DagsterLogManager] = None,\n step_context: Optional[SystemStepExecutionContext] = None,\n) -> OutputContext:\n """\n Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n from dagster.core.execution.plan.plan import ExecutionPlan\n\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(environment_config, "environment_config", EnvironmentConfig)\n check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle)\n check.opt_str_param(run_id, "run_id")\n\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n solid_config = environment_config.solids.get(step.solid_handle.to_string())\n outputs_config = solid_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n io_manager_key = execution_plan.get_step_output(step_output_handle).output_def.io_manager_key\n resource_config = environment_config.resources[io_manager_key].get("config", {})\n\n resources = build_resources_for_manager(io_manager_key, step_context) if step_context else None\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n pipeline_name=execution_plan.pipeline.get_definition().name,\n run_id=run_id,\n metadata=execution_plan.get_step_output(step_output_handle).output_def.metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n solid_def=step.solid.definition,\n dagster_type=execution_plan.get_step_output(step_output_handle).output_def.dagster_type,\n log_manager=log_manager,\n version=_step_output_version(execution_plan, step_output_handle)\n if MEMOIZED_RUN_TAG in execution_plan.pipeline.get_definition().tags\n else None,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n )\n
\nfrom collections import defaultdict\n\nfrom dagster import check\nfrom dagster.core.definitions import GraphDefinition, PipelineDefinition, Solid, SolidHandle\nfrom dagster.core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent, DagsterEventType\nfrom dagster.core.execution.plan.step import StepKind\n\n\ndef _construct_events_by_step_key(event_list):\n events_by_step_key = defaultdict(list)\n for event in event_list:\n events_by_step_key[event.step_key].append(event)\n\n return dict(events_by_step_key)\n\n\nclass GraphExecutionResult:\n def __init__(\n self,\n container,\n event_list,\n reconstruct_context,\n handle=None,\n resource_instances_to_override=None,\n ):\n self.container = check.inst_param(container, "container", GraphDefinition)\n self.event_list = check.list_param(event_list, "step_event_list", of_type=DagsterEvent)\n self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n self.handle = check.opt_inst_param(handle, "handle", SolidHandle)\n self.resource_instances_to_override = check.opt_dict_param(\n resource_instances_to_override, "resource_instances_to_override", str\n )\n self._events_by_step_key = _construct_events_by_step_key(event_list)\n\n @property\n def success(self):\n """bool: Whether all steps in the execution were successful."""\n return all([not event.is_failure for event in self.event_list])\n\n @property\n def step_event_list(self):\n """List[DagsterEvent] The full list of events generated by steps in the execution.\n\n Excludes events generated by the pipeline lifecycle, e.g., ``PIPELINE_START``.\n """\n return [event for event in self.event_list if event.is_step_event]\n\n @property\n def events_by_step_key(self):\n return self._events_by_step_key\n\n def result_for_solid(self, name):\n """Get the result of a top level solid.\n\n Args:\n name (str): The name of the top-level solid or aliased solid for which to retrieve the\n result.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the solid\n execution within the pipeline.\n """\n if not self.container.has_solid_named(name):\n raise DagsterInvariantViolationError(\n "Tried to get result for solid '{name}' in '{container}'. No such top level "\n "solid.".format(name=name, container=self.container.name)\n )\n\n return self.result_for_handle(SolidHandle(name, None))\n\n def output_for_solid(self, handle_str, output_name=DEFAULT_OUTPUT):\n """Get the output of a solid by its solid handle string and output name.\n\n Args:\n handle_str (str): The string handle for the solid.\n output_name (str): Optional. The name of the output, default to DEFAULT_OUTPUT.\n\n Returns:\n The output value for the handle and output_name.\n """\n check.str_param(handle_str, "handle_str")\n check.str_param(output_name, "output_name")\n return self.result_for_handle(SolidHandle.from_string(handle_str)).output_value(output_name)\n\n @property\n def solid_result_list(self):\n """List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results for each\n top level solid."""\n return [self.result_for_solid(solid.name) for solid in self.container.solids]\n\n def _result_for_handle(self, solid, handle):\n if not solid:\n raise DagsterInvariantViolationError(\n "Can not find solid handle {handle_str}.".format(handle_str=handle.to_string())\n )\n\n events_by_kind = defaultdict(list)\n\n if solid.is_composite:\n events = []\n for event in self.event_list:\n if event.is_step_event:\n if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n events_by_kind[event.step_kind].append(event)\n events.append(event)\n\n return CompositeSolidExecutionResult(\n solid,\n events,\n events_by_kind,\n self.reconstruct_context,\n handle=handle.with_ancestor(self.handle),\n resource_instances_to_override=self.resource_instances_to_override,\n )\n else:\n for event in self.event_list:\n if event.is_step_event:\n if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n events_by_kind[event.step_kind].append(event)\n\n return SolidExecutionResult(\n solid,\n events_by_kind,\n self.reconstruct_context,\n resource_instances_to_override=self.resource_instances_to_override,\n )\n\n def result_for_handle(self, handle):\n """Get the result of a solid by its solid handle.\n\n This allows indexing into top-level solids to retrieve the results of children of\n composite solids.\n\n Args:\n handle (Union[str,SolidHandle]): The handle for the solid.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the given\n solid.\n """\n if isinstance(handle, str):\n handle = SolidHandle.from_string(handle)\n else:\n check.inst_param(handle, "handle", SolidHandle)\n\n solid = self.container.get_solid(handle)\n\n return self._result_for_handle(solid, handle)\n\n\n[docs]class PipelineExecutionResult(GraphExecutionResult):\n """The result of executing a pipeline.\n\n Returned by :py:func:`execute_pipeline`. Users should not instantiate this class.\n """\n\n def __init__(\n self,\n pipeline_def,\n run_id,\n event_list,\n reconstruct_context,\n resource_instances_to_override=None,\n ):\n self.run_id = check.str_param(run_id, "run_id")\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n super(PipelineExecutionResult, self).__init__(\n container=pipeline_def,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n resource_instances_to_override=resource_instances_to_override,\n )\n\n @property\n def pipeline_def(self):\n return self.container\n\n\n[docs]class CompositeSolidExecutionResult(GraphExecutionResult):\n """Execution result for a composite solid in a pipeline.\n\n Users should not instantiate this class.\n """\n\n def __init__(\n self,\n solid,\n event_list,\n step_events_by_kind,\n reconstruct_context,\n handle=None,\n resource_instances_to_override=None,\n ):\n check.inst_param(solid, "solid", Solid)\n check.invariant(\n solid.is_composite,\n desc="Tried to instantiate a CompositeSolidExecutionResult with a noncomposite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.resource_instances_to_override = check.opt_dict_param(\n resource_instances_to_override, "resource_instances_to_override", str\n )\n super(CompositeSolidExecutionResult, self).__init__(\n container=solid.definition,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n handle=handle,\n resource_instances_to_override=resource_instances_to_override,\n )\n\n def output_values_for_solid(self, name):\n check.str_param(name, "name")\n return self.result_for_solid(name).output_values\n\n def output_values_for_handle(self, handle_str):\n check.str_param(handle_str, "handle_str")\n\n return self.result_for_handle(handle_str).output_values\n\n def output_value_for_solid(self, name, output_name=DEFAULT_OUTPUT):\n check.str_param(name, "name")\n check.str_param(output_name, "output_name")\n\n return self.result_for_solid(name).output_value(output_name)\n\n def output_value_for_handle(self, handle_str, output_name=DEFAULT_OUTPUT):\n check.str_param(handle_str, "handle_str")\n check.str_param(output_name, "output_name")\n\n return self.result_for_handle(handle_str).output_value(output_name)\n\n @property\n def output_values(self):\n values = {}\n\n for output_name in self.solid.definition.output_dict:\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n inner_solid_values = self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n SolidHandle(output_mapping.maps_from.solid_name, None),\n ).output_values\n\n if inner_solid_values is not None: # may be None if inner solid was skipped\n if output_mapping.maps_from.output_name in inner_solid_values:\n values[output_name] = inner_solid_values[output_mapping.maps_from.output_name]\n\n return values\n\n def output_value(self, output_name=DEFAULT_OUTPUT):\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in composite solid '{solid}': "\n "{outputs_clause}. If you were expecting this output to be present, you may "\n "be missing an output_mapping from an inner solid to its enclosing composite "\n "solid.".format(\n output_name=output_name,\n solid=self.solid.name,\n outputs_clause="found outputs {output_names}".format(\n output_names=str(list(self.solid.definition.output_dict.keys()))\n )\n if self.solid.definition.output_dict\n else "no output mappings were defined",\n )\n )\n\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n return self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n SolidHandle(output_mapping.maps_from.solid_name, None),\n ).output_value(output_mapping.maps_from.output_name)\n\n\n[docs]class SolidExecutionResult:\n """Execution result for a leaf solid in a pipeline.\n\n Users should not instantiate this class.\n """\n\n def __init__(\n self, solid, step_events_by_kind, reconstruct_context, resource_instances_to_override=None\n ):\n check.inst_param(solid, "solid", Solid)\n check.invariant(\n not solid.is_composite,\n desc="Tried to instantiate a SolidExecutionResult with a composite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n self.resource_instances_to_override = check.opt_dict_param(\n resource_instances_to_override, "resource_instances_to_override", str\n )\n\n @property\n def compute_input_event_dict(self):\n """Dict[str, DagsterEvent]: All events of type ``STEP_INPUT``, keyed by input name."""\n return {se.event_specific_data.input_name: se for se in self.input_events_during_compute}\n\n @property\n def input_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_INPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_INPUT)\n\n[docs] def get_output_event_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n DagsterEvent: The corresponding event.\n """\n events = self.get_output_events_for_compute(output_name)\n check.invariant(\n len(events) == 1, "Multiple output events returned, use get_output_events_for_compute"\n )\n return events[0]\n\n @property\n def compute_output_events_dict(self):\n """Dict[str, List[DagsterEvent]]: All events of type ``STEP_OUTPUT``, keyed by output name"""\n results = defaultdict(list)\n for se in self.output_events_during_compute:\n results[se.step_output_data.output_name].append(se)\n\n return dict(results)\n\n[docs] def get_output_events_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n List[DagsterEvent]: The corresponding events.\n """\n return self.compute_output_events_dict[output_name]\n\n @property\n def output_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_OUTPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_OUTPUT)\n\n @property\n def compute_step_events(self):\n """List[DagsterEvent]: All events generated by execution of the solid compute function."""\n return self.step_events_by_kind.get(StepKind.COMPUTE, [])\n\n @property\n def step_events(self):\n return self.compute_step_events\n\n @property\n def materializations_during_compute(self):\n """List[Materialization]: All materializations yielded by the solid."""\n return [\n mat_event.event_specific_data.materialization\n for mat_event in self.materialization_events_during_compute\n ]\n\n @property\n def materialization_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_MATERIALIZATION``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_MATERIALIZATION)\n\n @property\n def expectation_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_EXPECTATION_RESULT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_EXPECTATION_RESULT)\n\n def _compute_steps_of_type(self, dagster_event_type):\n return list(\n filter(lambda se: se.event_type == dagster_event_type, self.compute_step_events)\n )\n\n @property\n def expectation_results_during_compute(self):\n """List[ExpectationResult]: All expectation results yielded by the solid"""\n return [\n expt_event.event_specific_data.expectation_result\n for expt_event in self.expectation_events_during_compute\n ]\n\n[docs] def get_step_success_event(self):\n """DagsterEvent: The ``STEP_SUCCESS`` event, throws if not present."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n return step_event\n\n check.failed("Step success not found for solid {}".format(self.solid.name))\n\n @property\n def compute_step_failure_event(self):\n """DagsterEvent: The ``STEP_FAILURE`` event, throws if it did not fail."""\n if self.success:\n raise DagsterInvariantViolationError(\n "Cannot call compute_step_failure_event if successful"\n )\n\n step_failure_events = self._compute_steps_of_type(DagsterEventType.STEP_FAILURE)\n check.invariant(len(step_failure_events) == 1)\n return step_failure_events[0]\n\n @property\n def success(self):\n """bool: Whether solid execution was successful."""\n any_success = False\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return False\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n any_success = True\n\n return any_success\n\n @property\n def skipped(self):\n """bool: Whether solid execution was skipped."""\n return all(\n [\n step_event.event_type == DagsterEventType.STEP_SKIPPED\n for step_event in self.compute_step_events\n ]\n )\n\n @property\n def output_values(self):\n """Union[None, Dict[str, Union[Any, Dict[str, Any]]]: The computed output values.\n\n Returns ``None`` if execution did not succeed.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n\n Note that accessing this property will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n """\n if not self.success or not self.compute_step_events:\n return None\n\n results = {}\n with self.reconstruct_context(self.resource_instances_to_override) as context:\n for compute_step_event in self.compute_step_events:\n if compute_step_event.is_successful_output:\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if results.get(output.output_name) is None:\n results[output.output_name] = {mapping_key: value}\n else:\n results[output.output_name][mapping_key] = value\n else:\n results[output.output_name] = value\n\n return results\n\n[docs] def output_value(self, output_name=DEFAULT_OUTPUT):\n """Get a computed output value.\n\n Note that calling this method will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n\n Args:\n output_name(str): The output name for which to retrieve the value. (default: 'result')\n\n Returns:\n Union[None, Any, Dict[str, Any]]: ``None`` if execution did not succeed, the output value\n in the normal case, and a dict of mapping keys to values in the mapped case.\n """\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in solid '{solid}': found outputs "\n "{output_names}".format(\n output_name=output_name,\n solid=self.solid.name,\n output_names=str(list(self.solid.definition.output_dict.keys())),\n )\n )\n\n if not self.success:\n return None\n\n with self.reconstruct_context(self.resource_instances_to_override) as context:\n found = False\n result = None\n for compute_step_event in self.compute_step_events:\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = value\n else:\n result = value\n\n if found:\n return result\n\n raise DagsterInvariantViolationError(\n (\n "Did not find result {output_name} in solid {self.solid.name} "\n "execution result"\n ).format(output_name=output_name, self=self)\n )\n\n def _get_value(self, context, step_output_data):\n step_output_handle = step_output_data.step_output_handle\n manager = context.get_io_manager(step_output_handle)\n\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=self.solid.output_def_named(step_output_data.output_name).dagster_type,\n source_handle=step_output_handle,\n )\n )\n return res\n\n @property\n def failure_data(self):\n """Union[None, StepFailureData]: Any data corresponding to this step's failure, if it\n failed."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return step_event.step_failure_data\n
\nimport abc\n\n\n[docs]class Executor(abc.ABC): # pylint: disable=no-init\n[docs] @abc.abstractmethod\n def execute(self, pipeline_context, execution_plan):\n """\n For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n pipeline_context (SystemPipelineExecutionContext): The pipeline execution context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """\n\n @abc.abstractproperty\n def retries(self):\n """\n The Retries state / policy for this instance of the Executor. Executors should allow this to be\n controlled via configuration if possible.\n\n Returns: Retries\n """\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.definitions import (\n ExecutorDefinition,\n IPipeline,\n IntermediateStorageDefinition,\n ModeDefinition,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.system_config.objects import EnvironmentConfig\n\n\n[docs]class InitExecutorContext(\n namedtuple(\n "InitExecutorContext",\n "pipeline mode_def executor_def pipeline_run environment_config "\n "executor_config intermediate_storage_def instance",\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n pipeline (IPipeline): The pipeline to be executed.\n mode_def (ModeDefinition): The mode in which the pipeline is to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n pipeline_run (PipelineRun): Configuration for this pipeline run.\n environment_config (EnvironmentConfig): The parsed environment configuration for this\n pipeline run.\n executor_config (dict): The parsed config passed to the executor.\n intermediate_storage_def (Optional[IntermediateStorageDefinition]): The intermediate storage definition.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n pipeline,\n mode_def,\n executor_def,\n pipeline_run,\n environment_config,\n executor_config,\n instance,\n intermediate_storage_def=None,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n pipeline=check.inst_param(pipeline, "pipeline", IPipeline),\n mode_def=check.inst_param(mode_def, "mode_def", ModeDefinition),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n pipeline_run=check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n environment_config=check.inst_param(\n environment_config, "environment_config", EnvironmentConfig\n ),\n executor_config=check.dict_param(executor_config, executor_config, key_type=str),\n intermediate_storage_def=check.opt_inst_param(\n intermediate_storage_def, "intermediate_storage_def", IntermediateStorageDefinition\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )\n\n @property\n def pipeline_def(self):\n return self.pipeline.get_definition()\n
\nimport logging\nimport os\nimport sys\nimport tempfile\nimport time\nimport warnings\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\n\nimport yaml\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.pipeline import PipelineDefinition, PipelineSubsetDefinition\nfrom dagster.core.errors import (\n DagsterInvariantViolationError,\n DagsterRunAlreadyExists,\n DagsterRunConflict,\n)\nfrom dagster.core.storage.migration.utils import upgrading_instance\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster.core.system_config.objects import EnvironmentConfig\nfrom dagster.core.utils import str_format_list\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n\ndef _is_dagster_home_set():\n return bool(os.getenv("DAGSTER_HOME"))\n\n\ndef is_memoized_run(tags):\n return tags is not None and MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n\n\ndef _dagster_home():\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterInvariantViolationError(\n (\n "The environment variable $DAGSTER_HOME is not set. Dagster requires this "\n "environment variable to be set to an existing directory in your filesystem "\n "that contains your dagster instance configuration file (dagster.yaml).\\n"\n "You can resolve this error by exporting the environment variable."\n "For example, you can run the following command in your shell or "\n "include it in your shell configuration file:\\n"\n '\\texport DAGSTER_HOME="~/dagster_home"'\n )\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem that"\n "contains your dagster instance configuration file (dagster.yaml)."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem that "\n "contains your dagster instance configuration file (dagster.yaml)."\n ).format(dagster_home_path)\n )\n\n return dagster_home_path\n\n\ndef _check_run_equality(pipeline_run, candidate_run):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(candidate_run, "candidate_run", PipelineRun)\n\n field_diff = {}\n for field in pipeline_run._fields:\n expected_value = getattr(pipeline_run, field)\n candidate_value = getattr(candidate_run, field)\n if expected_value != candidate_value:\n field_diff[field] = (expected_value, candidate_value)\n\n return field_diff\n\n\ndef _format_field_diff(field_diff):\n return "\\n".join(\n [\n (\n " {field_name}:\\n"\n + " Expected: {expected_value}\\n"\n + " Received: {candidate_value}"\n ).format(\n field_name=field_name,\n expected_value=expected_value,\n candidate_value=candidate_value,\n )\n for field_name, (expected_value, candidate_value,) in field_diff.items()\n ]\n )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n def __init__(self, instance):\n self._instance = instance\n super(_EventListenerLogHandler, self).__init__()\n\n def emit(self, record):\n from dagster.core.events.log import construct_event_record, StructuredLoggerMessage\n\n try:\n event = construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n )\n\n self._instance.handle_new_event(event)\n\n except Exception as e: # pylint: disable=W0703\n logging.critical("Error during instance event listen")\n logging.exception(str(e))\n raise\n\n\nclass InstanceType(Enum):\n PERSISTENT = "PERSISTENT"\n EPHEMERAL = "EPHEMERAL"\n\n\n[docs]class DagsterInstance:\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME`` if set, otherwise fallback\n to using an ephemeral in-memory set of components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for run and event log storage, you can write a ``dagster.yaml``\n such as the following:\n\n .. literalinclude:: ../../../../docs/sections/deploying/postgres_dagster.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster.core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (ComputeLogManager): The compute log manager handles stdout and stderr\n logging for solid compute functions. By default, this will be a\n :py:class:`dagster.core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (RunCoordinator): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n _PROCESS_TEMPDIR = None\n\n def __init__(\n self,\n instance_type,\n local_artifact_storage,\n run_storage,\n event_storage,\n compute_log_manager,\n schedule_storage=None,\n scheduler=None,\n run_coordinator=None,\n run_launcher=None,\n settings=None,\n skip_validation_checks=False,\n ref=None,\n ):\n from dagster.core.storage.compute_log_manager import ComputeLogManager\n from dagster.core.storage.event_log import EventLogStorage\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import RunStorage\n from dagster.core.storage.schedules import ScheduleStorage\n from dagster.core.scheduler import Scheduler\n from dagster.core.run_coordinator import RunCoordinator\n from dagster.core.launcher import RunLauncher\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n if self._schedule_storage and not skip_validation_checks:\n self._schedule_storage.validate_stored_schedules(self.scheduler_class)\n\n self._run_coordinator = check.inst_param(run_coordinator, "run_coordinator", RunCoordinator)\n self._run_coordinator.initialize(self)\n self._run_launcher = check.inst_param(run_launcher, "run_launcher", RunLauncher)\n self._run_launcher.initialize(self)\n\n self._settings = check.opt_dict_param(settings, "settings")\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers = defaultdict(list)\n\n # ctors\n\n @staticmethod\n def ephemeral(tempdir=None, preload=None):\n from dagster.core.run_coordinator import DefaultRunCoordinator\n from dagster.core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster.core.storage.event_log import InMemoryEventLogStorage\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import InMemoryRunStorage\n from dagster.core.storage.noop_compute_log_manager import NoOpComputeLogManager\n\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance(\n InstanceType.EPHEMERAL,\n local_artifact_storage=LocalArtifactStorage(tempdir),\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n )\n\n @staticmethod\n def get(fallback_storage=None):\n # 1. Use $DAGSTER_HOME to determine instance if set.\n if _is_dagster_home_set():\n return DagsterInstance.from_config(_dagster_home())\n\n # 2. If that is not set use the fallback storage directory if provided.\n # This allows us to have a nice out of the box dagit experience where runs are persisted\n # across restarts in a tempdir that gets cleaned up when the dagit watchdog process exits.\n elif fallback_storage is not None:\n return DagsterInstance.from_config(fallback_storage)\n\n # 3. If all else fails create an ephemeral in memory instance.\n else:\n return DagsterInstance.ephemeral(fallback_storage)\n\n @staticmethod\n def get_for_migration():\n return DagsterInstance.from_config(_dagster_home(), skip_validation_checks=True)\n\n @staticmethod\n def local_temp(tempdir=None, overrides=None):\n warnings.warn(\n "To create a local DagsterInstance for a test, use the instance_for_test "\n "context manager instead, which ensures that resoures are cleaned up afterwards"\n )\n\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))\n\n @staticmethod\n def from_config(\n config_dir, config_filename=DAGSTER_CONFIG_YAML_FILENAME, skip_validation_checks=False\n ):\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref, skip_validation_checks=skip_validation_checks)\n\n @staticmethod\n def from_ref(instance_ref, skip_validation_checks=False):\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n return DagsterInstance(\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=instance_ref.run_storage,\n event_storage=instance_ref.event_storage,\n compute_log_manager=instance_ref.compute_log_manager,\n schedule_storage=instance_ref.schedule_storage,\n scheduler=instance_ref.scheduler,\n run_coordinator=instance_ref.run_coordinator,\n run_launcher=instance_ref.run_launcher,\n settings=instance_ref.settings,\n skip_validation_checks=skip_validation_checks,\n ref=instance_ref,\n )\n\n # flags\n\n @property\n def is_persistent(self):\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self):\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self):\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg="\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else "",\n )\n )\n\n @property\n def root_directory(self):\n return self._local_artifact_storage.base_dir\n\n @staticmethod\n def temp_storage():\n if DagsterInstance._PROCESS_TEMPDIR is None:\n DagsterInstance._PROCESS_TEMPDIR = tempfile.TemporaryDirectory()\n return DagsterInstance._PROCESS_TEMPDIR.name\n\n def _info(self, component):\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name, component):\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self):\n\n settings = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self._run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self):\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n # schedule storage\n\n @property\n def schedule_storage(self):\n return self._schedule_storage\n\n # schedule storage\n\n @property\n def scheduler(self):\n return self._scheduler\n\n @property\n def scheduler_class(self):\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self):\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self):\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self):\n return self._compute_log_manager\n\n def get_settings(self, settings_key):\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self):\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n else:\n return dagster_telemetry_enabled_default\n\n def upgrade(self, print_fn=lambda _: None):\n with upgrading_instance(self):\n\n print_fn("Updating run storage...")\n self._run_storage.upgrade()\n self._run_storage.build_missing_indexes()\n\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade()\n\n def optimize_for_dagit(self, statement_timeout):\n self._run_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n self._event_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n if self._schedule_storage:\n self._schedule_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n\n def reindex(self, print_fn=lambda _: None):\n print_fn("Checking for reindexing...")\n self._event_storage.reindex(print_fn)\n self._run_storage.reindex(print_fn)\n print_fn("Done.")\n\n def dispose(self):\n self._run_storage.dispose()\n self.run_coordinator.dispose()\n self._run_launcher.dispose()\n self._event_storage.dispose()\n self._compute_log_manager.dispose()\n\n # run storage\n\n def get_run_by_id(self, run_id):\n return self._run_storage.get_run_by_id(run_id)\n\n def get_pipeline_snapshot(self, snapshot_id):\n return self._run_storage.get_pipeline_snapshot(snapshot_id)\n\n def has_pipeline_snapshot(self, snapshot_id):\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n def get_historical_pipeline(self, snapshot_id):\n from dagster.core.host_representation import HistoricalPipeline\n\n snapshot = self._run_storage.get_pipeline_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_pipeline_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalPipeline(\n self._run_storage.get_pipeline_snapshot(snapshot_id), snapshot_id, parent_snapshot\n )\n\n def has_historical_pipeline(self, snapshot_id):\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n def get_execution_plan_snapshot(self, snapshot_id):\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n def get_run_stats(self, run_id):\n return self._event_storage.get_stats_for_run(run_id)\n\n def get_run_step_stats(self, run_id, step_keys=None):\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n def get_run_tags(self):\n return self._run_storage.get_run_tags()\n\n def get_run_group(self, run_id):\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_pipeline(\n self,\n pipeline_def,\n execution_plan=None,\n run_id=None,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n solid_selection=None,\n ):\n from dagster.core.execution.api import create_execution_plan\n from dagster.core.execution.plan.plan import ExecutionPlan\n from dagster.core.snap import snapshot_from_execution_plan\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that solids_to_execute is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # solid_selection is not required and will not be converted to solids_to_execute here.\n # i.e. this function doesn't handle solid queries.\n # solid_selection is only used to pass the user queries further down.\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n if solids_to_execute:\n if isinstance(pipeline_def, PipelineSubsetDefinition):\n # for the case when pipeline_def is created by IPipeline or ExternalPipeline\n check.invariant(\n solids_to_execute == pipeline_def.solids_to_execute,\n "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "\n "that conflicts with solids_to_execute arg {solids_to_execute}".format(\n pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),\n solids_to_execute=str_format_list(solids_to_execute),\n ),\n )\n else:\n # for cases when `create_run_for_pipeline` is directly called\n pipeline_def = pipeline_def.get_pipeline_subset_def(\n solids_to_execute=solids_to_execute\n )\n\n full_execution_plan = execution_plan or create_execution_plan(\n pipeline_def, run_config=run_config, mode=mode,\n )\n check.invariant(\n len(full_execution_plan.step_keys_to_execute) == len(full_execution_plan.steps)\n )\n\n if is_memoized_run(tags):\n from dagster.core.execution.resolve_versions import resolve_memoized_execution_plan\n\n if step_keys_to_execute:\n raise DagsterInvariantViolationError(\n "step_keys_to_execute parameter cannot be used in conjunction with memoized "\n "pipeline runs."\n )\n\n subsetted_execution_plan = resolve_memoized_execution_plan(\n full_execution_plan\n ) # TODO: tighter integration with existing step_keys_to_execute functionality\n step_keys_to_execute = subsetted_execution_plan.step_keys_to_execute\n else:\n subsetted_execution_plan = (\n full_execution_plan.build_subset_plan(step_keys_to_execute)\n if step_keys_to_execute\n else full_execution_plan\n )\n\n return self.create_run(\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n run_config=run_config,\n mode=check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name()),\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n subsetted_execution_plan, pipeline_def.get_pipeline_snapshot_id()\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n )\n\n def _construct_run_with_snapshots(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n external_pipeline_origin=None,\n ):\n\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc().isoformat()\n\n check.invariant(\n not (not pipeline_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot. "\n "It is possible to have no execution plan snapshot since we persist runs "\n "that do not successfully compile execution plans in the scheduled case.",\n )\n\n pipeline_snapshot_id = (\n self._ensure_persisted_pipeline_snapshot(pipeline_snapshot, parent_pipeline_snapshot)\n if pipeline_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and pipeline_snapshot_id\n else None\n )\n\n return PipelineRun(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n )\n\n def _ensure_persisted_pipeline_snapshot(self, pipeline_snapshot, parent_pipeline_snapshot):\n from dagster.core.snap import create_pipeline_snapshot_id, PipelineSnapshot\n\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n check.opt_inst_param(parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot)\n\n if pipeline_snapshot.lineage_snapshot:\n if not self._run_storage.has_pipeline_snapshot(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_pipeline_snapshot_id(parent_pipeline_snapshot)\n == pipeline_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n parent_pipeline_snapshot\n )\n check.invariant(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n == returned_pipeline_snapshot_id\n )\n\n pipeline_snapshot_id = create_pipeline_snapshot_id(pipeline_snapshot)\n if not self._run_storage.has_pipeline_snapshot(pipeline_snapshot_id):\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n pipeline_snapshot\n )\n check.invariant(pipeline_snapshot_id == returned_pipeline_snapshot_id)\n\n return pipeline_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self, execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n ):\n from dagster.core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.pipeline_snapshot_id == pipeline_snapshot_id,\n (\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n '"{ep_pipeline_snapshot_id}" and snapshot_id created in memory is '\n '"{pipeline_snapshot_id}"'\n ).format(\n ep_pipeline_snapshot_id=execution_plan_snapshot.pipeline_snapshot_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n ),\n )\n\n check.invariant(\n set(step_keys_to_execute) == set(execution_plan_snapshot.step_keys_to_execute)\n if step_keys_to_execute\n else set(execution_plan_snapshot.step_keys_to_execute)\n == set([step.key for step in execution_plan_snapshot.steps]),\n "We encode step_keys_to_execute twice in our stack, unfortunately. This check "\n "ensures that they are consistent. We check that step_keys_to_execute in the plan "\n "matches the step_keys_to_execute params if it is set. If it is not, this indicates "\n "a full execution plan, and so we verify that.",\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def create_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n external_pipeline_origin=None,\n ):\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n external_pipeline_origin=external_pipeline_origin,\n )\n return self._run_storage.add_run(pipeline_run)\n\n def register_managed_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n ):\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # PipelineRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(pipeline_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=PipelineRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n )\n\n def get_run():\n candidate_run = self.get_run_by_id(pipeline_run.run_id)\n\n field_diff = _check_run_equality(pipeline_run, candidate_run)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=pipeline_run.run_id, field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run\n\n if self.has_run(pipeline_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(pipeline_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n def add_run(self, pipeline_run):\n return self._run_storage.add_run(pipeline_run)\n\n def handle_run_event(self, run_id, event):\n return self._run_storage.handle_run_event(run_id, event)\n\n def add_run_tags(self, run_id, new_tags):\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n def has_run(self, run_id):\n return self._run_storage.has_run(run_id)\n\n def get_runs(self, filters=None, cursor=None, limit=None):\n return self._run_storage.get_runs(filters, cursor, limit)\n\n def get_runs_count(self, filters=None):\n return self._run_storage.get_runs_count(filters)\n\n def get_run_groups(self, filters=None, cursor=None, limit=None):\n return self._run_storage.get_run_groups(filters=filters, cursor=cursor, limit=limit)\n\n def wipe(self):\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n def delete_run(self, run_id):\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)\n\n # event storage\n\n def logs_after(self, run_id, cursor):\n return self._event_storage.get_logs_for_run(run_id, cursor=cursor)\n\n def all_logs(self, run_id):\n return self._event_storage.get_logs_for_run(run_id)\n\n def watch_event_logs(self, run_id, cursor, cb):\n return self._event_storage.watch(run_id, cursor, cb)\n\n # asset storage\n\n @property\n def is_asset_aware(self):\n return self._event_storage.is_asset_aware\n\n def check_asset_aware(self):\n check.invariant(\n self.is_asset_aware,\n (\n "Asset queries can only be performed on instances with asset-aware event log "\n "storage. Use `instance.is_asset_aware` to verify that the instance is configured "\n "with an EventLogStorage that implements `AssetAwareEventLogStorage`"\n ),\n )\n\n def all_asset_keys(self, prefix_path=None):\n self.check_asset_aware()\n return self._event_storage.get_all_asset_keys(prefix_path)\n\n def has_asset_key(self, asset_key):\n self.check_asset_aware()\n return self._event_storage.has_asset_key(asset_key)\n\n def events_for_asset_key(\n self, asset_key, partitions=None, cursor=None, limit=None, ascending=False\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n self.check_asset_aware()\n return self._event_storage.get_asset_events(\n asset_key, partitions, cursor, limit, ascending=ascending, include_cursor=True\n )\n\n def run_ids_for_asset_key(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n self.check_asset_aware()\n return self._event_storage.get_asset_run_ids(asset_key)\n\n def wipe_assets(self, asset_keys):\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n self.check_asset_aware()\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)\n\n # event subscriptions\n\n def get_logger(self):\n logger = logging.Logger("__event_listener")\n logger.addHandler(_EventListenerLogHandler(self))\n logger.setLevel(10)\n return logger\n\n def handle_new_event(self, event):\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.dagster_event.is_pipeline_event:\n self._run_storage.handle_run_event(run_id, event.dagster_event)\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id, cb):\n self._subscribers[run_id].append(cb)\n\n[docs] def report_engine_event(\n self, message, pipeline_run, engine_event_data=None, cls=None, step_key=None,\n ):\n """\n Report a EngineEvent that occurred outside of a pipeline execution context.\n """\n from dagster.core.events import EngineEventData, DagsterEvent, DagsterEventType\n from dagster.core.events.log import DagsterEventRecord\n\n check.class_param(cls, "cls")\n check.str_param(message, "message")\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n engine_event_data = check.opt_inst_param(\n engine_event_data, "engine_event_data", EngineEventData, EngineEventData([]),\n )\n\n if cls:\n message = "[{}] {}".format(cls.__name__, message)\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n event_specific_data=engine_event_data,\n )\n event_record = DagsterEventRecord(\n message=message,\n user_message=message,\n level=log_level,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=step_key,\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n def report_run_canceling(self, run, message=None):\n\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import DagsterEventRecord\n\n check.inst_param(run, "run", PipelineRun)\n message = check.opt_str_param(message, "message", "Sending pipeline termination request.",)\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n pipeline_name=run.pipeline_name,\n message=message,\n )\n\n event_record = DagsterEventRecord(\n message=message,\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=canceling_event,\n )\n\n self.handle_new_event(event_record)\n\n def report_run_canceled(\n self, pipeline_run, message=None,\n ):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import DagsterEventRecord\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This pipeline run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = DagsterEventRecord(\n message=message,\n user_message=message,\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n def report_run_failed(self, pipeline_run, message=None):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import DagsterEventRecord\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This pipeline run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = DagsterEventRecord(\n message=message,\n user_message=message,\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id):\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def intermediates_directory(self, run_id):\n return self._local_artifact_storage.intermediates_dir(run_id)\n\n def schedules_directory(self):\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n[docs] def submit_run(self, run_id, external_pipeline):\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n\n from dagster.core.host_representation import ExternalPipelineOrigin\n\n run = self.get_run_by_id(run_id)\n check.inst(\n run.external_pipeline_origin,\n ExternalPipelineOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self._run_coordinator.submit_run(\n run, external_pipeline=external_pipeline\n )\n except:\n from dagster.core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message, run, EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run\n\n # Run launcher\n\n[docs] def launch_run(self, run_id, external_pipeline):\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n run = self.get_run_by_id(run_id)\n\n from dagster.core.events import EngineEventData, DagsterEvent, DagsterEventType\n from dagster.core.events.log import DagsterEventRecord\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n pipeline_name=run.pipeline_name,\n )\n\n event_record = DagsterEventRecord(\n message="",\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=launch_started_event,\n )\n\n self.handle_new_event(event_record)\n\n run = self.get_run_by_id(run_id)\n\n try:\n self._run_launcher.launch_run(self, run, external_pipeline=external_pipeline)\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message, run, EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n # Scheduler\n\n def reconcile_scheduler_state(self, external_repository):\n return self._scheduler.reconcile_scheduler_state(self, external_repository)\n\n def start_schedule_and_update_storage_state(self, external_schedule):\n return self._scheduler.start_schedule_and_update_storage_state(self, external_schedule)\n\n def stop_schedule_and_update_storage_state(self, schedule_origin_id):\n return self._scheduler.stop_schedule_and_update_storage_state(self, schedule_origin_id)\n\n def stop_schedule_and_delete_from_storage(self, schedule_origin_id):\n return self._scheduler.stop_schedule_and_delete_from_storage(self, schedule_origin_id)\n\n def running_schedule_count(self, schedule_origin_id):\n if self._scheduler:\n return self._scheduler.running_schedule_count(self, schedule_origin_id)\n return 0\n\n def scheduler_debug_info(self):\n from dagster.core.scheduler import SchedulerDebugInfo\n from dagster.core.definitions.job import JobType\n from dagster.core.scheduler.job import JobStatus\n\n errors = []\n\n schedules = []\n for schedule_state in self.all_stored_job_state(job_type=JobType.SCHEDULE):\n if schedule_state.status == JobStatus.RUNNING and not self.running_schedule_count(\n schedule_state.job_origin_id\n ):\n errors.append(\n "Schedule {schedule_name} is set to be running, but the scheduler is not "\n "running the schedule.".format(schedule_name=schedule_state.job_name)\n )\n elif schedule_state.status == JobStatus.STOPPED and self.running_schedule_count(\n schedule_state.job_origin_id\n ):\n errors.append(\n "Schedule {schedule_name} is set to be stopped, but the scheduler is still running "\n "the schedule.".format(schedule_name=schedule_state.job_name)\n )\n\n if self.running_schedule_count(schedule_state.job_origin_id) > 1:\n errors.append(\n "Duplicate jobs found: More than one job for schedule {schedule_name} are "\n "running on the scheduler.".format(schedule_name=schedule_state.job_name)\n )\n\n schedule_info = {\n schedule_state.job_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.job_specific_data.cron_schedule,\n "repository_pointer": schedule_state.origin.get_repo_cli_args(),\n "schedule_origin_id": schedule_state.job_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(),\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule Storage\n\n def start_sensor(self, external_sensor):\n from dagster.core.scheduler.job import JobState, JobStatus, SensorJobData\n from dagster.core.definitions.job import JobType\n\n job_state = self.get_job_state(external_sensor.get_external_origin_id())\n\n if not job_state:\n self.add_job_state(\n JobState(\n external_sensor.get_external_origin(),\n JobType.SENSOR,\n JobStatus.RUNNING,\n SensorJobData(datetime.utcnow().timestamp()),\n )\n )\n elif job_state.status != JobStatus.RUNNING:\n # set the last completed time to the modified state time\n self.update_job_state(\n job_state.with_status(JobStatus.RUNNING).with_data(\n SensorJobData(datetime.utcnow().timestamp())\n )\n )\n\n def stop_sensor(self, job_origin_id):\n from dagster.core.scheduler.job import JobStatus, SensorJobData\n\n job_state = self.get_job_state(job_origin_id)\n if job_state:\n self.update_job_state(\n job_state.with_status(JobStatus.STOPPED).with_data(\n SensorJobData(datetime.utcnow().timestamp())\n )\n )\n\n def all_stored_job_state(self, repository_origin_id=None, job_type=None):\n return self._schedule_storage.all_stored_job_state(repository_origin_id, job_type)\n\n def get_job_state(self, job_origin_id):\n return self._schedule_storage.get_job_state(job_origin_id)\n\n def add_job_state(self, job_state):\n return self._schedule_storage.add_job_state(job_state)\n\n def update_job_state(self, job_state):\n return self._schedule_storage.update_job_state(job_state)\n\n def delete_job_state(self, job_origin_id):\n return self._schedule_storage.delete_job_state(job_origin_id)\n\n def get_job_ticks(self, job_origin_id):\n return self._schedule_storage.get_job_ticks(job_origin_id)\n\n def get_latest_job_tick(self, job_origin_id):\n return self._schedule_storage.get_latest_job_tick(job_origin_id)\n\n def create_job_tick(self, job_tick_data):\n return self._schedule_storage.create_job_tick(job_tick_data)\n\n def update_job_tick(self, tick):\n return self._schedule_storage.update_job_tick(tick)\n\n def get_job_tick_stats(self, job_origin_id):\n return self._schedule_storage.get_job_tick_stats(job_origin_id)\n\n def purge_job_ticks(self, job_origin_id, tick_status, before):\n self._schedule_storage.purge_job_ticks(job_origin_id, tick_status, before)\n\n def wipe_all_schedules(self):\n if self._scheduler:\n self._scheduler.wipe(self)\n\n self._schedule_storage.wipe()\n\n def logs_path_for_schedule(self, schedule_origin_id):\n return self._scheduler.get_logs_path(self, schedule_origin_id)\n\n def __enter__(self):\n return self\n\n def __exit__(self, exception_type, exception_value, traceback):\n self.dispose()\n\n[docs] def get_addresses_for_step_output_versions(self, step_output_versions):\n """\n For each given step output, finds whether an output exists with the given\n version, and returns its address if it does.\n\n Args:\n step_output_versions (Dict[(str, StepOutputHandle), str]):\n (pipeline name, step output handle) -> version.\n\n Returns:\n Dict[(str, StepOutputHandle), str]: (pipeline name, step output handle) -> address.\n For each step output, an address if there is one and None otherwise.\n """\n return self._event_storage.get_addresses_for_step_output_versions(step_output_versions)\n\n # dagster daemon\n[docs] def add_daemon_heartbeat(self, daemon_heartbeat):\n """Called on a regular interval by the daemon"""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)\n\n[docs] def get_daemon_heartbeats(self):\n """Latest heartbeats of all daemon types"""\n return self._run_storage.get_daemon_heartbeats()\n\n def wipe_daemon_heartbeats(self):\n self._run_storage.wipe_daemon_heartbeats()\n
\nimport os\nfrom collections import namedtuple\n\nimport yaml\nfrom dagster import check\nfrom dagster.serdes import ConfigurableClassData, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\n\ndef _runs_directory(base):\n return os.path.join(base, "history", "")\n\n\ndef compute_logs_directory(base):\n return os.path.join(base, "storage")\n\n\ndef _event_logs_directory(base):\n return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base):\n return os.path.join(base, "schedules")\n\n\ndef configurable_class_data_or_default(config_value, field_name, default):\n if config_value.get(field_name):\n return ConfigurableClassData(\n config_value[field_name]["module"],\n config_value[field_name]["class"],\n yaml.dump(config_value[field_name].get("config") or {}, default_flow_style=False),\n )\n return default\n\n\n[docs]@whitelist_for_serdes\nclass InstanceRef(\n namedtuple(\n "_InstanceRef",\n "local_artifact_storage_data run_storage_data event_storage_data compute_logs_data "\n "schedule_storage_data scheduler_data run_coordinator_data run_launcher_data settings",\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data,\n run_storage_data,\n event_storage_data,\n compute_logs_data,\n schedule_storage_data,\n scheduler_data,\n run_coordinator_data,\n run_launcher_data,\n settings,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n run_storage_data=check.inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_dict_param(settings, "settings"),\n )\n\n @staticmethod\n def from_dir(base_dir, config_filename=DAGSTER_CONFIG_YAML_FILENAME, overrides=None):\n overrides = check.opt_dict_param(overrides, "overrides")\n config_value = dagster_instance_config(\n base_dir, config_filename=config_filename, overrides=overrides\n )\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value,\n "local_artifact_storage",\n ConfigurableClassData(\n "dagster.core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n )\n\n run_storage_data = configurable_class_data_or_default(\n config_value,\n "run_storage",\n ConfigurableClassData(\n "dagster.core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n ),\n )\n\n event_storage_data = configurable_class_data_or_default(\n config_value,\n "event_log_storage",\n ConfigurableClassData(\n "dagster.core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n ),\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n ConfigurableClassData(\n "dagster.core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n )\n\n schedule_storage_data = configurable_class_data_or_default(\n config_value,\n "schedule_storage",\n ConfigurableClassData(\n "dagster.core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n ),\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value,\n "scheduler",\n ConfigurableClassData(\n "dagster.core.scheduler", "DagsterDaemonScheduler", yaml.dump({}),\n ),\n )\n\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n ConfigurableClassData(\n "dagster.core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n ConfigurableClassData("dagster", "DefaultRunLauncher", yaml.dump({}),),\n )\n\n settings_keys = {"telemetry"}\n settings = {key: config_value.get(key) for key in settings_keys}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data,\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data,\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self):\n return self.local_artifact_storage_data.rehydrate()\n\n @property\n def run_storage(self):\n return self.run_storage_data.rehydrate()\n\n @property\n def event_storage(self):\n return self.event_storage_data.rehydrate()\n\n @property\n def compute_log_manager(self):\n return self.compute_logs_data.rehydrate()\n\n @property\n def schedule_storage(self):\n return self.schedule_storage_data.rehydrate() if self.schedule_storage_data else None\n\n @property\n def scheduler(self):\n return self.scheduler_data.rehydrate() if self.scheduler_data else None\n\n @property\n def run_coordinator(self):\n return self.run_coordinator_data.rehydrate() if self.run_coordinator_data else None\n\n @property\n def run_launcher(self):\n return self.run_launcher_data.rehydrate() if self.run_launcher_data else None\n\n def to_dict(self):\n return self._asdict()\n
\nfrom abc import ABC, abstractmethod\n\n\n[docs]class RunLauncher(ABC):\n def initialize(self, instance):\n """\n Perform any initialization that depends on the surrounding DagsterInstance.\n\n Args:\n instance (DagsterInstance): The instance in which the run has been created.\n """\n\n @abstractmethod\n def launch_run(self, instance, run, external_pipeline):\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n instance (DagsterInstance): The instance in which the run has been created.\n run (PipelineRun): The PipelineRun to launch.\n external_pipeline (ExternalPipeline): The pipeline that is being launched (currently\n optional during migration)\n\n Returns:\n PipelineRun: The launched run.\n """\n\n @abstractmethod\n def can_terminate(self, run_id):\n """\n Can this run_id be terminated by this run launcher.\n """\n\n @abstractmethod\n def terminate(self, run_id):\n """\n Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self):\n """\n Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout=30):\n pass\n
\nimport time\nimport weakref\n\nimport grpc\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterLaunchFailedError\nfrom dagster.core.host_representation import ExternalPipeline\nfrom dagster.core.host_representation.handle import (\n GrpcServerRepositoryLocationHandle,\n ManagedGrpcPythonEnvRepositoryLocationHandle,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import GRPC_INFO_TAG\nfrom dagster.grpc.client import DagsterGrpcClient\nfrom dagster.grpc.types import (\n CanCancelExecutionRequest,\n CancelExecutionRequest,\n ExecuteExternalPipelineArgs,\n)\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.utils import merge_dicts\n\nfrom .base import RunLauncher\n\nGRPC_REPOSITORY_LOCATION_HANDLE_TYPES = (\n GrpcServerRepositoryLocationHandle,\n ManagedGrpcPythonEnvRepositoryLocationHandle,\n)\n\n\n[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers.\n """\n\n def __init__(self, inst_data=None):\n self._instance_weakref = None\n self._inst_data = inst_data\n\n # Used for test cleanup purposes only\n self._run_id_to_repository_location_handle_cache = {}\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DefaultRunLauncher(inst_data=inst_data)\n\n @property\n def _instance(self):\n return self._instance_weakref() if self._instance_weakref else None\n\n def initialize(self, instance):\n check.inst_param(instance, "instance", DagsterInstance)\n check.invariant(self._instance is None, "Must only call initialize once")\n # Store a weakref to avoid a circular reference / enable GC\n self._instance_weakref = weakref.ref(instance)\n\n def launch_run(self, instance, run, external_pipeline):\n check.inst_param(run, "run", PipelineRun)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n\n repository_location_handle = external_pipeline.repository_handle.repository_location_handle\n\n check.inst(\n repository_location_handle,\n GRPC_REPOSITORY_LOCATION_HANDLE_TYPES,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": repository_location_handle.host},\n {"port": repository_location_handle.port}\n if repository_location_handle.port\n else {"socket": repository_location_handle.socket},\n )\n )\n },\n )\n\n res = repository_location_handle.client.start_run(\n ExecuteExternalPipelineArgs(\n pipeline_origin=external_pipeline.get_external_origin(),\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n )\n\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n self._run_id_to_repository_location_handle_cache[run.run_id] = repository_location_handle\n\n return run\n\n def _get_grpc_client_for_termination(self, run_id):\n if not self._instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"), socket=grpc_info.get("socket"), host=grpc_info.get("host")\n )\n\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n client = self._get_grpc_client_for_termination(run_id)\n if not client:\n return False\n\n try:\n res = client.can_cancel_execution(CanCancelExecutionRequest(run_id=run_id), timeout=5)\n except grpc._channel._InactiveRpcError: # pylint: disable=protected-access\n # Server that created the run may no longer exist\n return False\n\n return res.can_cancel\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n if not self._instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n res = client.cancel_execution(CancelExecutionRequest(run_id=run_id))\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self._instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_id_to_repository_location_handle_cache.keys()\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(\n "Timed out waiting for these runs to finish: {active_run_ids}".format(\n active_run_ids=repr(active_run_ids)\n )\n )\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2\n\n def cleanup_managed_grpc_servers(self):\n """Shut down any managed grpc servers that used this run launcher to start a run.\n Should only be used for teardown purposes within tests (generally it's fine for a server\n to out-live the host process, since it might be finishing an execution and will\n automatically shut itself down once it no longer receives a heartbeat from the host\n process). But in tests, gRPC servers access the DagsterInstance during execution, so we need\n to shut them down before we can safely remove the temporary directory created for the\n DagsterInstance.\n """\n for repository_location_handle in self._run_id_to_repository_location_handle_cache.values():\n if isinstance(repository_location_handle, ManagedGrpcPythonEnvRepositoryLocationHandle):\n check.invariant(\n repository_location_handle.is_cleaned_up,\n "ManagedGrpcPythonRepositoryLocationHandle was not cleaned up "\n "before test teardown. This may indicate that the handle is not "\n "being used as a contextmanager.",\n )\n repository_location_handle.grpc_server_process.wait()\n
\nimport datetime\nimport itertools\nimport logging\nfrom collections import OrderedDict, namedtuple\n\nfrom dagster import check, seven\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.utils import frozendict, merge_dicts\nfrom dagster.utils.error import SerializableErrorInfo\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nPYTHON_LOGGING_LEVELS_MAPPING = frozendict(\n OrderedDict({"CRITICAL": 50, "ERROR": 40, "WARNING": 30, "INFO": 20, "DEBUG": 10})\n)\n\nPYTHON_LOGGING_LEVELS_ALIASES = frozendict(OrderedDict({"FATAL": "CRITICAL", "WARN": "WARNING"}))\n\nPYTHON_LOGGING_LEVELS_NAMES = frozenset(\n [\n level_name.lower()\n for level_name in sorted(\n list(PYTHON_LOGGING_LEVELS_MAPPING.keys()) + list(PYTHON_LOGGING_LEVELS_ALIASES.keys())\n )\n ]\n)\n\n\ndef _dump_value(value):\n # dump namedtuples as objects instead of arrays\n if isinstance(value, tuple) and hasattr(value, "_asdict"):\n return seven.json.dumps(value._asdict())\n\n return seven.json.dumps(value)\n\n\ndef construct_log_string(synth_props, logging_tags, message_props):\n # Handle this explicitly\n dagster_event = (\n message_props["dagster_event"]._asdict() if "dagster_event" in message_props else {}\n )\n\n event_specific_data = dagster_event.get("event_specific_data")\n stack = ""\n if hasattr(event_specific_data, "error") and isinstance(\n event_specific_data.error, SerializableErrorInfo\n ):\n stack = "\\n\\n" + event_specific_data.error.to_string()\n\n log_source_prefix = (\n "resource:%s" % logging_tags["resource_name"]\n if "resource_name" in logging_tags\n else message_props.get("pipeline_name", "system")\n )\n\n prefix = " - ".join(\n filter(\n None,\n (\n log_source_prefix,\n synth_props.get("run_id"),\n str(dagster_event["pid"]) if dagster_event.get("pid") is not None else None,\n logging_tags.get("step_key"),\n dagster_event.get("event_type_value"),\n synth_props.get("orig_message"),\n ),\n )\n )\n return prefix + stack\n\n\ndef coerce_valid_log_level(log_level):\n """Convert a log level into an integer for consumption by the low-level Python logging API."""\n if isinstance(log_level, int):\n return log_level\n check.str_param(log_level, "log_level")\n check.invariant(\n log_level.lower() in PYTHON_LOGGING_LEVELS_NAMES,\n "Bad value for log level {level}: permissible values are {levels}.".format(\n level=log_level,\n levels=", ".join(\n ["'{}'".format(level_name.upper()) for level_name in PYTHON_LOGGING_LEVELS_NAMES]\n ),\n ),\n )\n log_level = PYTHON_LOGGING_LEVELS_ALIASES.get(log_level.upper(), log_level.upper())\n return PYTHON_LOGGING_LEVELS_MAPPING[log_level]\n\n\n[docs]class DagsterLogManager(namedtuple("_DagsterLogManager", "run_id logging_tags loggers")):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers.\n\n An instance of the log manager is made available to solids as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` on a :py:class:`ModeDefinition` for a pipeline.\n\n The log manager supports standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of a solid,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __new__(cls, run_id, logging_tags, loggers):\n return super(DagsterLogManager, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n logging_tags=check.dict_param(\n logging_tags, "logging_tags", key_type=str, value_type=str\n ),\n loggers=check.list_param(loggers, "loggers", of_type=logging.Logger),\n )\n\n[docs] def with_tags(self, **new_tags):\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return self._replace(logging_tags=merge_dicts(self.logging_tags, new_tags))\n\n def _prepare_message(self, orig_message, message_props):\n check.str_param(orig_message, "orig_message")\n check.dict_param(message_props, "message_props")\n\n # These are todos to further align with the Python logging API\n check.invariant(\n "extra" not in message_props, "do not allow until explicit support is handled"\n )\n check.invariant(\n "exc_info" not in message_props, "do not allow until explicit support is handled"\n )\n\n # Reserved keys in the message_props -- these are system generated.\n check.invariant("orig_message" not in message_props, "orig_message reserved value")\n check.invariant("message" not in message_props, "message reserved value")\n check.invariant("log_message_id" not in message_props, "log_message_id reserved value")\n check.invariant("log_timestamp" not in message_props, "log_timestamp reserved value")\n\n log_message_id = make_new_run_id()\n\n log_timestamp = datetime.datetime.utcnow().isoformat()\n\n synth_props = {\n "orig_message": orig_message,\n "log_message_id": log_message_id,\n "log_timestamp": log_timestamp,\n "run_id": self.run_id,\n }\n\n # We first generate all props for the purpose of producing the semi-structured\n # log message via _kv_messsage\n all_props = dict(\n itertools.chain(synth_props.items(), self.logging_tags.items(), message_props.items())\n )\n\n # So here we use the arbitrary key DAGSTER_META_KEY to store a dictionary of\n # all the meta information that dagster injects into log message.\n # The python logging module, in its infinite wisdom, actually takes all the\n # keys in extra and unconditionally smashes them into the internal dictionary\n # of the logging.LogRecord class. We used a reserved key here to avoid naming\n # collisions with internal variables of the LogRecord class.\n # See __init__.py:363 (makeLogRecord) in the python 3.6 logging module source\n # for the gory details.\n return (\n construct_log_string(synth_props, self.logging_tags, message_props),\n {DAGSTER_META_KEY: all_props},\n )\n\n def _log(self, level, orig_message, message_props):\n """Invoke the underlying loggers for a given log level.\n\n Args:\n level (Union[str, int]): An integer represeting a Python logging level or one of the\n standard Python string representations of a logging level.\n orig_message (str): The log message generated in user code.\n message_props (dict): Additional properties for the structured log message.\n """\n if not self.loggers:\n return\n\n level = coerce_valid_log_level(level)\n\n message, extra = self._prepare_message(orig_message, message_props)\n\n for logger_ in self.loggers:\n logger_.log(level, message, extra=extra)\n\n[docs] def log(self, level, msg, **kwargs):\n """Invoke the underlying loggers for a given integer log level.\n\n Args:\n level (int): An integer represeting a Python logging level.\n orig_message (str): The message to log.\n """\n\n check.str_param(msg, "msg")\n check.int_param(level, "level")\n return self._log(level, msg, kwargs)\n\n[docs] def debug(self, msg, **kwargs):\n """Log at the ``logging.DEBUG`` level.\n\n The message will be automatically adorned with contextual information about the name\n of the pipeline, the name of the solid, etc., so it is generally unnecessary to include\n this type of information in the log message.\n\n You can optionally additional key-value pairs to an individual log message using the kwargs\n to this method.\n\n Args:\n msg (str): The message to log.\n **kwargs (Optional[Any]): Any additional key-value pairs for only this log message.\n """\n\n check.str_param(msg, "msg")\n return self._log(logging.DEBUG, msg, kwargs)\n\n[docs] def info(self, msg, **kwargs):\n """Log at the ``logging.INFO`` level.\n\n See :py:meth:`~DagsterLogManager.debug`.\n """\n\n check.str_param(msg, "msg")\n return self._log(logging.INFO, msg, kwargs)\n\n[docs] def warning(self, msg, **kwargs):\n """Log at the ``logging.WARNING`` level.\n\n See :py:meth:`~DagsterLogManager.debug`.\n """\n\n check.str_param(msg, "msg")\n return self._log(logging.WARNING, msg, kwargs)\n\n # Define the alias .warn()\n warn = warning\n """Alias for :py:meth:`~DagsterLogManager.warning`"""\n\n[docs] def error(self, msg, **kwargs):\n """Log at the ``logging.ERROR`` level.\n\n See :py:meth:`~DagsterLogManager.debug`.\n """\n\n check.str_param(msg, "msg")\n return self._log(logging.ERROR, msg, kwargs)\n\n[docs] def critical(self, msg, **kwargs):\n """Log at the ``logging.CRITICAL`` level.\n\n See :py:meth:`~DagsterLogManager.debug`.\n """\n check.str_param(msg, "msg")\n return self._log(logging.CRITICAL, msg, kwargs)\n\n # Define the alias .fatal()\n fatal = critical\n """Alias for :py:meth:`~DagsterLogManager.critical`"""\n
\nimport weakref\n\nfrom dagster import DagsterInstance, check\nfrom dagster.core.host_representation import ExternalPipeline\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator\n\n\n[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher.\n """\n\n def __init__(self, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._instance_ref = None\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n def initialize(self, instance):\n check.inst_param(instance, "instance", DagsterInstance)\n # Store a weakref to avoid a circular reference / enable GC\n self._instance_ref = weakref.ref(instance)\n\n @property\n def _instance(self):\n return self._instance_ref() if self._instance_ref else None\n\n def submit_run(self, pipeline_run, external_pipeline):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n return self._instance.launch_run(pipeline_run.run_id, external_pipeline)\n\n def can_cancel_run(self, run_id):\n return self._instance.run_launcher.can_terminate(run_id)\n\n def cancel_run(self, run_id):\n return self._instance.run_launcher.terminate(run_id)\n
\nimport logging\nimport time\nimport weakref\n\nfrom dagster import DagsterEvent, DagsterEventType, DagsterInstance, String, check\nfrom dagster.config import Field\nfrom dagster.config.config_type import Array, Noneable\nfrom dagster.config.field_utils import Shape\nfrom dagster.core.events.log import DagsterEventRecord\nfrom dagster.core.host_representation import ExternalPipeline\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator\n\n\n[docs]class QueuedRunCoordinator(RunCoordinator, ConfigurableClass):\n """\n Sends runs to the dequeuer process via the run storage. Requires the external process to be\n alive for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs=None,\n tag_concurrency_limits=None,\n dequeue_interval_seconds=None,\n inst_data=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._instance_ref = None\n self.max_concurrent_runs = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n self.tag_concurrency_limits = check.opt_list_param(\n tag_concurrency_limits, "tag_concurrency_limits",\n )\n self.dequeue_interval_seconds = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_concurrent_runs": Field(config=int, is_required=False),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(String, is_required=False),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n ),\n "dequeue_interval_seconds": Field(config=int, is_required=False),\n }\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n )\n\n def initialize(self, instance):\n check.inst_param(instance, "instance", DagsterInstance)\n # Store a weakref to avoid a circular reference / enable GC\n self._instance_ref = weakref.ref(instance)\n\n @property\n def _instance(self):\n return self._instance_ref() if self._instance_ref else None\n\n def submit_run(self, pipeline_run, external_pipeline):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n pipeline_name=pipeline_run.pipeline_name,\n )\n event_record = DagsterEventRecord(\n message="",\n user_message="",\n level=logging.INFO,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=enqueued_event,\n )\n self._instance.handle_new_event(event_record)\n\n return self._instance.get_run_by_id(pipeline_run.run_id)\n\n def can_cancel_run(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n if run.status == PipelineRunStatus.QUEUED:\n return True\n else:\n return self._instance.run_launcher.can_terminate(run_id)\n\n def cancel_run(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == PipelineRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run, message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)\n
\nimport abc\nimport os\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import IntSource\nfrom dagster.core.definitions.job import JobType\nfrom dagster.core.errors import DagsterError\nfrom dagster.core.host_representation import ExternalSchedule\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler.job import JobState, JobStatus, ScheduleJobData\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc, get_timestamp_from_utc_datetime\nfrom dagster.utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n """Base class for all Dagster Scheduler errors"""\n\n\nclass DagsterScheduleReconciliationError(DagsterError):\n """Error raised during schedule state reconcilation. During reconcilation, exceptions that are\n raised when trying to start or stop a schedule are collected and passed to this wrapper exception.\n The individual exceptions can be accessed by the `errors` property. """\n\n def __init__(self, preamble, errors, *args, **kwargs):\n self.errors = errors\n\n error_msg = preamble\n error_messages = []\n for i_error, error in enumerate(self.errors):\n error_messages.append(str(error))\n error_msg += "\\n Error {i_error}: {error_message}".format(\n i_error=i_error + 1, error_message=str(error)\n )\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterScheduleReconciliationError, self).__init__(error_msg, *args, **kwargs)\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n """Errors raised when ending a job for a schedule."""\n\n\nclass SchedulerDebugInfo(\n namedtuple("SchedulerDebugInfo", "errors scheduler_config_info scheduler_info schedule_storage")\n):\n def __new__(cls, errors, scheduler_config_info, scheduler_info, schedule_storage):\n return super(SchedulerDebugInfo, cls).__new__(\n cls,\n errors=check.list_param(errors, "errors", of_type=str),\n scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n schedule_storage=check.list_param(schedule_storage, "schedule_storage", of_type=str),\n )\n\n\n[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def _get_schedule_state(self, instance, external_origin_id):\n schedule_state = instance.get_job_state(external_origin_id)\n if not schedule_state:\n raise DagsterScheduleDoesNotExist(\n "You have attempted to start the job for schedule id {id}, but its state is not in storage.".format(\n id=external_origin_id\n )\n )\n\n return schedule_state\n\n def _create_new_schedule_state(self, instance, external_schedule):\n schedule_state = JobState(\n external_schedule.get_external_origin(),\n JobType.SCHEDULE,\n JobStatus.STOPPED,\n ScheduleJobData(external_schedule.cron_schedule, scheduler=self.__class__.__name__),\n )\n\n instance.add_job_state(schedule_state)\n return schedule_state\n\n def reconcile_scheduler_state(self, instance, external_repository):\n """Reconcile the ExternalSchedule list from the repository and ScheduleStorage\n on the instance to ensure there is a 1-1 correlation between ExternalSchedule and\n JobStates of type JobType.SCHEDULE, where the ExternalSchedule list is the source of truth.\n\n If a new ExternalSchedule is introduced, a new JobState is added to storage with status\n JobStatus.STOPPED.\n\n For every previously existing ExternalSchedule (where target id is the primary key),\n any changes to the definition are persisted in the corresponding JobState and the status is\n left unchanged. The schedule is also restarted to make sure the external artifacts (such\n as a cron job) are up to date.\n\n For every ScheduleDefinitions that is removed, the corresponding JobState is removed from\n the storage and the corresponding job is ended.\n """\n\n schedules_to_restart = []\n for external_schedule in external_repository.get_external_schedules():\n # If a schedule already exists for schedule_def, overwrite bash script and\n # metadata file\n existing_schedule_state = instance.get_job_state(\n external_schedule.get_external_origin_id()\n )\n if existing_schedule_state:\n new_timestamp = existing_schedule_state.job_specific_data.start_timestamp\n if not new_timestamp and existing_schedule_state.status == JobStatus.RUNNING:\n new_timestamp = get_timestamp_from_utc_datetime(get_current_datetime_in_utc())\n\n # Keep the status, update target and cron schedule\n schedule_state = JobState(\n external_schedule.get_external_origin(),\n JobType.SCHEDULE,\n existing_schedule_state.status,\n ScheduleJobData(\n external_schedule.cron_schedule,\n new_timestamp,\n scheduler=self.__class__.__name__,\n ),\n )\n\n instance.update_job_state(schedule_state)\n schedules_to_restart.append((existing_schedule_state, external_schedule))\n else:\n self._create_new_schedule_state(instance, external_schedule)\n\n # Delete all existing schedules that are not in external schedules\n external_schedule_origin_ids = {\n s.get_external_origin_id() for s in external_repository.get_external_schedules()\n }\n existing_schedule_origin_ids = set(\n [\n job.job_origin_id\n for job in instance.all_stored_job_state(\n external_repository.get_external_origin_id()\n )\n if job.job_type == JobType.SCHEDULE\n ]\n )\n schedule_origin_ids_to_delete = existing_schedule_origin_ids - external_schedule_origin_ids\n\n schedule_reconciliation_errors = []\n for schedule_state, external_schedule in schedules_to_restart:\n # Restart is only needed if the schedule was previously running\n if schedule_state.status == JobStatus.RUNNING:\n try:\n self.refresh_schedule(instance, external_schedule)\n except DagsterSchedulerError as e:\n schedule_reconciliation_errors.append(e)\n\n if schedule_state.status == JobStatus.STOPPED:\n try:\n self.stop_schedule(instance, external_schedule.get_external_origin_id())\n except DagsterSchedulerError as e:\n schedule_reconciliation_errors.append(e)\n\n for schedule_origin_id in schedule_origin_ids_to_delete:\n try:\n instance.stop_schedule_and_delete_from_storage(schedule_origin_id)\n except DagsterSchedulerError as e:\n schedule_reconciliation_errors.append(e)\n\n if len(schedule_reconciliation_errors):\n raise DagsterScheduleReconciliationError(\n "One or more errors were encountered by the Scheduler while starting or stopping schedules. "\n "Individual error messages follow:",\n errors=schedule_reconciliation_errors,\n )\n\n def start_schedule_and_update_storage_state(self, instance, external_schedule):\n """\n Updates the status of the given schedule to `JobStatus.RUNNING` in schedule storage,\n then calls `start_schedule`.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n schedule_state = instance.get_job_state(external_schedule.get_external_origin_id())\n\n if not schedule_state:\n schedule_state = self._create_new_schedule_state(instance, external_schedule)\n\n if schedule_state.status == JobStatus.RUNNING:\n raise DagsterSchedulerError(\n "You have attempted to start schedule {name}, but it is already running".format(\n name=external_schedule.name\n )\n )\n\n self.start_schedule(instance, external_schedule)\n started_schedule = schedule_state.with_status(JobStatus.RUNNING).with_data(\n ScheduleJobData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n scheduler=self.__class__.__name__,\n )\n )\n instance.update_job_state(started_schedule)\n return started_schedule\n\n def stop_schedule_and_update_storage_state(self, instance, schedule_origin_id):\n """\n Updates the status of the given schedule to `JobStatus.STOPPED` in schedule storage,\n then calls `stop_schedule`.\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n schedule_state = self._get_schedule_state(instance, schedule_origin_id)\n\n self.stop_schedule(instance, schedule_origin_id)\n stopped_schedule = schedule_state.with_status(JobStatus.STOPPED).with_data(\n ScheduleJobData(\n cron_schedule=schedule_state.job_specific_data.cron_schedule,\n scheduler=self.__class__.__name__,\n )\n )\n instance.update_job_state(stopped_schedule)\n return stopped_schedule\n\n def stop_schedule_and_delete_from_storage(self, instance, schedule_origin_id):\n """\n Deletes a schedule from schedule storage, then calls `stop_schedule`.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n schedule_origin_id (string): The id of the schedule target to start running.\n """\n\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n schedule = self._get_schedule_state(instance, schedule_origin_id)\n self.stop_schedule(instance, schedule_origin_id)\n instance.delete_job_state(schedule_origin_id)\n return schedule\n\n def refresh_schedule(self, instance, external_schedule):\n """Refresh a running schedule. This is called when user reconciles the schedule state.\n\n By default, this method will call stop_schedule and then start_schedule but can be\n overriden. For example, in the K8s Scheduler we patch the existing cronjob\n (without stopping it) to minimize downtime.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start running.\n """\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n self.stop_schedule(instance, external_schedule.get_external_origin_id())\n self.start_schedule(instance, external_schedule)\n\n @abc.abstractmethod\n def debug_info(self):\n """Returns debug information about the scheduler\n """\n\n @abc.abstractmethod\n def start_schedule(self, instance, external_schedule):\n """Start running a schedule. This method is called by `start_schedule_and_update_storage_state`,\n which first updates the status of the schedule in schedule storage to `JobStatus.RUNNING`,\n then calls this method.\n\n For example, in the cron scheduler, this method writes a cron job to the cron tab\n for the given schedule.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start running.\n """\n\n @abc.abstractmethod\n def stop_schedule(self, instance, schedule_origin_id):\n """Stop running a schedule.\n\n This method is called by\n 1) `stop_schedule_and_update_storage_state`,\n which first updates the status of the schedule in schedule storage to `JobStatus.STOPPED`,\n then calls this method.\n 2) `stop_schedule_and_delete_from_storage`, which deletes the schedule from schedule storage\n then calls this method.\n\n For example, in the cron scheduler, this method deletes the cron job for a given scheduler\n from the cron tab.\n\n Args:\n instance (DagsterInstance): The current instance.\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n\n @abc.abstractmethod\n def running_schedule_count(self, instance, schedule_origin_id):\n """Returns the number of jobs currently running for the given schedule. This method is used\n for detecting when the scheduler is out of sync with schedule storage.\n\n For example, when:\n - There are duplicate jobs runnning for a single schedule\n - There are no jobs runnning for a schedule that is set to be running\n - There are still jobs running for a schedule that is set to be stopped\n\n When the scheduler and schedule storage are in sync, this method should return:\n - 1 when a schedule is set to be running\n - 0 when a schedule is set to be stopped\n\n Args:\n instance (DagsterInstance): The current instance.\n schedule_origin_id (string): The id of the schedule target to return the number of jobs for\n """\n\n @abc.abstractmethod\n def get_logs_path(self, instance, schedule_origin_id):\n """Get path to store logs for schedule\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """\n\n\n[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process.\n """\n\n def __init__(self, max_catchup_runs=None, inst_data=None):\n self.max_catchup_runs = check.opt_int_param(max_catchup_runs, "max_catchup_runs", 5)\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"max_catchup_runs": Field(IntSource, is_required=False)}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DagsterDaemonScheduler(\n inst_data=inst_data, max_catchup_runs=config_value.get("max_catchup_runs")\n )\n\n def debug_info(self):\n return ""\n\n def start_schedule(self, instance, external_schedule):\n # Automatically picked up by the `dagster scheduler run` command\n pass\n\n def stop_schedule(self, instance, schedule_origin_id):\n # Automatically picked up by the `dagster scheduler run` command\n pass\n\n def running_schedule_count(self, instance, schedule_origin_id):\n state = instance.get_job_state(schedule_origin_id)\n if not state:\n return 0\n return 1 if state.status == JobStatus.RUNNING else 0\n\n def wipe(self, instance):\n pass\n\n def _get_or_create_logs_directory(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")\n
\nimport atexit\nfrom abc import ABC, abstractmethod\nfrom collections import namedtuple\nfrom contextlib import contextmanager\nfrom enum import Enum\n\nfrom dagster import check\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom rx import Observable\n\nMAX_BYTES_FILE_READ = 33554432 # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304 # 4 MB\n\n\nclass ComputeIOType(Enum):\n STDOUT = "stdout"\n STDERR = "stderr"\n\n\nclass ComputeLogFileData(namedtuple("ComputeLogFileData", "path data cursor size download_url")):\n """Representation of a chunk of compute execution log data"""\n\n def __new__(cls, path, data, cursor, size, download_url):\n return super(ComputeLogFileData, cls).__new__(\n cls,\n path=check.str_param(path, "path"),\n data=check.opt_str_param(data, "data"),\n cursor=check.int_param(cursor, "cursor"),\n size=check.int_param(size, "size"),\n download_url=check.opt_str_param(download_url, "download_url"),\n )\n\n\n[docs]class ComputeLogManager(ABC):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids."""\n\n @contextmanager\n def watch(self, pipeline_run, step_key=None):\n """\n Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(pipeline_run, step_key):\n yield\n return\n\n self.on_watch_start(pipeline_run, step_key)\n with self._watch_logs(pipeline_run, step_key):\n yield\n self.on_watch_finish(pipeline_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(self, pipeline_run, step_key=None):\n """\n Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n def get_local_path(self, run_id, key, io_type):\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n\n @abstractmethod\n def is_watch_completed(self, run_id, key):\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, pipeline_run, step_key):\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, pipeline_run, step_key):\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id, key, io_type):\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _pipeline_run, _step_key):\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription):\n """Hook for managing streaming subscriptions for log data from `dagit`\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def observable(self, run_id, key, io_type, cursor=None):\n """Return an Observable which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor)\n else:\n cursor = 0\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor)\n self.on_subscribe(subscription)\n return Observable.create(subscription) # pylint: disable=E1101\n\n def dispose(self):\n pass\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written\n """\n\n def __init__(self, manager, run_id, key, io_type, cursor):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer = None\n atexit.register(self._clean)\n\n def __call__(self, observer):\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n\n def fetch(self):\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id, self.key, self.io_type, self.cursor, max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer.on_next(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self):\n if not self.observer:\n return\n self.observer.on_completed()\n\n def _clean(self):\n self.complete()\n self.observer = None\n
\nfrom abc import ABC, abstractmethod, abstractproperty\n\nimport pyrsistent\nfrom dagster.core.events.log import EventRecord\nfrom dagster.core.execution.stats import (\n build_run_stats_from_events,\n build_run_step_stats_from_events,\n)\n\n\nclass EventLogSequence(pyrsistent.CheckedPVector):\n __type__ = EventRecord\n\n\n[docs]class EventLogStorage(ABC):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def get_logs_for_run(self, run_id, cursor=-1):\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n """\n\n def get_stats_for_run(self, run_id):\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(self, run_id, step_keys=None):\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.dagster_event.step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n\n Args:\n run_id (str): The id of the run that generated the event.\n event (EventRecord): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id):\n """Remove events for a given run id"""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex(self, print_fn=lambda _: None, force=False):\n """Call this method to run any data migrations, reindexing to build summary tables."""\n\n @abstractmethod\n def wipe(self):\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id, start_cursor, callback):\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id, handler):\n """Call this method to stop watching."""\n\n @abstractmethod\n def has_secondary_index(self, name, run_id=None):\n """Whether the secondary index for a given name is enabled."""\n\n @abstractmethod\n def enable_secondary_index(self, name, run_id=None):\n """Call to enable the secondary index for a given name."""\n\n @abstractproperty\n def is_persistent(self):\n """bool: Whether the storage is persistent."""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n @property\n def is_asset_aware(self):\n return isinstance(self, AssetAwareEventLogStorage)\n\n def optimize_for_dagit(self, statement_timeout):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n\nclass AssetAwareEventLogStorage(ABC):\n @abstractmethod\n def has_asset_key(self, asset_key):\n pass\n\n @abstractmethod\n def get_all_asset_keys(self, prefix_path=None):\n pass\n\n @abstractmethod\n def get_asset_events(\n self,\n asset_key,\n partitions=None,\n cursor=None,\n limit=None,\n ascending=False,\n include_cursor=False,\n ):\n pass\n\n @abstractmethod\n def get_asset_run_ids(self, asset_key):\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key):\n """Remove asset index history from event log for given asset_key"""\n
\nimport logging\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\n\nimport sqlalchemy as db\nfrom dagster import check, seven\nfrom dagster.core.definitions.events import AssetKey, Materialization\nfrom dagster.core.errors import DagsterEventLogInvalidForRun\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventRecord\nfrom dagster.core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.utils import datetime_as_float, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import PipelineRunStatsSnapshot\nfrom .base import AssetAwareEventLogStorage, EventLogStorage\nfrom .migration import REINDEX_DATA_MIGRATIONS, SECONDARY_INDEX_ASSET_KEY\nfrom .schema import AssetKeyTable, SecondaryIndexMigrationTable, SqlEventLogStorageTable\n\n\n[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n """\n\n @abstractmethod\n def connect(self, run_id=None):\n """Context manager yielding a connection.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def reindex(self, print_fn=lambda _: None, force=False):\n """Call this method to run any data migrations, reindexing to build summary tables."""\n for migration_name, migration_fn in REINDEX_DATA_MIGRATIONS.items():\n if self.has_secondary_index(migration_name):\n if not force:\n print_fn("Skipping already reindexed summary: {}".format(migration_name))\n continue\n print_fn("Starting reindex: {}".format(migration_name))\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n print_fn("Finished reindexing: {}".format(migration_name))\n\n def prepare_insert_event(self, event):\n """ Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=event.run_id,\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n timestamp=utc_datetime_from_timestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def store_asset_key(self, conn, event):\n check.inst_param(event, "event", EventRecord)\n if not event.is_dagster_event or not event.dagster_event.asset_key:\n return\n\n try:\n conn.execute(\n AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter\n asset_key=event.dagster_event.asset_key.to_string()\n )\n )\n except db.exc.IntegrityError:\n pass\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventRecord): The event to store.\n """\n check.inst_param(event, "event", EventRecord)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.connect(run_id) as conn:\n conn.execute(insert_event_statement)\n if event.is_dagster_event and event.dagster_event.asset_key:\n self.store_asset_key(conn, event)\n\n def get_logs_for_run_by_log_id(self, run_id, cursor=-1):\n check.str_param(run_id, "run_id")\n check.int_param(cursor, "cursor")\n check.invariant(\n cursor >= -1,\n "Don't know what to do with negative cursor {cursor}".format(cursor=cursor),\n )\n\n # cursor starts at 0 & auto-increment column starts at 1 so adjust\n cursor = cursor + 1\n\n query = (\n db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.id > cursor)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n with self.connect(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n try:\n for (record_id, json_str,) in results:\n events[record_id] = check.inst_param(\n deserialize_json_to_dagster_namedtuple(json_str), "event", EventRecord\n )\n except (seven.JSONDecodeError, check.CheckError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n return events\n\n def get_logs_for_run(self, run_id, cursor=-1):\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n """\n check.str_param(run_id, "run_id")\n check.int_param(cursor, "cursor")\n check.invariant(\n cursor >= -1,\n "Don't know what to do with negative cursor {cursor}".format(cursor=cursor),\n )\n\n events_by_id = self.get_logs_for_run_by_log_id(run_id, cursor)\n return [event for id, event in sorted(events_by_id.items(), key=lambda x: x[0])]\n\n def get_stats_for_run(self, run_id):\n check.str_param(run_id, "run_id")\n\n query = (\n db.select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .group_by("dagster_event_type")\n )\n\n with self.connect(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n if dagster_event_type:\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return PipelineRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.STEP_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, check.CheckError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(self, run_id, step_keys=None):\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n STEP_STATS_EVENT_TYPES = [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n ]\n\n by_step_query = (\n db.select(\n [\n SqlEventLogStorageTable.c.step_key,\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n db.func.count(SqlEventLogStorageTable.c.id).label("count"),\n ]\n )\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None)\n .where(SqlEventLogStorageTable.c.dagster_event_type.in_(STEP_STATS_EVENT_TYPES))\n )\n\n if step_keys:\n by_step_query = by_step_query.where(SqlEventLogStorageTable.c.step_key.in_(step_keys))\n\n by_step_query = by_step_query.group_by(\n SqlEventLogStorageTable.c.step_key, SqlEventLogStorageTable.c.dagster_event_type,\n )\n\n with self.connect(run_id) as conn:\n results = conn.execute(by_step_query).fetchall()\n\n by_step_key = defaultdict(dict)\n for result in results:\n step_key = result.step_key\n if result.dagster_event_type == DagsterEventType.STEP_START.value:\n by_step_key[step_key]["start_time"] = (\n datetime_as_float(result.timestamp) if result.timestamp else None\n )\n by_step_key[step_key]["attempts"] = by_step_key[step_key].get("attempts", 0) + 1\n if result.dagster_event_type == DagsterEventType.STEP_RESTARTED.value:\n by_step_key[step_key]["attempts"] = (\n # In case we see step retarted events but not a step started event, we want to\n # only count the restarted events, since the attempt count represents\n # the number of times we have successfully started runnning the step\n by_step_key[step_key].get("attempts", 0)\n + result.count\n )\n if result.dagster_event_type == DagsterEventType.STEP_FAILURE.value:\n by_step_key[step_key]["end_time"] = (\n datetime_as_float(result.timestamp) if result.timestamp else None\n )\n by_step_key[step_key]["status"] = StepEventStatus.FAILURE\n if result.dagster_event_type == DagsterEventType.STEP_SUCCESS.value:\n by_step_key[step_key]["end_time"] = (\n datetime_as_float(result.timestamp) if result.timestamp else None\n )\n by_step_key[step_key]["status"] = StepEventStatus.SUCCESS\n if result.dagster_event_type == DagsterEventType.STEP_SKIPPED.value:\n by_step_key[step_key]["end_time"] = (\n datetime_as_float(result.timestamp) if result.timestamp else None\n )\n by_step_key[step_key]["status"] = StepEventStatus.SKIPPED\n\n materializations = defaultdict(list)\n expectation_results = defaultdict(list)\n raw_event_query = (\n db.select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None)\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n ]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.connect(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n for (json_str,) in results:\n event = check.inst_param(\n deserialize_json_to_dagster_namedtuple(json_str), "event", EventRecord\n )\n if event.dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION:\n materializations[event.step_key].append(\n event.dagster_event.event_specific_data.materialization\n )\n elif event.dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n expectation_results[event.step_key].append(\n event.dagster_event.event_specific_data.expectation_result\n )\n except (seven.JSONDecodeError, check.CheckError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n return [\n RunStepKeyStatsSnapshot(\n run_id=run_id,\n step_key=step_key,\n status=value.get("status"),\n start_time=value.get("start_time"),\n end_time=value.get("end_time"),\n materializations=materializations.get(step_key),\n expectation_results=expectation_results.get(step_key),\n attempts=value.get("attempts"),\n )\n for step_key, value in by_step_key.items()\n ]\n\n def wipe(self):\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n # https://stackoverflow.com/a/54386260/324449\n with self.connect() as conn:\n conn.execute(SqlEventLogStorageTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(AssetKeyTable.delete()) # pylint: disable=no-value-for-parameter\n\n def delete_events(self, run_id):\n check.str_param(run_id, "run_id")\n\n delete_statement = SqlEventLogStorageTable.delete().where( # pylint: disable=no-value-for-parameter\n SqlEventLogStorageTable.c.run_id == run_id\n )\n removed_asset_key_query = (\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.asset_key != None)\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n\n with self.connect(run_id) as conn:\n removed_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(removed_asset_key_query).fetchall()\n ]\n conn.execute(delete_statement)\n if len(removed_asset_keys) > 0:\n keys_to_check = []\n keys_to_check.extend([key.to_string() for key in removed_asset_keys])\n keys_to_check.extend([key.to_string(legacy=True) for key in removed_asset_keys])\n remaining_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.asset_key.in_(keys_to_check))\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n ]\n to_remove = set(removed_asset_keys) - set(remaining_asset_keys)\n if to_remove:\n keys_to_remove = []\n keys_to_remove.extend([key.to_string() for key in to_remove])\n keys_to_remove.extend([key.to_string(legacy=True) for key in to_remove])\n conn.execute(\n AssetKeyTable.delete().where( # pylint: disable=no-value-for-parameter\n AssetKeyTable.c.asset_key.in_(keys_to_remove)\n )\n )\n\n @property\n def is_persistent(self):\n return True\n\n def update_event_log_record(self, record_id, event):\n """ Utility method for migration scripts to update SQL representation of event records. """\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventRecord)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.connect(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n timestamp=utc_datetime_from_timestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id, record_id):\n """ Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventRecord`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering)"""\n with self.connect(run_id=run_id) as conn:\n query = (\n db.select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name, run_id=None):\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name, run_id=None):\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=name, migration_completed=datetime.now(),\n )\n with self.connect(run_id) as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n\nclass AssetAwareSqlEventLogStorage(AssetAwareEventLogStorage, SqlEventLogStorage):\n @abstractmethod\n def connect(self, run_id=None):\n pass\n\n @abstractmethod\n def upgrade(self):\n pass\n\n def _add_cursor_limit_to_query(self, query, cursor, limit, ascending=False):\n """ Helper function to deal with cursor/limit pagination args """\n try:\n cursor = int(cursor) if cursor else None\n except ValueError:\n cursor = None\n\n if cursor:\n cursor_query = db.select([SqlEventLogStorageTable.c.id]).where(\n SqlEventLogStorageTable.c.id == cursor\n )\n if ascending:\n query = query.where(SqlEventLogStorageTable.c.id > cursor_query)\n else:\n query = query.where(SqlEventLogStorageTable.c.id < cursor_query)\n\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n return query\n\n def has_asset_key(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):\n query = (\n db.select([1])\n .where(\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .limit(1)\n )\n else:\n query = (\n db.select([1])\n .where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n return len(results) > 0\n\n def get_all_asset_keys(self, prefix_path=None):\n lazy_migrate = False\n if not prefix_path:\n if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):\n query = db.select([AssetKeyTable.c.asset_key])\n else:\n query = (\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.asset_key != None)\n .distinct()\n )\n\n # This is in place to migrate everyone to using the secondary index table for asset\n # keys. Performing this migration should result in a big performance boost for\n # any asset-catalog reads.\n\n # After a sufficient amount of time (>= 0.11.0?), we can remove the checks\n # for has_secondary_index(SECONDARY_INDEX_ASSET_KEY) and always read from the\n # AssetKeyTable, since we are already writing to the table. Tracking the conditional\n # check removal here: https://github.com/dagster-io/dagster/issues/3507\n lazy_migrate = True\n else:\n if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):\n query = db.select([AssetKeyTable.c.asset_key]).where(\n db.or_(\n AssetKeyTable.c.asset_key.startswith(AssetKey.get_db_prefix(prefix_path)),\n AssetKeyTable.c.asset_key.startswith(\n AssetKey.get_db_prefix(prefix_path, legacy=True)\n ),\n )\n )\n else:\n query = (\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.asset_key != None)\n .where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key.startswith(\n AssetKey.get_db_prefix(prefix_path)\n ),\n SqlEventLogStorageTable.c.asset_key.startswith(\n AssetKey.get_db_prefix(prefix_path, legacy=True)\n ),\n )\n )\n .distinct()\n )\n\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n if lazy_migrate:\n # This is in place to migrate everyone to using the secondary index table for asset\n # keys. Performing this migration should result in a big performance boost for\n # any subsequent asset-catalog reads.\n self._lazy_migrate_secondary_index_asset_key(\n conn, [asset_key for (asset_key,) in results if asset_key]\n )\n return list(\n set([AssetKey.from_db_string(asset_key) for (asset_key,) in results if asset_key])\n )\n\n def _lazy_migrate_secondary_index_asset_key(self, conn, asset_keys):\n results = conn.execute(db.select([AssetKeyTable.c.asset_key])).fetchall()\n existing = [asset_key for (asset_key,) in results if asset_key]\n to_migrate = set(asset_keys) - set(existing)\n for asset_key in to_migrate:\n try:\n conn.execute(\n AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter\n asset_key=AssetKey.from_db_string(asset_key).to_string()\n )\n )\n except db.exc.IntegrityError:\n # asset key already present\n pass\n self.enable_secondary_index(SECONDARY_INDEX_ASSET_KEY)\n\n def get_asset_events(\n self,\n asset_key,\n partitions=None,\n cursor=None,\n limit=None,\n ascending=False,\n include_cursor=False,\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.opt_list_param(partitions, "partitions", of_type=str)\n query = db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]).where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n if partitions:\n query = query.where(SqlEventLogStorageTable.c.partition.in_(partitions))\n\n query = self._add_cursor_limit_to_query(query, cursor, limit, ascending=ascending)\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n events = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventRecord):\n logging.warning(\n "Could not resolve asset event record as EventRecord for id `{}`.".format(\n row_id\n )\n )\n continue\n if include_cursor:\n events.append(tuple([row_id, event_record]))\n else:\n events.append(event_record)\n except seven.JSONDecodeError:\n logging.warning("Could not parse asset event record id `{}`.".format(row_id))\n return events\n\n def get_asset_run_ids(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n query = (\n db.select(\n [SqlEventLogStorageTable.c.run_id, db.func.max(SqlEventLogStorageTable.c.timestamp)]\n )\n .where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .group_by(SqlEventLogStorageTable.c.run_id,)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).desc())\n )\n\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return [run_id for (run_id, _timestamp) in results]\n\n def wipe_asset(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n event_query = db.select(\n [SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]\n ).where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n asset_key_delete = AssetKeyTable.delete().where( # pylint: disable=no-value-for-parameter\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n\n with self.connect() as conn:\n conn.execute(asset_key_delete)\n results = conn.execute(event_query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventRecord):\n continue\n\n assert event_record.dagster_event.event_specific_data.materialization.asset_key\n\n dagster_event = event_record.dagster_event\n event_specific_data = dagster_event.event_specific_data\n materialization = event_specific_data.materialization\n updated_materialization = Materialization(\n label=materialization.label,\n description=materialization.description,\n metadata_entries=materialization.metadata_entries,\n asset_key=None,\n skip_deprecation_warning=True,\n )\n updated_event_specific_data = event_specific_data._replace(\n materialization=updated_materialization\n )\n updated_dagster_event = dagster_event._replace(\n event_specific_data=updated_event_specific_data\n )\n updated_record = event_record._replace(dagster_event=updated_dagster_event)\n\n # update the event_record here\n self.update_event_log_record(row_id, updated_record)\n\n except seven.JSONDecodeError:\n logging.warning("Could not parse asset event record id `{}`.".format(row_id))\n
\nimport glob\nimport logging\nimport os\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nimport sqlalchemy as db\nfrom dagster import StringSource, check\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.core.storage.sql import (\n check_alembic_revision,\n create_engine,\n get_alembic_config,\n handle_schema_errors,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlEventLogStorage\n\n\n[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster.core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir, inst_data=None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database."""\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._watchers = defaultdict(dict)\n self._obs = Observer()\n self._obs.start()\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n def upgrade(self):\n all_run_ids = self.get_all_run_ids()\n print( # pylint: disable=print-call\n "Updating event log storage for {n_runs} runs on disk...".format(\n n_runs=len(all_run_ids)\n )\n )\n alembic_config = get_alembic_config(__file__)\n for run_id in tqdm(all_run_ids):\n with self.connect(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self):\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [os.path.splitext(os.path.basename(filename))[0] for filename in all_filenames]\n\n def path_for_run_id(self, run_id):\n return os.path.join(self._base_dir, "{run_id}.db".format(run_id=run_id))\n\n def conn_string_for_run_id(self, run_id):\n check.str_param(run_id, "run_id")\n return create_db_conn_string(self._base_dir, run_id)\n\n def _initdb(self, engine):\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db.exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagit process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n "table asset_keys already exists" in err_msg\n or "table secondary_indexes already exists" in err_msg\n or "table event_logs already exists" in err_msg\n or "database is locked" in err_msg\n or "table alembic_version already exists" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying ({retry_limit} retries left). Exception: {str_exc}".format(\n retry_limit=retry_limit, str_exc=err_msg\n )\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def connect(self, run_id=None):\n with self._db_lock:\n check.str_param(run_id, "run_id")\n\n conn_string = self.conn_string_for_run_id(run_id)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if not run_id in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(run_id)\n\n conn = engine.connect()\n\n try:\n with handle_schema_errors(\n conn,\n get_alembic_config(__file__),\n msg="SqliteEventLogStorage for run {run_id}".format(run_id=run_id),\n ):\n yield conn\n finally:\n conn.close()\n engine.dispose()\n\n def has_secondary_index(self, name, run_id=None):\n return False\n\n def enable_secondary_index(self, name, run_id=None):\n pass\n\n def wipe(self):\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n\n def watch(self, run_id, start_cursor, callback):\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, start_cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id, handler):\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch)\n del self._watchers[run_id][handler]\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, run_id, callback, start_cursor, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_run_id(run_id)\n self._cursor = start_cursor if start_cursor is not None else -1\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self):\n events = self._event_log_storage.get_logs_for_run(self._run_id, self._cursor)\n self._cursor += len(events)\n for event in events:\n status = self._cb(event)\n\n if (\n status == PipelineRunStatus.SUCCESS\n or status == PipelineRunStatus.FAILURE\n or status == PipelineRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, TextIO, Union\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.resource import resource\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.types.decorator import usable_as_dagster_type\nfrom dagster.utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\n\n# pylint: disable=no-init\n[docs]@usable_as_dagster_type\nclass FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @abstractproperty\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()\n\n\n[docs]@usable_as_dagster_type\nclass LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path\n\n\n[docs]class FileManager(ABC): # pylint: disable=no-init\n """Base class for all file managers in dagster.\n \n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n\n In 0.10.x, this abstraction will be deprecated in favor of the :py:class:`~dagster.IOManager`.\n """\n\n[docs] @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.write`, and \n :py:meth:`~dagster.core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()\n\n[docs] @abstractmethod\n def delete_local_temp(self):\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster.core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n \n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()\n\n[docs] @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> Union[TextIO, BinaryIO]:\n """Return a file-like stream for the file handle.\n \n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()\n\n[docs] @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()\n\n[docs] @abstractmethod\n def write(\n self, file_obj: Union[TextIO, BinaryIO], mode: str = "wb", ext: str = None\n ) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()\n\n[docs] @abstractmethod\n def write_data(self, data: bytes, ext: str = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()\n\n\n[docs]@resource(config_schema={"base_dir": Field(StringSource, default_value=".", is_required=False)})\ndef local_file_manager(init_context):\n """FileManager that provides abstract access to a local filesystem.\n \n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n\n Examples:\n\n .. code-block:: python\n\n import tempfile\n\n from dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n @solid(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @solid(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\n def files_pipeline():\n read_files(write_files())\n\n """\n\n return LocalFileManager(init_context.resource_config["base_dir"])\n\n\ndef check_file_like_obj(obj):\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance, run_id):\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self):\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle):\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj:\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n with open(file_handle.path, mode) as file_obj:\n yield file_obj\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n with open(dest_file_path, mode) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
\nimport os\nimport pickle\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.events import AssetKey, AssetMaterialization, EventMetadataEntry\nfrom dagster.core.execution.context.system import InputContext, OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n[docs]@io_manager(config_schema={"base_dir": Field(StringSource, default_value=".", is_required=False)})\ndef fs_io_manager(init_context):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n It allows users to specify a base directory where all the step outputs will be stored. It\n serializes and deserializes output values using pickling and automatically constructs\n the filepaths for the assets.\n\n Example usage:\n\n 1. Specify a pipeline-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all solids across a pipeline.\n\n .. code-block:: python\n\n @solid\n def solid_a(context, df):\n return df\n\n @solid\n def solid_b(context, df):\n return df[:5]\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})])\n def pipe():\n solid_b(solid_a())\n\n\n 2. Specify IO manager on :py:class:`OutputDefinition`, which allows the user to set\n different IO managers on different step outputs.\n\n .. code-block:: python\n\n @solid(output_defs=[OutputDefinition(io_manager_key="my_io_manager")])\n def solid_a(context, df):\n return df\n\n @solid\n def solid_b(context, df):\n return df[:5]\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={"my_io_manager": fs_io_manager})]\n )\n def pipe():\n solid_b(solid_a())\n\n """\n\n return PickledObjectFilesystemIOManager(init_context.resource_config["base_dir"])\n\n\nclass PickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context):\n """Automatically construct filepath."""\n keys = context.get_run_scoped_output_identifier()\n\n return os.path.join(self.base_dir, *keys)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n\n filepath = self._get_path(context)\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n\n filepath = self._get_path(context.upstream_output)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, path):\n return os.path.join(self.base_dir, path)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.pipeline_name, context.step_key, context.name]),\n metadata_entries=[EventMetadataEntry.fspath(os.path.abspath(filepath))],\n )\n\n def load_input(self, context):\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n[docs]@io_manager(config_schema={"base_dir": Field(StringSource, default_value=".", is_required=False)})\n@experimental\ndef custom_path_fs_io_manager(init_context):\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It also allows users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n @solid(\n output_defs=[\n OutputDefinition(\n io_manager_key="io_manager", metadata={"path": "path/to/sample_output"}\n )\n ]\n )\n def sample_data(context, df):\n return df[:5]\n\n @pipeline(\n mode_defs=[\n ModeDefinition(resource_defs={"io_manager": custom_path_fs_io_manager}),\n ],\n )\n def pipe():\n sample_data()\n """\n return CustomPathPickledObjectFilesystemIOManager(init_context.resource_config["base_dir"])\n
\nfrom collections import namedtuple\n\nfrom dagster import check\nfrom dagster.core.definitions import (\n IntermediateStorageDefinition,\n ModeDefinition,\n PipelineDefinition,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.type_storage import TypeStoragePluginRegistry\nfrom dagster.core.system_config.objects import EnvironmentConfig\n\n\n[docs]class InitIntermediateStorageContext(\n namedtuple(\n "InitIntermediateStorageContext",\n (\n "pipeline_def mode_def intermediate_storage_def pipeline_run instance environment_config "\n "type_storage_plugin_registry resources intermediate_storage_config"\n ),\n )\n):\n """Intermediate storage-specific initialization context.\n\n Attributes:\n pipeline_def (PipelineDefinition): The definition of the pipeline in context.\n mode_def (ModeDefinition): The definition of the mode in context.\n intermediate_storage_def (IntermediateStorageDefinition): The definition of the intermediate storage to be\n constructed.\n pipeline_run (PipelineRun): The pipeline run in context.\n instance (DagsterInstance): The instance.\n environment_config (EnvironmentConfig): The environment config.\n type_storage_plugin_registry (TypeStoragePluginRegistry): Registry containing custom type\n storage plugins.\n resources (Any): Resources available in context.\n intermediate_storage_config (Dict[str, Any]): The intermediate storage-specific configuration data\n provided by the environment config. The schema for this data is defined by the\n ``config_schema`` argument to :py:class:`IntermediateStorageDefinition`.\n """\n\n def __new__(\n cls,\n pipeline_def,\n mode_def,\n intermediate_storage_def,\n pipeline_run,\n instance,\n environment_config,\n type_storage_plugin_registry,\n resources,\n intermediate_storage_config,\n ):\n return super(InitIntermediateStorageContext, cls).__new__(\n cls,\n pipeline_def=check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition),\n mode_def=check.inst_param(mode_def, "mode_def", ModeDefinition),\n intermediate_storage_def=check.inst_param(\n intermediate_storage_def, "intermediate_storage_def", IntermediateStorageDefinition\n ),\n pipeline_run=check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n environment_config=check.inst_param(\n environment_config, "environment_config", EnvironmentConfig\n ),\n type_storage_plugin_registry=check.inst_param(\n type_storage_plugin_registry,\n "type_storage_plugin_registry",\n TypeStoragePluginRegistry,\n ),\n resources=check.not_none_param(resources, "resources"),\n intermediate_storage_config=check.dict_param(\n intermediate_storage_config, intermediate_storage_config, key_type=str\n ),\n )\n
\nfrom abc import abstractmethod\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.resource import ResourceDefinition\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition, OutputManager\nfrom dagster.core.storage.root_input_manager import IInputManagerDefinition\n\n\n[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store solid outputs and load them as inputs to downstream solids.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n input_config_schema=None,\n output_config_schema=None,\n ):\n self._input_config_schema = input_config_schema\n self._output_config_schema = output_config_schema\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n @property\n def output_config_schema(self):\n return self._output_config_schema\n\n def copy_for_configured(self, name, description, config_schema, _):\n check.invariant(name is None, "ResourceDefintions do not have names")\n return IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n\n[docs]class IOManager(InputManager, OutputManager):\n """\n Base class for user-provided IO managers.\n\n IOManagers are used to store solid outputs and load them as inputs to downstream solids.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n[docs] @abstractmethod\n def load_input(self, context):\n """User-defined method that loads an input to a solid.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """\n\n[docs] @abstractmethod\n def handle_output(self, context, obj):\n """User-defined method that stores an output of a solid.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the solid, to be stored.\n """\n\n\n[docs]def io_manager(\n config_schema=None,\n description=None,\n output_config_schema=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """\n Define an IO manager.\n\n IOManagers are used to store solid outputs and load them as inputs to downstream solids.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @solid(output_defs=[OutputDefinition(io_manager_key="my_io_manager_key")])\n def my_solid(_):\n return do_stuff()\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={"my_io_manager_key": my_io_manager})]\n )\n def my_pipeline():\n my_solid()\n\n execute_pipeline(my_pipeline)\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn):\n return _IOManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n output_config_schema=None,\n input_config_schema=None,\n ):\n self.config_schema = convert_user_facing_definition_config_schema(config_schema)\n self.description = check.opt_str_param(description, "description")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self.version = check.opt_str_param(version, "version")\n self.output_config_schema = convert_user_facing_definition_config_schema(\n output_config_schema\n )\n self.input_config_schema = convert_user_facing_definition_config_schema(input_config_schema)\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n update_wrapper(io_manager_def, wrapped=fn)\n\n return io_manager_def\n
\nimport hashlib\nimport os\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nfrom dagster import StringSource, check\nfrom dagster.core.execution.compute_logs import mirror_stream_to_file\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, touch_file\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom .compute_log_manager import (\n MAX_BYTES_FILE_READ,\n ComputeIOType,\n ComputeLogFileData,\n ComputeLogManager,\n ComputeLogSubscription,\n)\n\nWATCHDOG_POLLING_TIMEOUT = 2.5\n\nIO_TYPE_EXTENSION = {ComputeIOType.STDOUT: "out", ComputeIOType.STDERR: "err"}\n\nMAX_FILENAME_LENGTH = 255\n\n\n[docs]class LocalComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk.\n """\n\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = base_dir\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n key = self.get_key(pipeline_run, step_key)\n outpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n errpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDERR)\n with mirror_stream_to_file(sys.stdout, outpath):\n with mirror_stream_to_file(sys.stderr, errpath):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n def _run_directory(self, run_id):\n return os.path.join(self._base_dir, run_id, "compute_logs")\n\n def get_local_path(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return self._get_local_path(run_id, key, IO_TYPE_EXTENSION[io_type])\n\n def complete_artifact_path(self, run_id, key):\n return self._get_local_path(run_id, key, "complete")\n\n def _get_local_path(self, run_id, key, extension):\n filename = "{}.{}".format(key, extension)\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(key.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._run_directory(run_id), filename)\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def is_watch_completed(self, run_id, key):\n return os.path.exists(self.complete_artifact_path(run_id, key))\n\n def on_watch_start(self, pipeline_run, step_key):\n pass\n\n def get_key(self, pipeline_run, step_key):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or pipeline_run.pipeline_name\n\n def on_watch_finish(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n key = self.get_key(pipeline_run, step_key)\n touchpath = self.complete_artifact_path(pipeline_run.run_id, key)\n touch_file(touchpath)\n\n def download_url(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return "/download/{}/{}/{}".format(run_id, key, io_type.value)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = PollingObserver(WATCHDOG_POLLING_TIMEOUT)\n self._observer.start()\n\n def _key(self, run_id, key):\n return "{}:{}".format(run_id, key)\n\n def add_subscription(self, subscription):\n check.inst_param(subscription, "subscription", ComputeLogSubscription)\n key = self._key(subscription.run_id, subscription.key)\n self._subscriptions[key].append(subscription)\n self.watch(subscription.run_id, subscription.key)\n\n def remove_all_subscriptions(self, run_id, key):\n key = self._key(run_id, key)\n for subscription in self._subscriptions.pop(key, []):\n subscription.complete()\n\n def watch(self, run_id, key):\n key = self._key(run_id, key)\n if key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_local_path(run_id, key, ComputeIOType.STDOUT),\n self._manager.get_local_path(run_id, key, ComputeIOType.STDERR),\n ]\n complete_paths = [self._manager.complete_artifact_path(run_id, key)]\n directory = os.path.dirname(self._manager.get_local_path(run_id, key, ComputeIOType.STDERR))\n\n ensure_dir(directory)\n self._watchers[key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(self, run_id, key, update_paths, complete_paths),\n str(directory),\n )\n\n def notify_subscriptions(self, run_id, key):\n key = self._key(run_id, key)\n for subscription in self._subscriptions[key]:\n subscription.fetch()\n\n def unwatch(self, run_id, key, handler):\n key = self._key(run_id, key)\n if key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[key])\n del self._watchers[key]\n\n def dispose(self):\n self._observer.stop()\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, run_id, key, update_paths, complete_paths):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.run_id, self.key)\n self.manager.unwatch(self.run_id, self.key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.run_id, self.key)\n
\nfrom dagster.core.storage.io_manager import IOManager, io_manager\n\n\nclass InMemoryIOManager(IOManager):\n def __init__(self):\n self.values = {}\n\n def handle_output(self, context, obj):\n keys = tuple(context.get_run_scoped_output_identifier())\n self.values[keys] = obj\n\n def load_input(self, context):\n keys = tuple(context.upstream_output.get_run_scoped_output_identifier())\n return self.values[keys]\n\n\n[docs]@io_manager\ndef mem_io_manager(_):\n """Built-in IO manager that stores and retrieves values in memory."""\n\n return InMemoryIOManager()\n
\nimport os\nimport pickle\nfrom abc import abstractmethod\n\nfrom dagster import IOManager, check, io_manager\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n[docs]class MemoizableIOManager(IOManager):\n """\n Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n @abstractmethod\n def has_output(self, context):\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context):\n # automatically construct filepath\n step_key = check.str_param(context.step_key, "context.step_key")\n output_name = check.str_param(context.name, "context.name")\n version = check.str_param(context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n\n filepath = self._get_path(context)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n\n filepath = self._get_path(context.upstream_output)\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n\n filepath = self._get_path(context)\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n[docs]@io_manager(config_schema={"base_dir": Field(StringSource, default_value=".", is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It allows users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(init_context.resource_config["base_dir"])\n
\nimport warnings\nfrom collections import namedtuple\nfrom enum import Enum\n\nfrom dagster import check\nfrom dagster.core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes import Persistable, whitelist_for_persistence, whitelist_for_serdes\n\nfrom .tags import (\n BACKFILL_ID_TAG,\n PARTITION_NAME_TAG,\n PARTITION_SET_TAG,\n RESUME_RETRY_TAG,\n SCHEDULE_NAME_TAG,\n SENSOR_NAME_TAG,\n)\n\n\n@whitelist_for_serdes\nclass PipelineRunStatus(Enum):\n QUEUED = "QUEUED"\n NOT_STARTED = "NOT_STARTED"\n MANAGED = "MANAGED"\n STARTING = "STARTING"\n STARTED = "STARTED"\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n CANCELING = "CANCELING"\n CANCELED = "CANCELED"\n\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.STARTING,\n PipelineRunStatus.STARTED,\n PipelineRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.QUEUED,\n PipelineRunStatus.NOT_STARTED,\n PipelineRunStatus.SUCCESS,\n PipelineRunStatus.FAILURE,\n PipelineRunStatus.MANAGED,\n PipelineRunStatus.CANCELED,\n]\n\n\n@whitelist_for_serdes\nclass PipelineRunStatsSnapshot(\n namedtuple(\n "_PipelineRunStatsSnapshot",\n (\n "run_id steps_succeeded steps_failed materializations "\n "expectations start_time end_time"\n ),\n )\n):\n def __new__(\n cls,\n run_id,\n steps_succeeded,\n steps_failed,\n materializations,\n expectations,\n start_time,\n end_time,\n ):\n return super(PipelineRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\n[docs]@whitelist_for_persistence\nclass PipelineRun(\n namedtuple(\n "_PipelineRun",\n (\n "pipeline_name run_id run_config mode solid_selection solids_to_execute "\n "step_keys_to_execute status tags root_run_id parent_run_id "\n "pipeline_snapshot_id execution_plan_snapshot_id external_pipeline_origin"\n ),\n ),\n Persistable,\n):\n """Serializable internal representation of a pipeline run, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n pipeline_name=None,\n run_id=None,\n run_config=None,\n mode=None,\n solid_selection=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot_id=None,\n execution_plan_snapshot_id=None,\n external_pipeline_origin=None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n (\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group"\n ),\n )\n # a frozenset which contains the names of the solids to execute\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n # a list of solid queries provided by the user\n # possible to be None when only solids_to_execute is set by the user directly\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n if status == PipelineRunStatus.QUEUED:\n check.inst_param(\n external_pipeline_origin,\n "external_pipeline_origin",\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return super(PipelineRun, cls).__new__(\n cls,\n pipeline_name=check.opt_str_param(pipeline_name, "pipeline_name"),\n run_id=check.opt_str_param(run_id, "run_id", default=make_new_run_id()),\n run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n mode=check.opt_str_param(mode, "mode"),\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", PipelineRunStatus, PipelineRunStatus.NOT_STARTED\n ),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, "pipeline_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_pipeline_origin=check.opt_inst_param(\n external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin\n ),\n )\n\n @classmethod\n def from_storage_dict(cls, storage_dict):\n # called by the serdes layer, delegates to helper method with expanded kwargs\n return cls._from_storage(**storage_dict)\n\n @classmethod\n def _from_storage(\n cls,\n pipeline_name=None,\n run_id=None,\n run_config=None,\n mode=None,\n solid_selection=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot_id=None,\n execution_plan_snapshot_id=None,\n # backcompat\n environment_dict=None,\n previous_run_id=None,\n selector=None,\n solid_subset=None,\n reexecution_config=None, # pylint: disable=unused-argument\n external_pipeline_origin=None,\n **kwargs,\n ):\n\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n\n # back compat for environment dict => run_config\n if environment_dict:\n check.invariant(\n not run_config,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n run_config = environment_dict\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if previous_run_id and not (parent_run_id and root_run_id):\n parent_run_id = previous_run_id\n root_run_id = previous_run_id\n\n # back compat for selector => pipeline_name, solids_to_execute\n selector = check.opt_inst_param(selector, "selector", ExecutionSelector)\n if selector:\n check.invariant(\n pipeline_name is None or selector.name == pipeline_name,\n (\n "Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: "\n "selector was passed with pipeline {selector_pipeline}".format(\n pipeline_name=pipeline_name, selector_pipeline=selector.name\n )\n ),\n )\n if pipeline_name is None:\n pipeline_name = selector.name\n\n check.invariant(\n solids_to_execute is None or set(selector.solid_subset) == solids_to_execute,\n (\n "Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: "\n "selector was passed with subset {selector_subset}".format(\n solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset\n )\n ),\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = (\n frozenset(selector.solid_subset) if selector.solid_subset else None\n )\n\n # back compat for solid_subset => solids_to_execute\n check.opt_list_param(solid_subset, "solid_subset", of_type=str)\n if solid_subset:\n solids_to_execute = frozenset(solid_subset)\n\n # warn about unused arguments\n if len(kwargs):\n warnings.warn(\n "Found unhandled arguments from stored PipelineRun: {args}".format(\n args=kwargs.keys()\n )\n )\n\n return cls.__new__( # pylint: disable=redundant-keyword-arg\n cls,\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n )\n\n def with_status(self, status):\n if status == PipelineRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n check.inst(\n self.external_pipeline_origin,\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_mode(self, mode):\n return self._replace(mode=mode)\n\n def with_tags(self, tags):\n return self._replace(tags=tags)\n\n def get_root_run_id(self):\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self):\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n @property\n def is_finished(self):\n return (\n self.status == PipelineRunStatus.SUCCESS\n or self.status == PipelineRunStatus.FAILURE\n or self.status == PipelineRunStatus.CANCELED\n )\n\n @property\n def is_success(self):\n return self.status == PipelineRunStatus.SUCCESS\n\n @property\n def is_failure(self):\n return self.status == PipelineRunStatus.FAILURE or self.status == PipelineRunStatus.CANCELED\n\n @property\n def is_resume_retry(self):\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self):\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule):\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor):\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id):\n return {BACKFILL_ID_TAG: backfill_id}\n\n @staticmethod\n def tags_for_partition_set(partition_set, partition):\n return {PARTITION_NAME_TAG: partition.name, PARTITION_SET_TAG: partition_set.name}\n\n\n@whitelist_for_serdes\nclass PipelineRunsFilter(\n namedtuple("_PipelineRunsFilter", "run_ids pipeline_name statuses tags snapshot_id")\n):\n def __new__(cls, run_ids=None, pipeline_name=None, statuses=None, tags=None, snapshot_id=None):\n return super(PipelineRunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_list_param(run_ids, "run_ids", of_type=str),\n pipeline_name=check.opt_str_param(pipeline_name, "pipeline_name"),\n statuses=check.opt_list_param(statuses, "statuses", of_type=PipelineRunStatus),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n )\n\n @staticmethod\n def for_schedule(schedule):\n return PipelineRunsFilter(tags=PipelineRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_partition(partition_set, partition):\n return PipelineRunsFilter(tags=PipelineRun.tags_for_partition_set(partition_set, partition))\n\n @staticmethod\n def for_sensor(sensor):\n return PipelineRunsFilter(tags=PipelineRun.tags_for_sensor(sensor))\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(namedtuple("_ExecutionSelector", "name solid_subset")):\n """\n Kept here to maintain loading of PipelineRuns from when it was still alive.\n """\n\n def __new__(cls, name, solid_subset=None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=None\n if solid_subset is None\n else check.list_param(solid_subset, "solid_subset", of_type=str),\n )\n
\nimport os\n\nfrom dagster import StringSource, check\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\n\n[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def base_dir(self):\n return self._base_dir\n\n def file_manager_dir(self, run_id):\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n def intermediates_dir(self, run_id):\n return os.path.join(self.base_dir, "storage", run_id, "")\n\n @property\n def schedules_dir(self):\n return os.path.join(self.base_dir, "schedules")\n\n[docs] @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalArtifactStorage(inst_data=inst_data, **config_value)\n\n\n
\nfrom abc import abstractmethod, abstractproperty\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.resource import ResourceDefinition\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.utils.backcompat import experimental\n\n\nclass IInputManagerDefinition:\n @abstractproperty\n def input_config_schema(self):\n """The schema for per-input configuration for inputs that are managed by this\n input manager"""\n\n\n[docs]class RootInputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of a root input manager resource.\n\n Root input managers load solid inputs that aren't connected to upstream outputs.\n\n An RootInputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`RootInputManager`.\n\n The easiest way to create an RootInputManagerDefinition is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(RootInputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n def copy_for_configured(self, name, description, config_schema, _):\n check.invariant(name is None, "ResourceDefintions do not have names")\n return RootInputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )\n\n\n[docs]class RootInputManager(InputManager):\n """RootInputManagers are used to load inputs to solids at the root of a pipeline.\n\n The easiest way to define an RootInputManager is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n[docs] @abstractmethod\n def load_input(self, context):\n """The user-defined read method that loads data given its metadata.\n\n Args:\n context (InputContext): The context of the step output that produces this asset.\n\n Returns:\n Any: The data object.\n """\n\n\n[docs]@experimental\ndef root_input_manager(\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """Define a root input manager.\n\n Root input managers load solid inputs that aren't connected to upstream outputs.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of a solid.\n\n The decorator produces an :py:class:`RootInputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n @root_input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @solid(input_defs=[InputDefinition("input1", root_manager_key="csv_loader_key")])\n def my_solid(_, input1):\n do_stuff(input1)\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"csv_loader_key": csv_loader})])\n def my_pipeline():\n my_solid()\n\n @root_input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @root_input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn):\n return _InputManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap\n\n\nclass RootInputManagerWrapper(RootInputManager):\n def __init__(self, load_fn):\n self._load_fn = load_fn\n\n def load_input(self, context):\n return self._load_fn(context)\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n version=None,\n input_config_schema=None,\n required_resource_keys=None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn):\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return RootInputManagerWrapper(load_fn)\n\n root_input_manager_def = RootInputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(root_input_manager_def, wrapped=load_fn)\n\n return root_input_manager_def\n
\nfrom abc import ABC, abstractmethod\n\n\n[docs]class RunStorage(ABC):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, pipeline_run):\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n pipeline_run (PipelineRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id, event):\n """Update run storage in accordance to a pipeline run related DagsterEvent\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(self, filters=None, cursor=None, limit=None):\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[PipelineRunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.PipelineRunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_runs_count(self, filters=None):\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[PipelineRunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id):\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_groups(self, filters=None, cursor=None, limit=None):\n """Return all of the run groups present in the storage that include rows matching the\n given filter.\n\n Args:\n filter (Optional[PipelineRunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.PipelineRunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Dict[Dict[Union[PipelineRun, int]]]: Specifically, a dict of the form\n ``{'pipeline_run_id': {'runs': [PipelineRun, ...], 'count': int}, ...}``. The\n instances of :py:class:`~dagster.core.pipeline_run.PipelineRun` returned in this\n data structure correspond to all of the runs that would have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, plus their corresponding\n root runs, if any. The keys of this structure are the run_ids of all of the root\n runs (a run with no root is its own root). The integer counts are inclusive of all\n of the root runs' children, including those that would not have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, but exclusive of the root\n run itself; i.e., if a run has no children, the count will be 0.\n """\n\n # Note that we could have made the opposite decision here and filtered for root runs\n # matching a given filter, etc., rather than for child runs; so that asking for the last 5\n # run groups would give the last 5 roots and their descendants, rather than the last 5\n # children and their roots. Consider the case where we have just been retrying runs\n # belonging to a group created long ago; it makes sense to bump these to the top of the\n # interface rather than burying them deeply paginated down. Note also that this query can\n # return no more run groups than there are runs in an equivalent call to get_runs, and no\n # more than 2x total instances of PipelineRun.\n\n @abstractmethod\n def get_run_by_id(self, run_id):\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n\n @abstractmethod\n def get_run_tags(self):\n """Get a list of tag keys and the values that have been associated with them.\n\n Returns:\n List[Tuple[string, Set[string]]]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id, new_tags):\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id):\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n @abstractmethod\n def has_pipeline_snapshot(self, pipeline_snapshot_id):\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_pipeline_snapshot(self, pipeline_snapshot):\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n pipeline_snapshot (PipelineSnapshot)\n\n Return:\n str: The pipeline_snapshot_id\n """\n\n @abstractmethod\n def get_pipeline_snapshot(self, pipeline_snapshot_id):\n """Fetch a snapshot by ID\n\n Args:\n pipeline_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id):\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(self, execution_plan_snapshot):\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id):\n """Fetch a snapshot by ID\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self):\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id):\n """Remove a run from storage"""\n\n @abstractmethod\n def build_missing_indexes(self, print_fn=lambda _: None, force_rebuild_all=False):\n """Call this method to run any data migrations"""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n def optimize_for_dagit(self, statement_timeout):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat):\n """Called on a regular interval by the daemon"""\n\n @abstractmethod\n def get_daemon_heartbeats(self):\n """Latest heartbeats of all daemon types"""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self):\n """Wipe all daemon heartbeats"""\n
\nimport logging\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\n\nimport sqlalchemy as db\nfrom dagster import check\nfrom dagster.core.errors import DagsterRunAlreadyExists, DagsterSnapshotDoesNotExist\nfrom dagster.core.events import DagsterEvent, DagsterEventType\nfrom dagster.core.snap import (\n ExecutionPlanSnapshot,\n PipelineSnapshot,\n create_execution_plan_snapshot_id,\n create_pipeline_snapshot_id,\n)\nfrom dagster.core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG, ROOT_RUN_ID_TAG\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.seven import JSONDecodeError\nfrom dagster.utils import merge_dicts, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import PipelineRun, PipelineRunStatus, PipelineRunsFilter\nfrom .base import RunStorage\nfrom .migration import RUN_DATA_MIGRATIONS, RUN_PARTITIONS\nfrom .schema import (\n DaemonHeartbeatsTable,\n RunTagsTable,\n RunsTable,\n SecondaryIndexMigrationTable,\n SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n PIPELINE = "PIPELINE"\n EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n[docs]class SqlRunStorage(RunStorage): # pylint: disable=no-init\n """Base class for SQL based run storages\n """\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def fetchone(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n row = result_proxy.fetchone()\n result_proxy.close()\n\n return row\n\n def add_run(self, pipeline_run):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n if pipeline_run.pipeline_snapshot_id and not self.has_pipeline_snapshot(\n pipeline_run.pipeline_snapshot_id\n ):\n raise DagsterSnapshotDoesNotExist(\n "Snapshot {ss_id} does not exist in run storage".format(\n ss_id=pipeline_run.pipeline_snapshot_id\n )\n )\n\n has_tags = pipeline_run.tags and len(pipeline_run.tags) > 0\n partition = pipeline_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = pipeline_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n with self.connect() as conn:\n try:\n runs_insert = RunsTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=pipeline_run.run_id,\n pipeline_name=pipeline_run.pipeline_name,\n status=pipeline_run.status.value,\n run_body=serialize_dagster_namedtuple(pipeline_run),\n snapshot_id=pipeline_run.pipeline_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n conn.execute(runs_insert)\n except db.exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n if pipeline_run.tags and len(pipeline_run.tags) > 0:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [\n dict(run_id=pipeline_run.run_id, key=k, value=v)\n for k, v in pipeline_run.tags.items()\n ],\n )\n\n return pipeline_run\n\n def handle_run_event(self, run_id, event):\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n lookup = {\n DagsterEventType.PIPELINE_START: PipelineRunStatus.STARTED,\n DagsterEventType.PIPELINE_SUCCESS: PipelineRunStatus.SUCCESS,\n DagsterEventType.PIPELINE_FAILURE: PipelineRunStatus.FAILURE,\n DagsterEventType.PIPELINE_INIT_FAILURE: PipelineRunStatus.FAILURE,\n DagsterEventType.PIPELINE_ENQUEUED: PipelineRunStatus.QUEUED,\n DagsterEventType.PIPELINE_STARTING: PipelineRunStatus.STARTING,\n DagsterEventType.PIPELINE_CANCELING: PipelineRunStatus.CANCELING,\n DagsterEventType.PIPELINE_CANCELED: PipelineRunStatus.CANCELED,\n }\n\n if event.event_type not in lookup:\n return\n\n run = self.get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_pipeline_status = lookup[event.event_type]\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n status=new_pipeline_status.value,\n run_body=serialize_dagster_namedtuple(run.with_status(new_pipeline_status)),\n update_timestamp=datetime.now(),\n )\n )\n\n def _row_to_run(self, row):\n return deserialize_json_to_dagster_namedtuple(row[0])\n\n def _rows_to_runs(self, rows):\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(self, query, cursor, limit):\n """ Helper function to deal with cursor/limit pagination args """\n\n if cursor:\n cursor_query = db.select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < cursor_query)\n\n if limit:\n query = query.limit(limit)\n\n query = query.order_by(RunsTable.c.id.desc())\n return query\n\n def _add_filters_to_query(self, query, filters):\n check.inst_param(filters, "filters", PipelineRunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.pipeline_name:\n query = query.where(RunsTable.c.pipeline_name == filters.pipeline_name)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.tags:\n query = query.where(\n db.or_(\n *(\n db.and_(RunTagsTable.c.key == key, RunTagsTable.c.value == value)\n for key, value in filters.tags.items()\n )\n )\n ).group_by(RunsTable.c.run_body, RunsTable.c.id)\n\n if len(filters.tags) > 0:\n query = query.having(db.func.count(RunsTable.c.run_id) == len(filters.tags))\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n return query\n\n def _runs_query(self, filters=None, cursor=None, limit=None, columns=None):\n\n filters = check.opt_inst_param(\n filters, "filters", PipelineRunsFilter, default=PipelineRunsFilter()\n )\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(columns, "columns")\n\n if columns is None:\n columns = ["run_body"]\n\n base_query_columns = [getattr(RunsTable.c, column) for column in columns]\n\n # If we have a tags filter, then we need to select from a joined table\n if filters.tags:\n base_query = db.select(base_query_columns).select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n else:\n base_query = db.select(base_query_columns).select_from(RunsTable)\n\n query = self._add_filters_to_query(base_query, filters)\n query = self._add_cursor_limit_to_query(query, cursor, limit)\n\n return query\n\n def get_runs(self, filters=None, cursor=None, limit=None):\n query = self._runs_query(filters, cursor, limit)\n\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_runs_count(self, filters=None):\n subquery = self._runs_query(filters=filters).alias("subquery")\n\n # We use an alias here because Postgres requires subqueries to be\n # aliased.\n subquery = subquery.alias("subquery")\n\n query = db.select([db.func.count()]).select_from(subquery)\n rows = self.fetchall(query)\n count = rows[0][0]\n return count\n\n def get_run_by_id(self, run_id):\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n check.str_param(run_id, "run_id")\n\n query = db.select([RunsTable.c.run_body]).where(RunsTable.c.run_id == run_id)\n rows = self.fetchall(query)\n return deserialize_json_to_dagster_namedtuple(rows[0][0]) if len(rows) else None\n\n def get_run_tags(self):\n result = defaultdict(set)\n query = db.select([RunTagsTable.c.key, RunTagsTable.c.value]).distinct(\n RunTagsTable.c.key, RunTagsTable.c.value\n )\n rows = self.fetchall(query)\n for r in rows:\n result[r[0]].add(r[1])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def add_run_tags(self, run_id, new_tags):\n check.str_param(run_id, "run_id")\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self.get_run_by_id(run_id)\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_dagster_namedtuple(\n run.with_tags(merge_dicts(current_tags, new_tags))\n ),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=datetime.now(),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update() # pylint: disable=no-value-for-parameter\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id):\n check.str_param(run_id, "run_id")\n pipeline_run = self.get_run_by_id(run_id)\n if not pipeline_run:\n return None\n\n # find root_run\n root_run_id = pipeline_run.root_run_id if pipeline_run.root_run_id else pipeline_run.run_id\n root_run = self.get_run_by_id(root_run_id)\n\n # root_run_id to run_id 1:1 mapping\n root_to_run = (\n db.select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n )\n .where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n )\n .alias("root_to_run")\n )\n # get run group\n run_group_query = (\n db.select([RunsTable.c.run_body])\n .select_from(\n root_to_run.join(\n RunsTable, root_to_run.c.run_id == RunsTable.c.run_id, isouter=True,\n )\n )\n .alias("run_group")\n )\n\n with self.connect() as conn:\n res = conn.execute(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run] + run_group)\n\n def get_run_groups(self, filters=None, cursor=None, limit=None):\n # The runs that would be returned by calling RunStorage.get_runs with the same arguments\n runs = self._runs_query(\n filters=filters, cursor=cursor, limit=limit, columns=["run_body", "run_id"]\n ).alias("runs")\n\n # Gets us the run_id and associated root_run_id for every run in storage that is a\n # descendant run of some root\n #\n # pseudosql:\n # with all_descendant_runs as (\n # select *\n # from run_tags\n # where key = @ROOT_RUN_ID_TAG\n # )\n\n all_descendant_runs = (\n db.select([RunTagsTable])\n .where(RunTagsTable.c.key == ROOT_RUN_ID_TAG)\n .alias("all_descendant_runs")\n )\n\n # Augment the runs in our query, for those runs that are the descendant of some root run,\n # with the root_run_id\n #\n # pseudosql:\n #\n # with runs_augmented as (\n # select\n # runs.run_id as run_id,\n # all_descendant_runs.value as root_run_id\n # from runs\n # left outer join all_descendant_runs\n # on all_descendant_runs.run_id = runs.run_id\n # )\n\n runs_augmented = (\n db.select(\n [runs.c.run_id.label("run_id"), all_descendant_runs.c.value.label("root_run_id"),]\n )\n .select_from(\n runs.join(\n all_descendant_runs,\n all_descendant_runs.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n .alias("runs_augmented")\n )\n\n # Get all the runs our query will return. This includes runs as well as their root runs.\n #\n # pseudosql:\n #\n # with runs_and_root_runs as (\n # select runs.run_id as run_id\n # from runs, runs_augmented\n # where\n # runs.run_id = runs_augmented.run_id or\n # runs.run_id = runs_augmented.root_run_id\n # )\n\n runs_and_root_runs = (\n db.select([RunsTable.c.run_id.label("run_id")])\n .select_from(runs_augmented)\n .where(\n db.or_(\n RunsTable.c.run_id == runs_augmented.c.run_id,\n RunsTable.c.run_id == runs_augmented.c.root_run_id,\n )\n )\n .distinct(RunsTable.c.run_id)\n ).alias("runs_and_root_runs")\n\n # We count the descendants of all of the runs in our query that are roots so that\n # we can accurately display when a root run has more descendants than are returned by this\n # query and afford a drill-down. This might be an unnecessary complication, but the\n # alternative isn't obvious -- we could go and fetch *all* the runs in any group that we're\n # going to return in this query, and then append those.\n #\n # pseudosql:\n #\n # select runs.run_body, count(all_descendant_runs.id) as child_counts\n # from runs\n # join runs_and_root_runs on runs.run_id = runs_and_root_runs.run_id\n # left outer join all_descendant_runs\n # on all_descendant_runs.value = runs_and_root_runs.run_id\n # group by runs.run_body\n # order by child_counts desc\n\n runs_and_root_runs_with_descendant_counts = (\n db.select(\n [\n RunsTable.c.run_body,\n db.func.count(all_descendant_runs.c.id).label("child_counts"),\n ]\n )\n .select_from(\n RunsTable.join(\n runs_and_root_runs, RunsTable.c.run_id == runs_and_root_runs.c.run_id\n ).join(\n all_descendant_runs,\n all_descendant_runs.c.value == runs_and_root_runs.c.run_id,\n isouter=True,\n )\n )\n .group_by(RunsTable.c.run_body)\n .order_by(db.desc(db.column("child_counts")))\n )\n\n with self.connect() as conn:\n res = conn.execute(runs_and_root_runs_with_descendant_counts).fetchall()\n\n # Postprocess: descendant runs get aggregated with their roots\n run_groups = defaultdict(lambda: {"runs": [], "count": 0})\n for (run_body, count) in res:\n row = (run_body,)\n pipeline_run = self._row_to_run(row)\n root_run_id = pipeline_run.get_root_run_id()\n if root_run_id is not None:\n run_groups[root_run_id]["runs"].append(pipeline_run)\n else:\n run_groups[pipeline_run.run_id]["runs"].append(pipeline_run)\n run_groups[pipeline_run.run_id]["count"] = count + 1\n\n return run_groups\n\n def has_run(self, run_id):\n check.str_param(run_id, "run_id")\n return bool(self.get_run_by_id(run_id))\n\n def delete_run(self, run_id):\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_pipeline_snapshot(self, pipeline_snapshot_id):\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._has_snapshot_id(pipeline_snapshot_id)\n\n def add_pipeline_snapshot(self, pipeline_snapshot):\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n return self._add_snapshot(\n snapshot_id=create_pipeline_snapshot_id(pipeline_snapshot),\n snapshot_obj=pipeline_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_pipeline_snapshot(self, pipeline_snapshot_id):\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._get_snapshot(pipeline_snapshot_id)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id):\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(self, execution_plan_snapshot):\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n return self._add_snapshot(\n snapshot_id=execution_plan_snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id):\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id)\n\n def _add_snapshot(self, snapshot_id, snapshot_obj, snapshot_type):\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = SnapshotsTable.insert().values( # pylint: disable=no-value-for-parameter\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(\n serialize_dagster_namedtuple(snapshot_obj).encode("utf-8")\n ),\n snapshot_type=snapshot_type.value,\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def _has_snapshot_id(self, snapshot_id):\n query = db.select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id):\n query = db.select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_pipeline_snapshot_query(logging, row) if row else None\n\n def _get_partition_runs(self, partition_set_name, partition_name):\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=PipelineRunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def build_missing_indexes(self, print_fn=lambda _: None, force_rebuild_all=False):\n for migration_name, migration_fn in RUN_DATA_MIGRATIONS.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n continue\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n print_fn(f"Finished data migration: {migration_name}")\n\n def has_built_index(self, migration_name):\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name):\n query = SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=migration_name, migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type.value,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n except db.exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update() # pylint: disable=no-value-for-parameter\n .where(\n DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type.value\n )\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self):\n\n with self.connect() as conn:\n rows = conn.execute(db.select(DaemonHeartbeatsTable.columns))\n heartbeats = [deserialize_json_to_dagster_namedtuple(row.body) for row in rows]\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self):\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(RunTagsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(SnapshotsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n def wipe_daemon_heartbeats(self):\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_pipeline_snapshot_query(logger, row):\n # no checking here because sqlalchemy returns a special\n # row proxy and don't want to instance check on an internal\n # implementation detail\n\n def _warn(msg):\n logger.warning("get-pipeline-snapshot: {msg}".format(msg=msg))\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_json_to_dagster_namedtuple(decoded_str)\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
\nimport os\nfrom contextlib import contextmanager\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom dagster import StringSource, check\nfrom dagster.core.storage.sql import (\n check_alembic_revision,\n create_engine,\n get_alembic_config,\n handle_schema_errors,\n run_alembic_downgrade,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\nfrom sqlalchemy.pool import NullPool\n\nfrom ..schema import RunStorageSqlMetadata, RunTagsTable, RunsTable\nfrom ..sql_run_storage import SqlRunStorage\n\n\n[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster.core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @staticmethod\n def from_local(base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n\n return SqliteRunStorage(conn_string, inst_data)\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n with handle_schema_errors(\n conn, get_alembic_config(__file__), msg="Sqlite run storage requires migration",\n ):\n yield conn\n finally:\n conn.close()\n\n def _alembic_upgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n def upgrade(self):\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self):\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id):\n """ Override the default sql delete run implementation until we can get full\n support on cascading deletes """\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n
\nimport abc\n\nfrom dagster.core.definitions.job import JobType\nfrom dagster.core.errors import DagsterScheduleWipeRequired\nfrom dagster.core.scheduler.job import JobStatus\n\n\n[docs]class ScheduleStorage(abc.ABC):\n """Abstract class for managing persistance of scheduler artifacts\n """\n\n @abc.abstractmethod\n def wipe(self):\n """Delete all schedules from storage\n """\n\n @abc.abstractmethod\n def all_stored_job_state(self, repository_origin_id=None, job_type=None):\n """Return all JobStates present in storage\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n job_type (Optional[JobType]): The JobType to scope results to\n """\n\n @abc.abstractmethod\n def get_job_state(self, job_origin_id):\n """Return the unique job with the given id\n\n Args:\n job_origin_id (str): The unique job identifier\n """\n\n @abc.abstractmethod\n def add_job_state(self, job):\n """Add a job to storage.\n\n Args:\n job (JobState): The job to add\n """\n\n @abc.abstractmethod\n def update_job_state(self, job):\n """Update a job in storage.\n\n Args:\n job (JobState): The job to update\n """\n\n @abc.abstractmethod\n def delete_job_state(self, job_origin_id):\n """Delete a job in storage.\n\n Args:\n job_origin_id (str): The id of the ExternalJob target to delete\n """\n\n @abc.abstractmethod\n def get_job_ticks(self, job_origin_id):\n """Get the ticks for a given job.\n\n Args:\n job_origin_id (str): The id of the ExternalJob target\n """\n\n @abc.abstractmethod\n def get_latest_job_tick(self, job_origin_id):\n """Get the most recent tick for a given job.\n\n Args:\n job_origin_id (str): The id of the ExternalJob target\n """\n\n @abc.abstractmethod\n def create_job_tick(self, job_tick_data):\n """Add a job tick to storage.\n\n Args:\n repository_name (str): The repository the schedule belongs to\n job_tick_data (JobTickData): The job tick to add\n """\n\n @abc.abstractmethod\n def update_job_tick(self, tick):\n """Update a job tick already in storage.\n\n Args:\n tick (ScheduleTick): The job tick to update\n """\n\n @abc.abstractmethod\n def purge_job_ticks(self, job_origin_id, tick_status, before):\n """Wipe ticks for a job for a certain status and timestamp.\n\n Args:\n job_origin_id (str): The id of the ExternalJob target to delete\n tick_status (JobTickStatus): The tick status to wipe\n before (datetime): All ticks before this datetime will get purged\n """\n\n @abc.abstractmethod\n def get_job_tick_stats(self, job_origin_id):\n """Get tick stats for a given job.\n\n Args:\n job_origin_id (str): The id of the ExternalJob target\n """\n\n @abc.abstractmethod\n def upgrade(self):\n """Perform any needed migrations\n """\n\n def optimize_for_dagit(self, statement_timeout):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n def validate_stored_schedules(self, scheduler_class):\n # Check for any running job states that reference a different scheduler,\n # prompt the user to wipe if they don't match\n stored_schedules = self.all_stored_job_state(job_type=JobType.SCHEDULE)\n\n for schedule in stored_schedules:\n if schedule.status != JobStatus.RUNNING:\n continue\n\n stored_scheduler_class = schedule.job_specific_data.scheduler\n\n if stored_scheduler_class and stored_scheduler_class != scheduler_class:\n instance_scheduler_class = scheduler_class if scheduler_class else "None"\n\n raise DagsterScheduleWipeRequired(\n f"Found a running schedule using a scheduler ({stored_scheduler_class}) "\n + f"that differs from the scheduler on the instance ({instance_scheduler_class}). "\n + "The most likely reason for this error is that you changed the scheduler on "\n + "your instance while it was still running schedules. "\n + "To fix this, change the scheduler on your instance back to the previous "\n + "scheduler configuration and run the command 'dagster schedule wipe'. It "\n + f"will then be safe to change back to {instance_scheduler_class}."\n )\n
\nfrom abc import abstractmethod\nfrom datetime import datetime\n\nimport sqlalchemy as db\nfrom dagster import check\nfrom dagster.core.definitions.job import JobType\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.scheduler.job import (\n JobState,\n JobTick,\n JobTickData,\n JobTickStatsSnapshot,\n JobTickStatus,\n)\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .schema import JobTable, JobTickTable\n\n\n[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage\n """\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(self, rows):\n return list(map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))\n\n def all_stored_job_state(self, repository_origin_id=None, job_type=None):\n check.opt_inst_param(job_type, "job_type", JobType)\n base_query = db.select([JobTable.c.job_body, JobTable.c.job_origin_id]).select_from(\n JobTable\n )\n\n if repository_origin_id:\n query = base_query.where(JobTable.c.repository_origin_id == repository_origin_id)\n else:\n query = base_query\n\n if job_type:\n query = query.where(JobTable.c.job_type == job_type.value)\n\n rows = self.execute(query)\n return self._deserialize_rows(rows)\n\n def get_job_state(self, job_origin_id):\n check.str_param(job_origin_id, "job_origin_id")\n\n query = (\n db.select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == job_origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None\n\n def add_job_state(self, job):\n check.inst_param(job, "job", JobState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values( # pylint: disable=no-value-for-parameter\n job_origin_id=job.job_origin_id,\n repository_origin_id=job.repository_origin_id,\n status=job.status.value,\n job_type=job.job_type.value,\n job_body=serialize_dagster_namedtuple(job),\n )\n )\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"JobState {job.job_origin_id} is already present in storage"\n ) from exc\n\n return job\n\n def update_job_state(self, job):\n check.inst_param(job, "job", JobState)\n if not self.get_job_state(job.job_origin_id):\n raise DagsterInvariantViolationError(\n "JobState {id} is not present in storage".format(id=job.job_origin_id)\n )\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTable.c.job_origin_id == job.job_origin_id)\n .values(status=job.status.value, job_body=serialize_dagster_namedtuple(job),)\n )\n\n def delete_job_state(self, job_origin_id):\n check.str_param(job_origin_id, "job_origin_id")\n\n if not self.get_job_state(job_origin_id):\n raise DagsterInvariantViolationError(\n "JobState {id} is not present in storage".format(id=job_origin_id)\n )\n\n with self.connect() as conn:\n conn.execute(\n JobTable.delete().where( # pylint: disable=no-value-for-parameter\n JobTable.c.job_origin_id == job_origin_id\n )\n )\n\n def get_latest_job_tick(self, job_origin_id):\n check.str_param(job_origin_id, "job_origin_id")\n\n query = (\n db.select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .where(JobTickTable.c.job_origin_id == job_origin_id)\n .order_by(JobTickTable.c.timestamp.desc())\n .limit(1)\n )\n\n rows = self.execute(query)\n\n if len(rows) == 0:\n return None\n\n return JobTick(rows[0][0], deserialize_json_to_dagster_namedtuple(rows[0][1]))\n\n def get_job_ticks(self, job_origin_id):\n check.str_param(job_origin_id, "job_origin_id")\n\n query = (\n db.select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .where(JobTickTable.c.job_origin_id == job_origin_id)\n .order_by(JobTickTable.c.id.desc())\n )\n\n rows = self.execute(query)\n return list(\n map(lambda r: JobTick(r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows)\n )\n\n def create_job_tick(self, job_tick_data):\n check.inst_param(job_tick_data, "job_tick_data", JobTickData)\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values( # pylint: disable=no-value-for-parameter\n job_origin_id=job_tick_data.job_origin_id,\n status=job_tick_data.status.value,\n type=job_tick_data.job_type.value,\n timestamp=utc_datetime_from_timestamp(job_tick_data.timestamp),\n tick_body=serialize_dagster_namedtuple(job_tick_data),\n )\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return JobTick(tick_id, job_tick_data)\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert JobTick for job {job_tick_data.job_name} in storage"\n ) from exc\n\n def update_job_tick(self, tick):\n check.inst_param(tick, "tick", JobTick)\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.id == tick.tick_id)\n .values(\n status=tick.status.value,\n type=tick.job_type.value,\n timestamp=utc_datetime_from_timestamp(tick.timestamp),\n tick_body=serialize_dagster_namedtuple(tick.job_tick_data),\n )\n )\n\n return tick\n\n def purge_job_ticks(self, job_origin_id, tick_status, before):\n check.str_param(job_origin_id, "job_origin_id")\n check.inst_param(tick_status, "tick_status", JobTickStatus)\n check.inst_param(before, "before", datetime)\n\n utc_before = utc_datetime_from_timestamp(before.timestamp())\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.delete() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.status == tick_status.value)\n .where(JobTickTable.c.timestamp < utc_before)\n .where(JobTickTable.c.job_origin_id == job_origin_id)\n )\n\n def get_job_tick_stats(self, job_origin_id):\n check.str_param(job_origin_id, "job_origin_id")\n\n query = (\n db.select([JobTickTable.c.status, db.func.count()])\n .select_from(JobTickTable)\n .where(JobTickTable.c.job_origin_id == job_origin_id)\n .group_by(JobTickTable.c.status)\n )\n\n rows = self.execute(query)\n\n counts = {}\n for status, count in rows:\n counts[status] = count\n\n return JobTickStatsSnapshot(\n ticks_started=counts.get(JobTickStatus.STARTED.value, 0),\n ticks_succeeded=counts.get(JobTickStatus.SUCCESS.value, 0),\n ticks_skipped=counts.get(JobTickStatus.SKIPPED.value, 0),\n ticks_failed=counts.get(JobTickStatus.FAILURE.value, 0),\n )\n\n def wipe(self):\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(JobTickTable.delete()) # pylint: disable=no-value-for-parameter\n
\nfrom contextlib import contextmanager\n\nfrom dagster import StringSource, check\nfrom dagster.core.storage.sql import (\n check_alembic_revision,\n create_engine,\n get_alembic_config,\n handle_schema_errors,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\nfrom sqlalchemy.pool import NullPool\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\n\n[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage\n """\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @staticmethod\n def from_local(base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n\n return SqliteScheduleStorage(conn_string, inst_data)\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n with handle_schema_errors(\n conn,\n get_alembic_config(__file__),\n msg="Sqlite schedule storage requires migration",\n ):\n yield conn\n finally:\n conn.close()\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n
\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.core.definitions.intermediate_storage import IntermediateStorageDefinition\nfrom dagster.core.definitions.intermediate_storage import (\n intermediate_storage as intermediate_storage_fn,\n)\nfrom dagster.core.storage.io_manager import io_manager\nfrom dagster.core.storage.type_storage import (\n TypeStoragePluginRegistry,\n construct_type_storage_plugin_registry,\n)\nfrom dagster.core.system_config.objects import EnvironmentConfig\n\nfrom .init import InitIntermediateStorageContext\nfrom .intermediate_storage import IntermediateStorageAdapter, ObjectStoreIntermediateStorage\nfrom .object_store import FilesystemObjectStore, InMemoryObjectStore, ObjectStore\n\n\ndef build_intermediate_storage_from_object_store(\n object_store, init_context, root_for_run_id=lambda _: "",\n):\n """constructs an IntermediateStorage object from an object store and an init_context\n Call from within an intermediate_storage_definition\n Args:\n object_store(ObjectStore): The object store on which to base the intermediate store.\n init_context(InitIntermediateStorageContext): the context from which to create the intermediates manager\n root_for_run_id_creator(Callable[[str], str]):\n a function that converts from your run ID to the root of your object storage paths\n """\n object_store = check.inst_param(object_store, "object_store", ObjectStore)\n root_for_run_id = check.callable_param(root_for_run_id, "root_for_run_id")\n init_context = check.inst_param(init_context, "init_context", InitIntermediateStorageContext)\n\n return ObjectStoreIntermediateStorage(\n object_store=object_store,\n run_id=init_context.pipeline_run.run_id,\n root_for_run_id=root_for_run_id,\n type_storage_plugin_registry=init_context.type_storage_plugin_registry\n if init_context.type_storage_plugin_registry\n else TypeStoragePluginRegistry(types_to_register=[]),\n )\n\n\n[docs]@intermediate_storage_fn(name="in_memory", is_persistent=False, required_resource_keys=set())\ndef mem_intermediate_storage(init_context):\n """The default in-memory intermediate storage.\n\n In-memory intermediate storage is the default on any pipeline run that does\n not configure any custom intermediate storage.\n\n Keep in mind when using this storage that intermediates will not be persisted after the pipeline\n run ends. Use a persistent intermediate storage like :py:func:`fs_intermediate_storage` to\n persist intermediates and take advantage of advanced features like pipeline re-execution.\n """\n object_store = InMemoryObjectStore()\n return build_intermediate_storage_from_object_store(\n object_store=object_store, init_context=init_context\n )\n\n\n[docs]@intermediate_storage_fn(\n name="filesystem",\n is_persistent=True,\n config_schema={"base_dir": Field(str, is_required=False)},\n required_resource_keys=set(),\n)\ndef fs_intermediate_storage(init_context):\n """The default filesystem intermediate storage.\n\n Filesystem system storage is available by default on any :py:class:`ModeDefinition` that does\n not provide custom system storages. To select it, include a fragment such as the following in\n config:\n\n .. code-block:: yaml\n\n intermediate_storage:\n filesystem:\n base_dir: '/path/to/dir/'\n\n You may omit the ``base_dir`` config value, in which case the filesystem storage will use\n the :py:class:`DagsterInstance`-provided default.\n """\n object_store = FilesystemObjectStore()\n override_dir = init_context.intermediate_storage_config.get("base_dir")\n if override_dir:\n root_for_run_id = lambda _: override_dir\n else:\n root_for_run_id = init_context.instance.intermediates_directory\n\n return build_intermediate_storage_from_object_store(\n object_store, init_context, root_for_run_id=root_for_run_id\n )\n\n\ndefault_intermediate_storage_defs = [mem_intermediate_storage, fs_intermediate_storage]\n"""The default 'in_memory' and 'filesystem' intermediate storage definitions.\n\nFramework authors seeking to add their own intermediate storage definitions can extend this list as follows:\n\n.. code-block:: python\n\n custom_storage_mode = ModeDefinition(\n ...,\n intermediate_storage_defs=default_intermediate_storage_defs + [custom_intermediate_storage_def]\n )\n"""\n\n\n[docs]def io_manager_from_intermediate_storage(intermediate_storage_def):\n """Define an :py:class:`IOManagerDefinition` from an existing :py:class:`IntermediateStorageDefinition`.\n\n This method is used to adapt an existing user-defined intermediate storage to a IO manager\n resource, for example:\n\n .. code-block:: python\n\n my_io_manager_def = io_manager_from_intermediate_storage(my_intermediate_storage_def)\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": my_io_manager_def})])\n def my_pipeline():\n ...\n\n\n Args:\n intermediate_storage_def (IntermediateStorageDefinition): The intermediate storage definition\n to be converted to an IO manager definition.\n\n Returns:\n IOManagerDefinition\n """\n\n check.inst_param(\n intermediate_storage_def, "intermediate_storage_def", IntermediateStorageDefinition\n )\n\n @io_manager\n def _io_manager(init_context):\n pipeline_run = init_context.pipeline_run\n instance = init_context.instance_for_backwards_compat\n pipeline_def = init_context.pipeline_def_for_backwards_compat\n # depend on InitResourceContext.instance_for_backwards_compat and pipeline_def_for_backwards_compat\n environment_config = EnvironmentConfig.build(\n pipeline_def, pipeline_run.run_config, mode=pipeline_run.mode\n )\n mode_def = pipeline_def.get_mode_definition(pipeline_run.mode)\n\n intermediate_storage_context = InitIntermediateStorageContext(\n pipeline_def=pipeline_def,\n mode_def=mode_def,\n intermediate_storage_def=intermediate_storage_def,\n pipeline_run=pipeline_run,\n instance=instance,\n environment_config=environment_config,\n type_storage_plugin_registry=construct_type_storage_plugin_registry(\n pipeline_def, intermediate_storage_def\n ),\n resources=init_context.resources,\n intermediate_storage_config=environment_config.intermediate_storage.intermediate_storage_config,\n )\n\n intermediate_storage = intermediate_storage_def.intermediate_storage_creation_fn(\n intermediate_storage_context\n )\n\n return IntermediateStorageAdapter(intermediate_storage)\n\n return _io_manager\n
\nimport hashlib\n\nfrom dagster import check\nfrom dagster.config.config_type import ConfigType\nfrom dagster.core.decorator_utils import (\n split_function_parameters,\n validate_decorated_fn_positionals,\n)\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.utils import ensure_gen\nfrom dagster.utils.backcompat import experimental_arg_warning\n\n\nclass DagsterTypeLoader:\n @property\n def schema_type(self):\n check.not_implemented(\n "Must override schema_type in {klass}".format(klass=type(self).__name__)\n )\n\n @property\n def loader_version(self):\n return None\n\n def compute_loaded_input_version(self, _config_value):\n return None\n\n def construct_from_config_value(self, _context, config_value):\n """\n How to create a runtime value from config data.\n """\n return config_value\n\n def required_resource_keys(self):\n return frozenset()\n\n\nclass DagsterTypeMaterializer:\n @property\n def schema_type(self):\n check.not_implemented(\n "Must override schema_type in {klass}".format(klass=type(self).__name__)\n )\n\n def materialize_runtime_values(self, _context, _config_value, _runtime_value):\n """\n How to materialize a runtime value given configuration.\n """\n check.not_implemented("Must implement")\n\n def required_resource_keys(self):\n return frozenset()\n\n\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n if self._loader_version:\n experimental_arg_warning("loader_version", "DagsterTypeLoaderFromDecorator.__init__")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n if self._external_version_fn:\n experimental_arg_warning(\n "external_version_fn", "DagsterTypeLoaderFromDecorator.__init__"\n )\n\n @property\n def schema_type(self):\n return self._config_type\n\n @property\n def loader_version(self):\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value):\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (Union[Any, Dict]): Config value to be ingested by the external version\n loading function.\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(self, context, config_value):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version=None, external_version_fn=None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\n[docs]def dagster_type_loader(\n config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None,\n):\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func):\n fn_positionals, _ = split_function_parameters(func, EXPECTED_POSITIONALS)\n missing_positional = validate_decorated_fn_positionals(fn_positionals, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@dagster_type_loader '{solid_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. Solid functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n solid_name=func.__name__, missing_param=missing_positional\n )\n )\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper\n\n\nclass DagsterTypeMaterializerForDecorator(DagsterTypeMaterializer):\n def __init__(self, config_type, func, required_resource_keys):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n @property\n def schema_type(self):\n return self._config_type\n\n def materialize_runtime_values(self, context, config_value, runtime_value):\n return ensure_gen(self._func(context, config_value, runtime_value))\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_output_materializer_for_decorator(config_type, func, required_resource_keys):\n return DagsterTypeMaterializerForDecorator(config_type, func, required_resource_keys)\n\n\n[docs]def dagster_type_materializer(config_schema, required_resource_keys=None):\n """Create an output materialization hydration config that configurably materializes a runtime\n value.\n\n The decorated function should take the execution context, the parsed config value, and the\n runtime value and the parsed config data, should materialize the runtime value, and should\n return an appropriate :py:class:`AssetMaterialization`.\n\n Args:\n config_schema (Any): The type of the config data expected by the decorated function.\n\n Examples:\n\n .. code-block:: python\n\n # Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\n value, and writes\n @dagster_type_materializer(str)\n def materialize_df(_context, path, value):\n with open(path, 'w') as fd:\n writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n writer.writeheader()\n writer.writerows(rowdicts=value)\n\n return AssetMaterialization.file(path)\n\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n return lambda func: _create_output_materializer_for_decorator(\n config_type, func, required_resource_keys\n )\n
\nimport typing\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config.config_type import Array\nfrom dagster.config.config_type import Noneable as ConfigNoneable\nfrom dagster.core.definitions.events import TypeCheck\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.core.storage.type_storage import TypeStoragePlugin\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader, DagsterTypeMaterializer\nfrom .marshal import PickleSerializationStrategy, SerializationStrategy\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n ANY = "ANY"\n SCALAR = "SCALAR"\n LIST = "LIST"\n NOTHING = "NOTHING"\n NULLABLE = "NULLABLE"\n REGULAR = "REGULAR"\n\n\n[docs]class DagsterType:\n """Define a type in dagster. These can be used in the inputs and outputs of solids.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the solid. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n serialization_strategy (Optional[SerializationStrategy]): An instance of a class that\n inherits from :py:class:`~dagster.SerializationStrategy`. The default strategy for serializing\n this value when automatically persisting it between execution steps. You should set\n this value if the ordinary serialization machinery (e.g., pickle) will not be adequate\n for this type.\n auto_plugins (Optional[List[Type[TypeStoragePlugin]]]): If types must be serialized differently\n depending on the storage being used for intermediates, they should specify this\n argument. In these cases the serialization_strategy argument is not sufficient because\n serialization requires specialized API calls, e.g. to call an S3 API directly instead\n of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n """\n\n def __init__(\n self,\n type_check_fn,\n key=None,\n name=None,\n is_builtin=False,\n description=None,\n loader=None,\n materializer=None,\n serialization_strategy=None,\n auto_plugins=None,\n required_resource_keys=None,\n kind=DagsterTypeKind.REGULAR,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n\n if name is None:\n check.param_invariant(\n bool(key), "key", "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n check.param_invariant(\n bool(name), "name", "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self.description = check.opt_str_param(description, "description")\n self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n self.materializer = check.opt_inst_param(\n materializer, "materializer", DagsterTypeMaterializer\n )\n\n self.serialization_strategy = check.opt_inst_param(\n serialization_strategy,\n "serialization_strategy",\n SerializationStrategy,\n PickleSerializationStrategy(),\n )\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type)\n\n check.param_invariant(\n all(\n issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins\n ),\n "auto_plugins",\n )\n\n self.auto_plugins = auto_plugins\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n "All types must have a valid display name, got None for key {}".format(key),\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n def type_check(self, context, value):\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n (\n "You have returned {retval} of type {retval_type} from the type "\n 'check function of type "{type_key}". Return value must be instance '\n "of TypeCheck or a bool."\n ).format(retval=repr(retval), retval_type=type(retval), type_key=self.key)\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n @staticmethod\n def from_builtin_enum(builtin_enum):\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def display_name(self):\n """Asserted in __init__ to be not None, overridden in many subclasses"""\n return self._name\n\n @property\n def unique_name(self):\n """The unique name of this type. Can be None if the type is not unique, such as container types"""\n check.invariant(\n self._name is not None,\n "unique_name requested but is None for type {}".format(self.display_name),\n )\n return self._name\n\n @property\n def has_unique_name(self):\n return self._name is not None\n\n @property\n def inner_types(self):\n return []\n\n @property\n def loader_schema_key(self):\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def materializer_schema_key(self):\n return self.materializer.schema_type.key if self.materializer else None\n\n @property\n def type_param_keys(self):\n return []\n\n @property\n def is_nothing(self):\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self):\n return False\n\n def get_inner_type_for_fan_in(self):\n check.invariant(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before calling getter.".format(\n name=self.display_name\n )\n )\n\n\ndef _validate_type_check_fn(fn, name):\n from dagster.seven import get_args\n\n args = get_args(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n 'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'.format(name=name,)\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n 'type_check_fn argument on type "{name}" must take 2 arguments, '\n "received {count}.".format(name=name, count=len(args))\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name, type_check_fn, *args, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n *args,\n **kwargs,\n )\n\n def type_check_fn(self, _context, value):\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value):\n raise NotImplementedError()\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n materializer=BuiltinSchemas.INT_OUTPUT,\n type_check_fn=self.type_check_fn,\n )\n\n def type_check_scalar_value(self, value):\n return _fail_if_not_of_type(value, int, "int")\n\n\ndef _typemismatch_error_str(value, expected_type_desc):\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(value, value_type, value_type_desc):\n\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n type_check_fn=self.type_check_fn,\n )\n\n def type_check_scalar_value(self, value):\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n materializer=BuiltinSchemas.FLOAT_OUTPUT,\n type_check_fn=self.type_check_fn,\n )\n\n def type_check_scalar_value(self, value):\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n materializer=BuiltinSchemas.BOOL_OUTPUT,\n type_check_fn=self.type_check_fn,\n )\n\n def type_check_scalar_value(self, value):\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key,\n name,\n loader=None,\n materializer=None,\n serialization_strategy=None,\n is_builtin=False,\n description=None,\n auto_plugins=None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n materializer=materializer,\n serialization_strategy=serialization_strategy,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n auto_plugins=auto_plugins,\n )\n\n def type_check_method(self, _context, _value):\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n materializer=BuiltinSchemas.ANY_OUTPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name,\n loader=None,\n materializer=None,\n serialization_strategy=None,\n description=None,\n auto_plugins=None,\n):\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n materializer=materializer,\n serialization_strategy=serialization_strategy,\n auto_plugins=auto_plugins,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n materializer=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n )\n\n def type_check_method(self, _context, value):\n if value is not None:\n return TypeCheck(\n success=False,\n description="Value must be None, got a {value_type}".format(value_type=type(value)),\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self\n\n\n[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_mate <dagster.dagster_type_mate>`\n decorator to construct these arguments.\n serialization_strategy (Optional[SerializationStrategy]): An instance of a class that\n inherits from :py:class:`SerializationStrategy`. The default strategy for serializing\n this value when automatically persisting it between execution steps. You should set\n this value if the ordinary serialization machinery (e.g., pickle) will not be adequate\n for this type.\n auto_plugins (Optional[List[Type[TypeStoragePlugin]]]): If types must be serialized differently\n depending on the storage being used for intermediates, they should specify this\n argument. In these cases the serialization_strategy argument is not sufficient because\n serialization requires specialized API calls, e.g. to call an S3 API directly instead\n of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.\n\n """\n\n def __init__(self, python_type, key=None, name=None, **kwargs):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_type=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n else:\n self.python_type = check.type_param(python_type, "python_type")\n self.type_str = python_type.__name__\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key, name=name, type_check_fn=self.type_check_method, **kwargs\n )\n\n def type_check_method(self, _context, value):\n if not isinstance(value, self.python_type):\n return TypeCheck(\n success=False,\n description=(\n "Value of type {value_type} failed type check for Dagster type {dagster_type}, "\n "expected value to be of Python type {expected_type}."\n ).format(\n value_type=type(value), dagster_type=self._name, expected_type=self.type_str,\n ),\n )\n\n return TypeCheck(success=True)\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n if config_value is None:\n return None\n return self._inner_dagster_type.loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + inner_type.key\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n )\n\n @property\n def display_name(self):\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key=None, name=None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n **kwargs,\n )\n\n def type_check_method(self, _context, value):\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: typing.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n[docs]def make_python_type_usable_as_dagster_type(python_type, dagster_type):\n """\n Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n if (\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type, dagster_type)\n is not dagster_type\n ):\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n raise DagsterInvalidDefinitionError(\n (\n "A Dagster type has already been registered for the Python type "\n "{python_type}. make_python_type_usable_as_dagster_type can only "\n "be called once on a python type as it is registering a 1:1 mapping "\n "between that python type and a dagster type."\n ).format(python_type=python_type)\n )\n\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be DagsterType, a python scalar, or a python type "\n "that has been marked usable as a dagster type via @usable_dagster_type or "\n "make_python_type_usable_as_dagster_type: got {dagster_type}{additional_msg}"\n)\n\n\ndef resolve_dagster_type(dagster_type):\n # circular dep\n from .python_dict import PythonDict, Dict\n from .python_set import PythonSet, DagsterSetApi\n from .python_tuple import PythonTuple, DagsterTupleApi\n from .transform_typing import transform_typing_type\n from dagster.config.config_type import ConfigType\n from dagster.primitive_mapping import (\n remap_python_builtin_for_runtime,\n is_supported_runtime_python_builtin,\n )\n from dagster.utils.typing_api import is_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType)),\n "Do not pass runtime type classes. Got {}".format(dagster_type),\n )\n\n # First check to see if it part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n )\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[dagster_type]\n\n if dagster_type is Dict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n if not isinstance(dagster_type, type):\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n raise DagsterInvalidDefinitionError(\n "{dagster_type} is not a valid dagster type.".format(dagster_type=dagster_type)\n )\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(solid_defs):\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n for solid_def in solid_defs:\n for dagster_type in solid_def.all_dagster_types():\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return OptionalType(inner_type)\n\n\nOptional = DagsterOptionalApi()\n
\nfrom dagster import check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\n\n[docs]def usable_as_dagster_type(\n name=None,\n description=None,\n loader=None,\n materializer=None,\n serialization_strategy=None,\n auto_plugins=None,\n):\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n serialization_strategy (Optional[SerializationStrategy]): An instance of a class that\n inherits from :py:class:`SerializationStrategy`. The default strategy for serializing\n this value when automatically persisting it between execution steps. You should set\n this value if the ordinary serialization machinery (e.g., pickle) will not be adequate\n for this type.\n auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently\n depending on the storage being used for intermediates, they should specify this\n argument. In these cases the serialization_strategy argument is not sufficient because\n serialization requires specialized API calls, e.g. to call an S3 API directly instead\n of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.\n\n Examples:\n\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n\n def _with_args(bare_cls):\n check.type_param(bare_cls, "bare_cls")\n new_name = name if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n materializer=materializer,\n serialization_strategy=serialization_strategy,\n auto_plugins=auto_plugins,\n ),\n )\n return bare_cls\n\n # check for no args, no parens case\n if callable(name):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n return _with_args\n
\nfrom dagster import check\nfrom dagster.config.field_utils import Permissive\nfrom dagster.core.types.dagster_type import String\n\nfrom .config_schema import DagsterTypeLoader, dagster_type_loader\nfrom .dagster_type import DagsterType, PythonObjectDagsterType, resolve_dagster_type\n\n\n@dagster_type_loader(Permissive())\ndef _dict_input(_context, value):\n return value\n\n\nPythonDict = PythonObjectDagsterType(\n dict,\n "PythonDict",\n loader=_dict_input,\n description="""Represents a python dictionary to pass between solids""",\n)\n\n\nclass TypedDictLoader(DagsterTypeLoader):\n def __init__(self, value_dagster_type):\n self._value_dagster_type = check.inst_param(\n value_dagster_type, "value_dagster_type", DagsterType\n )\n\n @property\n def schema_type(self):\n return Permissive()\n\n def construct_from_config_value(self, context, config_value):\n config_value = check.dict_param(config_value, "config_value")\n runtime_value = dict()\n for key, val in config_value.items():\n runtime_value[key] = self._value_dagster_type.loader.construct_from_config_value(\n context, val\n )\n return runtime_value\n\n\nclass _TypedPythonDict(DagsterType):\n def __init__(self, key_type, value_type):\n self.key_type = check.inst_param(key_type, "key_type", DagsterType)\n self.value_type = check.inst_param(value_type, "value_type", DagsterType)\n can_get_from_config = self.value_type.loader is not None and isinstance(\n self.key_type, type(String)\n ) # True if value_type has a DagsterTypeLoader, meaning we can load the input from config,\n # otherwise False.\n super(_TypedPythonDict, self).__init__(\n key="TypedPythonDict.{}.{}".format(key_type.key, value_type.key),\n name=None,\n loader=(TypedDictLoader(self.value_type) if can_get_from_config else None),\n type_check_fn=self.type_check_method,\n )\n\n def type_check_method(self, context, value):\n from dagster.core.definitions.events import TypeCheck\n\n if not isinstance(value, dict):\n return TypeCheck(\n success=False,\n description="Value should be a dict, got a {value_type}".format(\n value_type=type(value)\n ),\n )\n\n for key, value in value.items():\n key_check = self.key_type.type_check(context, key)\n if not key_check.success:\n return key_check\n value_check = self.value_type.type_check(context, value)\n if not value_check.success:\n return value_check\n\n return TypeCheck(success=True)\n\n @property\n def display_name(self):\n return "Dict[{key},{value}]".format(\n key=self.key_type.display_name, value=self.value_type.display_name\n )\n\n @property\n def inner_types(self):\n return [self.key_type, self.value_type]\n\n @property\n def type_param_keys(self):\n return [self.key_type.key, self.value_type.key]\n\n\ndef create_typed_runtime_dict(key_dagster_type, value_dagster_type):\n key_type = resolve_dagster_type(key_dagster_type)\n value_type = resolve_dagster_type(value_dagster_type)\n\n return _TypedPythonDict(key_type, value_type)\n\n\nclass DagsterDictApi:\n def __getitem__(self, *args):\n check.param_invariant(len(args[0]) == 2, "args", "Must be two parameters")\n return create_typed_runtime_dict(args[0][0], args[0][1])\n\n\nDict = DagsterDictApi()\n
\nfrom dagster import check\nfrom dagster.config.config_type import Array\nfrom dagster.core.types.dagster_type import DagsterTypeKind\n\nfrom .config_schema import DagsterTypeLoader\nfrom .dagster_type import DagsterType, PythonObjectDagsterType, resolve_dagster_type\n\nPythonSet = PythonObjectDagsterType(\n set, "PythonSet", description="""Represents a python dictionary to pass between solids"""\n)\n\n\nclass TypedSetLoader(DagsterTypeLoader):\n def __init__(self, item_dagster_type):\n self._item_dagster_type = check.inst_param(\n item_dagster_type, "item_dagster_type", DagsterType\n )\n\n @property\n def schema_type(self):\n return Array(self._item_dagster_type.loader.schema_type)\n\n def construct_from_config_value(self, context, config_value):\n runtime_value = set()\n for item in config_value:\n runtime_value.add(\n self._item_dagster_type.loader.construct_from_config_value(context, item)\n )\n return runtime_value\n\n\nclass _TypedPythonSet(DagsterType):\n def __init__(self, item_dagster_type):\n self.item_type = item_dagster_type\n super(_TypedPythonSet, self).__init__(\n key="TypedPythonSet.{}".format(item_dagster_type.key),\n name=None,\n loader=(TypedSetLoader(item_dagster_type) if item_dagster_type.loader else None),\n type_check_fn=self.type_check_method,\n )\n\n def type_check_method(self, context, value):\n from dagster.core.definitions.events import TypeCheck\n\n if not isinstance(value, set):\n return TypeCheck(\n success=False,\n description="Value should be a set, got a{value_type}".format(\n value_type=type(value)\n ),\n )\n\n for item in value:\n item_check = self.item_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def display_name(self):\n return "Set[{}]".format(self.item_type.display_name)\n\n @property\n def inner_types(self):\n return [self.item_type]\n\n @property\n def type_param_keys(self):\n return [self.item_type.key]\n\n\ndef create_typed_runtime_set(item_dagster_type):\n item_dagster_type = resolve_dagster_type(item_dagster_type)\n\n check.invariant(\n not item_dagster_type.kind == DagsterTypeKind.NOTHING,\n "Cannot create the runtime type Set[Nothing]. Use List type for fan-in.",\n )\n\n return _TypedPythonSet(item_dagster_type)\n\n\nclass DagsterSetApi:\n def __getitem__(self, inner_type):\n return create_typed_runtime_set(inner_type)\n\n\nSet = DagsterSetApi()\n
\nfrom dagster import check\nfrom dagster.config.config_type import Array, ConfigAnyInstance\nfrom dagster.core.types.dagster_type import DagsterTypeKind\n\nfrom .config_schema import DagsterTypeLoader\nfrom .dagster_type import DagsterType, PythonObjectDagsterType, resolve_dagster_type\n\nPythonTuple = PythonObjectDagsterType(tuple, "PythonTuple", description="Represents a python tuple")\n\n\nclass TypedTupleDagsterTypeLoader(DagsterTypeLoader):\n def __init__(self, dagster_types):\n self._dagster_types = check.list_param(dagster_types, "dagster_types", of_type=DagsterType)\n\n @property\n def schema_type(self):\n return Array(ConfigAnyInstance)\n\n def construct_from_config_value(self, context, config_value):\n return tuple(\n (\n self._dagster_types[idx].loader.construct_from_config_value(context, item)\n for idx, item in enumerate(config_value)\n )\n )\n\n\nclass _TypedPythonTuple(DagsterType):\n def __init__(self, dagster_types):\n all_have_input_configs = all((dagster_type.loader for dagster_type in dagster_types))\n self.dagster_types = dagster_types\n super(_TypedPythonTuple, self).__init__(\n key="TypedPythonTuple" + ".".join(map(lambda t: t.key, dagster_types)),\n name=None,\n loader=(TypedTupleDagsterTypeLoader(dagster_types) if all_have_input_configs else None),\n type_check_fn=self.type_check_method,\n )\n\n def type_check_method(self, context, value):\n from dagster.core.definitions.events import TypeCheck\n\n if not isinstance(value, tuple):\n return TypeCheck(\n success=False,\n description="Value should be a tuple, got a {value_type}".format(\n value_type=type(value)\n ),\n )\n\n if len(value) != len(self.dagster_types):\n return TypeCheck(\n success=False,\n description=(\n "Tuple with key {key} requires {n} entries, received {m} " "values"\n ).format(key=self.key, n=len(self.dagster_types), m=len(value)),\n )\n\n for item, dagster_type in zip(value, self.dagster_types):\n item_check = dagster_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def display_name(self):\n return "Tuple[{}]".format(\n ",".join([inner_type.display_name for inner_type in self.dagster_types])\n )\n\n @property\n def inner_types(self):\n return self.dagster_types\n\n @property\n def type_param_keys(self):\n return [dt.key for dt in self.dagster_types]\n\n\ndef create_typed_tuple(*dagster_type_args):\n dagster_types = list(map(resolve_dagster_type, dagster_type_args))\n\n check.invariant(\n not any((dagster_type.kind == DagsterTypeKind.NOTHING for dagster_type in dagster_types)),\n "Cannot create a runtime tuple containing inner type Nothing. Use List for fan-in",\n )\n\n return _TypedPythonTuple(dagster_types)\n\n\nclass DagsterTupleApi:\n def __getitem__(self, tuple_types):\n check.not_none_param(tuple_types, "tuple_types")\n if isinstance(tuple_types, tuple):\n return create_typed_tuple(*tuple_types)\n else:\n return create_typed_tuple(tuple_types)\n\n\nTuple = DagsterTupleApi()\n
\n"""\nSerialization & deserialization for Dagster objects.\n\nWhy have custom serialization?\n\n* Default json serialization doesn't work well on namedtuples, which we use extensively to create\n immutable value types. Namedtuples serialize like tuples as flat lists.\n* Explicit whitelisting should help ensure we are only persisting or communicating across a\n serialization boundary the types we expect to.\n\nWhy not pickle?\n\n* This isn't meant to replace pickle in the conditions that pickle is reasonable to use\n (in memory, not human readable, etc) just handle the json case effectively.\n"""\nimport hashlib\nimport importlib\nimport sys\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom collections import namedtuple\nfrom enum import Enum\nfrom inspect import Parameter, signature\n\nimport yaml\nfrom dagster import check, seven\nfrom dagster.utils import compose\n\n_WHITELIST_MAP = {\n "types": {"tuple": {}, "enum": {}},\n "persistence": {},\n}\n\n\ndef create_snapshot_id(snapshot):\n json_rep = serialize_dagster_namedtuple(snapshot)\n m = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n m.update(json_rep.encode("utf-8"))\n return m.hexdigest()\n\n\ndef serialize_pp(value):\n return serialize_dagster_namedtuple(value, indent=2, separators=(",", ": "))\n\n\ndef register_serdes_tuple_fallbacks(fallback_map):\n for class_name, klass in fallback_map.items():\n _WHITELIST_MAP["types"]["tuple"][class_name] = klass\n\n\ndef _get_dunder_new_params_dict(klass):\n return signature(klass.__new__).parameters\n\n\ndef _get_dunder_new_params(klass):\n return list(_get_dunder_new_params_dict(klass).values())\n\n\nclass SerdesClassUsageError(Exception):\n pass\n\n\nclass Persistable(ABC):\n def to_storage_value(self):\n return default_to_storage_value(self, _WHITELIST_MAP)\n\n @classmethod\n def from_storage_dict(cls, storage_dict):\n return default_from_storage_dict(cls, storage_dict)\n\n\ndef _check_serdes_tuple_class_invariants(klass):\n dunder_new_params = _get_dunder_new_params(klass)\n\n cls_param = dunder_new_params[0]\n\n def _with_header(msg):\n return "For namedtuple {class_name}: {msg}".format(class_name=klass.__name__, msg=msg)\n\n if cls_param.name not in {"cls", "_cls"}:\n raise SerdesClassUsageError(\n _with_header(\n 'First parameter must be _cls or cls. Got "{name}".'.format(name=cls_param.name)\n )\n )\n\n value_params = dunder_new_params[1:]\n\n for index, field in enumerate(klass._fields):\n\n if index >= len(value_params):\n error_msg = (\n "Missing parameters to __new__. You have declared fields "\n "in the named tuple that are not present as parameters to the "\n "to the __new__ method. In order for "\n "both serdes serialization and pickling to work, "\n "these must match. Missing: {missing_fields}"\n ).format(missing_fields=repr(list(klass._fields[index:])))\n\n raise SerdesClassUsageError(_with_header(error_msg))\n\n value_param = value_params[index]\n if value_param.name != field:\n error_msg = (\n "Params to __new__ must match the order of field declaration in the namedtuple. "\n 'Declared field number {one_based_index} in the namedtuple is "{field_name}". '\n 'Parameter {one_based_index} in __new__ method is "{param_name}".'\n ).format(one_based_index=index + 1, field_name=field, param_name=value_param.name)\n raise SerdesClassUsageError(_with_header(error_msg))\n\n if len(value_params) > len(klass._fields):\n # Ensure that remaining parameters have default values\n for extra_param_index in range(len(klass._fields), len(value_params) - 1):\n if value_params[extra_param_index].default == Parameter.empty:\n error_msg = (\n 'Parameter "{param_name}" is a parameter to the __new__ '\n "method but is not a field in this namedtuple. The only "\n "reason why this should exist is that "\n "it is a field that used to exist (we refer to this as the graveyard) "\n "but no longer does. However it might exist in historical storage. This "\n "parameter existing ensures that serdes continues to work. However these "\n "must come at the end and have a default value for pickling to work."\n ).format(param_name=value_params[extra_param_index].name)\n raise SerdesClassUsageError(_with_header(error_msg))\n\n\ndef _whitelist_for_persistence(whitelist_map):\n def __whitelist_for_persistence(klass):\n check.subclass_param(klass, "klass", Persistable)\n whitelist_map["persistence"][klass.__name__] = klass\n return klass\n\n return __whitelist_for_persistence\n\n\ndef _whitelist_for_serdes(whitelist_map):\n def __whitelist_for_serdes(klass):\n if issubclass(klass, Enum):\n whitelist_map["types"]["enum"][klass.__name__] = klass\n elif issubclass(klass, tuple):\n _check_serdes_tuple_class_invariants(klass)\n whitelist_map["types"]["tuple"][klass.__name__] = klass\n else:\n check.failed("Can not whitelist class {klass} for serdes".format(klass=klass))\n\n return klass\n\n return __whitelist_for_serdes\n\n\ndef whitelist_for_serdes(klass):\n check.class_param(klass, "klass")\n return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP)(klass)\n\n\ndef whitelist_for_persistence(klass):\n check.class_param(klass, "klass")\n return compose(\n _whitelist_for_persistence(whitelist_map=_WHITELIST_MAP),\n _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP),\n )(klass)\n\n\ndef pack_value(val):\n return _pack_value(val, whitelist_map=_WHITELIST_MAP)\n\n\ndef _pack_value(val, whitelist_map):\n if isinstance(val, list):\n return [_pack_value(i, whitelist_map) for i in val]\n if isinstance(val, tuple):\n klass_name = val.__class__.__name__\n check.invariant(\n klass_name in whitelist_map["types"]["tuple"],\n "Can only serialize whitelisted namedtuples, received tuple {}".format(val),\n )\n if klass_name in whitelist_map["persistence"]:\n return val.to_storage_value()\n base_dict = {key: _pack_value(value, whitelist_map) for key, value in val._asdict().items()}\n base_dict["__class__"] = klass_name\n return base_dict\n if isinstance(val, Enum):\n klass_name = val.__class__.__name__\n check.invariant(\n klass_name in whitelist_map["types"]["enum"],\n "Can only serialize whitelisted Enums, received {}".format(klass_name),\n )\n return {"__enum__": str(val)}\n if isinstance(val, set):\n return {"__set__": [_pack_value(item, whitelist_map) for item in val]}\n if isinstance(val, frozenset):\n return {"__frozenset__": [_pack_value(item, whitelist_map) for item in val]}\n if isinstance(val, dict):\n return {key: _pack_value(value, whitelist_map) for key, value in val.items()}\n\n return val\n\n\ndef _serialize_dagster_namedtuple(nt, whitelist_map, **json_kwargs):\n return seven.json.dumps(_pack_value(nt, whitelist_map), **json_kwargs)\n\n\ndef serialize_value(val):\n return seven.json.dumps(_pack_value(val, whitelist_map=_WHITELIST_MAP))\n\n\ndef deserialize_value(val):\n return _unpack_value(\n seven.json.loads(check.str_param(val, "val")), whitelist_map=_WHITELIST_MAP,\n )\n\n\ndef serialize_dagster_namedtuple(nt, **json_kwargs):\n return _serialize_dagster_namedtuple(\n check.tuple_param(nt, "nt"), whitelist_map=_WHITELIST_MAP, **json_kwargs\n )\n\n\ndef unpack_value(val):\n return _unpack_value(val, whitelist_map=_WHITELIST_MAP,)\n\n\ndef _unpack_value(val, whitelist_map):\n if isinstance(val, list):\n return [_unpack_value(i, whitelist_map) for i in val]\n if isinstance(val, dict) and val.get("__class__"):\n klass_name = val.pop("__class__")\n if klass_name not in whitelist_map["types"]["tuple"]:\n check.failed(\n 'Attempted to deserialize class "{}" which is not in the serdes whitelist.'.format(\n klass_name\n )\n )\n\n klass = whitelist_map["types"]["tuple"][klass_name]\n if klass is None:\n return None\n\n unpacked_val = {key: _unpack_value(value, whitelist_map) for key, value in val.items()}\n\n if klass_name in whitelist_map["persistence"]:\n return klass.from_storage_dict(unpacked_val)\n\n # Naively implements backwards compatibility by filtering arguments that aren't present in\n # the constructor. If a property is present in the serialized object, but doesn't exist in\n # the version of the class loaded into memory, that property will be completely ignored.\n # The call to seven.get_args turns out to be pretty expensive -- we should probably turn\n # to, e.g., manually managing the deprecated keys on the serdes constructor.\n args_for_class = seven.get_args(klass)\n filtered_val = {k: v for k, v in unpacked_val.items() if k in args_for_class}\n return klass(**filtered_val)\n if isinstance(val, dict) and val.get("__enum__"):\n name, member = val["__enum__"].split(".")\n return getattr(whitelist_map["types"]["enum"][name], member)\n if isinstance(val, dict) and val.get("__set__") is not None:\n return set([_unpack_value(item, whitelist_map) for item in val["__set__"]])\n if isinstance(val, dict) and val.get("__frozenset__") is not None:\n return frozenset([_unpack_value(item, whitelist_map) for item in val["__frozenset__"]])\n if isinstance(val, dict):\n return {key: _unpack_value(value, whitelist_map) for key, value in val.items()}\n\n return val\n\n\ndef deserialize_json_to_dagster_namedtuple(json_str):\n dagster_namedtuple = _deserialize_json_to_dagster_namedtuple(\n check.str_param(json_str, "json_str"), whitelist_map=_WHITELIST_MAP\n )\n check.invariant(\n isinstance(dagster_namedtuple, tuple),\n "Output of deserialized json_str was not a namedtuple. Received type {}.".format(\n type(dagster_namedtuple)\n ),\n )\n return dagster_namedtuple\n\n\ndef _deserialize_json_to_dagster_namedtuple(json_str, whitelist_map):\n return _unpack_value(seven.json.loads(json_str), whitelist_map=whitelist_map)\n\n\ndef default_to_storage_value(value, whitelist_map):\n base_dict = {key: _pack_value(value, whitelist_map) for key, value in value._asdict().items()}\n base_dict["__class__"] = value.__class__.__name__\n return base_dict\n\n\ndef default_from_storage_dict(cls, storage_dict):\n return cls.__new__(cls, **storage_dict)\n\n\n[docs]@whitelist_for_serdes\nclass ConfigurableClassData(\n namedtuple("_ConfigurableClassData", "module_name class_name config_yaml")\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name, class_name, config_yaml):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n check.str_param(module_name, "module_name"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n def info_dict(self):\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": yaml.safe_load(self.config_yaml),\n }\n\n def rehydrate(self):\n from dagster.core.errors import DagsterInvalidConfigError\n from dagster.config.field import resolve_to_config_type\n from dagster.config.validate import process_config\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the "\n "configurable class {configurable_class}".format(\n module_name=self.module_name,\n configurable_class=self.module_name + "." + self.class_name,\n )\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "configurable class {configurable_class}".format(\n class_name=self.class_name,\n configurable_class=self.module_name + "." + self.class_name,\n )\n )\n\n if not issubclass(klass, ConfigurableClass):\n raise check.CheckError(\n klass,\n "class {class_name} in module {module_name}".format(\n class_name=self.class_name, module_name=self.module_name\n ),\n ConfigurableClass,\n )\n\n config_dict = yaml.safe_load(self.config_yaml)\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n "Errors whilst loading configuration for {}.".format(klass.config_type()),\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, result.value)\n\n\n[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @abstractproperty\n def inst_data(self):\n """\n Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n[docs] @classmethod\n @abstractmethod\n def config_type(cls):\n """dagster.ConfigType: The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n\n[docs] @staticmethod\n @abstractmethod\n def from_config_value(inst_data, config_value):\n """New up an instance of the ConfigurableClass from a validated config value.\n\n Called by ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster.core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """\n
\nimport contextlib\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nfrom collections import namedtuple\nfrom enum import Enum\nfrom typing import Iterator\nfrom warnings import warn\n\nimport _thread as thread\nimport yaml\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.seven import IS_WINDOWS, multiprocessing\nfrom dagster.seven.abc import Mapping\n\nfrom .merger import merge_dicts\nfrom .yaml_utils import load_yaml_from_glob_list, load_yaml_from_globs, load_yaml_from_path\n\nif sys.version_info > (3,):\n from pathlib import Path # pylint: disable=import-error\nelse:\n from pathlib2 import Path # pylint: disable=import-error\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\n\n[docs]def file_relative_path(dunderfile, relative_path):\n """\n This function is useful when one needs to load a file that is\n relative to the position of the current file. (Such as when\n you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n It is meant to be used like the following:\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)\n\n\ndef script_relative_path(file_path):\n """\n Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string):\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict):\n check.dict_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return list(ddict.items())[0]\n\n\n@contextlib.contextmanager\ndef pushd(path):\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path):\n """"Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path):\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n pass\n else:\n raise\n\n\nclass frozendict(dict):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyDict")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # For a dict, the default behavior for pickle is to iteratively call __setitem__ (see 5th item\n # in __reduce__ tuple). Since we want to disable __setitem__ and still inherit dict, we\n # override this behavior by defining __reduce__. We return the 3rd item in the tuple, which is\n # passed to __setstate__, allowing us to restore the frozendict.\n\n def __reduce__(self):\n return (frozendict, (), dict(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__\n __delitem__ = __readonly__\n pop = __readonly__ # type: ignore[assignment]\n popitem = __readonly__\n clear = __readonly__\n update = __readonly__ # type: ignore[assignment]\n setdefault = __readonly__\n del __readonly__\n\n\nclass frozenlist(list):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyList")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # Like frozendict, implement __reduce__ and __setstate__ to handle pickling.\n # Otherwise, __setstate__ will be called to restore the frozenlist, causing\n # a RuntimeError because frozenlist is not mutable.\n\n def __reduce__(self):\n return (frozenlist, (), list(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__ # type: ignore[assignment]\n __delitem__ = __readonly__\n append = __readonly__\n clear = __readonly__\n extend = __readonly__\n insert = __readonly__\n pop = __readonly__\n remove = __readonly__\n reverse = __readonly__\n sort = __readonly__ # type: ignore[assignment]\n\n def __hash__(self):\n return hash(tuple(self))\n\n\ndef make_readonly_value(value):\n if isinstance(value, list):\n return frozenlist(list(map(make_readonly_value, value)))\n elif isinstance(value, dict):\n return frozendict({key: make_readonly_value(value) for key, value in value.items()})\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_pipeline(path, pipeline_fn_name, env_file=None):\n from dagster.core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # pylint: disable=print-call\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if os.path.exists(path):\n os.unlink(path)\n\n\ndef ensure_gen(thing_or_gen):\n if not inspect.isgenerator(thing_or_gen):\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path):\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n\n\ndef ensure_file(path):\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(context, iterator):\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt):\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\n# hashable frozen string to string dict\nclass frozentags(frozendict):\n def __init__(self, *args, **kwargs):\n super(frozentags, self).__init__(*args, **kwargs)\n check.dict_param(self, "self", key_type=str, value_type=str)\n\n def __hash__(self):\n return hash(tuple(sorted(self.items())))\n\n def updated_with(self, new_tags):\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n updated = dict(self)\n for key, value in new_tags.items():\n updated[key] = value\n\n return frozentags(updated)\n\n\nclass EventGenerationManager:\n """ Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(self, generator, object_cls, require_object=True):\n self.generator = check.generator(generator)\n self.object_cls = check.type_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self):\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n "generator never yielded object of type {}".format(self.object_cls.__name__),\n )\n\n def get_object(self):\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return self.object\n\n def generate_teardown_events(self):\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp):\n tz = None\n if sys.version_info.major >= 3 and sys.version_info.minor >= 2:\n from datetime import timezone\n\n tz = timezone.utc\n else:\n import pytz\n\n tz = pytz.utc\n\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef is_enum_value(value):\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root():\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault():\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port():\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add, to_remove):\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules():\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid):\n if IS_WINDOWS:\n import psutil # pylint: disable=import-error\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """\n Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).\n """\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n
\nimport datetime\n\nimport pendulum\nfrom dagster import check\nfrom dagster.core.definitions.partition import Partition, PartitionSetDefinition\nfrom dagster.core.definitions.schedule import ScheduleExecutionContext\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.utils.schedules import schedule_execution_time_iterator\n\nDEFAULT_MONTHLY_FORMAT = "%Y-%m"\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\nDEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE = "%Y-%m-%d-%H:%M"\nDEFAULT_HOURLY_FORMAT_WITH_TIMEZONE = DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE + "%z"\n\n\ndef schedule_partition_range(\n start, end, cron_schedule, fmt, timezone, execution_time_to_partition_fn,\n):\n check.inst_param(start, "start", datetime.datetime)\n check.opt_inst_param(end, "end", datetime.datetime)\n check.str_param(cron_schedule, "cron_schedule")\n check.str_param(fmt, "fmt")\n check.opt_str_param(timezone, "timezone")\n check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn")\n\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt), end=end.strftime(fmt),\n )\n )\n\n def get_schedule_range_partitions():\n tz = timezone if timezone else pendulum.now().timezone.name\n _start = (\n start.in_tz(tz)\n if isinstance(start, pendulum.Pendulum)\n else pendulum.instance(start, tz=tz)\n )\n\n if not end:\n _end = pendulum.now(tz)\n elif isinstance(end, pendulum.Pendulum):\n _end = end.in_tz(tz)\n else:\n _end = pendulum.instance(end, tz=tz)\n\n end_timestamp = _end.timestamp()\n\n partitions = []\n for next_time in schedule_execution_time_iterator(_start.timestamp(), cron_schedule, tz):\n\n partition_time = execution_time_to_partition_fn(next_time)\n\n if partition_time.timestamp() > end_timestamp:\n break\n\n if partition_time.timestamp() < _start.timestamp():\n continue\n\n partitions.append(Partition(value=partition_time, name=partition_time.strftime(fmt)))\n\n return partitions[:-1]\n\n return get_schedule_range_partitions\n\n\n[docs]def date_partition_range(\n start, end=None, delta_range="days", fmt=None, inclusive=False, timezone=None,\n):\n """ Utility function that returns a partition generating function to be used in creating a\n `PartitionSet` definition.\n\n Args:\n start (datetime): Datetime capturing the start of the time range.\n end (Optional(datetime)): Datetime capturing the end of the partition. By default, the\n current time is used. The range is not inclusive of the end\n value.\n delta_range (Optional(str)): string representing the time duration of each partition.\n Must be a valid argument to pendulum.period.range ("days", "hours", "months", etc.).\n fmt (Optional(str)): Format string to represent each partition by its start time\n inclusive (Optional(bool)): By default, the partition set only contains date interval\n partitions for which the end time of the interval is less than current time. In other\n words, the partition set contains date interval partitions that are completely in the\n past. If inclusive is set to True, then the partition set will include all date\n interval partitions for which the start time of the interval is less than the\n current time.\n timezone (Optional(str)): Timezone in which the partition values should be expressed.\n Returns:\n Callable[[], List[Partition]]\n """\n\n check.inst_param(start, "start", datetime.datetime)\n check.opt_inst_param(end, "end", datetime.datetime)\n check.str_param(delta_range, "delta_range")\n fmt = check.opt_str_param(fmt, "fmt", default=DEFAULT_DATE_FORMAT)\n check.opt_str_param(timezone, "timezone")\n\n delta_amount = 1\n\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt), end=end.strftime(fmt),\n )\n )\n\n def get_date_range_partitions():\n tz = timezone if timezone else pendulum.now().timezone.name\n _start = (\n start.in_tz(tz)\n if isinstance(start, pendulum.Pendulum)\n else pendulum.instance(start, tz=tz)\n )\n\n if not end:\n _end = pendulum.now(tz)\n elif isinstance(end, pendulum.Pendulum):\n _end = end.in_tz(tz)\n else:\n _end = pendulum.instance(end, tz=tz)\n\n period = pendulum.period(_start, _end)\n date_names = [\n Partition(value=current, name=current.strftime(fmt))\n for current in period.range(delta_range, delta_amount)\n ]\n\n # We don't include the last element here by default since we only want\n # fully completed intervals, and the _end time is in the middle of the interval\n # represented by the last element of date_names\n if inclusive:\n return date_names\n\n return date_names[:-1]\n\n return get_date_range_partitions\n\n\n[docs]def identity_partition_selector(context, partition_set_def):\n """ Utility function for supplying a partition selector when creating a schedule from a\n partition set made of `datetime`s that assumes the schedule always executes at the\n partition time.\n\n It's important that the cron string passed into `create_schedule_definition` match\n the partition set times. For example, a schedule created from a partition set with partitions for each day at\n midnight would create its partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "hello_world_daily_schedule",\n "0 0 * * *",\n partition_selector=identity_partition_selector,\n execution_timezone="US/Central",\n )\n """\n\n return create_offset_partition_selector(lambda d: d)(context, partition_set_def)\n\n\n[docs]def create_offset_partition_selector(execution_time_to_partition_fn):\n """ Utility function for supplying a partition selector when creating a schedule from a\n partition set made of `datetime`s that assumes a fixed time offset between the partition\n time and the time at which the schedule executes.\n\n It's important to keep the cron string that's supplied to\n `PartitionSetDefinition.create_schedule_definition` in sync with the offset that's\n supplied to this function. For example, a schedule created from a partition set with\n partitions for each day at midnight that fills in the partition for day N at day N+1 at\n 10:00AM would create the partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "daily_10am_schedule",\n "0 10 * * *",\n partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n execution_timezone="US/Central",\n )\n\n Args:\n execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]): A\n function that maps the execution time of the schedule to the partition time.\n """\n\n check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn")\n\n def offset_partition_selector(context, partition_set_def):\n check.inst_param(context, "context", ScheduleExecutionContext)\n check.inst_param(partition_set_def, "partition_set_def", PartitionSetDefinition)\n\n if not context.scheduled_execution_time:\n partitions = partition_set_def.get_partitions()\n if not partitions:\n return None\n return partitions[-1]\n\n partition_time = execution_time_to_partition_fn(context.scheduled_execution_time)\n\n for partition in reversed(partition_set_def.get_partitions()):\n if partition.value.isoformat() == partition_time.isoformat():\n return partition\n\n if partition.value < partition_time:\n break\n\n return None\n\n return offset_partition_selector\n
\nimport os\nimport shutil\nimport tempfile\nimport uuid\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\n# top-level include is dangerous in terms of incurring circular deps\nfrom dagster import (\n DagsterInvariantViolationError,\n DependencyDefinition,\n Failure,\n ModeDefinition,\n PipelineDefinition,\n RepositoryDefinition,\n SolidInvocation,\n TypeCheck,\n check,\n execute_pipeline,\n lambda_solid,\n)\nfrom dagster.core.definitions.logger import LoggerDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.resource import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid import NodeDefinition\nfrom dagster.core.execution.api import create_execution_plan, scoped_pipeline_context\nfrom dagster.core.execution.context_creation_pipeline import (\n SystemPipelineExecutionContext,\n construct_execution_context_data,\n create_context_creation_data,\n create_executor,\n create_log_manager,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler import Scheduler\nfrom dagster.core.scheduler.scheduler import DagsterScheduleDoesNotExist, DagsterSchedulerError\nfrom dagster.core.snap import snapshot_from_execution_plan\nfrom dagster.core.storage.file_manager import LocalFileManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.types.dagster_type import resolve_dagster_type\nfrom dagster.core.utility_solids import define_stub_solid\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes import ConfigurableClass\n\n# pylint: disable=unused-import\nfrom ..temp_file import (\n get_temp_dir,\n get_temp_file_handle,\n get_temp_file_handle_with_data,\n get_temp_file_name,\n get_temp_file_name_with_data,\n get_temp_file_names,\n)\nfrom ..typing_api import is_typing_type\n\n\ndef create_test_pipeline_execution_context(logger_defs=None):\n from dagster.core.storage.intermediate_storage import build_in_mem_intermediates_storage\n\n loggers = check.opt_dict_param(\n logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n )\n mode_def = ModeDefinition(logger_defs=loggers)\n pipeline_def = PipelineDefinition(\n name="test_legacy_context", solid_defs=[], mode_defs=[mode_def]\n )\n run_config = {"loggers": {key: {} for key in loggers}}\n pipeline_run = PipelineRun(pipeline_name="test_legacy_context", run_config=run_config)\n instance = DagsterInstance.ephemeral()\n execution_plan = create_execution_plan(pipeline=pipeline_def, run_config=run_config)\n creation_data = create_context_creation_data(execution_plan, run_config, pipeline_run, instance)\n log_manager = create_log_manager(creation_data)\n scoped_resources_builder = ScopedResourcesBuilder()\n executor = create_executor(creation_data)\n\n return SystemPipelineExecutionContext(\n construct_execution_context_data(\n context_creation_data=creation_data,\n scoped_resources_builder=scoped_resources_builder,\n intermediate_storage=build_in_mem_intermediates_storage(pipeline_run.run_id),\n log_manager=log_manager,\n retries=executor.retries,\n raise_on_error=True,\n ),\n executor=executor,\n log_manager=log_manager,\n )\n\n\ndef _dep_key_of(solid):\n return SolidInvocation(solid.definition.name, solid.name)\n\n\ndef build_pipeline_with_input_stubs(pipeline_def, inputs):\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n deps = defaultdict(dict)\n for solid_name, dep_dict in pipeline_def.dependencies.items():\n for input_name, dep in dep_dict.items():\n deps[solid_name][input_name] = dep\n\n stub_solid_defs = []\n\n for solid_name, input_dict in inputs.items():\n if not pipeline_def.has_solid_named(solid_name):\n raise DagsterInvariantViolationError(\n (\n "You are injecting an input value for solid {solid_name} "\n "into pipeline {pipeline_name} but that solid was not found"\n ).format(solid_name=solid_name, pipeline_name=pipeline_def.name)\n )\n\n solid = pipeline_def.solid_named(solid_name)\n for input_name, input_value in input_dict.items():\n stub_solid_def = define_stub_solid(\n "__stub_{solid_name}_{input_name}".format(\n solid_name=solid_name, input_name=input_name\n ),\n input_value,\n )\n stub_solid_defs.append(stub_solid_def)\n deps[_dep_key_of(solid)][input_name] = DependencyDefinition(stub_solid_def.name)\n\n return PipelineDefinition(\n name=pipeline_def.name + "_stubbed",\n solid_defs=pipeline_def.top_level_solid_defs + stub_solid_defs,\n mode_defs=pipeline_def.mode_definitions,\n dependencies=deps,\n )\n\n\n[docs]def execute_solids_within_pipeline(\n pipeline_def,\n solid_names,\n inputs=None,\n run_config=None,\n mode=None,\n preset=None,\n tags=None,\n instance=None,\n):\n """Execute a set of solids within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_names (FrozenSet[str]): A set of the solid names, or the aliased solids, to execute.\n inputs (Optional[Dict[str, Dict[str, Any]]]): A dict keyed on solid names, whose values are\n dicts of input names to input values, used to pass input values to the solids directly.\n You may also use the ``run_config`` to configure any inputs that are configurable.\n run_config (Optional[dict]): The environment configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Dict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results of\n executing the solids, keyed by solid name.\n """\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solid_names, "solid_names", of_type=str)\n inputs = check.opt_dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n sub_pipeline = pipeline_def.get_pipeline_subset_def(solid_names)\n stubbed_pipeline = build_pipeline_with_input_stubs(sub_pipeline, inputs)\n result = execute_pipeline(\n stubbed_pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )\n\n return {sr.solid.name: sr for sr in result.solid_result_list}\n\n\n[docs]def execute_solid_within_pipeline(\n pipeline_def,\n solid_name,\n inputs=None,\n run_config=None,\n mode=None,\n preset=None,\n tags=None,\n instance=None,\n):\n """Execute a single solid within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_name (str): The name of the solid, or the aliased solid, to execute.\n inputs (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass input values to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n run_config (Optional[dict]): The environment configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n\n return execute_solids_within_pipeline(\n pipeline_def,\n solid_names={solid_name},\n inputs={solid_name: inputs} if inputs else None,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )[solid_name]\n\n\n@contextmanager\ndef yield_empty_pipeline_context(run_id=None, instance=None):\n pipeline = InMemoryPipeline(PipelineDefinition([]))\n pipeline_def = pipeline.get_definition()\n instance = check.opt_inst_param(\n instance, "instance", DagsterInstance, default=DagsterInstance.ephemeral()\n )\n\n execution_plan = create_execution_plan(pipeline)\n\n pipeline_run = instance.create_run(\n pipeline_name="<empty>",\n run_id=run_id,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan, pipeline_def.get_pipeline_snapshot_id()\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n )\n with scoped_pipeline_context(execution_plan, {}, pipeline_run, instance) as context:\n yield context\n\n\n[docs]def execute_solid(\n solid_def, mode_def=None, input_values=None, tags=None, run_config=None, raise_on_error=True,\n):\n """Execute a single solid in an ephemeral pipeline.\n\n Intended to support unit tests. Input values may be passed directly, and no pipeline need be\n specified -- an ephemeral pipeline will be constructed.\n\n Args:\n solid_def (SolidDefinition): The solid to execute.\n mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this\n if, e.g., custom resources, loggers, or executors are desired.\n input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass inputs to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n run_config (Optional[dict]): The environment configuration that parameterized this\n execution, as a dict.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n check.inst_param(solid_def, "solid_def", NodeDefinition)\n check.opt_inst_param(mode_def, "mode_def", ModeDefinition)\n input_values = check.opt_dict_param(input_values, "input_values", key_type=str)\n solid_defs = [solid_def]\n\n def create_value_solid(input_name, input_value):\n @lambda_solid(name=input_name)\n def input_solid():\n return input_value\n\n return input_solid\n\n dependencies = defaultdict(dict)\n\n for input_name, input_value in input_values.items():\n dependencies[solid_def.name][input_name] = DependencyDefinition(input_name)\n solid_defs.append(create_value_solid(input_name, input_value))\n\n result = execute_pipeline(\n PipelineDefinition(\n name="ephemeral_{}_solid_pipeline".format(solid_def.name),\n solid_defs=solid_defs,\n dependencies=dependencies,\n mode_defs=[mode_def] if mode_def else None,\n ),\n run_config=run_config,\n mode=mode_def.name if mode_def else None,\n tags=tags,\n raise_on_error=raise_on_error,\n )\n return result.result_for_handle(solid_def.name)\n\n\n[docs]def check_dagster_type(dagster_type, value):\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n (\n "Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n ).format(dagster_type=dagster_type)\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n with yield_empty_pipeline_context() as pipeline_context:\n context = pipeline_context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check\n\n\n@contextmanager\ndef copy_directory(src):\n with tempfile.TemporaryDirectory() as temp_dir:\n dst = os.path.join(temp_dir, os.path.basename(src))\n shutil.copytree(src, dst)\n yield dst\n\n\nclass FilesystemTestScheduler(Scheduler, ConfigurableClass):\n """This class is used in dagster core and dagster_graphql to test the scheduler's interactions\n with schedule storage, which are implemented in the methods defined on the base Scheduler class.\n Therefore, the following methods used to actually schedule jobs (e.g. create and remove cron jobs\n on a cron tab) are left unimplemented.\n """\n\n def __init__(self, artifacts_dir, inst_data=None):\n check.str_param(artifacts_dir, "artifacts_dir")\n self._artifacts_dir = artifacts_dir\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": str}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return FilesystemTestScheduler(artifacts_dir=config_value["base_dir"], inst_data=inst_data)\n\n def debug_info(self):\n return ""\n\n def start_schedule(self, instance, external_schedule):\n pass\n\n def stop_schedule(self, instance, schedule_origin_id):\n pass\n\n def running_schedule_count(self, instance, schedule_origin_id):\n return 0\n\n def get_logs_path(self, _instance, schedule_origin_id):\n check.str_param(schedule_origin_id, "schedule_origin_id")\n return os.path.join(self._artifacts_dir, "logs", schedule_origin_id, "scheduler.log")\n\n def wipe(self, instance):\n pass\n
\nimport datetime\nimport logging\nimport sys\nfrom contextlib import contextmanager\n\nimport dateutil\nimport lazy_object_proxy\nimport pendulum\nfrom airflow.models import TaskInstance\nfrom airflow.models.baseoperator import BaseOperator\nfrom airflow.models.dag import DAG\nfrom airflow.models.dagbag import DagBag\nfrom airflow.settings import LOG_FORMAT\nfrom dagster import (\n DagsterInvariantViolationError,\n DependencyDefinition,\n InputDefinition,\n MultiDependencyDefinition,\n Nothing,\n OutputDefinition,\n PipelineDefinition,\n SolidDefinition,\n check,\n repository,\n solid,\n)\nfrom dagster.core.definitions.utils import VALID_NAME_REGEX, validate_tags\nfrom dagster.core.instance import AIRFLOW_EXECUTION_DATE_STR, IS_AIRFLOW_INGEST_PIPELINE_STR\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\n\n\nclass DagsterAirflowError(Exception):\n pass\n\n\ndef contains_duplicate_task_names(dag_bag, refresh_from_airflow_db):\n check.inst_param(dag_bag, "dag_bag", DagBag)\n check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n seen_task_names = set()\n\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n for task in dag.tasks:\n if task.task_id in seen_task_names:\n return True\n else:\n seen_task_names.add(task.task_id)\n return False\n\n\n[docs]def make_dagster_repo_from_airflow_dag_bag(\n dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False\n):\n """ Construct a Dagster repository corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_repo_from_dag_bag():\n return make_dagster_repo_from_airflow_dag_bag(my_dag_bag, 'my_repo_name')\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag`\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n refresh_from_airflow_db (bool): If True, will refresh DAG if expired via DagBag.get_dag(),\n which requires access to initialized Airflow DB. If False (recommended), gets dag from\n DagBag's dags dict without depending on Airflow DB. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n check.str_param(repo_name, "repo_name")\n check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n use_unique_id = contains_duplicate_task_names(dag_bag, refresh_from_airflow_db)\n\n pipeline_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n # Only call Airflow DB via dag_bag.get_dag(dag_id) if refresh_from_airflow_db is True\n dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n if not use_unique_id:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag, tags=None, use_airflow_template_context=use_airflow_template_context\n )\n )\n else:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags=None,\n use_airflow_template_context=use_airflow_template_context,\n unique_id=count,\n )\n )\n count += 1\n\n @repository(name=repo_name)\n def _repo():\n return pipeline_defs\n\n return _repo\n\n\n[docs]def make_dagster_repo_from_airflow_example_dags(repo_name="airflow_example_dags_repo"):\n """ Construct a Dagster repository for Airflow's example DAGs.\n\n Execution of the following Airflow example DAGs is not currently supported:\n 'example_external_task_marker_child',\n 'example_pig_operator',\n 'example_skip_dag',\n 'example_trigger_target_dag',\n 'example_xcom',\n 'test_utils',\n\n Usage:\n\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_repo_from_airflow_example_dags()\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags`\n\n Args:\n repo_name (str): Name for generated RepositoryDefinition\n\n Returns:\n RepositoryDefinition\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1.10.8, v1.10.9, v1.10.10 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is currently fixed in master.\n # v1.10 stable: https://github.com/apache/airflow/blob/v1-10-stable/airflow/example_dags/example_complex.py#L133\n # master (05-05-2020): https://github.com/apache/airflow/blob/master/airflow/example_dags/example_complex.py#L136\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name)\n\n\n[docs]def make_dagster_repo_from_airflow_dags_path(\n dag_path,\n repo_name,\n safe_mode=True,\n store_serialized_dags=False,\n use_airflow_template_context=False,\n):\n """ Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n ``DagBag.get_dag()`` dependency requires Airflow DB to be initialized.\n\n Usage:\n Create ``make_dagster_repo.py``:\n\n .. code-block:: python\n\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\n def make_repo_from_dir():\n return make_dagster_repo_from_airflow_dags_path(\n '/path/to/dags/', 'my_repo_name'\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagit -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n store_serialized_dags (bool): True to read Airflow DAGS from Airflow DB. False to read DAGS\n from Python files. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.str_param(dag_path, "dag_path")\n check.str_param(repo_name, "repo_name")\n check.bool_param(safe_mode, "safe_mode")\n check.bool_param(store_serialized_dags, "store_serialized_dags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n try:\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n store_serialized_dags=store_serialized_dags,\n )\n except Exception: # pylint: disable=broad-except\n raise DagsterAirflowError("Error initializing airflow.models.dagbag object with arguments")\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name, use_airflow_template_context)\n\n\n[docs]def make_dagster_pipeline_from_airflow_dag(\n dag, tags=None, use_airflow_template_context=False, unique_id=None\n):\n """Construct a Dagster pipeline corresponding to a given Airflow DAG.\n\n Tasks in the resulting pipeline will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to the\n time (in UTC) of pipeline invocation:\n\n .. code-block:: python\n\n execute_pipeline(\n pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n preset='default')\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineDefinition tags. This will\n override behavior from (1).\n\n .. code-block:: python\n\n execute_pipeline(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n )\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineRun tags,\n such as in the Dagit UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating pipeline name and solid\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster pipeline\n tags (Dict[str, Field]): Pipeline tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n unique_id (int): If not None, this id will be postpended to generated solid names. Used by\n framework authors to enforce unique solid names within a repo.\n\n Returns:\n pipeline_def (PipelineDefinition): The generated Dagster pipeline\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_dict_param(tags, "tags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n tags = validate_tags(tags)\n\n pipeline_dependencies, solid_defs = _get_pipeline_definition_args(\n dag, use_airflow_template_context, unique_id\n )\n pipeline_def = PipelineDefinition(\n name=normalized_name(dag.dag_id, None),\n solid_defs=solid_defs,\n dependencies=pipeline_dependencies,\n tags=tags,\n )\n return pipeline_def\n\n\n# Airflow DAG ids and Task ids allow a larger valid character set (alphanumeric characters,\n# dashes, dots and underscores) than Dagster's naming conventions (alphanumeric characters,\n# underscores), so Dagster will strip invalid characters and replace with '_'\ndef normalized_name(name, unique_id):\n base_name = "airflow_" + "".join(c if VALID_NAME_REGEX.match(c) else "_" for c in name)\n if not unique_id:\n return base_name\n else:\n return base_name + "_" + str(unique_id)\n\n\ndef _get_pipeline_definition_args(dag, use_airflow_template_context, unique_id=None):\n check.inst_param(dag, "dag", DAG)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n pipeline_dependencies = {}\n solid_defs = []\n seen_tasks = []\n\n # To enforce predictable iteration order\n dag_roots = sorted(dag.roots, key=lambda x: x.task_id)\n for task in dag_roots:\n _traverse_airflow_dag(\n task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n return (pipeline_dependencies, solid_defs)\n\n\ndef _traverse_airflow_dag(\n task, seen_tasks, pipeline_dependencies, solid_defs, use_airflow_template_context, unique_id\n):\n check.inst_param(task, "task", BaseOperator)\n check.list_param(seen_tasks, "seen_tasks", BaseOperator)\n check.list_param(solid_defs, "solid_defs", SolidDefinition)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n seen_tasks.append(task)\n current_solid = make_dagster_solid_from_airflow_task(\n task, use_airflow_template_context, unique_id\n )\n solid_defs.append(current_solid)\n\n if len(task.upstream_list) > 0:\n # To enforce predictable iteration order\n task_upstream_list = sorted(task.upstream_list, key=lambda x: x.task_id)\n\n pipeline_dependencies[current_solid.name] = {\n "airflow_task_ready": MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=normalized_name(task_upstream.task_id, unique_id),\n output="airflow_task_complete",\n )\n for task_upstream in task_upstream_list\n ]\n )\n }\n\n # To enforce predictable iteration order\n task_downstream_list = sorted(task.downstream_list, key=lambda x: x.task_id)\n for child_task in task_downstream_list:\n if child_task not in seen_tasks:\n _traverse_airflow_dag(\n child_task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n\n\n@contextmanager\ndef replace_airflow_logger_handlers():\n try:\n # Redirect airflow handlers to stdout / compute logs\n prev_airflow_handlers = logging.getLogger("airflow.task").handlers\n handler = logging.StreamHandler(sys.stdout)\n handler.setFormatter(logging.Formatter(LOG_FORMAT))\n root = logging.getLogger("airflow.task")\n root.handlers = [handler]\n yield\n finally:\n # Restore previous log handlers\n logging.getLogger("airflow.task").handlers = prev_airflow_handlers\n\n\n# If unique_id is not None, this id will be postpended to generated solid names, generally used\n# to enforce unique solid names within a repo.\ndef make_dagster_solid_from_airflow_task(task, use_airflow_template_context, unique_id=None):\n check.inst_param(task, "task", BaseOperator)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n @solid(\n name=normalized_name(task.task_id, unique_id),\n input_defs=[InputDefinition("airflow_task_ready", Nothing)],\n output_defs=[OutputDefinition(Nothing, "airflow_task_complete")],\n )\n def _solid(context): # pylint: disable=unused-argument\n if AIRFLOW_EXECUTION_DATE_STR not in context.pipeline_run.tags:\n raise DagsterInvariantViolationError(\n 'Could not find "{AIRFLOW_EXECUTION_DATE_STR}" in pipeline tags "{tags}". Please '\n 'add "{AIRFLOW_EXECUTION_DATE_STR}" to pipeline tags before executing'.format(\n AIRFLOW_EXECUTION_DATE_STR=AIRFLOW_EXECUTION_DATE_STR,\n tags=context.pipeline_run.tags,\n )\n )\n execution_date_str = context.pipeline_run.tags.get(AIRFLOW_EXECUTION_DATE_STR)\n\n check.str_param(execution_date_str, "execution_date_str")\n try:\n execution_date = dateutil.parser.parse(execution_date_str)\n except ValueError:\n raise DagsterInvariantViolationError(\n 'Could not parse execution_date "{execution_date_str}". Please use datetime format '\n "compatible with dateutil.parser.parse.".format(\n execution_date_str=execution_date_str,\n )\n )\n except OverflowError:\n raise DagsterInvariantViolationError(\n 'Date "{execution_date_str}" exceeds the largest valid C integer on the system.'.format(\n execution_date_str=execution_date_str,\n )\n )\n\n check.inst_param(execution_date, "execution_date", datetime.datetime)\n\n with replace_airflow_logger_handlers():\n task_instance = TaskInstance(task=task, execution_date=execution_date)\n\n ti_context = (\n dagster_get_template_context(task_instance, task, execution_date)\n if not use_airflow_template_context\n else task_instance.get_template_context()\n )\n task.render_template_fields(ti_context)\n\n task.execute(ti_context)\n\n return None\n\n return _solid\n\n\ndef dagster_get_template_context(task_instance, task, execution_date):\n """\n Modified from /airflow/models/taskinstance.py to not reference Airflow DB\n (1) Removes the following block, which queries DB, removes dagrun instances, recycles run_id\n if hasattr(task, 'dag'):\n if task.dag.params:\n params.update(task.dag.params)\n from airflow.models.dagrun import DagRun # Avoid circular import\n\n dag_run = (\n session.query(DagRun)\n .filter_by(dag_id=task.dag.dag_id, execution_date=execution_date)\n .first()\n )\n run_id = dag_run.run_id if dag_run else None\n session.expunge_all()\n session.commit()\n (2) Removes returning 'conf': conf which passes along Airflow config\n (3) Removes 'var': {'value': VariableAccessor(), 'json': VariableJsonAccessor()}, which allows\n fetching Variable from Airflow DB\n """\n from airflow import macros\n\n tables = None\n if "tables" in task.params:\n tables = task.params["tables"]\n\n params = {}\n run_id = ""\n dag_run = None\n\n ds = execution_date.strftime("%Y-%m-%d")\n ts = execution_date.isoformat()\n yesterday_ds = (execution_date - datetime.timedelta(1)).strftime("%Y-%m-%d")\n tomorrow_ds = (execution_date + datetime.timedelta(1)).strftime("%Y-%m-%d")\n\n # For manually triggered dagruns that aren't run on a schedule, next/previous\n # schedule dates don't make sense, and should be set to execution date for\n # consistency with how execution_date is set for manually triggered tasks, i.e.\n # triggered_date == execution_date.\n if dag_run and dag_run.external_trigger:\n prev_execution_date = execution_date\n next_execution_date = execution_date\n else:\n prev_execution_date = task.dag.previous_schedule(execution_date)\n next_execution_date = task.dag.following_schedule(execution_date)\n\n next_ds = None\n next_ds_nodash = None\n if next_execution_date:\n next_ds = next_execution_date.strftime("%Y-%m-%d")\n next_ds_nodash = next_ds.replace("-", "")\n next_execution_date = pendulum.instance(next_execution_date)\n\n prev_ds = None\n prev_ds_nodash = None\n if prev_execution_date:\n prev_ds = prev_execution_date.strftime("%Y-%m-%d")\n prev_ds_nodash = prev_ds.replace("-", "")\n prev_execution_date = pendulum.instance(prev_execution_date)\n\n ds_nodash = ds.replace("-", "")\n ts_nodash = execution_date.strftime("%Y%m%dT%H%M%S")\n ts_nodash_with_tz = ts.replace("-", "").replace(":", "")\n yesterday_ds_nodash = yesterday_ds.replace("-", "")\n tomorrow_ds_nodash = tomorrow_ds.replace("-", "")\n\n ti_key_str = "{dag_id}__{task_id}__{ds_nodash}".format(\n dag_id=task.dag_id, task_id=task.task_id, ds_nodash=ds_nodash\n )\n\n if task.params:\n params.update(task.params)\n\n return {\n "dag": task.dag,\n "ds": ds,\n "next_ds": next_ds,\n "next_ds_nodash": next_ds_nodash,\n "prev_ds": prev_ds,\n "prev_ds_nodash": prev_ds_nodash,\n "ds_nodash": ds_nodash,\n "ts": ts,\n "ts_nodash": ts_nodash,\n "ts_nodash_with_tz": ts_nodash_with_tz,\n "yesterday_ds": yesterday_ds,\n "yesterday_ds_nodash": yesterday_ds_nodash,\n "tomorrow_ds": tomorrow_ds,\n "tomorrow_ds_nodash": tomorrow_ds_nodash,\n "END_DATE": ds,\n "end_date": ds,\n "dag_run": dag_run,\n "run_id": run_id,\n "execution_date": pendulum.instance(execution_date),\n "prev_execution_date": prev_execution_date,\n "prev_execution_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_execution_date_success\n ),\n "prev_start_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_start_date_success\n ),\n "next_execution_date": next_execution_date,\n "latest_date": ds,\n "macros": macros,\n "params": params,\n "tables": tables,\n "task": task,\n "task_instance": task_instance,\n "ti": task_instance,\n "task_instance_key_str": ti_key_str,\n "test_mode": task_instance.test_mode,\n "inlets": task.inlets,\n "outlets": task.outlets,\n }\n
\nimport datetime\nimport re\nfrom collections import namedtuple\n\nfrom airflow import DAG\nfrom airflow.operators import BaseOperator\nfrom dagster import check, seven\nfrom dagster.core.definitions.reconstructable import ReconstructableRepository\nfrom dagster.core.execution.api import create_execution_plan\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.instance.ref import InstanceRef\nfrom dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot, snapshot_from_execution_plan\nfrom dagster_airflow.operators.util import check_storage_specified\n\nfrom .compile import coalesce_execution_steps\nfrom .operators.docker_operator import DagsterDockerOperator\nfrom .operators.python_operator import DagsterPythonOperator\n\nDEFAULT_ARGS = {\n "depends_on_past": False,\n "email": ["airflow@example.com"],\n "email_on_failure": False,\n "email_on_retry": False,\n "owner": "airflow",\n "retries": 1,\n "retry_delay": datetime.timedelta(0, 300),\n "start_date": datetime.datetime(1900, 1, 1, 0, 0),\n}\n\n# Airflow DAG names are not allowed to be longer than 250 chars\nAIRFLOW_MAX_DAG_NAME_LEN = 250\n\n\ndef _make_dag_description(pipeline_name):\n return """Editable scaffolding autogenerated by dagster-airflow from pipeline {pipeline_name}\n """.format(\n pipeline_name=pipeline_name\n )\n\n\ndef _rename_for_airflow(name):\n """Modify pipeline name for Airflow to meet constraints on DAG names:\n https://github.com/apache/airflow/blob/1.10.3/airflow/utils/helpers.py#L52-L63\n\n Here, we just substitute underscores for illegal characters to avoid imposing Airflow's\n constraints on our naming schemes.\n """\n return re.sub(r"[^\\w\\-\\.]", "_", name)[:AIRFLOW_MAX_DAG_NAME_LEN]\n\n\nclass DagsterOperatorInvocationArgs(\n namedtuple(\n "DagsterOperatorInvocationArgs",\n "recon_repo pipeline_name run_config mode step_keys instance_ref pipeline_snapshot "\n "execution_plan_snapshot parent_pipeline_snapshot",\n )\n):\n def __new__(\n cls,\n recon_repo,\n pipeline_name,\n run_config,\n mode,\n step_keys,\n instance_ref,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n ):\n return super(DagsterOperatorInvocationArgs, cls).__new__(\n cls,\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n step_keys=step_keys,\n instance_ref=instance_ref,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n )\n\n\nclass DagsterOperatorParameters(\n namedtuple(\n "_DagsterOperatorParameters",\n (\n "recon_repo pipeline_name run_config "\n "mode task_id step_keys dag instance_ref op_kwargs pipeline_snapshot "\n "execution_plan_snapshot parent_pipeline_snapshot"\n ),\n )\n):\n def __new__(\n cls,\n pipeline_name,\n task_id,\n recon_repo=None,\n run_config=None,\n mode=None,\n step_keys=None,\n dag=None,\n instance_ref=None,\n op_kwargs=None,\n pipeline_snapshot=None,\n execution_plan_snapshot=None,\n parent_pipeline_snapshot=None,\n ):\n check_storage_specified(run_config)\n\n return super(DagsterOperatorParameters, cls).__new__(\n cls,\n recon_repo=check.opt_inst_param(recon_repo, "recon_repo", ReconstructableRepository),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n mode=check.opt_str_param(mode, "mode"),\n task_id=check.str_param(task_id, "task_id"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n dag=check.opt_inst_param(dag, "dag", DAG),\n instance_ref=check.opt_inst_param(instance_ref, "instance_ref", InstanceRef),\n op_kwargs=check.opt_dict_param(op_kwargs.copy(), "op_kwargs", key_type=str),\n pipeline_snapshot=check.inst_param(\n pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot\n ),\n execution_plan_snapshot=check.inst_param(\n execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n ),\n parent_pipeline_snapshot=check.opt_inst_param(\n parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot\n ),\n )\n\n @property\n def invocation_args(self):\n return DagsterOperatorInvocationArgs(\n recon_repo=self.recon_repo,\n pipeline_name=self.pipeline_name,\n run_config=self.run_config,\n mode=self.mode,\n step_keys=self.step_keys,\n instance_ref=self.instance_ref,\n pipeline_snapshot=self.pipeline_snapshot,\n execution_plan_snapshot=self.execution_plan_snapshot,\n parent_pipeline_snapshot=self.parent_pipeline_snapshot,\n )\n\n\ndef _make_airflow_dag(\n recon_repo,\n pipeline_name,\n run_config=None,\n mode=None,\n instance=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n operator=DagsterPythonOperator,\n):\n check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n check.str_param(pipeline_name, "pipeline_name")\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n mode = check.opt_str_param(mode, "mode")\n # Default to use the (persistent) system temp directory rather than a TemporaryDirectory,\n # which would not be consistent between Airflow task invocations.\n instance = (\n check.inst_param(instance, "instance", DagsterInstance)\n if instance\n else DagsterInstance.get(fallback_storage=seven.get_system_temp_directory())\n )\n\n # Only used for Airflow; internally we continue to use pipeline.name\n dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(pipeline_name))\n\n dag_description = check.opt_str_param(\n dag_description, "dag_description", _make_dag_description(pipeline_name)\n )\n check.subclass_param(operator, "operator", BaseOperator)\n\n dag_kwargs = dict(\n {"default_args": DEFAULT_ARGS},\n **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str),\n )\n\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n\n dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)\n pipeline = recon_repo.get_definition().get_pipeline(pipeline_name)\n\n if mode is None:\n mode = pipeline.get_default_mode_name()\n\n execution_plan = create_execution_plan(pipeline, run_config, mode=mode)\n\n tasks = {}\n\n coalesced_plan = coalesce_execution_steps(execution_plan)\n\n for solid_handle, solid_steps in coalesced_plan.items():\n step_keys = [step.key for step in solid_steps]\n\n operator_parameters = DagsterOperatorParameters(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n task_id=solid_handle,\n step_keys=step_keys,\n dag=dag,\n instance_ref=instance.get_ref(),\n op_kwargs=op_kwargs,\n pipeline_snapshot=pipeline.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()\n ),\n )\n task = operator(operator_parameters)\n\n tasks[solid_handle] = task\n\n for solid_step in solid_steps:\n for step_input in solid_step.step_inputs:\n for key in step_input.dependency_keys:\n prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string()\n if solid_handle != prev_solid_handle:\n tasks[prev_solid_handle].set_downstream(task)\n\n return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])\n\n\n[docs]def make_airflow_dag(\n module_name,\n pipeline_name,\n run_config=None,\n mode=None,\n instance=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\n callable, run by an underlying :py:class:`PythonOperator <airflow:PythonOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the Python environment within which\n your Airflow tasks execute. If you cannot install requirements into this environment, or you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline definition can be\n found.\n pipeline_name (str): The name of the pipeline definition.\n run_config (Optional[dict]): The environment config, if any, with which to compile\n the pipeline to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`PythonOperator <airflow:airflow.operators.python_operator.PythonOperator>`).\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n\n """\n check.str_param(module_name, "module_name")\n\n recon_repo = ReconstructableRepository.for_module(module_name, pipeline_name)\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n instance=instance,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )\n\n\n[docs]def make_airflow_dag_for_operator(\n recon_repo,\n pipeline_name,\n operator,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster pipeline and custom operator.\n\n `Custom operator template <https://github.com/dagster-io/dagster/blob/master/python_modules/dagster-test/dagster_test/dagster_airflow/custom_operator.py>`_\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\n Operator :py:class:`BaseOperator <airflow.models.BaseOperator>`. If you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n recon_repo (:class:`dagster.ReconstructableRepository`): reference to a Dagster RepositoryDefinition\n that can be reconstructed in another process\n pipeline_name (str): The name of the pipeline definition.\n operator (type): The operator to use. Must be a class that inherits from\n :py:class:`BaseOperator <airflow.models.BaseOperator>`\n run_config (Optional[dict]): The environment config, if any, with which to compile\n the pipeline to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.subclass_param(operator, "operator", BaseOperator)\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=operator,\n )\n\n\ndef make_airflow_dag_for_recon_repo(\n recon_repo,\n pipeline_name,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n):\n return _make_airflow_dag(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )\n\n\n[docs]def make_airflow_dag_containerized(\n module_name,\n pipeline_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n):\n """Construct a containerized Airflow DAG corresponding to a given Dagster pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate by calling the\n dagster-graphql API exposed by a container run using a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the container spun up by this operator.\n Typically you'll want to install these requirements onto the image you're using.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline definition can be\n found.\n pipeline_name (str): The name of the pipeline definition.\n image (str): The name of the Docker image to use for execution (passed through to\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n run_config (Optional[dict]): The environment config, if any, with which to compile\n the pipeline to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.str_param(module_name, "module_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n check.opt_dict_param(op_kwargs, "op_kwargs")\n\n recon_repo = ReconstructableRepository.for_module(module_name, pipeline_name)\n\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n op_kwargs["image"] = image\n return _make_airflow_dag(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n )\n\n\ndef make_airflow_dag_containerized_for_recon_repo(\n recon_repo,\n pipeline_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n instance=None,\n):\n check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n check.str_param(pipeline_name, "pipeline_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n op_kwargs["image"] = image\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n pipeline_name=pipeline_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n instance=instance,\n )\n
\n# Portions of this file are copied from the Yelp MRJob project:\n#\n# https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nimport dagster\nfrom botocore.exceptions import WaiterError\nfrom dagster import check\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n\n\n\n[docs]class EmrJobRunner:\n def __init__(\n self, region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(\n "cluster {cluster_name} not found in region {region}".format(\n cluster_name=cluster_name, region=self.region\n )\n )\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags = cluster_config.get("Tags", {})\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = "{log_key_prefix}{cluster_id}/steps/{step_id}".format(\n log_key_prefix=log_key_prefix, cluster_id=cluster_id, step_id=step_id\n )\n stdout_log = self.wait_for_log(log, log_bucket, "{prefix}/stdout.gz".format(prefix=prefix))\n stderr_log = self.wait_for_log(log, log_bucket, "{prefix}/stderr.gz".format(prefix=prefix))\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(\n "Attempting to get log: s3://{log_bucket}/{log_key}".format(\n log_bucket=log_bucket, log_key=log_key\n )\n )\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
\nimport os\nimport pickle\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import Field, StringSource, check, resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.events import log_step_event\nfrom dagster.core.execution.plan.external_step import (\n PICKLED_EVENTS_FILE_NAME,\n PICKLED_STEP_RUN_REF_FILE_NAME,\n step_context_to_step_run_ref,\n)\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n[docs]@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description="The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html",\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description="S3 bucket to use for passing files between the plan process and EMR "\n "process.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description="S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process",\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description="If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime.",\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=True,\n description="Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on EMR. This is a path on the local fileystem of "\n "the process executing the pipeline. The expectation is that this package will also be "\n "available on the python path of the launched process running the Spark step on EMR, "\n "either deployed on step launch via the deploy_pipeline_package option, referenced on "\n "s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap "\n "actions.",\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description="If set, before every step run, the launcher will zip up all the code in "\n "local_pipeline_package_path, upload it to s3, and pass it to spark-submit's "\n "--py-files option. This gives the remote process access to up-to-date user code. "\n "If not set, the assumption is that some other mechanism is used for distributing code "\n "to the EMR cluster. If this option is set to True, s3_pipeline_package_path should "\n "not also be set.",\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True.",\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n return EmrPySparkStepLauncher(**context.resource_config)\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items()\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_pipeline_package_path,\n deploy_local_pipeline_package,\n s3_pipeline_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_pipeline_package or not s3_pipeline_package_path,\n "If deploy_local_pipeline_package is set to True, s3_pipeline_package_path should not "\n "also be set.",\n )\n\n self.local_pipeline_package_path = check.str_param(\n local_pipeline_package_path, "local_pipeline_package_path"\n )\n self.deploy_local_pipeline_package = check.bool_param(\n deploy_local_pipeline_package, "deploy_local_pipeline_package"\n )\n self.s3_pipeline_package_path = check.opt_str_param(\n s3_pipeline_package_path, "s3_pipeline_package_path"\n )\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """\n Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(\n "Uploading file {local_path} to {s3_uri}".format(\n local_path=local_path, s3_uri=s3_uri\n )\n )\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_pipeline_package:\n # Zip and upload package containing pipeline\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_pipeline_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_pipeline_package_path\n )\n\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.solid.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n return self.wait_for_completion_and_log(log, run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, log, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(log, s3, run_id, step_key, emr_step_id):\n log_step_event(step_context, event)\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(log, emr_step_id)\n\n def wait_for_completion(self, log, s3, run_id, step_key, emr_step_id, check_interval=15):\n """ We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(log, self.cluster_id, emr_step_id)\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n yield all_events_new[i]\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object( # pylint: disable=no-member\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return pickle.loads(events_data)\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log."""\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n log._log( # pylint: disable=protected-access\n record.level,\n "".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n {},\n )\n log.info("Spark Driver stdout: " + stdout_log)\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc="spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"' % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, step_key, filename)\n return "s3://{bucket}/{key}".format(bucket=self.staging_bucket, key=key)\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join([self.staging_prefix, run_id, step_key, os.path.basename(filename)])\n
\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n[docs]class EmrClusterState(PyEnum):\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n[docs]class EmrStepState(PyEnum):\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
\nimport abc\nfrom contextlib import contextmanager\n\nimport psycopg2\nimport psycopg2.extensions\nfrom dagster import Field, IntSource, StringSource, check, resource\n\n\nclass RedshiftError(Exception):\n pass\n\n\nclass _BaseRedshiftResource(abc.ABC):\n def __init__(self, context): # pylint: disable=too-many-locals\n # Extract parameters from resource config\n self.conn_args = {\n k: context.resource_config.get(k)\n for k in (\n "host",\n "port",\n "user",\n "password",\n "database",\n "schema",\n "connect_timeout",\n "sslmode",\n )\n if context.resource_config.get(k) is not None\n }\n\n self.autocommit = context.resource_config.get("autocommit")\n self.log = context.log_manager\n\n @abc.abstractmethod\n def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n pass\n\n @abc.abstractmethod\n def execute_queries(\n self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n ):\n pass\n\n\nclass RedshiftResource(_BaseRedshiftResource):\n def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n """Synchronously execute a single query against Redshift. Will return a list of rows, where\n each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n Args:\n query (str): The query to execute.\n fetch_results (Optional[bool]): Whether to return the results of executing the query.\n Defaults to False, in which case the query will be executed without retrieving the\n results.\n cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n cursor_factory; defaults to None. Will be used when constructing the cursor.\n error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n callback function, invoked when an exception is encountered during query execution;\n this is intended to support executing additional queries to provide diagnostic\n information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n function is provided, exceptions during query execution will be raised directly.\n\n Returns:\n Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n fetch_results is set. Otherwise return None.\n """\n check.str_param(query, "query")\n check.bool_param(fetch_results, "fetch_results")\n check.opt_subclass_param(cursor_factory, "cursor_factory", psycopg2.extensions.cursor)\n check.opt_callable_param(error_callback, "error_callback")\n\n with self._get_conn() as conn:\n with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n try:\n self.log.info("Executing query '{query}'".format(query=query))\n cursor.execute(query)\n\n if fetch_results and cursor.rowcount > 0:\n return cursor.fetchall()\n else:\n self.log.info("Empty result from query")\n\n except Exception as e: # pylint: disable=broad-except\n # If autocommit is disabled or not set (it is disabled by default), Redshift\n # will be in the middle of a transaction at exception time, and because of\n # the failure the current transaction will not accept any further queries.\n #\n # This conn.commit() call closes the open transaction before handing off\n # control to the error callback, so that the user can issue additional\n # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n # use the same conn/cursor, so you have to do this conn.commit() to ensure\n # things are in a usable state in the error callback.\n if not self.autocommit:\n conn.commit()\n\n if error_callback is not None:\n error_callback(e, cursor, self.log)\n else:\n raise\n\n def execute_queries(\n self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n ):\n """Synchronously execute a list of queries against Redshift. Will return a list of list of\n rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n [[(1,)], [(1,)]].\n\n Args:\n queries (List[str]): The queries to execute.\n fetch_results (Optional[bool]): Whether to return the results of executing the query.\n Defaults to False, in which case the query will be executed without retrieving the\n results.\n cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n cursor_factory; defaults to None. Will be used when constructing the cursor.\n error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n callback function, invoked when an exception is encountered during query execution;\n this is intended to support executing additional queries to provide diagnostic\n information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n function is provided, exceptions during query execution will be raised directly.\n\n Returns:\n Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n tuples, when fetch_results is set. Otherwise return None.\n """\n check.list_param(queries, "queries", of_type=str)\n check.bool_param(fetch_results, "fetch_results")\n check.opt_subclass_param(cursor_factory, "cursor_factory", psycopg2.extensions.cursor)\n check.opt_callable_param(error_callback, "error_callback")\n\n results = []\n with self._get_conn() as conn:\n with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n for query in queries:\n try:\n self.log.info("Executing query '{query}'".format(query=query))\n cursor.execute(query)\n\n if fetch_results and cursor.rowcount > 0:\n results.append(cursor.fetchall())\n else:\n results.append([])\n self.log.info("Empty result from query")\n\n except Exception as e: # pylint: disable=broad-except\n # If autocommit is disabled or not set (it is disabled by default), Redshift\n # will be in the middle of a transaction at exception time, and because of\n # the failure the current transaction will not accept any further queries.\n #\n # This conn.commit() call closes the open transaction before handing off\n # control to the error callback, so that the user can issue additional\n # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n # use the same conn/cursor, so you have to do this conn.commit() to ensure\n # things are in a usable state in the error callback.\n if not self.autocommit:\n conn.commit()\n\n if error_callback is not None:\n error_callback(e, cursor, self.log)\n else:\n raise\n\n if fetch_results:\n return results\n\n @contextmanager\n def _get_conn(self):\n try:\n conn = psycopg2.connect(**self.conn_args)\n yield conn\n finally:\n conn.close()\n\n @contextmanager\n def _get_cursor(self, conn, cursor_factory=None):\n check.opt_subclass_param(cursor_factory, "cursor_factory", psycopg2.extensions.cursor)\n\n # Could be none, in which case we should respect the connection default. Otherwise\n # explicitly set to true/false.\n if self.autocommit is not None:\n conn.autocommit = self.autocommit\n\n with conn:\n with conn.cursor(cursor_factory=cursor_factory) as cursor:\n yield cursor\n\n # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n # want to do a final commit after we're wrapped up executing the full set of one or more\n # queries.\n if not self.autocommit:\n conn.commit()\n\n\nclass FakeRedshiftResource(_BaseRedshiftResource):\n QUERY_RESULT = [(1,)]\n\n def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n """Fake for execute_query; returns [self.QUERY_RESULT]\n\n Args:\n query (str): The query to execute.\n fetch_results (Optional[bool]): Whether to return the results of executing the query.\n Defaults to False, in which case the query will be executed without retrieving the\n results.\n cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n cursor_factory; defaults to None. Will be used when constructing the cursor.\n error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n callback function, invoked when an exception is encountered during query execution;\n this is intended to support executing additional queries to provide diagnostic\n information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n function is provided, exceptions during query execution will be raised directly.\n\n Returns:\n Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n fetch_results is set. Otherwise return None.\n """\n check.str_param(query, "query")\n check.bool_param(fetch_results, "fetch_results")\n check.opt_subclass_param(cursor_factory, "cursor_factory", psycopg2.extensions.cursor)\n check.opt_callable_param(error_callback, "error_callback")\n\n self.log.info("Executing query '{query}'".format(query=query))\n if fetch_results:\n return self.QUERY_RESULT\n\n def execute_queries(\n self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n ):\n """Fake for execute_queries; returns [self.QUERY_RESULT] * 3\n\n Args:\n queries (List[str]): The queries to execute.\n fetch_results (Optional[bool]): Whether to return the results of executing the query.\n Defaults to False, in which case the query will be executed without retrieving the\n results.\n cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n cursor_factory; defaults to None. Will be used when constructing the cursor.\n error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n callback function, invoked when an exception is encountered during query execution;\n this is intended to support executing additional queries to provide diagnostic\n information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n function is provided, exceptions during query execution will be raised directly.\n\n Returns:\n Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n tuples, when fetch_results is set. Otherwise return None.\n """\n check.list_param(queries, "queries", of_type=str)\n check.bool_param(fetch_results, "fetch_results")\n check.opt_subclass_param(cursor_factory, "cursor_factory", psycopg2.extensions.cursor)\n check.opt_callable_param(error_callback, "error_callback")\n\n for query in queries:\n self.log.info("Executing query '{query}'".format(query=query))\n if fetch_results:\n return [self.QUERY_RESULT] * 3\n\n\ndef define_redshift_config():\n """Redshift configuration. See the Redshift documentation for reference:\n\n https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-to-cluster.html\n """\n\n return {\n "host": Field(StringSource, description="Redshift host", is_required=True),\n "port": Field(\n IntSource, description="Redshift port", is_required=False, default_value=5439\n ),\n "user": Field(\n StringSource, description="Username for Redshift connection", is_required=False,\n ),\n "password": Field(\n StringSource, description="Password for Redshift connection", is_required=False,\n ),\n "database": Field(\n StringSource,\n description="Name of the default database to use. After login, you can use USE DATABASE"\n " to change the database.",\n is_required=False,\n ),\n "schema": Field(\n StringSource,\n description="Name of the default schema to use. After login, you can use USE SCHEMA to "\n "change the schema.",\n is_required=False,\n ),\n "autocommit": Field(\n bool,\n description="None by default, which honors the Redshift parameter AUTOCOMMIT. Set to "\n "True or False to enable or disable autocommit mode in the session, respectively.",\n is_required=False,\n ),\n "connect_timeout": Field(\n int,\n description="Connection timeout in seconds. 5 seconds by default",\n is_required=False,\n default_value=5,\n ),\n "sslmode": Field(\n str,\n description="SSL mode to use. See the Redshift documentation for more information on "\n "usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html",\n is_required=False,\n default_value="require",\n ),\n }\n\n\n[docs]@resource(\n config_schema=define_redshift_config(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, execute_solid, solid\n from dagster_aws.redshift import redshift_resource\n\n @solid(required_resource_keys={'redshift'})\n def example_redshift_solid(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n result = execute_solid(\n example_redshift_solid,\n run_config={\n 'resources': {\n 'redshift': {\n 'config': {\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n }\n }\n }\n },\n mode_def=ModeDefinition(resource_defs={'redshift': redshift_resource}),\n )\n assert result.output_value() == [(1,)]\n\n """\n return RedshiftResource(context)\n\n\n[docs]@resource(\n config_schema=define_redshift_config(),\n description="Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case.",\n)\ndef fake_redshift_resource(context):\n return FakeRedshiftResource(context)\n
\nimport os\nfrom contextlib import contextmanager\n\nimport boto3\nfrom dagster import Field, StringSource, check, seven\nfrom dagster.core.storage.compute_log_manager import (\n MAX_BYTES_FILE_READ,\n ComputeIOType,\n ComputeLogFileData,\n ComputeLogManager,\n)\nfrom dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, ensure_file\n\n\n[docs]class S3ComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Logs solid compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster.seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data=None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = check.str_param(prefix, "prefix")\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self.local_manager = LocalComputeLogManager(local_dir)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs( # pylint: disable=protected-access\n pipeline_run, step_key\n ):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n def get_local_path(self, run_id, key, io_type):\n return self.local_manager.get_local_path(run_id, key, io_type)\n\n def on_watch_start(self, pipeline_run, step_key):\n self.local_manager.on_watch_start(pipeline_run, step_key)\n\n def on_watch_finish(self, pipeline_run, step_key):\n self.local_manager.on_watch_finish(pipeline_run, step_key)\n key = self.local_manager.get_key(pipeline_run, step_key)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR)\n\n def is_watch_completed(self, run_id, key):\n return self.local_manager.is_watch_completed(run_id, key)\n\n def download_url(self, run_id, key, io_type):\n if not self.is_watch_completed(run_id, key):\n return self.local_manager.download_url(run_id, key, io_type)\n key = self._bucket_key(run_id, key, io_type)\n\n url = self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": key}\n )\n\n return url\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n if self._should_download(run_id, key, io_type):\n self._download_to_local(run_id, key, io_type)\n data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes)\n return self._from_local_file_data(run_id, key, io_type, data)\n\n def on_subscribe(self, subscription):\n self.local_manager.on_subscribe(subscription)\n\n def _should_download(self, run_id, key, io_type):\n local_path = self.get_local_path(run_id, key, io_type)\n if os.path.exists(local_path):\n return False\n s3_objects = self._s3_session.list_objects(\n Bucket=self._s3_bucket, Prefix=self._bucket_key(run_id, key, io_type)\n )\n return len(s3_objects) > 0\n\n def _from_local_file_data(self, run_id, key, io_type, local_file_data):\n is_complete = self.is_watch_completed(run_id, key)\n path = (\n "s3://{}/{}".format(self._s3_bucket, self._bucket_key(run_id, key, io_type))\n if is_complete\n else local_file_data.path\n )\n\n return ComputeLogFileData(\n path,\n local_file_data.data,\n local_file_data.cursor,\n local_file_data.size,\n self.download_url(run_id, key, io_type),\n )\n\n def _upload_from_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_file(path)\n key = self._bucket_key(run_id, key, io_type)\n with open(path, "rb") as data:\n self._s3_session.upload_fileobj(data, self._s3_bucket, key)\n\n def _download_to_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_dir(os.path.dirname(path))\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(\n self._s3_bucket, self._bucket_key(run_id, key, io_type), fileobj\n )\n\n def _bucket_key(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n paths = [\n self._s3_prefix,\n "storage",\n run_id,\n "compute_logs",\n "{}.{}".format(key, extension),\n ]\n return "/".join(paths) # s3 path delimiter\n
\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import Field, check, resource\nfrom dagster.core.storage.file_cache import FileCache\n\nfrom .file_manager import S3FileHandle\n\n\n[docs]class S3FileCache(FileCache):\n def __init__(self, s3_bucket, s3_key, s3_session, overwrite=False):\n super(S3FileCache, self).__init__(overwrite=overwrite)\n\n self.s3_bucket = s3_bucket\n self.s3_key = s3_key\n self.s3 = s3_session\n\n def has_file_object(self, file_key):\n check.str_param(file_key, "file_key")\n try:\n self.s3.get_object(Bucket=self.s3_bucket, Key=self.get_full_key(file_key))\n except ClientError:\n return False\n return True\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self.s3_key, file_key=file_key)\n\n def write_file_object(self, file_key, source_file_object):\n check.str_param(file_key, "file_key")\n\n self.s3.put_object(\n Body=source_file_object, Bucket=self.s3_bucket, Key=self.get_full_key(file_key)\n )\n return self.get_file_handle(file_key)\n\n def get_file_handle(self, file_key):\n check.str_param(file_key, "file_key")\n return S3FileHandle(self.s3_bucket, self.get_full_key(file_key))\n\n\n@resource(\n {\n "bucket": Field(str),\n "key": Field(str),\n "overwrite": Field(bool, is_required=False, default_value=False),\n }\n)\ndef s3_file_cache(init_context):\n return S3FileCache(\n s3_bucket=init_context.resource_config["bucket"],\n s3_key=init_context.resource_config["key"],\n overwrite=init_context.resource_config["overwrite"],\n # TODO: resource dependencies\n s3_session=boto3.resource("s3", use_ssl=True).meta.client,\n )\n
\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom dagster import check, usable_as_dagster_type\nfrom dagster.core.storage.file_manager import (\n FileHandle,\n FileManager,\n TempfileManager,\n check_file_like_obj,\n)\n\n\n[docs]@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key)\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n with open(self._get_local_path(file_handle), mode) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._s3_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
\nimport pickle\n\nfrom dagster import Field, IOManager, StringSource, check, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\n\n\nclass PickledObjectS3IOManager(IOManager):\n def __init__(\n self, s3_bucket, s3_session, s3_prefix=None,\n ):\n self.bucket = check.str_param(s3_bucket, "s3_bucket")\n self.s3_prefix = check.str_param(s3_prefix, "s3_prefix")\n self.s3 = s3_session\n self.s3.head_bucket(Bucket=self.bucket)\n\n def _get_path(self, context):\n return "/".join([self.s3_prefix, "storage", *context.get_run_scoped_output_identifier()])\n\n def _last_key(self, key):\n if "/" not in key:\n return key\n comps = key.split("/")\n return comps[-1]\n\n def _rm_object(self, key):\n check.str_param(key, "key")\n check.param_invariant(len(key) > 0, "key")\n\n def delete_for_results(store, results):\n store.s3.delete_objects(\n Bucket=store.bucket,\n Delete={"Objects": [{"Key": result["Key"]} for result in results["Contents"]]},\n )\n\n if self._has_object(key):\n results = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=key)\n delete_for_results(self, results)\n\n continuation = results["IsTruncated"]\n while continuation:\n continuation_token = results["NextContinuationToken"]\n results = self.s3.list_objects_v2(\n Bucket=self.bucket, Prefix=key, ContinuationToken=continuation_token\n )\n delete_for_results(self, results)\n continuation = results["IsTruncated"]\n\n def _has_object(self, key):\n check.str_param(key, "key")\n check.param_invariant(len(key) > 0, "key")\n\n key_count = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=key)["KeyCount"]\n return bool(key_count > 0)\n\n def _uri_for_key(self, key):\n check.str_param(key, "key")\n return "s3://" + self.bucket + "/" + "{key}".format(key=key)\n\n def load_input(self, context):\n key = self._get_path(context.upstream_output)\n context.log.debug(f"Loading S3 object from: {self._uri_for_key(key)}")\n obj = pickle.loads(self.s3.get_object(Bucket=self.bucket, Key=key)["Body"].read())\n\n return obj\n\n def handle_output(self, context, obj):\n key = self._get_path(context)\n context.log.debug(f"Writing S3 object at: {self._uri_for_key(key)}")\n\n if self._has_object(key):\n context.log.warning(f"Removing existing S3 key: {key}")\n self._rm_object(key)\n\n pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n self.s3.put_object(Bucket=self.bucket, Key=key, Body=pickled_obj)\n\n\n[docs]@io_manager(\n config_schema={\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Attach this resource definition to a :py:class:`~dagster.ModeDefinition`\n in order to make it available to your pipeline:\n\n .. code-block:: python\n\n pipeline_def = PipelineDefinition(\n mode_defs=[\n ModeDefinition(\n resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...},\n ), ...\n ], ...\n )\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager\n
\nfrom dagster import Field, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nS3_SESSION_CONFIG = {\n "use_unsigned_session": Field(\n bool,\n description="Specifies whether to use an unsigned S3 session",\n is_required=False,\n default_value=False,\n ),\n "region_name": Field(\n str, description="Specifies a custom region for the S3 session", is_required=False\n ),\n "endpoint_url": Field(\n StringSource,\n description="Specifies a custom endpoint for the S3 session",\n is_required=False,\n ),\n "max_attempts": Field(\n int,\n description="This provides Boto3's retry handler with a value of maximum retry attempts, "\n "where the initial call counts toward the max_attempts value that you provide",\n is_required=False,\n default_value=5,\n ),\n}\n\n\n[docs]@resource(S3_SESSION_CONFIG)\ndef s3_resource(context):\n """Resource that gives solids access to S3.\n\n The underlying S3 session is created by calling :py:func:`boto3.resource('s3') <boto3:boto3.resource>`.\n\n Attach this resource definition to a :py:class:`~dagster.ModeDefinition` in order to make it\n available to your solids.\n\n Example:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, execute_solid, solid\n from dagster_aws.s3 import s3_resource\n\n @solid(required_resource_keys={'s3'})\n def example_s3_solid(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n result = execute_solid(\n example_s3_solid,\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n },\n mode_def=ModeDefinition(resource_defs={'s3': s3_resource}),\n )\n\n Note that your solids must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n """\n return construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n )\n\n\n[docs]@resource(\n merge_dicts(\n S3_SESSION_CONFIG,\n {\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef s3_file_manager(context):\n """FileManager that provides abstract access to S3.\n \n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n ),\n s3_bucket=context.resource_config["s3_bucket"],\n s3_base_key=context.resource_config["s3_prefix"],\n )\n
\nfrom dagster import (\n AssetMaterialization,\n EventMetadataEntry,\n Field,\n FileHandle,\n InputDefinition,\n Output,\n OutputDefinition,\n StringSource,\n check,\n dagster_type_loader,\n solid,\n)\nfrom dagster.core.types.dagster_type import PythonObjectDagsterType\n\nfrom .file_manager import S3FileHandle\n\n\ndef dict_with_fields(name, fields):\n check.str_param(name, "name")\n check.dict_param(fields, "fields", key_type=str)\n field_names = set(fields.keys())\n\n @dagster_type_loader(fields)\n def _input_schema(_context, value):\n check.dict_param(value, "value")\n check.param_invariant(set(value.keys()) == field_names, "value")\n return value\n\n class _DictWithSchema(PythonObjectDagsterType):\n def __init__(self):\n super(_DictWithSchema, self).__init__(python_type=dict, name=name, loader=_input_schema)\n\n return _DictWithSchema()\n\n\nS3Coordinate = dict_with_fields(\n "S3Coordinate",\n fields={\n "bucket": Field(StringSource, description="S3 bucket name"),\n "key": Field(StringSource, description="S3 key name"),\n },\n)\n\n\ndef last_key(key):\n if "/" not in key:\n return key\n comps = key.split("/")\n return comps[-1]\n\n\n@solid(\n config_schema={\n "Bucket": Field(\n StringSource, description="The name of the bucket to upload to.", is_required=True\n ),\n "Key": Field(\n StringSource, description="The name of the key to upload to.", is_required=True\n ),\n },\n input_defs=[InputDefinition("file_handle", FileHandle, description="The file to upload.")],\n output_defs=[OutputDefinition(name="s3_file_handle", dagster_type=S3FileHandle)],\n description="""Take a file handle and upload it to s3. Returns an S3FileHandle.""",\n required_resource_keys={"s3", "file_manager"},\n)\ndef file_handle_to_s3(context, file_handle):\n bucket = context.solid_config["Bucket"]\n key = context.solid_config["Key"]\n\n with context.resources.file_manager.read(file_handle, "rb") as fileobj:\n context.resources.s3.upload_fileobj(fileobj, bucket, key)\n s3_file_handle = S3FileHandle(bucket, key)\n\n yield AssetMaterialization(\n asset_key=s3_file_handle.s3_path,\n metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))],\n )\n\n yield Output(value=s3_file_handle, output_name="s3_file_handle")\n
\nfrom dagster import Executor, Field, Noneable, Permissive, StringSource, check, executor\nfrom dagster.core.definitions.executor import check_cross_process_constraints\nfrom dagster.core.execution.retries import Retries, RetryMode, get_retries_config\nfrom dagster.grpc.types import ExecuteStepArgs\nfrom dagster.serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n "broker": Field(\n Noneable(StringSource),\n is_required=False,\n description=(\n "The URL of the Celery broker. Default: "\n "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n "'localhost')}//'."\n ),\n ),\n "backend": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="rpc://",\n description="The URL of the Celery results backend. Default: 'rpc://'.",\n ),\n "include": Field(\n [str], is_required=False, description="List of modules every worker should import"\n ),\n "config_source": Field(\n Noneable(Permissive()),\n is_required=False,\n description="Additional settings for the Celery app.",\n ),\n "retries": get_retries_config(),\n}\n\n\n[docs]@executor(name="celery", config_schema=CELERY_CONFIG)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute pipelines\n with variations on these settings.\n\n If you'd like to configure a celery executor in addition to the\n :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a\n :py:class:`~dagster.ModeDefinition` as follows:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, default_executors, pipeline\n from dagster_celery import celery_executor\n\n @pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [celery_executor])])\n def celery_enabled_pipeline():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n celery:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n check_cross_process_constraints(init_context)\n\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=Retries.from_config(init_context.executor_config["retries"]),\n )\n\n\ndef _submit_task(app, pipeline_context, step, queue, priority):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=pipeline_context.pipeline.get_python_origin(),\n pipeline_run_id=pipeline_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=pipeline_context.instance.get_ref(),\n retries_dict=pipeline_context.executor.retries.for_inner_plan().to_config(),\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=pipeline_context.pipeline.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority, queue=queue, routing_key="{queue}.execute_plan".format(queue=queue),\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self, retries, broker=None, backend=None, include=None, config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", Retries)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, pipeline_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n pipeline_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=Retries(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
\nimport json\nimport os\n\nimport docker.client\nfrom dagster import (\n DagsterInstance,\n EventMetadataEntry,\n Executor,\n Field,\n StringSource,\n check,\n executor,\n)\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.definitions.executor import check_cross_process_constraints\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils import merge_dicts\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n additional_config = {\n "docker": Field(\n {\n "image": Field(\n StringSource,\n is_required=False,\n description="The docker image to be used for step execution.",\n ),\n "registry": Field(\n {\n "url": Field(StringSource),\n "username": Field(StringSource),\n "password": Field(StringSource),\n },\n is_required=False,\n description="Information for using a non local/public docker registry",\n ),\n "env_vars": Field(\n [str],\n is_required=False,\n description="The list of environment variables names to forward from the celery worker in to the docker container",\n ),\n "network": Field(\n str,\n is_required=False,\n description="Name of the network this container will be connected to at creation time",\n ),\n },\n is_required=True,\n description="The configuration for interacting with docker in the celery worker.",\n ),\n }\n\n cfg = merge_dicts(CELERY_CONFIG, additional_config)\n return cfg\n\n\n[docs]@executor(name=CELERY_DOCKER_CONFIG_KEY, config_schema=celery_docker_config())\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute pipelines\n with variations on these settings.\n\n If you'd like to configure a Celery Docker executor in addition to the\n :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a\n :py:class:`~dagster.ModeDefinition` as follows:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, default_executors, pipeline\n from dagster_celery_docker.executor import celery_docker_executor\n\n @pipeline(mode_defs=[\n ModeDefinition(executor_defs=default_executors + [celery_docker_executor])\n ])\n def celery_enabled_pipeline():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n celery-docker:\n config:\n\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n check_cross_process_constraints(init_context)\n\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=Retries.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self, retries, docker_config, broker=None, backend=None, include=None, config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", Retries)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, pipeline_context, execution_plan):\n\n return core_celery_execution_loop(\n pipeline_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, pipeline_context, step, queue, priority):\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=pipeline_context.pipeline.get_python_origin(),\n pipeline_run_id=pipeline_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=pipeline_context.instance.get_ref(),\n retries_dict=pipeline_context.executor.retries.for_inner_plan().to_config(),\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=pipeline_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_docker".format(queue=queue),\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self, execute_step_args_packed, docker_config,\n ):\n """Run step execution in a Docker container.\n """\n execute_step_args = unpack_value(\n check.dict_param(execute_step_args_packed, "execute_step_args_packed",)\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n input_json = serialize_dagster_namedtuple(execute_step_args)\n\n command = "dagster api execute_step {}".format(json.dumps(input_json))\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the pipeline or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n "Executing steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(step_keys_str, "Step keys"),\n EventMetadataEntry.text(docker_image, "Image"),\n EventMetadataEntry.text(self.request.hostname, "Celery worker"),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_dagster_namedtuple(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=command,\n detach=False,\n auto_remove=True,\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n instance.report_engine_event(\n "Failed to run steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(docker_image, "Job image"),\n EventMetadataEntry.text(err.stderr, "Docker stderr"),\n ],\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n serialized_events += [event for event in res.split("\\n") if event]\n\n return serialized_events\n\n return _execute_step_docker\n
\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster import (\n DagsterEvent,\n DagsterEventType,\n DagsterInstance,\n EventMetadataEntry,\n Executor,\n check,\n executor,\n)\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.definitions.executor import check_cross_process_constraints\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.events.log import DagsterEventRecord\nfrom dagster.core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n DagsterK8sAPIRetryLimitExceeded,\n DagsterK8sError,\n DagsterK8sPipelineStatusException,\n DagsterK8sTimeoutError,\n DagsterK8sUnrecoverableAPIError,\n)\nfrom dagster_k8s.job import (\n UserDefinedDagsterK8sConfig,\n get_k8s_job_name,\n get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import (\n delete_job,\n filter_dagster_events_from_pod_logs,\n get_pod_names_in_job,\n retrieve_pod_logs,\n wait_for_job_success,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n[docs]@executor(name=CELERY_K8S_CONFIG_KEY, config_schema=celery_k8s_config())\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute pipelines\n with variations on these settings.\n\n If you'd like to configure a Celery Kubernetes Job executor in addition to the\n :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a\n :py:class:`~dagster.ModeDefinition` as follows:\n\n .. literalinclude:: ../dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n celery-k8s:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n\n check_cross_process_constraints(init_context)\n\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = DagsterK8sJobConfig(\n dagster_home=run_launcher.dagster_home,\n instance_config_map=run_launcher.instance_config_map,\n postgres_password_secret=run_launcher.postgres_password_secret,\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"),\n image_pull_policy=exc_cfg.get("image_pull_policy"),\n image_pull_secrets=exc_cfg.get("image_pull_secrets"),\n service_account_name=exc_cfg.get("service_account_name"),\n env_config_maps=exc_cfg.get("env_config_maps"),\n env_secrets=exc_cfg.get("env_secrets"),\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or Retries.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace"),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n )\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n ):\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", Retries)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.str_param(repo_location_name, "repo_location_name")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, pipeline_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n pipeline_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, pipeline_context, step, queue, priority):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=pipeline_context.pipeline.get_python_origin(),\n pipeline_run_id=pipeline_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=pipeline_context.instance.get_ref(),\n retries_dict=pipeline_context.executor.retries.for_inner_plan().to_config(),\n should_verify_step=True,\n )\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=pipeline_context.executor.job_config.to_dict(),\n job_namespace=pipeline_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=pipeline_context.executor.load_incluster_config,\n kubeconfig_file=pipeline_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_k8s_job".format(queue=queue),\n )\n\n\ndef construct_step_failure_event_and_handle(pipeline_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = DagsterEventRecord(\n message=str(err),\n user_message=str(err),\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod.\n """\n execute_step_args = unpack_value(\n check.dict_param(execute_step_args_packed, "execute_step_args_packed",)\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n "Task for step {step_key} picked up by Celery".format(step_key=step_key),\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(celery_worker_name, "Celery worker name"),\n EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because pipeline run status is not STARTED",\n pipeline_run,\n EngineEventData([EventMetadataEntry.text(step_key, "Step key"),]),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key)\n\n retries = Retries.from_config(execute_step_args.retries_dict)\n\n if retries.get_attempt_count(step_key):\n attempt_number = retries.get_attempt_count(step_key)\n job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-job-%s" % (k8s_name_key)\n pod_name = "dagster-job-%s" % (k8s_name_key)\n\n input_json = serialize_dagster_namedtuple(execute_step_args)\n args = ["dagster", "api", "execute_step", input_json]\n\n job = construct_dagster_k8s_job(\n job_config, args, job_name, user_defined_k8s_config, pod_name\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n "Executing step {} in Kubernetes job {}".format(step_key, job_name),\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(step_key, "Step key"),\n EventMetadataEntry.text(job_name, "Kubernetes Job name"),\n EventMetadataEntry.text(job_config.job_image, "Job image"),\n EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"),\n EventMetadataEntry.text(\n str(job_config.image_pull_secrets), "Image pull secrets"\n ),\n EventMetadataEntry.text(\n str(job_config.service_account_name), "Service account name"\n ),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(step_key, "Step key"),\n EventMetadataEntry.text(job_name, "Kubernetes Job name"),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [EventMetadataEntry.text(step_key, "Step key"),],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.pipeline_run_id,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n pipeline_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sPipelineStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because pipeline run status is not STARTED",\n pipeline_run,\n EngineEventData(\n [\n EventMetadataEntry.text(step_key, "Step key"),\n EventMetadataEntry.text(job_name, "Kubernetes Job name"),\n EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ) as err:\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [EventMetadataEntry.text(step_key, "Step key"),],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [EventMetadataEntry.text(step_key, "Step key"),],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n pipeline_run,\n EngineEventData([EventMetadataEntry.text("\\n".join(pod_names), "Pod names")]),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.rest.ApiException as e:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n pipeline_run,\n EngineEventData(\n [EventMetadataEntry.text(step_key, "Step key"),],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_pod_logs(logs)\n serialized_events = [serialize_dagster_namedtuple(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
\nimport sys\nimport weakref\n\nimport kubernetes\nfrom dagster import DagsterInvariantViolationError, EventMetadataEntry, Field, Noneable, check\nfrom dagster.config.field import resolve_to_config_type\nfrom dagster.config.validate import process_config\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.host_representation import (\n ExternalPipeline,\n GrpcServerRepositoryLocationHandle,\n GrpcServerRepositoryLocationOrigin,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.launcher import RunLauncher\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom dagster_k8s.job import (\n DagsterK8sJobConfig,\n construct_dagster_k8s_job,\n get_job_name_from_run_id,\n get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import delete_job\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_config\n\n\n[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches pipeline runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run coordinator Kubernetes Job, which traverses the pipeline run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You may configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n As always when using a :py:class:`~dagster.serdes.ConfigurableClass`, the values\n under the ``config`` key of this YAML block will be passed to the constructor. The full list\n of acceptable values is given below by the constructor args.\n\n Args:\n instance_config_map (str): The ``name`` of an existing Volume to mount into the pod in\n order to provide a ConfigMap for the Dagster instance. This Volume should contain a\n ``dagster.yaml`` with appropriate values for run storage, event log storage, etc.\n dagster_home (str): The location of DAGSTER_HOME in the Job container; this is where the\n ``dagster.yaml`` file will be mounted from the instance ConfigMap specified above.\n postgres_password_secret (str): The name of the Kubernetes Secret where the postgres\n password can be retrieved. Will be mounted and supplied as an environment variable to\n the Job Pod.\n load_incluster_config (Optional[bool]): Set this value if you are running the launcher\n within a k8s cluster. If ``True``, we assume the launcher is running within the target\n cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n we will use the k8s config specified in ``kubeconfig_file`` (using\n ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig. Default:\n ``True``.\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n None (using the default kubeconfig).\n broker (Optional[str]): The URL of the Celery broker.\n backend (Optional[str]): The URL of the Celery backend.\n include (List[str]): List of includes for the Celery workers\n config_source: (Optional[dict]): Additional settings for the Celery app.\n retries: (Optional[dict]): Default retry configuration for Celery tasks.\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data=None,\n k8s_client_batch_api=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._batch_api = k8s_client_batch_api or kubernetes.client.BatchV1Api()\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = Retries.from_config(retries)\n self._instance_ref = None\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n from dagster_celery.executor import CELERY_CONFIG\n\n job_cfg = DagsterK8sJobConfig.config_type_run_launcher()\n\n run_launcher_extra_cfg = {\n "load_incluster_config": Field(bool, is_required=False, default_value=True),\n "kubeconfig_file": Field(Noneable(str), is_required=False, default_value=None),\n }\n\n res = merge_dicts(job_cfg, run_launcher_extra_cfg)\n return merge_dicts(res, CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def _instance(self):\n return self._instance_ref() if self._instance_ref else None\n\n def initialize(self, instance):\n check.inst_param(instance, "instance", DagsterInstance)\n # Store a weakref to avoid a circular reference / enable GC\n self._instance_ref = weakref.ref(instance)\n\n def launch_run(self, instance, run, external_pipeline):\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(run, "run", PipelineRun)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n\n job_image = None\n pipeline_origin = None\n env_vars = None\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n # If the user is using user-code deployments, we grab the image from the gRPC server.\n if isinstance(\n external_pipeline.get_external_origin().external_repository_origin.repository_location_origin,\n GrpcServerRepositoryLocationOrigin,\n ):\n\n repository_location_handle = (\n external_pipeline.repository_handle.repository_location_handle\n )\n\n if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle):\n raise DagsterInvariantViolationError(\n "Expected RepositoryLocationHandle to be of type "\n "GrpcServerRepositoryLocationHandle but found type {}".format(\n type(repository_location_handle)\n )\n )\n\n repository_name = external_pipeline.repository_handle.repository_name\n repository_origin = repository_location_handle.reload_repository_python_origin(\n repository_name\n )\n pipeline_origin = PipelinePythonOrigin(\n pipeline_name=external_pipeline.name, repository_origin=repository_origin\n )\n\n job_image = repository_origin.container_image\n env_vars = {"DAGSTER_CURRENT_IMAGE": job_image}\n\n if job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have specified a job_image {job_image_from_executor_config} in your executor configuration, "\n "but also {job_image} in your user-code deployment. You cannot specify a job_image "\n "in your executor config when using user-code deployments because the job image is "\n "pulled from the deployment. To resolve this error, remove the job_image "\n "configuration from your executor configuration (which is a part of your run configuration)"\n )\n\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. "\n "To resolve this error, specify the job_image configuration in the executor "\n "config section in your run config. \\n"\n "Note: You may also be seeing this error because you are using the configured API. "\n "Using configured with the celery-k8s executor is not supported at this time, "\n "and the job_image must be configured at the top-level executor config without "\n "using configured."\n )\n\n job_image = job_image_from_executor_config\n pipeline_origin = external_pipeline.get_python_origin()\n\n job_config = DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy"),\n image_pull_secrets=exc_config.get("image_pull_secrets"),\n service_account_name=exc_config.get("service_account_name"),\n env_config_maps=exc_config.get("env_config_maps"),\n env_secrets=exc_config.get("env_secrets"),\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n\n from dagster.cli.api import ExecuteRunArgs\n\n input_json = serialize_dagster_namedtuple(\n # depends on DagsterInstance.get() returning the same instance\n # https://github.com/dagster-io/dagster/issues/2757\n ExecuteRunArgs(\n pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None,\n )\n )\n\n job = construct_dagster_k8s_job(\n job_config,\n args=["dagster", "api", "execute_run", input_json],\n job_name=job_name,\n pod_name=pod_name,\n component="run_coordinator",\n user_defined_k8s_config=user_defined_k8s_config,\n env_vars=env_vars,\n )\n\n job_namespace = exc_config.get("job_namespace")\n\n self._batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run_coordinator job launched",\n run,\n EngineEventData(\n [\n EventMetadataEntry.text(job_name, "Kubernetes Job name"),\n EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"),\n EventMetadataEntry.text(run.run_id, "Run ID"),\n ]\n ),\n cls=self.__class__,\n )\n return run\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate pipeline: can_terminate returned {}.".format(\n can_terminate\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n self._instance.report_run_canceling(run)\n\n try:\n termination_result = delete_job(job_name=job_name, namespace=job_namespace)\n if termination_result:\n self._instance.report_engine_event(\n message="Pipeline was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Pipeline was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception: # pylint: disable=broad-except\n self._instance.report_engine_event(\n message="Pipeline was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n run_config = pipeline_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace")\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n if not CELERY_K8S_CONFIG_KEY in executor_config:\n raise DagsterInvariantViolationError(\n "{config_key} execution configuration must be present in the run config to use the CeleryK8sRunLauncher. "\n "Note: You may also be seeing this error because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured.".format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n execution_config_schema = resolve_to_config_type(celery_k8s_config())\n execution_run_config = run_config["execution"][CELERY_K8S_CONFIG_KEY].get("config", {})\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success, "Incorrect {} execution schema provided".format(CELERY_K8S_CONFIG_KEY)\n )\n\n return res.value\n
\nimport io\nimport os\nimport shutil\nimport stat\nimport sys\n\nfrom crontab import CronTab\nfrom dagster import DagsterInstance, check, utils\nfrom dagster.core.host_representation import ExternalSchedule\nfrom dagster.core.scheduler import DagsterSchedulerError, Scheduler\nfrom dagster.serdes import ConfigurableClass\n\n\n[docs]class SystemCronScheduler(Scheduler, ConfigurableClass):\n """Scheduler implementation that uses the local systems cron. Only works on unix systems that\n have cron.\n\n Enable this scheduler by adding it to your ``dagster.yaml`` in ``$DAGSTER_HOME``.\n """\n\n def __init__(\n self, inst_data=None,\n ):\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SystemCronScheduler(inst_data=inst_data)\n\n def get_cron_tab(self):\n return CronTab(user=True)\n\n def debug_info(self):\n return "Running Cron Jobs:\\n{jobs}\\n".format(\n jobs="\\n".join(\n [str(job) for job in self.get_cron_tab() if "dagster-schedule:" in job.comment]\n )\n )\n\n def start_schedule(self, instance, external_schedule):\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n schedule_origin_id = external_schedule.get_external_origin_id()\n\n # If the cron job already exists, remove it. This prevents duplicate entries.\n # Then, add a new cron job to the cron tab.\n if self.running_schedule_count(instance, external_schedule.get_external_origin_id()) > 0:\n self._end_cron_job(instance, schedule_origin_id)\n\n self._start_cron_job(instance, external_schedule)\n\n # Verify that the cron job is running\n running_schedule_count = self.running_schedule_count(instance, schedule_origin_id)\n if running_schedule_count == 0:\n raise DagsterSchedulerError(\n "Attempted to write cron job for schedule "\n "{schedule_name}, but failed. "\n "The scheduler is not running {schedule_name}.".format(\n schedule_name=external_schedule.name\n )\n )\n elif running_schedule_count > 1:\n raise DagsterSchedulerError(\n "Attempted to write cron job for schedule "\n "{schedule_name}, but duplicate cron jobs were found. "\n "There are {running_schedule_count} jobs running for the schedule."\n "To resolve, run `dagster schedule up`, or edit the cron tab to "\n "remove duplicate schedules".format(\n schedule_name=external_schedule.name,\n running_schedule_count=running_schedule_count,\n )\n )\n\n def stop_schedule(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n schedule = self._get_schedule_state(instance, schedule_origin_id)\n\n self._end_cron_job(instance, schedule_origin_id)\n\n # Verify that the cron job has been removed\n running_schedule_count = self.running_schedule_count(instance, schedule_origin_id)\n if running_schedule_count > 0:\n raise DagsterSchedulerError(\n "Attempted to remove existing cron job for schedule "\n "{schedule_name}, but failed. "\n "There are still {running_schedule_count} jobs running for the schedule.".format(\n schedule_name=schedule.name, running_schedule_count=running_schedule_count\n )\n )\n\n def wipe(self, instance):\n # Note: This method deletes schedules from ALL repositories\n check.inst_param(instance, "instance", DagsterInstance)\n\n # Delete all script files\n script_directory = os.path.join(instance.schedules_directory(), "scripts")\n if os.path.isdir(script_directory):\n shutil.rmtree(script_directory)\n\n # Delete all logs\n logs_directory = os.path.join(instance.schedules_directory(), "logs")\n if os.path.isdir(logs_directory):\n shutil.rmtree(logs_directory)\n\n # Remove all cron jobs\n with self.get_cron_tab() as cron_tab:\n for job in cron_tab:\n if "dagster-schedule:" in job.comment:\n cron_tab.remove_all(comment=job.comment)\n\n def _get_bash_script_file_path(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n script_directory = os.path.join(instance.schedules_directory(), "scripts")\n utils.mkdir_p(script_directory)\n\n script_file_name = "{}.sh".format(schedule_origin_id)\n return os.path.join(script_directory, script_file_name)\n\n def _cron_tag_for_schedule(self, schedule_origin_id):\n return "dagster-schedule: {schedule_origin_id}".format(\n schedule_origin_id=schedule_origin_id\n )\n\n def _get_command(self, script_file, instance, schedule_origin_id):\n schedule_log_file_path = self.get_logs_path(instance, schedule_origin_id)\n command = "{script_file} > {schedule_log_file_path} 2>&1".format(\n script_file=script_file, schedule_log_file_path=schedule_log_file_path\n )\n\n return command\n\n def _start_cron_job(self, instance, external_schedule):\n schedule_origin_id = external_schedule.get_external_origin_id()\n script_file = self._write_bash_script_to_file(instance, external_schedule)\n command = self._get_command(script_file, instance, schedule_origin_id)\n\n with self.get_cron_tab() as cron_tab:\n job = cron_tab.new(\n command=command,\n comment="dagster-schedule: {schedule_origin_id}".format(\n schedule_origin_id=schedule_origin_id\n ),\n )\n job.setall(external_schedule.cron_schedule)\n\n def _end_cron_job(self, instance, schedule_origin_id):\n with self.get_cron_tab() as cron_tab:\n cron_tab.remove_all(comment=self._cron_tag_for_schedule(schedule_origin_id))\n\n script_file = self._get_bash_script_file_path(instance, schedule_origin_id)\n if os.path.isfile(script_file):\n os.remove(script_file)\n\n def running_schedule_count(self, instance, schedule_origin_id):\n matching_jobs = self.get_cron_tab().find_comment(\n self._cron_tag_for_schedule(schedule_origin_id)\n )\n\n return len(list(matching_jobs))\n\n def _get_or_create_logs_directory(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n utils.mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")\n\n def _write_bash_script_to_file(self, instance, external_schedule):\n # Get path to store bash script\n schedule_origin_id = external_schedule.get_external_origin_id()\n script_file = self._get_bash_script_file_path(instance, schedule_origin_id)\n\n # Get path to store schedule attempt logs\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n schedule_log_file_name = "{}_{}.result".format("${RUN_DATE}", schedule_origin_id)\n schedule_log_file_path = os.path.join(logs_directory, schedule_log_file_name)\n\n local_target = external_schedule.get_external_origin()\n\n # Environment information needed for execution\n dagster_home = os.getenv("DAGSTER_HOME")\n\n script_contents = """\n #!/bin/bash\n export DAGSTER_HOME={dagster_home}\n export LANG=en_US.UTF-8\n {env_vars}\n\n export RUN_DATE=$(date "+%Y%m%dT%H%M%S")\n\n {python_exe} -m dagster api launch_scheduled_execution --schedule_name {schedule_name} {repo_cli_args} "{result_file}"\n """.format(\n python_exe=sys.executable,\n schedule_name=external_schedule.name,\n repo_cli_args=local_target.get_repo_cli_args(),\n result_file=schedule_log_file_path,\n dagster_home=dagster_home,\n env_vars="\\n".join(\n [\n "export {key}={value}".format(key=key, value=value)\n for key, value in external_schedule.environment_vars.items()\n ]\n ),\n )\n\n with io.open(script_file, "w", encoding="utf-8") as f:\n f.write(script_contents)\n\n st = os.stat(script_file)\n os.chmod(script_file, st.st_mode | stat.S_IEXEC)\n\n return script_file\n
\nimport dask\nimport dask.distributed\nfrom dagster import Executor, Field, Permissive, Selector, StringSource, check, seven\nfrom dagster.core.definitions.executor import check_cross_process_constraints, executor\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.api import create_execution_plan, execute_plan\nfrom dagster.core.execution.context.system import SystemPipelineExecutionContext\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.retries import Retries\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.utils import frozentags, iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n[docs]@executor(\n name="dask",\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource}, description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n If you'd like to configure a dask executor in addition to the\n :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a\n :py:class:`~dagster.ModeDefinition` as follows:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, default_executors, pipeline\n from dagster_dask import dask_executor\n\n @pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [dask_executor])])\n def dask_enabled_pipeline():\n pass\n\n """\n check_cross_process_constraints(init_context)\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)\n\n\ndef query_on_dask_worker(\n dependencies, recon_pipeline, pipeline_run, run_config, step_keys, mode, instance_ref,\n): # pylint: disable=unused-argument\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n\n with DagsterInstance.from_ref(instance_ref) as instance:\n execution_plan = create_execution_plan(\n recon_pipeline.subset_for_execution_from_existing_pipeline(\n pipeline_run.solids_to_execute\n ),\n run_config=run_config,\n step_keys_to_execute=step_keys,\n mode=mode,\n )\n\n return execute_plan(execution_plan, instance, pipeline_run, run_config=run_config)\n\n\ndef get_dask_resource_requirements(tags):\n check.inst_param(tags, "tags", frozentags)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return Retries.disabled_mode()\n\n def execute(self, pipeline_context, execution_plan):\n check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(pipeline_context.executor, DaskExecutor),\n "pipeline_context",\n "Expected executor to be DaskExecutor got {}".format(pipeline_context.executor),\n )\n\n check.invariant(\n pipeline_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n pipeline_name = pipeline_context.pipeline_name\n\n instance = pipeline_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(pipeline_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(pipeline_name))\n else:\n raise ValueError(\n f"Must be providing one of the following ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n run_config = dict(pipeline_context.run_config, execution={"in_process": {}})\n recon_repo = pipeline_context.pipeline.get_reconstructable_repository()\n\n dask_task_name = "%s.%s" % (pipeline_name, step.key)\n\n recon_pipeline = recon_repo.get_reconstructable_pipeline(pipeline_name)\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_pipeline,\n pipeline_context.pipeline_run,\n run_config,\n [step.key],\n pipeline_context.mode_def.name,\n instance.get_ref(),\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, pipeline_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": pipeline_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
\nimport base64\nimport time\n\nimport dagster\nimport requests.exceptions\nfrom dagster import check\nfrom databricks_api import DatabricksAPI\n\nfrom .types import (\n DATABRICKS_RUN_TERMINATED_STATES,\n DatabricksRunLifeCycleState,\n DatabricksRunResultState,\n)\n\n# wait at most 24 hours by default for run execution\n_DEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n\n\n\nclass DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(self, host, token, workspace_id=None):\n self.host = host\n self.workspace_id = workspace_id\n self.client = DatabricksAPI(host=host, token=token)\n\n def submit_run(self, *args, **kwargs):\n """Submit a run directly to the 'Runs Submit' API."""\n return self.client.jobs.submit_run(*args, **kwargs)["run_id"] # pylint: disable=no-member\n\n def read_file(self, dbfs_path, block_size=1024 ** 2):\n """Read a file from DBFS to a **byte string**."""\n\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n data = b""\n bytes_read = 0\n jdoc = self.client.dbfs.read(path=dbfs_path, length=block_size) # pylint: disable=no-member\n data += base64.b64decode(jdoc["data"])\n while jdoc["bytes_read"] == block_size:\n bytes_read += jdoc["bytes_read"]\n jdoc = self.client.dbfs.read( # pylint: disable=no-member\n path=dbfs_path, offset=bytes_read, length=block_size\n )\n data += base64.b64decode(jdoc["data"])\n return data\n\n def put_file(self, file_obj, dbfs_path, overwrite=False, block_size=1024 ** 2):\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n create_response = self.client.dbfs.create( # pylint: disable=no-member\n path=dbfs_path, overwrite=overwrite\n )\n handle = create_response["handle"]\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n self.client.dbfs.add_block(data=data, handle=handle) # pylint: disable=no-member\n block = file_obj.read(block_size)\n\n self.client.dbfs.close(handle=handle) # pylint: disable=no-member\n\n def get_run_state(self, databricks_run_id):\n """Get the state of a run by Databricks run ID (_not_ dagster run ID).\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n state = run["state"]\n result_state = state.get("result_state")\n if result_state:\n result_state = DatabricksRunResultState(result_state)\n return DatabricksRunState(\n life_cycle_state=DatabricksRunLifeCycleState(state["life_cycle_state"]),\n result_state=result_state,\n state_message=state["state_message"],\n )\n\n\nclass DatabricksRunState:\n """Represents the state of a Databricks job run."""\n\n def __init__(self, life_cycle_state, result_state, state_message):\n self.life_cycle_state = life_cycle_state\n self.result_state = result_state\n self.state_message = state_message\n\n def has_terminated(self):\n """Has the job terminated?"""\n return self.life_cycle_state in DATABRICKS_RUN_TERMINATED_STATES\n\n def is_successful(self):\n """Was the job successful?"""\n return self.result_state == DatabricksRunResultState.Success\n\n def __repr__(self):\n return str(self.__dict__)\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress."""\n\n def __init__(\n self, host, token, poll_interval_sec=10, max_wait_time_sec=_DEFAULT_RUN_MAX_WAIT_TIME_SEC\n ):\n """Args:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net\n token (str): Databricks token\n """\n self.host = check.str_param(host, "host")\n self.token = check.str_param(token, "token")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client = DatabricksClient(host=self.host, token=self.token)\n\n @property\n def client(self):\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config, task):\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", [])\n tags.append({"key": "__dagster_version", "value": dagster.__version__})\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n python_libraries = {x["pypi"]["package"].split("==")[0] for x in libraries if "pypi" in x}\n for library in ["dagster", "dagster_databricks", "dagster_pyspark"]:\n if library not in python_libraries:\n libraries.append(\n {"pypi": {"package": "{}=={}".format(library, dagster.__version__)}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n config = dict(\n run_name=run_config.get("run_name"),\n new_cluster=new_cluster,\n existing_cluster_id=existing_cluster_id,\n libraries=libraries,\n **task,\n )\n return self.client.submit_run(**config)\n\n def retrieve_logs_for_run_id(self, log, databricks_run_id):\n """Retrieve the stdout and stderr logs for a run."""\n api_client = self.client.client\n run = api_client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n cluster = api_client.cluster.get_cluster( # pylint: disable=no-member\n run["cluster_instance"]["cluster_id"]\n )\n log_config = cluster.get("cluster_log_conf")\n if log_config is None:\n log.warn(\n "Logs not configured for cluster {cluster} used for run {run}".format(\n cluster=cluster["cluster_id"], run=databricks_run_id\n )\n )\n return None\n if "s3" in log_config:\n logs_prefix = log_config["s3"]["destination"]\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif "dbfs" in log_config:\n logs_prefix = log_config["dbfs"]["destination"]\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self, log, prefix, cluster_id, filename, waiter_delay=10, waiter_max_attempts=10\n ):\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info("Retrieving logs from {}".format(path))\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n\n def wait_for_run_to_complete(self, log, databricks_run_id):\n return wait_for_run_to_complete(\n self.client, log, databricks_run_id, self.poll_interval_sec, self.max_wait_time_sec\n )\n\n\ndef wait_for_run_to_complete(client, log, databricks_run_id, poll_interval_sec, max_wait_time_sec):\n """Wait for a Databricks run to complete."""\n check.int_param(databricks_run_id, "databricks_run_id")\n log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n start = time.time()\n while True:\n log.debug("Waiting %.1f seconds..." % poll_interval_sec)\n time.sleep(poll_interval_sec)\n run_state = client.get_run_state(databricks_run_id)\n if run_state.has_terminated():\n if run_state.is_successful():\n log.info("Run %s completed successfully" % databricks_run_id)\n return\n else:\n error_message = "Run %s failed with result state: %s. Message: %s" % (\n databricks_run_id,\n run_state.result_state,\n run_state.state_message,\n )\n log.error(error_message)\n raise DatabricksError(error_message)\n else:\n log.info("Run %s in state %s" % (databricks_run_id, run_state))\n if time.time() - start > max_wait_time_sec:\n raise DatabricksError(\n "Job run {} took more than {}s to complete; failing".format(\n databricks_run_id, max_wait_time_sec\n )\n )\n
\nimport io\nimport os.path\nimport pickle\nimport tempfile\n\nfrom dagster import Bool, Field, StringSource, check, resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.events import log_step_event\nfrom dagster.core.execution.plan.external_step import (\n PICKLED_EVENTS_FILE_NAME,\n PICKLED_STEP_RUN_REF_FILE_NAME,\n step_context_to_step_run_ref,\n)\nfrom dagster.serdes import deserialize_value\nfrom dagster_databricks import DatabricksJobRunner, databricks_step_main\nfrom dagster_pyspark.utils import build_pyspark_zip\n\nfrom .configs import (\n define_databricks_secrets_config,\n define_databricks_storage_config,\n define_databricks_submit_run_config,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\n\n\n[docs]@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n StringSource, is_required=True, description="Databricks access token",\n ),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=True,\n description="Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on Databricks. This is a path on the local "\n "fileystem of the process executing the pipeline. Before every step run, the "\n "launcher will zip up the code in this path, upload it to DBFS, and unzip it "\n "into the Python path of the remote Spark process. This gives the remote process "\n "access to up-to-date user code.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description="If set, and if the specified cluster is configured to export logs, "\n "the system will wait after job completion for the logs to appear in the configured "\n "location. Note that logs are copied every 5 minutes, so enabling this will add "\n "several minutes to the job runtime.",\n ),\n }\n)\ndef databricks_pyspark_step_launcher(context):\n """Resource for running solids as a Databricks Job.\n\n When this resource is used, the solid will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the solid's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config,\n databricks_host,\n databricks_token,\n secrets_to_env_variables,\n storage,\n local_pipeline_package_path,\n staging_prefix,\n wait_for_logs,\n ):\n self.run_config = check.dict_param(run_config, "run_config")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n self.databricks_token = check.str_param(databricks_token, "databricks_token")\n self.secrets = check.list_param(secrets_to_env_variables, "secrets_to_env_variables", dict)\n self.storage = check.dict_param(storage, "storage")\n self.local_pipeline_package_path = check.str_param(\n local_pipeline_package_path, "local_pipeline_package_path"\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(host=databricks_host, token=databricks_token)\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_pipeline_package_path\n )\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n self.databricks_runner.wait_for_run_to_complete(log, databricks_run_id)\n finally:\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n for event in self.get_step_events(run_id, step_key):\n log_step_event(step_context, event)\n yield event\n\n def get_step_events(self, run_id, step_key):\n path = self._dbfs_path(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n events_data = self.databricks_runner.client.read_file(path)\n return deserialize_value(pickle.loads(events_data))\n\n def _get_databricks_task(self, run_id, step_key):\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(self, log, step_run_ref, run_id, step_key):\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name())\n )\n\n log.info("Uploading pipeline to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing pipeline\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_pipeline_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME)\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file, self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n )\n\n databricks_config = DatabricksConfig(storage=self.storage, secrets=self.secrets,)\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file, self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n )\n\n def _log_logs_from_cluster(self, log, run_id):\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return databricks_step_main.__file__\n\n def _dbfs_path(self, run_id, step_key, filename):\n path = "/".join([self.staging_prefix, run_id, step_key, os.path.basename(filename)])\n return "dbfs://{}".format(path)\n\n def _internal_dbfs_path(self, run_id, step_key, filename):\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join([self.staging_prefix, run_id, step_key, os.path.basename(filename)])\n return "/dbfs/{}".format(path)\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to solid execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(self, storage, secrets):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils, sc):\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils, sc):\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n else:\n raise Exception("No valid storage found in Databricks configuration!")\n\n def setup_s3_storage(self, s3_storage, dbutils, sc):\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsAccessKeyId", access_key\n )\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsSecretAccessKey", secret_key\n )\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage, dbutils, sc):\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils):\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print( # pylint: disable=print-call\n "Exporting {} from Databricks secret {}, scope {}".format(name, key, scope)\n )\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
\nfrom dagster import Field, InputDefinition, Nothing, OutputDefinition, Permissive, check, solid\n\nfrom .databricks import wait_for_run_to_complete\n\n_START = "start"\n\n_DEFAULT_POLL_INTERVAL = 10\n# wait at most 24 hours by default for run execution\n_DEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n[docs]def create_databricks_job_solid(\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n """\n Creates a solid that launches a databricks job.\n\n As config, the solid accepts a blob of the form described in Databricks' job API:\n https://docs.databricks.com/dev-tools/api/latest/jobs.html.\n\n Returns:\n SolidDefinition: A solid definition.\n """\n check.str_param(name, "name")\n check.opt_str_param(description, "description")\n check.int_param(num_inputs, "num_inputs")\n check.set_param(required_resource_keys, "required_resource_keys", of_type=str)\n\n input_defs = [InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs)]\n\n @solid(\n name=name,\n description=description,\n config_schema={\n "job": Field(\n Permissive(),\n description="Databricks job run configuration, in the form described in "\n "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html",\n ),\n "poll_interval_sec": Field(\n float,\n description="Check whether the job is done at this interval.",\n default_value=_DEFAULT_POLL_INTERVAL,\n ),\n "max_wait_time_sec": Field(\n float,\n description="If the job is not complete after this length of time, raise an error.",\n default_value=_DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ),\n },\n input_defs=input_defs,\n output_defs=[OutputDefinition(Nothing)],\n required_resource_keys=required_resource_keys,\n tags={"kind": "databricks"},\n )\n def databricks_solid(context):\n job_config = context.solid_config["job"]\n databricks_client = context.resources.databricks_client\n run_id = databricks_client.submit_run(**job_config)\n\n context.log.info(\n "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion...".format(\n run_id=run_id, url=create_ui_url(databricks_client, context.solid_config)\n )\n )\n wait_for_run_to_complete(\n databricks_client,\n context.log,\n run_id,\n context.solid_config["poll_interval_sec"],\n context.solid_config["max_wait_time_sec"],\n )\n\n return databricks_solid\n\n\ndef create_ui_url(databricks_client, solid_config):\n host = databricks_client.host\n workspace_id = databricks_client.workspace_id or "<workspace_id>"\n if "existing_cluster_id" in solid_config["job"]:\n return "https://{host}/?o={workspace_id}#/setting/clusters/{cluster_id}/sparkUi".format(\n host=host,\n workspace_id=workspace_id,\n cluster_id=solid_config["job"]["existing_cluster_id"],\n )\n else:\n return "https://{host}/?o={workspace_id}#joblist".format(\n host=host, workspace_id=workspace_id\n )\n
\nfrom dagster import Field, StringSource, resource\nfrom datadog import DogStatsd, initialize, statsd\n\n\nclass DataDogResource:\n # Mirroring levels from the dogstatsd library\n OK, WARNING, CRITICAL, UNKNOWN = (\n DogStatsd.OK,\n DogStatsd.WARNING,\n DogStatsd.CRITICAL,\n DogStatsd.UNKNOWN,\n )\n\n def __init__(self, api_key, app_key):\n initialize(api_key=api_key, app_key=app_key)\n\n # Pull in methods from the dogstatsd library\n for method in [\n "event",\n "gauge",\n "increment",\n "decrement",\n "histogram",\n "distribution",\n "set",\n "service_check",\n "timed",\n "timing",\n ]:\n setattr(self, method, getattr(statsd, method))\n\n\n[docs]@resource(\n {\n "api_key": Field(StringSource, description="Datadog API key"),\n "app_key": Field(StringSource, description="Datadog application key"),\n },\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n\n .. code-block:: python\n\n @solid(required_resource_keys={'datadog'})\n def datadog_solid(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={'datadog': datadog_resource})])\n def dd_pipeline():\n datadog_solid()\n\n result = execute_pipeline(\n dd_pipeline,\n {'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}},\n )\n\n """\n return DataDogResource(\n context.resource_config.get("api_key"), context.resource_config.get("app_key")\n )\n
\nfrom typing import Dict\n\nfrom dagster import (\n Array,\n AssetMaterialization,\n Bool,\n EventMetadataEntry,\n InputDefinition,\n Noneable,\n Nothing,\n Output,\n OutputDefinition,\n Permissive,\n StringSource,\n solid,\n)\nfrom dagster.config.field import Field\nfrom dagster.utils.backcompat import experimental\n\nfrom .types import DbtCliOutput\nfrom .utils import execute_cli, parse_run_results\n\nDEFAULT_DBT_EXECUTABLE = "dbt"\n\n# The following config fields correspond to flags that apply to all dbt CLI commands. For details\n# on dbt CLI flags, see\n# https://github.com/fishtown-analytics/dbt/blob/1f8e29276e910c697588c43f08bc881379fff178/core/dbt/main.py#L260-L329\nCLI_COMMON_FLAGS_CONFIG_SCHEMA = {\n "project-dir": Field(\n config=StringSource,\n is_required=False,\n description=(\n "Which directory to look in for the dbt_project.yml file. Default is the current "\n "working directory and its parents."\n ),\n ),\n "profiles-dir": Field(\n config=StringSource,\n is_required=False,\n description=(\n "Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or "\n "$HOME/.dbt"\n ),\n ),\n "profile": Field(\n config=StringSource,\n is_required=False,\n description="Which profile to load. Overrides setting in dbt_project.yml.",\n ),\n "target": Field(\n config=StringSource,\n is_required=False,\n description="Which target to load for the given profile.",\n ),\n "vars": Field(\n config=Permissive({}),\n is_required=False,\n description=(\n "Supply variables to the project. This argument overrides variables defined in your "\n "dbt_project.yml file. This argument should be a dictionary, eg. "\n "{'my_variable': 'my_value'}"\n ),\n ),\n "bypass-cache": Field(\n config=bool,\n is_required=False,\n description="If set, bypass the adapter-level cache of database state",\n default_value=False,\n ),\n}\n\n# The following config fields correspond to options that apply to all CLI solids, but should not be\n# formatted as CLI flags.\nCLI_COMMON_OPTIONS_CONFIG_SCHEMA = {\n "warn-error": Field(\n config=bool,\n is_required=False,\n description=(\n "If dbt would normally warn, instead raise an exception. Examples include --models "\n "that selects nothing, deprecations, configurations with no associated models, "\n "invalid test configurations, and missing sources/refs in tests."\n ),\n default_value=False,\n ),\n "dbt_executable": Field(\n config=StringSource,\n is_required=False,\n description="Path to the dbt executable. Default is {}".format(DEFAULT_DBT_EXECUTABLE),\n default_value=DEFAULT_DBT_EXECUTABLE,\n ),\n "ignore_handled_error": Field(\n config=bool,\n is_required=False,\n description=(\n "When True, will not raise an exception when the dbt CLI returns error code 1. "\n "Default is False."\n ),\n default_value=False,\n ),\n}\n\nCLI_CONFIG_SCHEMA = {**CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA}\n\nCLI_COMMON_FLAGS = set(CLI_COMMON_FLAGS_CONFIG_SCHEMA.keys())\n\n\ndef passthrough_flags_only(solid_config, additional_flags):\n return {\n flag: solid_config[flag]\n for flag in (CLI_COMMON_FLAGS | set(additional_flags))\n if solid_config.get(flag) is not None\n }\n\n\n[docs]@solid(\n description="A solid to invoke dbt run via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate the "\n "incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first failure. (--fail-fast)",\n is_required=False,\n default_value=False,\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n is_required=False,\n default_value=[],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run(context) -> DbtCliOutput:\n """This solid executes ``dbt run`` via the dbt CLI."""\n from ..utils import generate_materializations\n\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("run",),\n flags_dict=passthrough_flags_only(\n context.solid_config, ("threads", "models", "exclude", "full-refresh", "fail-fast")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n run_results = parse_run_results(context.solid_config["project-dir"])\n cli_output_dict = {**run_results, **cli_output}\n cli_output = DbtCliOutput.from_dict(cli_output_dict)\n\n if context.solid_config["yield_materializations"]:\n for materialization in generate_materializations(\n cli_output, asset_key_prefix=context.solid_config["asset_key_prefix"]\n ):\n yield materialization\n\n yield AssetMaterialization(\n asset_key="dbt_run_cli_output",\n description="Output from the CLI execution of `dbt run`.",\n metadata_entries=[EventMetadataEntry.json(cli_output_dict, label="CLI Output")],\n )\n\n yield Output(cli_output, output_name="dbt_output")\n\n\n[docs]@solid(\n description="A solid to invoke dbt test via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "data": Field(\n config=bool,\n description='Run data tests defined in "tests" directory.',\n is_required=False,\n default_value=False,\n ),\n "schema": Field(\n config=bool,\n description="Run constraint validations from schema.yml files.",\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first test failure.",\n is_required=False,\n default_value=False,\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "yield_materializations": Field(\n config=Bool, is_required=False, default_value=True, description="FIXME"\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_test(context) -> DbtCliOutput:\n """This solid executes ``dbt test`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("test",),\n flags_dict=passthrough_flags_only(\n context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n run_results = parse_run_results(context.solid_config["project-dir"])\n cli_output = {**run_results, **cli_output}\n\n if context.solid_config["yield_materializations"]:\n yield AssetMaterialization(\n asset_key="dbt_test_cli_output",\n description="Output from the CLI execution of `dbt test`.",\n metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")],\n )\n\n yield Output(DbtCliOutput.from_dict(cli_output), output_name="dbt_output")\n\n\n[docs]@solid(\n description="A solid to invoke dbt snapshot via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(dagster_type=Dict)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings in "\n "profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "yield_materializations": Field(\n config=Bool, is_required=False, default_value=True, description="FIXME"\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot(context) -> Dict:\n """This solid executes ``dbt snapshot`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("snapshot",),\n flags_dict=passthrough_flags_only(context.solid_config, ("threads", "models", "exclude")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n\n if context.solid_config["yield_materializations"]:\n yield AssetMaterialization(\n asset_key="dbt_snapshot_cli_output",\n description="Output from the CLI execution of `dbt snapshot`.",\n metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")],\n )\n\n yield Output(cli_output)\n\n\n[docs]@solid(\n description="A solid to invoke dbt run-operation via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=Dict)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "macro": Field(\n config=StringSource,\n description=(\n "Specify the macro to invoke. dbt will call this macro with the supplied "\n "arguments and then exit."\n ),\n ),\n "args": Field(\n config=Permissive({}),\n is_required=False,\n description=(\n "Supply arguments to the macro. This dictionary will be mapped to the keyword "\n "arguments defined in the selected macro. This argument should be a dictionary, "\n "eg. {'my_variable': 'my_value'}"\n ),\n ),\n "yield_materializations": Field(\n config=Bool, is_required=False, default_value=True, description="FIXME"\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run_operation(context) -> Dict:\n """This solid executes ``dbt run-operation`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("run-operation", context.solid_config["macro"]),\n flags_dict=passthrough_flags_only(context.solid_config, ("args",)),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n\n if context.solid_config["yield_materializations"]:\n yield AssetMaterialization(\n asset_key="dbt_run_operation_cli_output",\n description="Output from the CLI execution of `dbt run-operation`.",\n metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")],\n )\n\n yield Output(cli_output)\n\n\n[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=Dict)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="Specify the sources to snapshot freshness.",\n ),\n "output": Field(\n config=StringSource,\n is_required=False,\n description=(\n "Specify the output path for the json report. By default, outputs to "\n "target/sources.json"\n ),\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides "\n "settings in profiles.yml."\n ),\n ),\n "yield_materializations": Field(\n config=Bool, is_required=False, default_value=True, description="FIXME"\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot_freshness(context) -> Dict:\n """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("source", "snapshot-freshness"),\n flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n\n if context.solid_config["yield_materializations"]:\n yield AssetMaterialization(\n asset_key="dbt_source_snapshot-freshness_cli_output",\n description="Output from the CLI execution of `dbt source snapshot-freshness`.",\n metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")],\n )\n\n yield Output(cli_output)\n\n\n[docs]@solid(\n description="A solid to invoke dbt compile via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=Dict)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "parse-only": Field(config=bool, is_required=False, default_value=False,),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate "\n "the incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n "yield_materializations": Field(\n config=Bool, is_required=False, default_value=True, description="FIXME"\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_compile(context) -> Dict:\n """This solid executes ``dbt compile`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=("compile",),\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "parse-only",\n "threads",\n "no-version-check",\n "models",\n "exclude",\n "selector",\n "state",\n "full-refresh",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n )\n\n if context.solid_config["yield_materializations"]:\n yield AssetMaterialization(\n asset_key="dbt_compile_cli_output",\n description="Output from the CLI execution of `dbt compile`.",\n metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")],\n )\n\n yield Output(cli_output)\n
\nfrom collections import namedtuple\nfrom typing import Any, Dict, Optional\n\nfrom dagster import check, usable_as_dagster_type\n\nfrom ..types import DbtResult\n\n\n[docs]@usable_as_dagster_type\nclass DbtCliOutput(\n namedtuple(\n "_DbtCliOutput",\n "result command return_code raw_output num_pass num_warn num_error num_skip num_total",\n ),\n):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n If the executed dbt command is either ``run`` or ``test``, then the ``.num_*`` attributes will\n contain non-``None`` integer values. Otherwise, they will be ``None``.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n num_pass (Optional[int]): The number of dbt nodes (models) that passed.\n num_warn (Optional[int]): The number of dbt nodes (models) that emitted warnings.\n num_error (Optional[int]): The number of dbt nodes (models) that emitted errors.\n num_skip (Optional[int]): The number of dbt nodes (models) that were skipped.\n num_total (Optional[int]): The total number of dbt nodes (models) that were processed.\n """\n\n def __new__(\n cls,\n result: DbtResult,\n command: str,\n return_code: int,\n raw_output: str,\n num_pass: Optional[int] = None,\n num_warn: Optional[int] = None,\n num_error: Optional[int] = None,\n num_skip: Optional[int] = None,\n num_total: Optional[int] = None,\n ):\n return super().__new__(\n cls,\n result,\n check.str_param(command, "command"),\n check.int_param(return_code, "return_code"),\n check.str_param(raw_output, "raw_output"),\n check.opt_int_param(num_pass, "num_pass"),\n check.opt_int_param(num_warn, "num_warn"),\n check.opt_int_param(num_error, "num_error"),\n check.opt_int_param(num_skip, "num_skip"),\n check.opt_int_param(num_total, "num_total"),\n )\n\n[docs] @classmethod\n def from_dict(cls, d: Dict[str, Any]) -> "DbtCliOutput":\n """Constructs an instance of :class:`DbtCliOutput <dagster_dbt.DbtCliOutput>` from a\n dictionary.\n\n Args:\n d (Dict[str, Any]): A dictionary with key-values to construct a :class:`DbtCliOutput\n <dagster_dbt.DbtCliOutput>`.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput <dagster_dbt.DbtCliOutput>`.\n """\n return_code = check.int_elem(d, "return_code")\n raw_output = check.str_elem(d, "raw_output")\n num_pass = check.opt_int_elem(d, "num_pass")\n num_warn = check.opt_int_elem(d, "num_warn")\n num_error = check.opt_int_elem(d, "num_error")\n num_skip = check.opt_int_elem(d, "num_skip")\n num_total = check.opt_int_elem(d, "num_total")\n command = check.str_elem(d, "command")\n\n return cls(\n result=DbtResult.from_dict(d),\n return_code=return_code,\n raw_output=raw_output,\n num_pass=num_pass,\n num_warn=num_warn,\n num_error=num_error,\n num_skip=num_skip,\n num_total=num_total,\n command=command,\n )\n
\nfrom abc import ABC\nfrom typing import Any, Dict, List\n\nfrom dagster import EventMetadataEntry, Failure, check\n\n\n[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""\n\n\n[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: List[int]\n\n def __init__(self, invalid_line_nos: List[int]):\n check.list_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata_entries = [\n EventMetadataEntry.json(\n {"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers"\n )\n ]\n super().__init__(description, metadata_entries)\n self.invalid_line_nos = invalid_line_nos\n\n\n[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):\n metadata_entries = [\n EventMetadataEntry.json({"logs": logs}, label="Parsed CLI Output (JSON)",),\n EventMetadataEntry.text(\n DagsterDbtCliRuntimeError.stitch_messages(logs),\n label="Parsed CLI Output (JSON) Message Attributes",\n ),\n EventMetadataEntry.text(raw_output, label="Raw CLI Output",),\n ]\n super().__init__(description, metadata_entries)\n\n @staticmethod\n def stitch_messages(logs: List[dict]) -> str:\n return "\\n".join(\n log["message"].strip("\\n")\n for log in logs\n if isinstance(log.get("message"), str) # defensive\n )\n\n\n[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output)\n\n\n[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Fatal error in the dbt CLI (return code 2)", logs, raw_output)\n\n\n[docs]class DagsterDbtRpcUnexpectedPollOutputError(DagsterDbtError):\n """Represents an unexpected response when polling the dbt RPC server."""\n\n\n[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__("Expected to find file at path {}".format(path))\n
\nimport json\nimport platform\nimport sys\nimport uuid\nfrom base64 import standard_b64encode as b64\nfrom typing import Any, Dict, List, Optional\n\nimport requests\nfrom dagster import Field, IntSource, RetryRequested, StringSource, check, resource\n\nfrom .utils import is_fatal_code\n\n\n[docs]class DbtRpcClient:\n """A client for a dbt RPC server.\n\n If you are need a dbt RPC server as a Dagster resource, we recommend that you use\n :func:`dbt_rpc_resource <dagster_dbt.dbt_rpc_resource>`.\n """\n\n def __init__(\n self,\n host: str = "0.0.0.0",\n port: int = 8580,\n jsonrpc_version: str = "2.0",\n logger: Optional[Any] = None,\n **_,\n ):\n """Constructor\n\n Args:\n host (str): The IP address of the host of the dbt RPC server. Default is ``"0.0.0.0"``.\n port (int): The port of the dbt RPC server. Default is ``8580``.\n jsonrpc_version (str): The JSON-RPC version to send in RPC requests.\n Default is ``"2.0"``.\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n """\n check.str_param(host, "host")\n check.int_param(port, "port")\n check.str_param(jsonrpc_version, "jsonrpc_version")\n\n self._host = host\n self._port = port\n self._jsonrpc_version = jsonrpc_version\n self._logger = logger\n\n @staticmethod\n def _construct_user_agent() -> str:\n """A helper method to construct a standard User-Agent string to be used in HTTP request\n headers.\n\n Returns:\n str: The constructed User-Agent value.\n """\n client = "dagster/dbt-rpc-client"\n python_version = (\n f"Python/{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"\n )\n system_info = f"{platform.system()}/{platform.release()}"\n user_agent = " ".join([python_version, client, system_info])\n return user_agent\n\n def _construct_headers(self) -> Dict[str, str]:\n """Constructs a standard set of headers for HTTP requests.\n\n Returns:\n Dict[str, str]: The HTTP request headers.\n """\n headers = requests.utils.default_headers()\n headers["User-Agent"] = self._construct_user_agent()\n headers["Content-Type"] = "application/json"\n headers["Accept"] = "application/json"\n return headers\n\n def _post(self, data: str = None) -> requests.Response:\n """Constructs and sends a POST request to the dbt RPC server.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n headers = self._construct_headers()\n try:\n response = requests.post(self.url, headers=headers, data=data)\n response.raise_for_status()\n except requests.exceptions.HTTPError as e:\n if is_fatal_code(e):\n raise e\n else:\n raise RetryRequested(max_retries=5, seconds_to_wait=30)\n return response\n\n def _default_request(self, method: str) -> Dict[str, Any]:\n """Constructs a standard HTTP request body, to be sent to a dbt RPC server.\n\n Args:\n method (str): a dbt RPC method.\n\n Returns:\n Dict: the constructed HTTP request body.\n """\n data = {\n "jsonrpc": self.jsonrpc_version,\n "method": method,\n "id": str(uuid.uuid1()),\n "params": {},\n }\n return data\n\n def _selection(\n self, *, models: List[str] = None, select: List[str] = None, exclude: List[str] = None\n ) -> Dict[str, str]:\n params = {}\n if models is not None:\n params["models"] = " ".join(set(models))\n if select is not None:\n params["select"] = " ".join(set(select))\n if exclude is not None:\n params["exclude"] = " ".join(set(exclude))\n\n return params\n\n @property\n def host(self) -> str:\n """str: The IP address of the host of the dbt RPC server."""\n return self._host\n\n @property\n def port(self) -> int:\n """int: The port of the dbt RPC server."""\n return self._port\n\n @property\n def jsonrpc_version(self) -> str:\n """str: The JSON-RPC version to send in RPC requests."""\n return self._jsonrpc_version\n\n @property\n def logger(self) -> Optional[Any]:\n """Optional[Any]: A property for injecting a logger dependency."""\n return self._logger\n\n @property\n def url(self) -> str:\n """str: The URL for sending dbt RPC requests."""\n return f"http://{self.host}:{self.port}/jsonrpc"\n\n[docs] def status(self):\n """Sends a request with the method ``status`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `status\n <https://docs.getdbt.com/reference/commands/rpc/#status>`_.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="status")\n return self._post(data=json.dumps(data))\n\n[docs] def poll(\n self, *, request_token: str, logs: bool = False, logs_start: int = 0\n ) -> requests.Response:\n """Sends a request with the method ``poll`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `poll\n <https://docs.getdbt.com/reference/commands/rpc/#poll>`_.\n\n Args:\n request_token (str): the token to poll responses for.\n logs (bool): Whether logs should be returned in the response. Defaults to ``False``.\n logs_start (int): The zero-indexed log line to fetch logs from. Defaults to ``0``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="poll")\n data["params"] = {"request_token": request_token, "logs": logs, "logs_start": logs_start}\n return self._post(data=json.dumps(data))\n\n[docs] def ps(self, *, completed: bool = False) -> requests.Response:\n """Sends a request with the method ``ps`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `ps\n <https://docs.getdbt.com/reference/commands/rpc/#ps>`_.\n\n Args:\n compelted (bool): If ``True``, then also return completed tasks. Defaults to ``False``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="ps")\n data["params"] = {"completed": completed}\n return self._post(data=json.dumps(data))\n\n[docs] def kill(self, *, task_id: str) -> requests.Response:\n """Sends a request with the method ``kill`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `kill\n <https://docs.getdbt.com/reference/commands/rpc/#kill>`_.\n\n Args:\n task_id (str): the ID of the task to terminate.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="kill")\n data["params"] = {"task_id": task_id}\n return self._post(data=json.dumps(data))\n\n[docs] def cli(self, *, cli: str, **kwargs) -> requests.Response:\n """Sends a request with CLI syntax to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for `running CLI commands via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#running-a-task-with-cli-syntax>`_.\n\n Args:\n cli (str): a dbt command in CLI syntax.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="cli_args")\n data["params"] = {"cli": cli}\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def compile(\n self, *, models: List[str] = None, exclude: List[str] = None, **kwargs\n ) -> requests.Response:\n """Sends a request with the method ``compile`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling projects via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#compile-a-project>`_.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="compile")\n data["params"].update(self._selection(models=models, exclude=exclude))\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def run(\n self, *, models: List[str] = None, exclude: List[str] = None, **kwargs\n ) -> requests.Response:\n """Sends a request with the method ``run`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `run\n <https://docs.getdbt.com/reference/commands/rpc/#run-models>`_.\n\n Args:\n models (List[str], optional): the models to include in the run.\n exclude (List[str]), optional): the models to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="run")\n data["params"].update(self._selection(models=models, exclude=exclude))\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def snapshot(\n self, *, select: List[str] = None, exclude: List[str] = None, **kwargs\n ) -> requests.Response:\n """Sends a request with the method ``snapshot`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `snapshot\n <https://docs.getdbt.com/reference/commands/snapshot>`_.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="snapshot")\n data["params"].update(self._selection(select=select, exclude=exclude))\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def test(\n self,\n *,\n models: List[str] = None,\n exclude: List[str] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> requests.Response:\n """Sends a request with the method ``test`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `test\n <https://docs.getdbt.com/reference/commands/rpc/#run-test>`_.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n payload = self._default_request(method="test")\n payload["params"] = {"data": data, "schema": schema}\n\n payload["params"].update(self._selection(models=models, exclude=exclude))\n\n if kwargs is not None:\n payload["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(payload))\n\n[docs] def seed(self, *, show: bool = False, **kwargs) -> requests.Response:\n """Sends a request with the method ``seed`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `seed\n <https://docs.getdbt.com/reference/commands/rpc/#run-seed>`_.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="seed")\n data["params"] = {"show": show}\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def generate_docs(\n self,\n *,\n models: List[str] = None,\n exclude: List[str] = None,\n compile: bool = False, # pylint: disable=redefined-builtin # TODO\n **kwargs,\n ) -> requests.Response:\n """Sends a request with the method ``docs.generate`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `docs.generate\n <https://docs.getdbt.com/reference/commands/rpc/#generate-docs>`_.\n\n Args:\n models (List[str], optional): the models to include in docs generation.\n exclude (List[str], optional): the models to exclude from docs generation.\n compile (bool, optional): If ``True`` (default), then compile the project before\n generating docs.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="docs.generate")\n data["params"] = {"compile": compile}\n\n data["params"].update(self._selection(models=models, exclude=exclude))\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def run_operation(\n self, *, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> requests.Response:\n """Sends a request with the method ``run-operation`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `run-operation\n <https://docs.getdbt.com/reference/commands/run-operation>`_.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="run-operation")\n data["params"] = {"macro": macro}\n\n if args is not None:\n data["params"]["args"] = args\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def snapshot_freshness(\n self, *, select: Optional[List[str]] = None, **kwargs\n ) -> requests.Response:\n """Sends a request with the method ``snapshot-freshness`` to the dbt RPC server, and returns\n the response. For more details, see the dbt docs for the command `source snapshot-freshness\n <https://docs.getdbt.com/reference/commands/source#dbt-source-snapshot-freshness>`_.\n\n Args:\n select (List[str], optional): the models to include in calculating snapshot freshness.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="snapshot-freshness")\n data["params"].update(self._selection(select=select))\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._post(data=json.dumps(data))\n\n[docs] def compile_sql(self, *, sql: str, name: str) -> requests.Response:\n """Sends a request with the method ``compile_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#compiling-a-query>`_.\n\n Args:\n sql (str): the SQL to compile in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="compile_sql")\n data["params"] = {"sql": b64(sql.encode("utf-8")).decode("utf-8"), "name": name}\n return self._post(data=json.dumps(data))\n\n[docs] def run_sql(self, *, sql: str, name: str) -> requests.Response:\n """Sends a request with the method ``run_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `running SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#executing-a-query>`_.\n\n Args:\n sql (str): the SQL to run in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="run_sql")\n data["params"] = {"sql": b64(sql.encode("utf-8")).decode("utf-8"), "name": name}\n return self._post(data=json.dumps(data))\n\n\n[docs]@resource(\n description="A resource representing a dbt RPC client.",\n config_schema={\n "host": Field(StringSource),\n "port": Field(IntSource, is_required=False, default_value=8580),\n },\n)\ndef dbt_rpc_resource(context) -> DbtRpcClient:\n """This resource defines a dbt RPC client.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n Examples:\n\n .. code-block:: python\n\n custom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt_rpc": custom_dbt_rpc_resource})])\n def dbt_rpc_pipeline():\n # Run solids with `required_resource_keys={"dbt_rpc", ...}`.\n\n """\n return DbtRpcClient(host=context.resource_config["host"], port=context.resource_config["port"])\n\n\nlocal_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "0.0.0.0", "port": 8580})\nlocal_dbt_rpc_resource.__doc__ = """This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580."""\n
\nimport json\nimport time\nfrom typing import Callable, Optional\n\nimport pandas as pd\nfrom dagster import (\n Array,\n Bool,\n DagsterInvalidDefinitionError,\n Failure,\n Field,\n InputDefinition,\n Int,\n Noneable,\n Nothing,\n Output,\n OutputDefinition,\n Permissive,\n RetryRequested,\n String,\n check,\n solid,\n)\nfrom dagster.core.execution.context.compute import SolidExecutionContext\nfrom dagster_pandas import DataFrame\n\nfrom ..errors import DagsterDbtRpcUnexpectedPollOutputError\nfrom .types import DbtRpcOutput\nfrom .utils import log_rpc, raise_for_rpc_error\n\n\ndef _poll_rpc(\n context: SolidExecutionContext, request_token: str, should_yield_materializations: bool = True\n) -> DbtRpcOutput:\n """Polls the dbt RPC server for the status of a request until the state is ``success``."""\n from ..utils import generate_materializations\n\n logs_start = 0\n interval = context.solid_config.get("interval")\n\n elapsed_time = -1\n current_state = None\n\n while True:\n # Poll for the dbt RPC request.\n context.log.debug(f"RequestToken: {request_token}")\n resp = context.resources.dbt_rpc.poll(\n request_token=request_token, logs=context.solid_config["logs"], logs_start=logs_start\n )\n raise_for_rpc_error(context, resp)\n\n resp_json = resp.json()\n resp_result = resp_json.get("result", {})\n\n # Pass dbt RPC logs into the Dagster/Dagit logger.\n if context.solid_config["logs"]:\n logs = resp_result.get("logs", [])\n if len(logs) > 0:\n log_rpc(context, logs)\n logs_start += len(logs)\n\n current_state = resp_result.get("state")\n # Stop polling if request's state is no longer "running".\n if current_state != "running":\n break\n\n elapsed_time = resp_result.get("elapsed", 0)\n # Sleep for the configured time interval before polling again.\n context.log.debug(\n f"Request {request_token} currently in state '{current_state}' (elapsed time "\n f"{elapsed_time} seconds). Sleeping for {interval}s..."\n )\n time.sleep(interval)\n\n if current_state != "success":\n raise Failure(\n description=(\n f"Request {request_token} finished with state '{current_state}' in "\n f"{elapsed_time} seconds"\n ),\n )\n\n context.log.info(\n f"Request {request_token} finished with state '{current_state}' in {elapsed_time} seconds"\n )\n context.log.debug(json.dumps(resp_result, indent=2))\n\n polled_run_results = DbtRpcOutput.from_dict(resp_result)\n\n if should_yield_materializations:\n for materialization in generate_materializations(polled_run_results):\n yield materialization\n\n yield Output(polled_run_results)\n\n\ndef unwrap_result(poll_rpc_generator) -> DbtRpcOutput:\n """A helper function that extracts the `DbtRpcOutput` value from a generator.\n\n The parameter `poll_rpc_generator` is expected to be an invocation of `_poll_rpc`.\n """\n output = None\n for x in poll_rpc_generator:\n output = x\n\n if output is None:\n raise DagsterDbtRpcUnexpectedPollOutputError(\n description="poll_rpc yielded None as its last value. Expected value of type Output containing DbtRpcOutput.",\n )\n\n if not isinstance(output, Output):\n raise DagsterDbtRpcUnexpectedPollOutputError(\n description=f"poll_rpc yielded value of type {type(output)} as its last value. Expected value of type Output containing DbtRpcOutput.",\n )\n\n if not isinstance(output.value, DbtRpcOutput):\n raise DagsterDbtRpcUnexpectedPollOutputError(\n description=f"poll_rpc yielded Output containing {type(output.value)}. Expected DbtRpcOutput.",\n )\n\n return output.value\n\n\n[docs]@solid(\n description="A solid to invoke dbt run over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n resp = context.resources.dbt_rpc.run(\n models=context.solid_config["models"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n return resp.json().get("result").get("request_token")\n\n\n[docs]@solid(\n description="A solid to invoke dbt run over RPC and poll the resulting RPC process until it's complete.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full_refresh": Field(\n config=Bool,\n description="Whether or not to perform a --full-refresh.",\n is_required=False,\n default_value=False,\n ),\n "fail_fast": Field(\n config=Bool,\n description="Whether or not to --fail-fast.",\n is_required=False,\n default_value=False,\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " run"\n\n if context.solid_config["models"]:\n models = " ".join(set(context.solid_config["models"]))\n command += f" --models {models}"\n\n if context.solid_config["exclude"]:\n exclude = " ".join(set(context.solid_config["exclude"]))\n command += f" --exclude {exclude}"\n\n if context.solid_config["full_refresh"]:\n command += " --full-refresh"\n\n if context.solid_config["fail_fast"]:\n command += " --fail-fast"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n resp = context.resources.dbt_rpc.cli(cli=command, **context.solid_config["task_tags"])\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n[docs]@solid(\n description="A solid to invoke dbt test over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt test.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n resp = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n return resp.json().get("result").get("request_token")\n\n\n[docs]@solid(\n description=(\n "A solid to invoke dbt test over RPC and poll the resulting RPC process until it's "\n "complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n resp = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n[docs]@solid(\n description="A solid to invoke a dbt run operation over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run operation.",\n )\n ],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n resp = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n return resp.json().get("result").get("request_token")\n\n\n[docs]@solid(\n description=(\n "A solid to invoke a dbt run operation over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n resp = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n[docs]@solid(\n description="A solid to invoke a dbt snapshot over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n resp = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n return resp.json().get("result").get("request_token")\n\n\n[docs]@solid(\n description=(\n "A solid to invoke a dbt snapshot over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the result of\n the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n resp = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt source snapshot-freshness`` command to a dbt RPC server and\n returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n resp = context.resources.dbt_rpc.cli(cli=command)\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n return resp.json().get("result").get("request_token")\n\n\n[docs]@solid(\n description=(\n "A solid to invoke dbt source snapshot-freshness over RPC and poll the resulting "\n "RPC process until it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt source snapshot`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n resp = context.resources.dbt_rpc.cli(cli=command)\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n[docs]@solid(\n description="A solid to compile a SQL query in context of a dbt project over RPC.",\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be compiled.", dagster_type=String\n ),\n ],\n output_defs=[\n OutputDefinition(name="sql", description="The compiled SQL query.", dagster_type=String)\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_compile_sql(context: SolidExecutionContext, sql: String) -> String:\n """This solid sends the ``dbt compile`` command to a dbt RPC server and returns the request\n token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n resp = context.resources.dbt_rpc.compile_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n result = unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )\n return result.results[0].node["compiled_sql"]\n\n\n[docs]def create_dbt_rpc_run_sql_solid(\n name: str, output_def: Optional[OutputDefinition] = None, **kwargs\n) -> Callable:\n """This function is a factory which constructs a solid that will copy the results of a SQL query\n run within the context of a dbt project to a pandas ``DataFrame``.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config_schema``, ``input_defs``, and\n ``required_resource_keys`` is not allowed and will throw a :class:`DagsterInvalidDefinitionError\n <dagster.DagsterInvalidDefinitionError>`.\n\n If you would like to configure this solid with different config fields, you could consider using\n :func:`@composite_solid <dagster.composite_solid>` to wrap this solid.\n\n Args:\n name (str): The name of this solid.\n output_def (OutputDefinition, optional): The :class:`OutputDefinition\n <dagster.OutputDefinition>` for the solid. This value should always be a representation\n of a pandas ``DataFrame``. If not specified, the solid will default to an\n :class:`OutputDefinition <dagster.OutputDefinition>` named "df" with a ``DataFrame``\n dagster type.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n check.str_param(obj=name, param_name="name")\n check.opt_inst_param(obj=output_def, param_name="output_def", ttype=OutputDefinition)\n\n if "config_schema" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding config_schema is not supported.")\n\n if "input_defs" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding input_defs is not supported.")\n\n if "required_resource_keys" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding required_resource_keys is not supported.")\n\n @solid(\n name=name,\n description=kwargs.pop(\n "description",\n "A solid to run a SQL query in context of a dbt project over RPC and return the "\n "results in a pandas DataFrame.",\n ),\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be run.", dagster_type=String\n ),\n ],\n output_defs=[\n output_def\n or OutputDefinition(\n name="df", description="The results of the SQL query.", dagster_type=DataFrame\n )\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation "\n "will be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n **kwargs,\n )\n def _dbt_rpc_run_sql(context: SolidExecutionContext, sql: String) -> DataFrame:\n resp = context.resources.dbt_rpc.run_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(resp.text)\n raise_for_rpc_error(context, resp)\n request_token = resp.json().get("result").get("request_token")\n result = unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )\n table = result.results[0].table\n return pd.DataFrame.from_records(data=table["rows"], columns=table["column_names"])\n\n return _dbt_rpc_run_sql\n
\nfrom collections import namedtuple\nfrom typing import Any, Dict\n\nfrom dagster import check, usable_as_dagster_type\n\nfrom ..types import DbtResult\n\n\n[docs]@usable_as_dagster_type\nclass DbtRpcOutput(namedtuple("_DbtRpcOutput", "result state start end elapsed")):\n """The output from executing a dbt command via the dbt RPC server.\n\n Note that users should not construct instances of this class directly. This class is intended to be\n constructed from the JSON output of dbt commands.\n\n Attributes:\n result (DbtResult): The dbt results from the executed command.\n state (str): The state of the polled dbt process.\n start (str): An ISO string timestamp of when the dbt process started.\n end (str): An ISO string timestamp of when the dbt process ended.\n elapsed (float): The duration (in seconds) for which the dbt process was running.\n """\n\n def __new__(\n cls, result: DbtResult, state: str, start: str, end: str, elapsed: float,\n ):\n return super().__new__(\n cls,\n result,\n check.str_param(state, "state"),\n check.str_param(start, "start"),\n check.str_param(end, "end"),\n check.float_param(elapsed, "elapsed"),\n )\n\n[docs] @classmethod\n def from_dict(cls, d: Dict[str, Any]) -> "DbtRpcOutput":\n """Constructs an instance of :class:`DbtRpcOutput <dagster_dbt.DbtRpcOutput>` from a\n dictionary.\n\n Args:\n d (Dict[str, Any]): A dictionary with key-values to construct a :class:`DbtRpcOutput\n <dagster_dbt.DbtRpcOutput>`.\n\n Returns:\n DbtRpcOutput: An instance of :class:`DbtRpcOutput <dagster_dbt.DbtRpcOutput>`.\n """\n state = check.str_elem(d, "state")\n start = check.str_elem(d, "start")\n end = check.str_elem(d, "end")\n elapsed = check.float_elem(d, "elapsed")\n\n result = DbtResult.from_dict(d)\n\n return cls(result=result, state=state, start=start, end=end, elapsed=elapsed)\n
\nfrom collections import namedtuple\nfrom datetime import datetime, timedelta\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom dagster import check\nfrom dateutil import parser\n\n\n[docs]class StepTiming(namedtuple("_StepTiming", "name started_at completed_at")):\n """The timing information of an executed step for a dbt node (model).\n\n Note that users should not construct instances of this class directly. This class is intended to be\n constructed from the JSON output of dbt commands.\n\n Attributes:\n name (str): The name of the executed step.\n started_at (datetime.datetime): An ISO string timestamp of when the step started executing.\n completed_at (datetime.datetime): An ISO string timestamp of when the step completed\n execution.\n """\n\n def __new__(cls, name: str, started_at: datetime, completed_at: datetime):\n return super().__new__(\n cls,\n check.str_param(name, "name"),\n check.inst_param(started_at, "started_at", datetime),\n check.inst_param(completed_at, "completed_at", datetime),\n )\n\n @property\n def duration(self) -> timedelta:\n """datetime.timedelta: The execution duration of the step."""\n return self.completed_at - self.started_at\n\n\n[docs]class NodeResult(\n namedtuple(\n "_NodeResult",\n "node error status execution_time thread_id step_timings table fail warn skip",\n ),\n):\n """The result of executing a dbt node (model).\n\n Note that users should not construct instances of this class directly. This class is intended to be\n constructed from the JSON output of dbt commands.\n\n Attributes:\n node (Dict[str, Any]): Details about the executed dbt node (model).\n error (Optional[str]): An error message if an error occurred.\n fail (Optional[Any]): The ``fail`` field from the results of the executed dbt node.\n warn (Optional[Any]): The ``warn`` field from the results of the executed dbt node.\n skip (Optional[Any]): The ``skip`` field from the results of the executed dbt node.\n status (Optional[Union[str,int]]): The status of the executed dbt node (model).\n execution_time (float): The execution duration (in seconds) of the dbt node (model).\n thread_id (str): The dbt thread identifier that executed the dbt node (model).\n step_timings (List[StepTiming]): The timings for each step in the executed dbt node\n (model).\n table (Optional[Dict]): Details about the table/view that is created from executing a\n `run_sql <https://docs.getdbt.com/reference/commands/rpc#executing-a-query>`_\n command on an dbt RPC server.\n """\n\n def __new__(\n cls,\n node: Dict[str, Any],\n error: Optional[str] = None,\n status: Optional[Union[str, int]] = None,\n execution_time: Optional[float] = None,\n thread_id: Optional[str] = None,\n step_timings: List[StepTiming] = None,\n table: Optional[Dict[str, Any]] = None,\n fail: Optional[Any] = None,\n warn: Optional[Any] = None,\n skip: Optional[Any] = None,\n ):\n step_timings = check.list_param(step_timings, "step_timings", of_type=StepTiming)\n return super().__new__(\n cls,\n check.dict_param(node, "node", key_type=str),\n check.opt_str_param(error, "error"),\n status,\n check.opt_float_param(execution_time, "execution_time"),\n check.opt_str_param(thread_id, "thread_id"),\n step_timings,\n check.opt_dict_param(table, "table"),\n fail,\n warn,\n skip,\n )\n\n[docs] @classmethod\n def from_dict(cls, d: Dict[str, Any]) -> "NodeResult":\n """Constructs an instance of :class:`NodeResult <dagster_dbt.NodeResult>` from a dictionary.\n\n Args:\n d (Dict[str, Any]): A dictionary with key-values to construct a :class:`NodeResult\n <dagster_dbt.NodeResult>`.\n\n Returns:\n NodeResult: An instance of :class:`NodeResult <dagster_dbt.NodeResult>`.\n """\n node = check.dict_elem(d, "node")\n error = check.opt_str_elem(d, "error")\n execution_time = check.float_elem(d, "execution_time")\n thread_id = check.opt_str_elem(d, "thread_id")\n check.list_elem(d, "timing")\n step_timings = [\n StepTiming(\n name=d["name"],\n started_at=parser.isoparse(d["started_at"]),\n completed_at=parser.isoparse(d["completed_at"]),\n )\n for d in check.is_list(d["timing"], of_type=Dict)\n ]\n table = check.opt_dict_elem(d, "table")\n\n return cls(\n step_timings=step_timings,\n node=node,\n error=error,\n execution_time=execution_time,\n thread_id=thread_id,\n table=table,\n )\n\n\n[docs]class DbtResult(namedtuple("_DbtResult", "logs results generated_at elapsed_time")):\n """The results of executing a dbt command.\n\n Note that users should not construct instances of this class directly. This class is intended to be\n constructed from the JSON output of dbt commands.\n\n Attributes:\n logs (List[Dict[str, Any]]): JSON log output from the dbt process.\n results (List[NodeResult]]): Details about each executed dbt node (model) in the run.\n generated_at (str): An ISO string timestamp of when the run result was generated by dbt.\n elapsed_time (float): The execution duration (in seconds) of the run.\n """\n\n def __new__(\n cls,\n logs: List[Dict[str, Any]],\n results: List[NodeResult],\n generated_at: str,\n elapsed_time: Optional[float] = None,\n ):\n return super().__new__(\n cls,\n check.list_param(logs, "logs", of_type=Dict),\n results,\n check.str_param(generated_at, "generated_at"),\n check.opt_float_param(elapsed_time, "elapsed_time"),\n )\n\n[docs] @classmethod\n def from_dict(cls, d: Dict[str, Any]) -> "DbtResult":\n """Constructs an instance of :class:`DbtResult <dagster_dbt.DbtResult>` from a dictionary.\n\n Args:\n d (Dict[str, Any]): A dictionary with key-values to construct a :class:`DbtResult\n <dagster_dbt.DbtResult>`.\n\n Returns:\n DbtResult: An instance of :class:`DbtResult <dagster_dbt.DbtResult>`.\n """\n check.list_elem(d, "logs")\n logs = check.is_list(d["logs"], of_type=Dict)\n check.list_elem(d, "results")\n results = [NodeResult.from_dict(d) for d in check.is_list(d["results"], of_type=Dict)]\n generated_at = check.str_elem(d, "generated_at")\n elapsed_time = check.float_elem(d, "elapsed_time")\n\n return cls(\n logs=logs, results=results, generated_at=generated_at, elapsed_time=elapsed_time,\n )\n\n def __len__(self) -> int:\n return len(self.results)\n
\nfrom dagster import resource\nfrom google.cloud import bigquery\n\nfrom .configs import bq_resource_config\n\n\n[docs]@resource(\n config_schema=bq_resource_config(), description="Dagster resource for connecting to BigQuery"\n)\ndef bigquery_resource(context):\n return bigquery.Client(**context.resource_config)\n
\nimport hashlib\n\nfrom dagster import InputDefinition, List, Nothing, OutputDefinition, check, solid\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import EncryptionConfiguration, TimePartitioning\n\nfrom .configs import (\n define_bigquery_create_dataset_config,\n define_bigquery_delete_dataset_config,\n define_bigquery_load_config,\n define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n time_partitioning = cfg.get("time_partitioning")\n\n if destination_encryption_configuration is not None:\n cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n kms_key_name=destination_encryption_configuration\n )\n\n if time_partitioning is not None:\n cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n return cfg\n\n\n[docs]def bq_solid_for_queries(sql_queries):\n """\n Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n\n m = hashlib.sha1()\n for query in sql_queries:\n m.update(query.encode("utf-8"))\n name = "bq_solid_{hash}".format(hash=m.hexdigest()[:10])\n\n @solid(\n name=name,\n input_defs=[InputDefinition(_START, Nothing)],\n output_defs=[OutputDefinition(List[DataFrame])],\n config_schema=define_bigquery_query_config(),\n required_resource_keys={"bigquery"},\n tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n )\n def _solid(context): # pylint: disable=unused-argument\n query_job_config = _preprocess_config(context.solid_config.get("query_job_config", {}))\n\n # Retrieve results as pandas DataFrames\n results = []\n for sql_query in sql_queries:\n # We need to construct a new QueryJobConfig for each query.\n # See: https://bit.ly/2VjD6sl\n cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n context.log.info(\n "executing query %s with config: %s"\n % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n )\n results.append(\n context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n )\n\n return results\n\n return _solid\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n[docs]@solid(\n input_defs=[InputDefinition("paths", List[str])],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)\n\n\n[docs]@solid(\n input_defs=[InputDefinition("df", DataFrame)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)\n\n\n[docs]@solid(\n input_defs=[InputDefinition("path", str)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.solid_config.get("destination")\n load_job_config = _preprocess_config(context.solid_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n[docs]@solid(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This solid encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.solid_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)\n\n\n[docs]@solid(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This solid encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n (dataset, delete_contents, not_found_ok) = [\n context.solid_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )\n
\nimport re\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\nfrom dagster.config import ConfigScalar, ConfigScalarKind, PostProcessingError\nfrom google.cloud.bigquery.job import (\n CreateDisposition,\n Encoding,\n QueryPriority,\n SchemaUpdateOption,\n SourceFormat,\n WriteDisposition,\n)\n\n\nclass BigQueryLoadSource(PyEnum):\n DataFrame = "DATA_FRAME"\n GCS = "GCS"\n File = "FILE"\n\n\nBQCreateDisposition = Enum(\n name="BQCreateDisposition",\n enum_values=[\n EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n EnumValue(CreateDisposition.CREATE_NEVER),\n ],\n)\n\nBQPriority = Enum(\n name="BQPriority",\n enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n name="BQSchemaUpdateOption",\n enum_values=[\n EnumValue(\n SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n description="Allow adding a nullable field to the schema.",\n ),\n EnumValue(\n SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n description="Allow relaxing a required field in the original schema to nullable.",\n ),\n ],\n)\n\nBQWriteDisposition = Enum(\n name="BQWriteDisposition",\n enum_values=[\n EnumValue(WriteDisposition.WRITE_APPEND),\n EnumValue(WriteDisposition.WRITE_EMPTY),\n EnumValue(WriteDisposition.WRITE_TRUNCATE),\n ],\n)\n\nBQEncoding = Enum(\n name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n name="BQSourceFormat",\n enum_values=[\n EnumValue(SourceFormat.AVRO),\n EnumValue(SourceFormat.CSV),\n EnumValue(SourceFormat.DATASTORE_BACKUP),\n EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n EnumValue(SourceFormat.ORC),\n EnumValue(SourceFormat.PARQUET),\n ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n """Datasets must be of form "project.dataset" or "dataset"\n """\n return re.match(\n # regex matches: project.dataset -- OR -- dataset\n r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n config_value,\n )\n\n\ndef _is_valid_table(config_value):\n """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n date-partition suffix\n """\n return re.match(\n r"^"\n + RE_PROJECT # project\n + r"\\." # .\n + RE_DS_TABLE # dataset\n + r"\\." # .\n + RE_DS_TABLE # table\n + RE_PARTITION_SUFFIX # date partition suffix\n + r"$|^" # -- OR --\n + RE_DS_TABLE # dataset\n + r"\\." # .\n + RE_DS_TABLE # table\n + RE_PARTITION_SUFFIX # date partition suffix\n + r"$",\n config_value,\n )\n\n\nclass _Dataset(ConfigScalar):\n def __init__(self):\n super(_Dataset, self).__init__(\n key=type(self).__name__,\n given_name=type(self).__name__,\n scalar_kind=ConfigScalarKind.STRING,\n )\n\n def post_process(self, value):\n if not _is_valid_dataset(value):\n raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n return value\n\n\nclass _Table(ConfigScalar):\n def __init__(self):\n super(_Table, self).__init__(\n key=type(self).__name__,\n given_name=type(self).__name__,\n scalar_kind=ConfigScalarKind.STRING,\n )\n\n def post_process(self, value):\n if not _is_valid_table(value):\n raise PostProcessingError(\n (\n 'Tables must be of the form "project.dataset.table" or "dataset.table" '\n "with optional date-partition suffix"\n )\n )\n\n return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n\n
\nimport time\nfrom contextlib import contextmanager\n\nfrom dagster import resource\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocResource:\n """Builds a client to the dataproc API."""\n\n def __init__(self, config):\n # Use Application Default Credentials to check the\n # GOOGLE_APPLICATION_CREDENTIALS environment variable\n # for the location of the service account key file.\n credentials = GoogleCredentials.get_application_default()\n\n # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n # cache_discovery=False configuration below\n self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n self.config = config\n\n (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n )\n\n @property\n def dataproc_clusters(self):\n return (\n # Google APIs dynamically genned, so pylint pukes\n # pylint: disable=no-member\n self.dataproc.projects()\n .regions()\n .clusters()\n )\n\n @property\n def dataproc_jobs(self):\n return (\n # Google APIs dynamically genned, so pylint pukes\n # pylint: disable=no-member\n self.dataproc.projects()\n .regions()\n .jobs()\n )\n\n def create_cluster(self):\n (\n self.dataproc_clusters.create(\n projectId=self.project_id,\n region=self.region,\n body={\n "projectId": self.project_id,\n "clusterName": self.cluster_name,\n "config": self.cluster_config,\n },\n ).execute()\n )\n\n def iter_fn():\n # TODO: Add logging\n # See: https://bit.ly/2UW5JaN\n cluster = self.get_cluster()\n return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n done = DataprocResource._iter_and_sleep_until_ready(iter_fn)\n if not done:\n cluster = self.get_cluster()\n raise DataprocError(\n "Could not provision cluster -- status: %s" % str(cluster["status"])\n )\n\n def get_cluster(self):\n return self.dataproc_clusters.get(\n projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n ).execute()\n\n def delete_cluster(self):\n return self.dataproc_clusters.delete(\n projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n ).execute()\n\n def submit_job(self, job_details):\n return self.dataproc_jobs.submit(\n projectId=self.project_id, region=self.region, body=job_details\n ).execute()\n\n def get_job(self, job_id):\n return self.dataproc_jobs.get(\n projectId=self.project_id, region=self.region, jobId=job_id\n ).execute()\n\n def wait_for_job(self, job_id):\n """This method polls job status every 5 seconds\n """\n # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n def iter_fn():\n # See: https://bit.ly/2Lg2tHr\n result = self.get_job(job_id)\n\n # Handle exceptions\n if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n raise DataprocError("Job error: %s" % str(result["status"]))\n\n if result["status"]["state"] == "DONE":\n return True\n\n return False\n\n done = DataprocResource._iter_and_sleep_until_ready(iter_fn)\n if not done:\n job = self.get_job(job_id)\n raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n @staticmethod\n def _iter_and_sleep_until_ready(\n callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n ):\n """Iterates and sleeps until callable_fn returns true\n """\n # Wait for cluster ready state\n ready, curr_iter = False, 0\n max_iter = max_wait_time_sec / iter_time\n while not ready and curr_iter < max_iter:\n ready = callable_fn()\n time.sleep(iter_time)\n curr_iter += 1\n\n # Will return false if ran up to max_iter without success\n return ready\n\n @contextmanager\n def cluster_context_manager(self):\n """This context manager gives syntactic sugar so you can run:\n\n with context.resources.dataproc.cluster as cluster:\n # do stuff...\n """\n self.create_cluster()\n try:\n yield self\n finally:\n self.delete_cluster()\n\n\n[docs]@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocResource(context.resource_config)\n
\nfrom dagster import Bool, Field, solid\nfrom dagster.seven import json\n\nfrom .configs import define_dataproc_submit_job_config\n\n\n[docs]@solid(\n required_resource_keys={"dataproc"},\n config_schema={\n "job_config": define_dataproc_submit_job_config(),\n "job_scoped_cluster": Field(\n Bool,\n description="whether to create a cluster or use an existing cluster",\n is_required=False,\n default_value=True,\n ),\n },\n)\ndef dataproc_solid(context):\n job_config = context.solid_config["job_config"]\n\n context.log.info("submitting job with config: %s" % str(json.dumps(job_config)))\n\n if context.solid_config["job_scoped_cluster"]:\n # Cluster context manager, creates and then deletes cluster\n with context.resources.dataproc.cluster_context_manager() as cluster:\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = cluster.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info("Submitted job ID {}".format(job_id))\n cluster.wait_for_job(job_id)\n else:\n # Submit to an existing cluster\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = context.resources.dataproc.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info("Submitted job ID {}".format(job_id))\n context.resources.dataproc.wait_for_job(job_id)\n
\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom dagster import check, usable_as_dagster_type\nfrom dagster.core.storage.file_manager import (\n FileHandle,\n FileManager,\n TempfileManager,\n check_file_like_obj,\n)\nfrom google.cloud import storage\n\n\n[docs]@usable_as_dagster_type\nclass GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return "gs://{bucket}/{key}".format(bucket=self.gcs_bucket, key=self.gcs_key)\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.get_bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n with open(self._get_local_path(file_handle), mode) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.get_bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._gcs_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
\nimport pickle\n\nfrom dagster import Field, IOManager, StringSource, check, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\nfrom dagster.utils.backoff import backoff\nfrom google.api_core.exceptions import TooManyRequests\nfrom google.cloud import storage\n\nDEFAULT_LEASE_DURATION = 60 # One minute\n\n\nclass PickledObjectGCSIOManager(IOManager):\n def __init__(self, bucket, client=None, prefix="dagster"):\n self.bucket = check.str_param(bucket, "bucket")\n self.client = client or storage.Client()\n self.bucket_obj = self.client.get_bucket(bucket)\n check.invariant(self.bucket_obj.exists())\n self.prefix = check.str_param(prefix, "prefix")\n\n def _get_path(self, context):\n run_id, step_key, name = context.get_run_scoped_output_identifier()\n return "/".join([self.prefix, "storage", run_id, "files", step_key, name,])\n\n def _rm_object(self, key):\n check.str_param(key, "key")\n check.param_invariant(len(key) > 0, "key")\n\n if self.bucket_obj.blob(key).exists():\n self.bucket_obj.blob(key).delete()\n\n def _has_object(self, key):\n check.str_param(key, "key")\n check.param_invariant(len(key) > 0, "key")\n blobs = self.client.list_blobs(self.bucket, prefix=key)\n return len(list(blobs)) > 0\n\n def _uri_for_key(self, key):\n check.str_param(key, "key")\n return "gs://" + self.bucket + "/" + "{key}".format(key=key)\n\n def load_input(self, context):\n key = self._get_path(context.upstream_output)\n context.log.debug(f"Loading GCS object from: {self._uri_for_key(key)}")\n\n bytes_obj = self.bucket_obj.blob(key).download_as_bytes()\n obj = pickle.loads(bytes_obj)\n\n return obj\n\n def handle_output(self, context, obj):\n key = self._get_path(context)\n context.log.debug(f"Writing GCS object at: {self._uri_for_key(key)}")\n\n if self._has_object(key):\n context.log.warning(f"Removing existing GCS key: {key}")\n self._rm_object(key)\n\n pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n backoff(\n self.bucket_obj.blob(key).upload_from_string,\n args=[pickled_obj],\n retry_on=(TooManyRequests,),\n )\n\n\n[docs]@io_manager(\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Attach this resource definition to a :py:class:`~dagster.ModeDefinition`\n in order to make it available to your pipeline:\n\n .. code-block:: python\n\n pipeline_def = PipelineDefinition(\n mode_defs=[\n ModeDefinition(\n resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...},\n ), ...\n ], ...\n )\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n init_context.resource_config["gcs_bucket"],\n client,\n init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager\n
\nfrom dagster import Field, Noneable, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\nfrom google.cloud import storage\n\nfrom .file_manager import GCSFileManager\n\nGCS_CLIENT_CONFIG = {\n "project": Field(Noneable(StringSource), is_required=False, description="Project name")\n}\n\n\n[docs]@resource(\n GCS_CLIENT_CONFIG, description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context):\n return _gcs_client_from_config(init_context.resource_config)\n\n\n[docs]@resource(\n merge_dicts(\n GCS_CLIENT_CONFIG,\n {\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n \n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n gcs_client = _gcs_client_from_config(context.resource_config)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=context.resource_config["gcs_bucket"],\n gcs_base_key=context.resource_config["gcs_prefix"],\n )\n\n\ndef _gcs_client_from_config(config):\n """\n Args:\n config: A configuration containing the fields in GCS_CLIENT_CONFIG.\n\n Returns: A GCS client.\n """\n project = config.get("project", None)\n return storage.client.Client(project=project)\n
\nfrom dagster import Field, StringSource, intermediate_storage\nfrom dagster.core.storage.system_storage import (\n build_intermediate_storage_from_object_store,\n fs_intermediate_storage,\n mem_intermediate_storage,\n)\n\nfrom .object_store import GCSObjectStore\n\n\n[docs]@intermediate_storage(\n name="gcs",\n is_persistent=True,\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_intermediate_storage(init_context):\n client = init_context.resources.gcs\n gcs_bucket = init_context.intermediate_storage_config["gcs_bucket"]\n gcs_prefix = init_context.intermediate_storage_config["gcs_prefix"]\n object_store = GCSObjectStore(gcs_bucket, client=client)\n\n def root_for_run_id(r_id):\n return object_store.key_for_paths([gcs_prefix, "storage", r_id])\n\n return build_intermediate_storage_from_object_store(\n object_store, init_context, root_for_run_id=root_for_run_id\n )\n\n\ngcs_plus_default_intermediate_storage_defs = [\n mem_intermediate_storage,\n fs_intermediate_storage,\n gcs_intermediate_storage,\n]\n
\nimport datetime\n\nimport great_expectations as ge\nfrom dagster import (\n EventMetadataEntry,\n ExpectationResult,\n InputDefinition,\n Noneable,\n Output,\n OutputDefinition,\n StringSource,\n check,\n resource,\n solid,\n)\nfrom dagster_pandas import DataFrame\nfrom great_expectations.core import convert_to_json_serializable\nfrom great_expectations.render.page_renderer_util import (\n render_multiple_validation_result_pages_markdown,\n)\n\n\n@resource(config_schema={"ge_root_dir": Noneable(StringSource)})\ndef ge_data_context(context):\n if context.resource_config["ge_root_dir"] is None:\n yield ge.data_context.DataContext()\n else:\n yield ge.data_context.DataContext(context_root_dir=context.resource_config["ge_root_dir"])\n\n\n[docs]def ge_validation_solid_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """\n Generates solids for interacting with GE.\n\n Args:\n name (str): the name of the solid\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to None,\n which generates an ephemeral validator.\n If you want to save data docs, use 'action_list_operator'.\n See https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the\n solid. Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`,\n where `dataset` is the input to the generated solid.\n\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n\n """\n\n check.str_param(datasource_name, "datasource_name")\n check.str_param(suite_name, "suite_name")\n check.opt_str_param(validation_operator_name, "validation_operator_name")\n\n batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n @solid(\n name=name,\n input_defs=[InputDefinition("dataset", input_dagster_type)],\n output_defs=[\n OutputDefinition(\n dagster_type=dict,\n description="""\n This solid yields an expectationResult with a structured dict of metadata from the GE suite,\n as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by expectation\n results/details.\n """,\n )\n ],\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def ge_validation_solid(context, dataset):\n data_context = context.resources.ge_data_context\n if validation_operator_name is not None:\n validation_operator = validation_operator_name\n else:\n data_context.add_validation_operator(\n "ephemeral_validation",\n {"class_name": "ActionListValidationOperator", "action_list": []},\n )\n validation_operator = "ephemeral_validation"\n suite = data_context.get_expectation_suite(suite_name)\n final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n if "datasource" in batch_kwargs:\n context.log.warning(\n "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n "parameter of the solid factory instead."\n )\n final_batch_kwargs["datasource"] = datasource_name\n batch = data_context.get_batch(final_batch_kwargs, suite)\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = data_context.run_validation_operator(\n validation_operator, assets_to_validate=[batch], run_id=run_id\n )\n res = convert_to_json_serializable(results.list_validation_results())[0]\n md_str = render_multiple_validation_result_pages_markdown(\n validation_operator_result=results, run_info_at_end=True,\n )\n meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results")\n yield ExpectationResult(\n success=res["success"], metadata_entries=[meta_stats,],\n )\n yield Output(res)\n\n return ge_validation_solid\n
\nimport time\nfrom datetime import datetime\n\nimport jwt\nimport requests\nfrom dagster import Field, IntSource, StringSource, resource\n\n\ndef to_seconds(dt):\n return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubResource:\n def __init__(self, client, app_id, app_private_rsa_key, default_installation_id):\n self.client = client\n self.app_private_rsa_key = app_private_rsa_key\n self.app_id = app_id\n self.default_installation_id = default_installation_id\n self.installation_tokens = {}\n self.app_token = {}\n\n def __set_app_token(self):\n # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n # needing to self-sign a JWT\n now = int(time.time())\n # JWT expiration time (10 minute maximum)\n expires = now + (10 * 60)\n encoded_token = jwt.encode(\n {\n # issued at time\n "iat": now,\n # JWT expiration time\n "exp": expires,\n # GitHub App's identifier\n "iss": self.app_id,\n },\n self.app_private_rsa_key,\n algorithm="RS256",\n )\n self.app_token = {\n "value": encoded_token,\n "expires": expires,\n }\n\n def __check_app_token(self):\n if ("expires" not in self.app_token) or (\n self.app_token["expires"] < (int(time.time()) + 60)\n ):\n self.__set_app_token()\n\n def get_installations(self, headers=None):\n if headers is None:\n headers = {}\n self.__check_app_token()\n headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n request = self.client.get("https://api.github.com/app/installations", headers=headers,)\n request.raise_for_status()\n return request.json()\n\n def __set_installation_token(self, installation_id, headers=None):\n if headers is None:\n headers = {}\n self.__check_app_token()\n headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n request = requests.post(\n "https://api.github.com/app/installations/{}/access_tokens".format(installation_id),\n headers=headers,\n )\n request.raise_for_status()\n auth = request.json()\n self.installation_tokens[installation_id] = {\n "value": auth["token"],\n "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n }\n\n def __check_installation_tokens(self, installation_id):\n if (installation_id not in self.installation_tokens) or (\n self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n ):\n self.__set_installation_token(installation_id)\n\n def execute(self, query, variables, headers=None, installation_id=None):\n if headers is None:\n headers = {}\n if installation_id is None:\n installation_id = self.default_installation_id\n self.__check_installation_tokens(installation_id)\n headers["Authorization"] = "token {}".format(\n self.installation_tokens[installation_id]["value"]\n )\n request = requests.post(\n "https://api.github.com/graphql",\n json={"query": query, "variables": variables},\n headers=headers,\n )\n request.raise_for_status()\n return request.json()\n\n def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n if installation_id is None:\n installation_id = self.default_installation_id\n res = self.execute(\n query="""\n query get_repo_id($repo_name: String!, $repo_owner: String!) {\n repository(name: $repo_name, owner: $repo_owner) {\n id\n }\n }\n """,\n variables={"repo_name": repo_name, "repo_owner": repo_owner},\n installation_id=installation_id,\n )\n\n return self.execute(\n query="""\n mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n createIssue(input: {\n repositoryId: $id,\n title: $title,\n body: $body\n }) {\n clientMutationId,\n issue {\n body\n title\n url\n }\n }\n }\n """,\n variables={"id": res["data"]["repository"]["id"], "title": title, "body": body,},\n installation_id=installation_id,\n )\n\n\n[docs]@resource(\n config_schema={\n "github_app_id": Field(\n IntSource,\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n ),\n "github_app_private_rsa_key": Field(\n StringSource,\n description="Github Application Private RSA key text, for more info see https://developer.github.com/apps/",\n ),\n "github_installation_id": Field(\n IntSource,\n is_required=False,\n description="Github Application Installation ID, for more info see https://developer.github.com/apps/",\n ),\n },\n description="This resource is for connecting to Github",\n)\ndef github_resource(context):\n return GithubResource(\n client=requests.Session(),\n app_id=context.resource_config["github_app_id"],\n app_private_rsa_key=context.resource_config["github_app_private_rsa_key"],\n default_installation_id=context.resource_config["github_installation_id"],\n )\n
\nimport sys\nimport weakref\n\nimport kubernetes\nfrom dagster import (\n DagsterInvariantViolationError,\n EventMetadataEntry,\n Field,\n Noneable,\n StringSource,\n check,\n)\nfrom dagster.cli.api import ExecuteRunArgs\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.host_representation import (\n ExternalPipeline,\n GrpcServerRepositoryLocationHandle,\n GrpcServerRepositoryLocationOrigin,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.launcher import RunLauncher\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .job import (\n DagsterK8sJobConfig,\n construct_dagster_k8s_job,\n get_job_name_from_run_id,\n get_user_defined_k8s_config,\n)\nfrom .utils import delete_job\n\n\n[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each pipeline run.\n\n Encapsulates each pipeline run in a separate, isolated invocation of ``dagster-graphql``.\n\n You may configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: pipeline_run_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n As always when using a :py:class:`~dagster.serdes.ConfigurableClass`, the values\n under the ``config`` key of this YAML block will be passed to the constructor. The full list\n of acceptable values is given below by the constructor args.\n\n Args:\n service_account_name (str): The name of the Kubernetes service account under which to run\n the Job.\n job_image (Optional[str]): The ``name`` of the image to use for the Job's Dagster container.\n This image will be run with the command\n ``dagster api execute_run``.\n When using user code deployments, the image should not be specified.\n instance_config_map (str): The ``name`` of an existing Volume to mount into the pod in\n order to provide a ConfigMap for the Dagster instance. This Volume should contain a\n ``dagster.yaml`` with appropriate values for run storage, event log storage, etc.\n postgres_password_secret (str): The name of the Kubernetes Secret where the postgres\n password can be retrieved. Will be mounted and supplied as an environment variable to\n the Job Pod.\n dagster_home (str): The location of DAGSTER_HOME in the Job container; this is where the\n ``dagster.yaml`` file will be mounted from the instance ConfigMap specified above.\n load_incluster_config (Optional[bool]): Set this value if you are running the launcher\n within a k8s cluster. If ``True``, we assume the launcher is running within the target\n cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n we will use the k8s config specified in ``kubeconfig_file`` (using\n ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig. Default:\n ``True``.\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n None (using the default kubeconfig).\n image_pull_secrets (Optional[List[Dict[str, str]]]): Optionally, a list of dicts, each of\n which corresponds to a Kubernetes ``LocalObjectReference`` (e.g.,\n ``{'name': 'myRegistryName'}``). This allows you to specify the ```imagePullSecrets`` on\n a pod basis. Typically, these will be provided through the service account, when needed,\n and you will not need to pass this argument.\n See:\n https://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\n and https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core.\n image_pull_policy (Optional[str]): Allows the image pull policy to be overridden, e.g. to\n facilitate local testing with `kind <https://kind.sigs.k8s.io/>`_. Default:\n ``"Always"``. See: https://kubernetes.io/docs/concepts/containers/images/#updating-images.\n job_namespace (Optional[str]): The namespace into which to launch new jobs. Note that any\n other Kubernetes resources the Job requires (such as the service account) must be\n present in this namespace. Default: ``"default"``\n env_config_maps (Optional[List[str]]): A list of custom ConfigMapEnvSource names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container\n env_secrets (Optional[List[str]]): A list of custom Secret names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret,\n dagster_home,\n job_image=None,\n image_pull_policy="Always",\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data=None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n k8s_client_batch_api=None,\n k8s_client_core_api=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._batch_api = k8s_client_batch_api or kubernetes.client.BatchV1Api()\n self._core_api = k8s_client_core_api or kubernetes.client.CoreV1Api()\n\n self.job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self._dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.str_param(image_pull_policy, "image_pull_policy")\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self._instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self._postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._instance_ref = None\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n job_cfg = DagsterK8sJobConfig.config_type()\n\n run_launcher_extra_cfg = {\n "job_namespace": Field(StringSource, is_required=False, default_value="default"),\n "load_incluster_config": Field(bool, is_required=False, default_value=True),\n "kubeconfig_file": Field(Noneable(str), is_required=False, default_value=None),\n }\n return merge_dicts(job_cfg, run_launcher_extra_cfg)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def _instance(self):\n return self._instance_ref() if self._instance_ref else None\n\n def _get_static_job_config(self):\n if self.job_config:\n return self.job_config\n else:\n self.job_config = DagsterK8sJobConfig(\n job_image=check.str_param(self._job_image, "job_image"),\n dagster_home=check.str_param(self._dagster_home, "dagster_home"),\n image_pull_policy=check.str_param(self._image_pull_policy, "image_pull_policy"),\n image_pull_secrets=check.opt_list_param(\n self._image_pull_secrets, "image_pull_secrets", of_type=dict\n ),\n service_account_name=check.str_param(\n self._service_account_name, "service_account_name"\n ),\n instance_config_map=check.str_param(\n self._instance_config_map, "instance_config_map"\n ),\n postgres_password_secret=check.str_param(\n self._postgres_password_secret, "postgres_password_secret"\n ),\n env_config_maps=check.opt_list_param(\n self._env_config_maps, "env_config_maps", of_type=str\n ),\n env_secrets=check.opt_list_param(self._env_secrets, "env_secrets", of_type=str),\n )\n return self.job_config\n\n def _get_grpc_job_config(self, job_image):\n return DagsterK8sJobConfig(\n job_image=check.str_param(job_image, "job_image"),\n dagster_home=check.str_param(self._dagster_home, "dagster_home"),\n image_pull_policy=check.str_param(self._image_pull_policy, "image_pull_policy"),\n image_pull_secrets=check.opt_list_param(\n self._image_pull_secrets, "image_pull_secrets", of_type=dict\n ),\n service_account_name=check.str_param(\n self._service_account_name, "service_account_name"\n ),\n instance_config_map=check.str_param(self._instance_config_map, "instance_config_map"),\n postgres_password_secret=check.str_param(\n self._postgres_password_secret, "postgres_password_secret"\n ),\n env_config_maps=check.opt_list_param(\n self._env_config_maps, "env_config_maps", of_type=str\n ),\n env_secrets=check.opt_list_param(self._env_secrets, "env_secrets", of_type=str),\n )\n\n def initialize(self, instance):\n check.inst_param(instance, "instance", DagsterInstance)\n # Store a weakref to avoid a circular reference / enable GC\n self._instance_ref = weakref.ref(instance)\n\n def launch_run(self, instance, run, external_pipeline):\n check.inst_param(run, "run", PipelineRun)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n\n job_name = "dagster-run-{}".format(run.run_id)\n pod_name = job_name\n\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n\n pipeline_origin = None\n job_config = None\n if isinstance(\n external_pipeline.get_external_origin().external_repository_origin.repository_location_origin,\n GrpcServerRepositoryLocationOrigin,\n ):\n if self._job_image:\n raise DagsterInvariantViolationError(\n "Cannot specify job_image in run launcher config when loading pipeline "\n "from GRPC server."\n )\n\n repository_location_handle = (\n external_pipeline.repository_handle.repository_location_handle\n )\n\n if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle):\n raise DagsterInvariantViolationError(\n "Expected RepositoryLocationHandle to be of type "\n "GrpcServerRepositoryLocationHandle but found type {}".format(\n type(repository_location_handle)\n )\n )\n\n repository_name = external_pipeline.repository_handle.repository_name\n\n repository_origin = repository_location_handle.reload_repository_python_origin(\n repository_name\n )\n\n job_image = repository_origin.container_image\n\n pipeline_origin = PipelinePythonOrigin(\n pipeline_name=external_pipeline.name, repository_origin=repository_origin\n )\n\n job_config = self._get_grpc_job_config(job_image)\n else:\n pipeline_origin = external_pipeline.get_python_origin()\n job_config = self._get_static_job_config()\n\n input_json = serialize_dagster_namedtuple(\n ExecuteRunArgs(\n pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None,\n )\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=["dagster", "api", "execute_run", input_json],\n job_name=job_name,\n pod_name=pod_name,\n component="run_coordinator",\n user_defined_k8s_config=user_defined_k8s_config,\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=self.job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run_coordinator job launched",\n run,\n EngineEventData(\n [\n EventMetadataEntry.text(job_name, "Kubernetes Job name"),\n EventMetadataEntry.text(self.job_namespace, "Kubernetes Namespace"),\n EventMetadataEntry.text(run.run_id, "Run ID"),\n ]\n ),\n cls=self.__class__,\n )\n return run\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate pipeline; can_terminate returned {}".format(\n can_terminate\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(run_id)\n\n try:\n termination_result = delete_job(job_name=job_name, namespace=self.job_namespace)\n if termination_result:\n self._instance.report_engine_event(\n message="Pipeline was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Pipeline was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception: # pylint: disable=broad-except\n self._instance.report_engine_event(\n message="Pipeline was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n
\nimport time\n\nimport kubernetes\nfrom dagster import DagsterInstance, Field, Noneable, StringSource, check\nfrom dagster.core.host_representation import ExternalSchedule\nfrom dagster.core.scheduler import DagsterSchedulerError, Scheduler\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import merge_dicts\nfrom dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_k8s_job_name\n\n\n[docs]class K8sScheduler(Scheduler, ConfigurableClass):\n """Scheduler implementation on top of Kubernetes CronJob.\n\n Enable this scheduler by adding it to your dagster.yaml, or by configuring the scheduler\n section of the Helm chart\n https://github.com/dagster-io/dagster/tree/master/helm"""\n\n def __init__(\n self,\n dagster_home,\n service_account_name,\n instance_config_map,\n postgres_password_secret,\n job_image,\n load_incluster_config=True,\n scheduler_namespace="default",\n image_pull_policy="Always",\n image_pull_secrets=None,\n kubeconfig_file=None,\n inst_data=None,\n env_config_maps=None,\n env_secrets=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api = kubernetes.client.BatchV1beta1Api()\n self._namespace = check.str_param(scheduler_namespace, "scheduler_namespace")\n self.grace_period_seconds = 5 # This should be passed in via config\n\n self.job_config = DagsterK8sJobConfig(\n job_image=check.str_param(job_image, "job_image"),\n dagster_home=check.str_param(dagster_home, "dagster_home"),\n image_pull_policy=check.str_param(image_pull_policy, "image_pull_policy"),\n image_pull_secrets=check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n ),\n service_account_name=check.str_param(service_account_name, "service_account_name"),\n instance_config_map=check.str_param(instance_config_map, "instance_config_map"),\n postgres_password_secret=check.str_param(\n postgres_password_secret, "postgres_password_secret"\n ),\n env_config_maps=check.opt_list_param(env_config_maps, "env_config_maps", of_type=str),\n env_secrets=check.opt_list_param(env_secrets, "env_secrets", of_type=str),\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n job_cfg = DagsterK8sJobConfig.config_type()\n\n scheduler_extra_cfg = {\n "scheduler_namespace": Field(StringSource, is_required=True),\n "load_incluster_config": Field(bool, is_required=False, default_value=True),\n "kubeconfig_file": Field(Noneable(StringSource), is_required=False, default_value=None),\n }\n return merge_dicts(job_cfg, scheduler_extra_cfg)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n def debug_info(self):\n return "Running K8s CronJob(s):\\n{jobs}\\n".format(\n jobs="\\n".join([str(job) for job in self.get_all_cron_jobs()])\n )\n\n def wipe(self, instance):\n # Note: This method deletes schedules from ALL repositories\n check.inst_param(instance, "instance", DagsterInstance)\n\n self._api.delete_collection_namespaced_cron_job(namespace=self._namespace)\n time.sleep(self.grace_period_seconds)\n\n # Verify that no cron jobs are running\n running_cron_job_count = len(self.get_all_cron_jobs())\n if running_cron_job_count != 0:\n raise DagsterSchedulerError(\n "Attempted to delete all K8s CronJobs but failed. There are "\n "still {} running schedules".format(running_cron_job_count)\n )\n\n def _job_template(self, external_schedule):\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n local_target = external_schedule.get_external_origin()\n\n job_config = self.job_config\n\n external_schedule_name = external_schedule.name\n job_name = get_k8s_job_name(external_schedule_name)\n pod_name = job_name\n\n job_template = construct_dagster_k8s_job(\n job_config=job_config,\n args=[\n "dagster",\n "api",\n "launch_scheduled_execution",\n "/tmp/launch_scheduled_execution_output", # https://bugs.python.org/issue20074 prevents using /dev/stdout\n "--schedule_name",\n external_schedule_name,\n ]\n + local_target.get_repo_cli_args().split(" "),\n job_name=job_name,\n pod_name=pod_name,\n component="scheduled_job",\n )\n return job_template\n\n def _start_cron_job(self, external_schedule):\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n job_template = self._job_template(external_schedule)\n\n cron_job_spec = kubernetes.client.V1beta1CronJobSpec(\n schedule=external_schedule.cron_schedule, job_template=job_template\n )\n\n schedule_origin_id = external_schedule.get_external_origin_id()\n cron_job = kubernetes.client.V1beta1CronJob(\n spec=cron_job_spec, metadata={"name": schedule_origin_id}\n )\n\n existing_cron_job = self.get_cron_job(schedule_origin_id=schedule_origin_id)\n if existing_cron_job:\n # patch_namespaced_cron_job will cause the containers array to be additive\n # https://blog.atomist.com/kubernetes-apply-replace-patch/\n self._api.replace_namespaced_cron_job(\n name=schedule_origin_id, body=cron_job, namespace=self._namespace\n )\n else:\n self._api.create_namespaced_cron_job(body=cron_job, namespace=self._namespace)\n\n time.sleep(self.grace_period_seconds)\n\n # Update the existing K8s CronJob if it exists; else, create it.\n def start_schedule(self, instance, external_schedule):\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n self._start_cron_job(external_schedule)\n\n # Verify that the cron job is running\n cron_job = self.get_cron_job(schedule_origin_id=external_schedule.get_external_origin_id())\n if not cron_job:\n raise DagsterSchedulerError(\n "Attempted to add K8s CronJob for schedule {schedule_name}, but failed. "\n "The schedule {schedule_name} is not running.".format(\n schedule_name=external_schedule.name\n )\n )\n return\n\n def refresh_schedule(self, instance, external_schedule):\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n self.start_schedule(instance, external_schedule)\n\n def running_schedule_count(self, instance, schedule_origin_id):\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n if self.get_cron_job(schedule_origin_id):\n return 1\n else:\n return 0\n\n def get_cron_job(self, schedule_origin_id):\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n cron_jobs = self._api.list_namespaced_cron_job(namespace=self._namespace)\n for item in cron_jobs.items:\n if schedule_origin_id == item.metadata.name:\n return item\n return None\n\n def get_all_cron_jobs(self):\n return self._api.list_namespaced_cron_job(namespace=self._namespace).items\n\n def _end_cron_job(self, schedule_origin_id):\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n self._api.delete_namespaced_cron_job(name=schedule_origin_id, namespace=self._namespace)\n time.sleep(self.grace_period_seconds)\n\n def stop_schedule(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n if self.get_cron_job(schedule_origin_id):\n self._end_cron_job(schedule_origin_id=schedule_origin_id)\n\n cron_job = self.get_cron_job(schedule_origin_id)\n if cron_job:\n schedule = self._get_schedule_state(instance, schedule_origin_id)\n\n raise DagsterSchedulerError(\n "Attempted to remove existing K8s CronJob for schedule "\n "{schedule_name}, but failed. Schedule is still running.".format(\n schedule_name=schedule.name\n )\n )\n\n def get_logs_path(self, instance, schedule_origin_id):\n raise NotImplementedError("To get logs, inspect the corresponding K8s CronJob")\n
\nimport pypd\nfrom dagster import Field, resource\n\n\nclass PagerDutyService:\n """Integrates with PagerDuty via the pypd library.\n\n See:\n https://v2.developer.pagerduty.com/docs/events-api-v2\n https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n https://github.com/PagerDuty/pagerduty-api-python-client\n\n for documentation and more information.\n """\n\n def __init__(self, routing_key):\n self.routing_key = routing_key\n\n def EventV2_create(\n self,\n summary,\n source,\n severity,\n event_action="trigger",\n dedup_key=None,\n timestamp=None,\n component=None,\n group=None,\n event_class=None,\n custom_details=None,\n ):\n """Events API v2 enables you to add PagerDuty's advanced event and incident management\n functionality to any system that can make an outbound HTTP connection.\n\n Arguments:\n summary {string} -- A high-level, text summary message of the event. Will be used to\n construct an alert's description.\n\n Example: "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n 'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n source {string} -- Specific human-readable unique identifier, such as a hostname, for\n the system having the problem.\n\n Examples:\n "prod05.theseus.acme-widgets.com"\n "171.26.23.22"\n "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n "9c09acd49a25"\n\n severity {string} -- How impacted the affected system is. Displayed to users in lists\n and influences the priority of any created incidents. Must be one\n of {info, warning, error, critical}\n\n Keyword Arguments:\n event_action {str} -- There are three types of events that PagerDuty recognizes, and\n are used to represent different types of activity in your\n monitored systems. (default: 'trigger')\n * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n or add a new trigger log entry to an existing alert, depending on the\n provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n when a new problem has been detected. You may send additional triggers\n when a previously detected problem has occurred again.\n\n * acknowledge: acknowledge events cause the referenced incident to enter the\n acknowledged state. While an incident is acknowledged, it won't\n generate any additional notifications, even if it receives new\n trigger events. Your monitoring tools should send PagerDuty an\n acknowledge event when they know someone is presently working on the\n problem.\n\n * resolve: resolve events cause the referenced incident to enter the resolved state.\n Once an incident is resolved, it won't generate any additional\n notifications. New trigger events with the same dedup_key as a resolved\n incident won't re-open the incident. Instead, a new incident will be\n created. Your monitoring tools should send PagerDuty a resolve event when\n the problem that caused the initial trigger event has been fixed.\n\n dedup_key {string} -- Deduplication key for correlating triggers and resolves. The\n maximum permitted length of this property is 255 characters.\n\n timestamp {string} -- Timestamp (ISO 8601). When the upstream system detected / created\n the event. This is useful if a system batches or holds events\n before sending them to PagerDuty.\n\n Optional - Will be auto-generated by PagerDuty if not provided.\n\n Example:\n 2015-07-17T08:42:58.315+0000\n\n component {string} -- The part or component of the affected system that is broken.\n\n Examples:\n "keepalive"\n "webping"\n "mysql"\n "wqueue"\n\n group {string} -- A cluster or grouping of sources. For example, sources\n "prod-datapipe-02" and "prod-datapipe-03" might both be part of\n "prod-datapipe"\n\n Examples:\n "prod-datapipe"\n "www"\n "web_stack"\n\n event_class {string} -- The class/type of the event.\n\n Examples:\n "High CPU"\n "Latency"\n "500 Error"\n\n custom_details {Dict[str, str]} -- Additional details about the event and affected\n system.\n\n Example:\n {"ping time": "1500ms", "load avg": 0.75 }\n """\n\n data = {\n "routing_key": self.routing_key,\n "event_action": event_action,\n "payload": {"summary": summary, "source": source, "severity": severity},\n }\n\n if dedup_key is not None:\n data["dedup_key"] = dedup_key\n\n if timestamp is not None:\n data["payload"]["timestamp"] = timestamp\n\n if component is not None:\n data["payload"]["component"] = component\n\n if group is not None:\n data["payload"]["group"] = group\n\n if event_class is not None:\n data["payload"]["class"] = event_class\n\n if custom_details is not None:\n data["payload"]["custom_details"] = custom_details\n\n return pypd.EventV2.create(data=data)\n\n\n[docs]@resource(\n {\n "routing_key": Field(\n str,\n description="""The routing key provisions access to your PagerDuty service. You\n will need to include the integration key for your new integration, as a\n routing_key in the event payload.""",\n )\n },\n description="""This resource is for posting events to PagerDuty.""",\n)\ndef pagerduty_resource(context):\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n\n .. code-block:: python\n\n @solid(required_resource_keys={'pagerduty'})\n def pagerduty_solid(context):\n context.resources.pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={'pagerduty': pagerduty_resource})],\n )\n def pd_pipeline():\n pagerduty_solid()\n\n execute_pipeline(\n pd_pipeline,\n {\n 'resources': {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n },\n )\n """\n return PagerDutyService(context.resource_config.get("routing_key"))\n
\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom dagster import DagsterType, EventMetadataEntry, TypeCheck, check\nfrom dagster.utils.backcompat import experimental_class_warning\nfrom pandas import DataFrame\n\n\nclass ConstraintViolationException(Exception):\n """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n """\n This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n failed typecheck or an exception.\n\n Args:\n constraint_name (str): the name of the violated constraint\n constraint_description (Optional[str]): the description of the violated constraint\n expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n offending (Optional[Union[dict,list, str, set]]): which pieces of the dataframe violated the expectation, typically list or string\n actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n """\n\n def __init__(\n self,\n constraint_name,\n constraint_description="",\n expectation=None,\n offending=None,\n actual=None,\n ):\n self.constraint_name = constraint_name\n self.constraint_description = constraint_description\n self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n super(ConstraintWithMetadataException, self).__init__(\n "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n constraint_name, constraint_description, expectation, offending, actual,\n )\n )\n\n def convert_to_metadata(self):\n return EventMetadataEntry.json(\n {\n "constraint_name": self.constraint_name,\n "constraint_description": self.constraint_description,\n "expected": self.expectation,\n "offending": self.offending,\n "actual": self.actual,\n },\n "constraint-metadata",\n )\n\n def return_as_typecheck(self):\n return TypeCheck(\n success=False, description=self.args[0], metadata_entries=[self.convert_to_metadata()]\n )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n """Indicates a dataframe level constraint has been violated."""\n\n def __init__(self, constraint_name, constraint_description):\n super(DataFrameConstraintViolationException, self).__init__(\n "Violated {constraint_name} - {constraint_description}".format(\n constraint_name=constraint_name, constraint_description=constraint_description\n )\n )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n def __init__(self, constraint_name, constraint_description, expectation, actual):\n super(DataFrameWithMetadataException, self).__init__(\n constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n """Indicates that a column constraint has been violated."""\n\n def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n self.constraint_name = constraint_name\n self.constraint_description = constraint_description\n self.column_name = column_name\n self.offending_rows = offending_rows\n super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n def construct_message(self):\n base_message = "Violated {constraint_name} ({constraint_description}) for Column Name ({column_name}) ".format(\n constraint_name=self.constraint_name,\n constraint_description=self.constraint_description,\n column_name=self.column_name,\n )\n if self.offending_rows is not None:\n base_message += "The offending (index, row values) are the following: {}".format(\n self.offending_rows\n )\n return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n super(ColumnWithMetadataException, self).__init__(\n "the column constraint " + constraint_name,\n constraint_description,\n expectation,\n offending,\n actual,\n )\n\n\nclass Constraint:\n """\n Base constraint object that all constraints inherit from.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n self.name = self.__class__.__name__\n self.markdown_description = check.str_param(markdown_description, "markdown_description")\n self.error_description = check.str_param(error_description, "error_description")\n\n\nclass ConstraintWithMetadata:\n """\n This class defines a base constraint over pandas DFs with organized metadata\n\n args:\n description (str): description of the constraint\n validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n # TODO: validation_fn returning metadata is sorta broken. maybe have it yield typecheck events and grab metadata?\n\n def __init__(\n self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n ):\n experimental_class_warning(self.__class__.__name__)\n if name is None:\n self.name = self.__class__.__name__\n else:\n self.name = name\n self.description = description\n # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n self.validation_fn = validation_fn\n self.resulting_exception = resulting_exception\n self.raise_or_typecheck = raise_or_typecheck\n\n def validate(self, data, *args, **kwargs):\n res = self.validation_fn(data, *args, **kwargs)\n if not res[0]:\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **res[1]\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n else:\n if res[0]:\n return TypeCheck(success=True)\n\n # TODO: composition of validations\n def as_dagster_type(self, *args, **kwargs):\n if self.raise_or_typecheck:\n raise Exception(\n "Dagster types can only be constructed from constraints that return typechecks"\n )\n return DagsterType(\n name=self.name,\n description="A Pandas DataFrame with the following validation: {}".format(\n self.description\n ),\n type_check_fn=lambda x: self.validate(x, *args),\n **kwargs,\n )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n """\n Use this class if you have multiple constraints to check over the entire dataframe\n\n args:\n description (str): description of the constraint\n validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n a list of the validation functions to run over inputted data\n Each function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n validation_fn_arr,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n def validation_fn(data, *args, **kwargs):\n\n results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n truthparam = all(item[0] for item in results)\n metadict = defaultdict(dict)\n for i, dicta in enumerate(item[1] for item in results):\n if len(dicta.keys()) > 0:\n for key in dicta:\n metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n return (truthparam, metadict)\n\n super(MultiConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n def validation_fcn(inframe):\n if list(inframe.columns) == column_list:\n return (True, {})\n else:\n if self.enforce_ordering:\n resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n return (False, resdict)\n else:\n if set(inframe.columns) == set(column_list):\n return (True, {})\n else:\n extra = [x for x in inframe.columns if x not in set(column_list)]\n missing = [x for x in set(column_list) if x not in inframe.columns]\n resdict = {\n "expectation": self.column_list,\n "actual": {"extra_columns": extra, "missing_columns": missing},\n }\n return (False, resdict)\n\n basestr = "ensuring that the right columns, {} were present".format(self.column_list)\n if enforce_ordering:\n basestr += " in the right order"\n super(StrictColumnsWithMetadata, self).__init__(\n basestr,\n validation_fcn,\n DataFrameWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n\nclass DataFrameConstraint(Constraint):\n """\n Base constraint object that represent Dataframe shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(DataFrameConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe):\n raise NotImplementedError()\n\n\n[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = "No columns outside of {cols} allowed. ".format(cols=self.strict_column_list)\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected the following ordering of columns {expected}. Received: {received}".format(\n expected=self.strict_column_list, received=columns_received\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )\n\n\n[docs]class RowCountConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = "Dataframe must have {} +- {} rows.".format(\n self.num_allowed_rows, self.error_tolerance\n )\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n ),\n )\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """\n Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if not res[1].get("actual") is None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """\n This class is useful for constructing single constraints that\n you want to apply to multiple columns of your dataframe\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """\n This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly,\n and also allows for cases like 'fail if any one of these constraints fails but still run all of them'\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata_entries[0].entry_data.data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """\n This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """\n validates that a particular value in a column is not null\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """\n validates that all values in an iterable are unique\n Returns duplicated values as metadata\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=ntype)])\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}), output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """\n decorator for column validation functions to make them error on nulls\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """\n factory for validators testing if column values are within a range\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=ntype)])\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}), output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = "checks whether values are between {} and {}".format(\n minim, maxim\n )\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """\n factory for validators testing if all values are in some set\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=ntype)])\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}), output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = "checks whether values are within this set of values: {}".format(\n categories\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """\n factory for testing if the dtype of a val falls within some allowed set\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=ntype)])\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}), output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = "checks whether values are this type/types: {}".format(\n datatypes\n )\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = "Confirms values are between {} and {}".format(minim, maxim)\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """\n Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """\n A column constraint that applies a pandas dtype validation function to a columns dtypes.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = "{fn} must evaluate to True for column dtypes".format(\n fn=self.type_fn.__name__\n )\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if not self.type_fn(received_dtypes):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description="{base_error_message}. Dtypes received: {received_dtypes}.".format(\n base_error_message=self.error_description, received_dtypes=received_dtypes\n ),\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """\n A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = "Column dtype must be in the following set {}.".format(\n self.expected_dtype_set\n )\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description="{base_error_message}. DTypes received: {received_dtypes}".format(\n base_error_message=self.error_description, received_dtypes=received_dtypes\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are not null.\n """\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description="Expected Categories are {}".format(self.categories),\n markdown_description="Category examples are {}...".format(self.categories[:5]),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description="values > {}".format(self.min_value),\n error_description="Column must have values > {}".format(self.min_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description="values < {}".format(self.max_value),\n error_description="Column must have values < {}".format(self.max_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description="{} < values < {}".format(self.min_value, self.max_value),\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
\nimport pandas as pd\nfrom dagster import (\n AssetMaterialization,\n DagsterInvariantViolationError,\n DagsterType,\n EventMetadataEntry,\n Field,\n StringSource,\n TypeCheck,\n check,\n dagster_type_loader,\n dagster_type_materializer,\n)\nfrom dagster.config.field_utils import Selector\nfrom dagster.utils import dict_without_keys\nfrom dagster.utils.backcompat import experimental\nfrom dagster_pandas.constraints import (\n ColumnDTypeFnConstraint,\n ColumnDTypeInSetConstraint,\n ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_materializer(\n Selector(\n {\n "csv": {\n "path": StringSource,\n "sep": Field(StringSource, is_required=False, default_value=","),\n },\n "parquet": {"path": StringSource},\n "table": {"path": StringSource},\n "pickle": {"path": StringSource},\n },\n )\n)\ndef dataframe_materializer(_context, config, pandas_df):\n check.inst_param(pandas_df, "pandas_df", pd.DataFrame)\n file_type, file_options = list(config.items())[0]\n\n if file_type == "csv":\n path = file_options["path"]\n pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path"))\n elif file_type == "parquet":\n pandas_df.to_parquet(file_options["path"])\n elif file_type == "table":\n pandas_df.to_csv(file_options["path"], sep="\\t", index=False)\n elif file_type == "pickle":\n pandas_df.to_pickle(file_options["path"])\n else:\n check.failed("Unsupported file_type {file_type}".format(file_type=file_type))\n\n return AssetMaterialization.file(file_options["path"])\n\n\n@dagster_type_loader(\n Selector(\n {\n "csv": {\n "path": StringSource,\n "sep": Field(StringSource, is_required=False, default_value=","),\n },\n "parquet": {"path": StringSource},\n "table": {"path": StringSource},\n "pickle": {"path": StringSource},\n },\n )\n)\ndef dataframe_loader(_context, config):\n file_type, file_options = list(config.items())[0]\n\n if file_type == "csv":\n path = file_options["path"]\n return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n elif file_type == "parquet":\n return pd.read_parquet(file_options["path"])\n elif file_type == "table":\n return pd.read_csv(file_options["path"], sep="\\t")\n elif file_type == "pickle":\n return pd.read_pickle(file_options["path"])\n else:\n raise DagsterInvariantViolationError(\n "Unsupported file_type {file_type}".format(file_type=file_type)\n )\n\n\ndef df_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(success=False)\n return TypeCheck(\n success=True,\n metadata_entries=[\n EventMetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"),\n # string cast columns since they may be things like datetime\n EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),\n ],\n )\n\n\nDataFrame = DagsterType(\n name="PandasDataFrame",\n description="""Two-dimensional size-mutable, potentially heterogeneous\n tabular data structure with labeled axes (rows and columns).\n See http://pandas.pydata.org/""",\n loader=dataframe_loader,\n materializer=dataframe_materializer,\n type_check_fn=df_type_check,\n)\n\n\ndef _construct_constraint_list(constraints):\n def add_bullet(constraint_list, constraint_description):\n return constraint_list + "+ {constraint_description}\\n".format(\n constraint_description=constraint_description\n )\n\n constraint_list = ""\n for constraint in constraints:\n if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n header = "**{column_name}**".format(column_name=column_name)\n for constraint in constraints:\n if isinstance(constraint, ColumnDTypeInSetConstraint):\n dtypes_tuple = tuple(constraint.expected_dtype_set)\n return header + ": `{expected_dtypes}`".format(\n expected_dtypes=dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]\n )\n elif isinstance(constraint, ColumnDTypeFnConstraint):\n return header + ": Validator `{expected_dtype_fn}`".format(\n expected_dtype_fn=constraint.type_fn.__name__\n )\n return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n title = "\\n".join([description, "### Columns", ""])\n buildme = title\n for column in columns:\n buildme += "{}\\n{}\\n".format(\n _build_column_header(column.name, column.constraints),\n _construct_constraint_list(column.constraints),\n )\n return buildme\n\n\n[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n event_metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n materializer=None,\n):\n """\n Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata\n which allow you to express things like summary statistics during runtime.\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n """\n # We allow for the plugging in of dagster_type_loaders/materializers so that\n # Users can load and materialize their custom dataframes via configuration their own way if the default\n # configs don't suffice. This is purely optional.\n check.str_param(name, "name")\n event_metadata_fn = check.opt_callable_param(event_metadata_fn, "event_metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n\n try:\n validate_constraints(\n value, pandas_columns=columns, dataframe_constraints=dataframe_constraints\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata_entries=_execute_summary_stats(name, value, event_metadata_fn)\n if event_metadata_fn\n else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if loader else dataframe_materializer,\n description=description,\n )\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n materializer=None,\n):\n """\n\n Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = []\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata_entries[0].entry_data.data\n metadata.append(\n EventMetadataEntry.json(result_dict, "{}-constraint-metadata".format(key),)\n )\n constraint_clauses.append("{} failing constraints, {}".format(key, result.description))\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata_entries=sorted(metadata, key=lambda x: x.label),\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if loader else dataframe_materializer,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, event_metadata_fn):\n if not event_metadata_fn:\n return []\n\n metadata_entries = event_metadata_fn(value)\n\n if not (\n isinstance(metadata_entries, list)\n and all(isinstance(item, EventMetadataEntry) for item in metadata_entries)\n ):\n raise DagsterInvariantViolationError(\n (\n "The return value of the user-defined summary_statistics function "\n "for pandas data frame type {type_name} returned {value}. "\n "This function must return List[EventMetadataEntry]"\n ).format(type_name=type_name, value=repr(metadata_entries))\n )\n\n return metadata_entries\n
\nfrom dagster import DagsterInvariantViolationError, check\nfrom dagster_pandas.constraints import (\n CategoricalColumnConstraint,\n ColumnDTypeFnConstraint,\n ColumnDTypeInSetConstraint,\n Constraint,\n ConstraintViolationException,\n DataFrameConstraint,\n InRangeColumnConstraint,\n NonNullableColumnConstraint,\n UniqueColumnConstraint,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n is_bool_dtype,\n is_float_dtype,\n is_integer_dtype,\n is_numeric_dtype,\n is_string_dtype,\n)\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n non_nullable = check.bool_param(non_nullable, "exists")\n unique = check.bool_param(unique, "unique")\n ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n if non_nullable and ignore_missing_vals:\n raise DagsterInvariantViolationError(\n "PandasColumn cannot have a non-null constraint while also ignore missing values"\n )\n constraints = []\n if non_nullable:\n constraints.append(NonNullableColumnConstraint())\n if unique:\n constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n return constraints\n\n\n[docs]class PandasColumn:\n """\n The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n "Required column {column_name} not in dataframe with columns {dataframe_columns}".format(\n column_name=self.name, dataframe_columns=dataframe.columns\n )\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n[docs] @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """\n Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint({"datetime64[ns]"}),\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n[docs] @staticmethod\n def categorical_column(\n name,\n categories,\n of_types="object",\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
\nimport threading\nfrom collections import namedtuple\n\nimport psycopg2\nimport sqlalchemy as db\nfrom dagster import check\nfrom dagster.core.events.log import EventRecord\nfrom dagster.core.storage.event_log import (\n AssetAwareSqlEventLogStorage,\n AssetKeyTable,\n SqlEventLogStorageMetadata,\n SqlEventLogStorageTable,\n)\nfrom dagster.core.storage.sql import (\n create_engine,\n get_alembic_config,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.serdes import (\n ConfigurableClass,\n ConfigurableClassData,\n deserialize_json_to_dagster_namedtuple,\n)\n\nfrom ..pynotify import await_pg_notifications\nfrom ..utils import (\n create_pg_connection,\n pg_config,\n pg_statement_timeout,\n pg_url_from_config,\n retry_pg_connection_fn,\n retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n[docs]class PostgresEventLogStorage(AssetAwareSqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for event log storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, postgres_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self._disposed = False\n\n self._event_watcher = PostgresEventWatcher(self.postgres_url)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n self._secondary_index_cache = {}\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n if "event_logs" not in table_names:\n with self.connect() as conn:\n alembic_config = get_alembic_config(__file__)\n retry_pg_creation_fn(lambda: SqlEventLogStorageMetadata.create_all(conn))\n\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(alembic_config, conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresEventLogStorage(\n inst_data=inst_data, postgres_url=pg_url_from_config(config_value)\n )\n\n @staticmethod\n def create_clean_storage(conn_string):\n inst = PostgresEventLogStorage(conn_string)\n inst.wipe()\n return inst\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n Args:\n event (EventRecord): The event to store.\n """\n check.inst_param(event, "event", EventRecord)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self.connect() as conn:\n result_proxy = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result_proxy.fetchone()\n result_proxy.close()\n conn.execute(\n """NOTIFY {channel}, %s; """.format(channel=CHANNEL_NAME),\n (res[0] + "_" + str(res[1]),),\n )\n if event.is_dagster_event and event.dagster_event.asset_key:\n self.store_asset_key(conn, event)\n\n def store_asset_key(self, conn, event):\n check.inst_param(event, "event", EventRecord)\n if not event.is_dagster_event or not event.dagster_event.asset_key:\n return\n\n conn.execute(\n db.dialects.postgresql.insert(AssetKeyTable)\n .values(asset_key=event.dagster_event.asset_key.to_string())\n .on_conflict_do_nothing(index_elements=[AssetKeyTable.c.asset_key])\n )\n\n def connect(self, run_id=None):\n return create_pg_connection(self._engine, __file__, "event log")\n\n def has_secondary_index(self, name, run_id=None):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name, run_id)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name, run_id=None):\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, start_cursor, callback):\n self._event_watcher.watch_run(run_id, start_cursor, callback)\n\n def end_watch(self, run_id, handler):\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def event_watcher(self):\n return self._event_watcher\n\n def __del__(self):\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self):\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n\nEventWatcherProcessStartedEvent = namedtuple("EventWatcherProcessStartedEvent", "")\nEventWatcherStart = namedtuple("EventWatcherStart", "")\nEventWatcherEvent = namedtuple("EventWatcherEvent", "payload")\nEventWatchFailed = namedtuple("EventWatchFailed", "message")\nEventWatcherEnd = namedtuple("EventWatcherEnd", "")\n\nEventWatcherThreadEvents = (\n EventWatcherProcessStartedEvent,\n EventWatcherStart,\n EventWatcherEvent,\n EventWatchFailed,\n EventWatcherEnd,\n)\nEventWatcherThreadNoopEvents = (EventWatcherProcessStartedEvent, EventWatcherStart)\nEventWatcherThreadEndEvents = (EventWatchFailed, EventWatcherEnd)\n\nPOLLING_CADENCE = 0.25\n\nTERMINATE_EVENT_LOOP = "TERMINATE_EVENT_LOOP"\n\n\ndef watcher_thread(conn_string, run_id_dict, handlers_dict, dict_lock, watcher_thread_exit):\n\n try:\n for notif in await_pg_notifications(\n conn_string,\n channels=[CHANNEL_NAME],\n timeout=POLLING_CADENCE,\n yield_on_timeout=True,\n exit_event=watcher_thread_exit,\n ):\n if notif is None:\n if watcher_thread_exit.is_set():\n break\n else:\n run_id, index_str = notif.payload.split("_")\n if run_id not in run_id_dict:\n continue\n\n index = int(index_str)\n with dict_lock:\n handlers = handlers_dict.get(run_id, [])\n\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n res = engine.execute(\n db.select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == index\n ),\n )\n dagster_event = deserialize_json_to_dagster_namedtuple(res.fetchone()[0])\n finally:\n engine.dispose()\n\n for (cursor, callback) in handlers:\n if index >= cursor:\n callback(dagster_event)\n except psycopg2.OperationalError:\n pass\n\n\nclass PostgresEventWatcher:\n def __init__(self, conn_string):\n self._run_id_dict = {}\n self._handlers_dict = {}\n self._dict_lock = threading.Lock()\n self._conn_string = conn_string\n self._watcher_thread_exit = None\n self._watcher_thread = None\n\n def has_run_id(self, run_id):\n with self._dict_lock:\n _has_run_id = run_id in self._run_id_dict\n return _has_run_id\n\n def watch_run(self, run_id, start_cursor, callback):\n if not self._watcher_thread:\n self._watcher_thread_exit = threading.Event()\n self._watcher_thread = threading.Thread(\n target=watcher_thread,\n args=(\n self._conn_string,\n self._run_id_dict,\n self._handlers_dict,\n self._dict_lock,\n self._watcher_thread_exit,\n ),\n name="postgres-event-watch",\n )\n self._watcher_thread.daemon = True\n self._watcher_thread.start()\n\n with self._dict_lock:\n if run_id in self._run_id_dict:\n self._handlers_dict[run_id].append((start_cursor, callback))\n else:\n # See: https://docs.python.org/2/library/multiprocessing.html#multiprocessing.managers.SyncManager\n run_id_dict = self._run_id_dict\n run_id_dict[run_id] = None\n self._run_id_dict = run_id_dict\n self._handlers_dict[run_id] = [(start_cursor, callback)]\n\n def unwatch_run(self, run_id, handler):\n with self._dict_lock:\n if run_id in self._run_id_dict:\n self._handlers_dict[run_id] = [\n (start_cursor, callback)\n for (start_cursor, callback) in self._handlers_dict[run_id]\n if callback != handler\n ]\n if not self._handlers_dict[run_id]:\n del self._handlers_dict[run_id]\n run_id_dict = self._run_id_dict\n del run_id_dict[run_id]\n self._run_id_dict = run_id_dict\n\n def close(self):\n if self._watcher_thread:\n self._watcher_thread_exit.set()\n self._watcher_thread.join()\n self._watcher_thread_exit = None\n self._watcher_thread = None\n
\nimport sqlalchemy as db\nfrom dagster import check\nfrom dagster.core.storage.runs import DaemonHeartbeatsTable, RunStorageSqlMetadata, SqlRunStorage\nfrom dagster.core.storage.sql import (\n create_engine,\n get_alembic_config,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n create_pg_connection,\n pg_config,\n pg_statement_timeout,\n pg_url_from_config,\n retry_pg_connection_fn,\n retry_pg_creation_fn,\n)\n\n\n[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if there's no previously stamped revision and the main table\n # doesn't exist (since we used to not stamp postgres storage when it was first created)\n if "runs" not in table_names:\n with self.connect() as conn:\n alembic_config = get_alembic_config(__file__)\n retry_pg_creation_fn(lambda: RunStorageSqlMetadata.create_all(conn))\n\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(alembic_config, conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold 1 open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresRunStorage(\n inst_data=inst_data, postgres_url=pg_url_from_config(config_value)\n )\n\n @staticmethod\n def create_clean_storage(postgres_url):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url)\n\n def connect(self):\n return create_pg_connection(self._engine, __file__, "run",)\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name):\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name):\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db.dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type.value,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_dagster_namedtuple(daemon_heartbeat),\n },\n )\n )\n
\nimport sqlalchemy as db\nfrom dagster import check\nfrom dagster.core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster.core.storage.sql import (\n create_engine,\n get_alembic_config,\n run_alembic_upgrade,\n stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom ..utils import (\n create_pg_connection,\n pg_config,\n pg_statement_timeout,\n pg_url_from_config,\n retry_pg_connection_fn,\n retry_pg_creation_fn,\n)\n\n\n[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for schedule storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if missing_main_table:\n with self.connect() as conn:\n alembic_config = get_alembic_config(__file__)\n retry_pg_creation_fn(lambda: ScheduleStorageSqlMetadata.create_all(conn))\n\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(alembic_config, conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresScheduleStorage(\n inst_data=inst_data, postgres_url=pg_url_from_config(config_value)\n )\n\n @staticmethod\n def create_clean_storage(postgres_url):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_pg_connection(self._engine, __file__, "schedule")\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n
\nimport prometheus_client\nfrom dagster import Field, check, resource\nfrom prometheus_client.exposition import default_handler\n\n\n[docs]class PrometheusResource:\n """Integrates with Prometheus via the prometheus_client library.\n """\n\n def __init__(self, gateway, timeout):\n self.gateway = check.str_param(gateway, "gateway")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.registry = prometheus_client.CollectorRegistry()\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method."""\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method."""\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler):\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method."""\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n\n[docs]@resource(\n {\n "gateway": Field(\n str,\n description="the url for your push gateway. Either of the form "\n "'http://pushgateway.local', or 'pushgateway.local'. "\n "Scheme defaults to 'http' if none is provided",\n ),\n "timeout": Field(\n int,\n default_value=30,\n is_required=False,\n description="is how long delete will attempt to connect before giving up. "\n "Defaults to 30s.",\n ),\n },\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )\n
\nfrom dagster import check, resource\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pyspark.sql import SparkSession\n\n\ndef spark_session_from_config(spark_conf=None):\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n builder = SparkSession.builder\n flat = flatten_dict(spark_conf)\n for key, value in flat:\n builder = builder.config(key, value)\n\n return builder.getOrCreate()\n\n\nclass PySparkResource:\n def __init__(self, spark_conf):\n self._spark_session = spark_session_from_config(spark_conf)\n\n @property\n def spark_session(self):\n return self._spark_session\n\n @property\n def spark_context(self):\n return self.spark_session.sparkContext\n\n\n[docs]@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context):\n """This resource provides access to a PySpark SparkSession for executing PySpark code within\n Dagster.\n\n Example:\n\n .. literalinclude:: ../../../../../examples/basic_pyspark/repo.py\n :language: python\n\n """\n return PySparkResource(init_context.resource_config["spark_conf"])\n
\nimport os\n\nfrom dagster import (\n Enum,\n EnumValue,\n Failure,\n Field,\n InputDefinition,\n Noneable,\n Nothing,\n OutputDefinition,\n Permissive,\n check,\n solid,\n)\n\nfrom .utils import execute, execute_script_file\n\n\ndef shell_solid_config():\n return {\n "env": Field(\n Noneable(Permissive()),\n default_value=os.environ.copy(),\n is_required=False,\n description="An optional dict of environment variables to pass to the subprocess. "\n "Defaults to using os.environ.copy().",\n ),\n "output_logging": Field(\n Enum(\n name="OutputType",\n enum_values=[\n EnumValue("STREAM", description="Stream script stdout/stderr."),\n EnumValue(\n "BUFFER",\n description="Buffer shell script stdout/stderr, then log upon completion.",\n ),\n EnumValue("NONE", description="No logging"),\n ],\n ),\n is_required=False,\n default_value="BUFFER",\n ),\n "cwd": Field(\n Noneable(str),\n default_value=None,\n is_required=False,\n description="Working directory in which to execute shell script",\n ),\n }\n\n\n[docs]@solid(\n name="shell_solid",\n description=(\n "This solid executes a shell command it receives as input.\\n\\n"\n "This solid is suitable for uses where the command to execute is generated dynamically by "\n "upstream solids. If you know the command to execute at pipeline construction time, "\n "consider `shell_command_solid` instead."\n ),\n input_defs=[InputDefinition("shell_command", str)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_solid_config(),\n)\ndef shell_solid(context, shell_command):\n """This solid executes a shell command it receives as input.\n\n This solid is suitable for uses where the command to execute is generated dynamically by\n upstream solids. If you know the command to execute at pipeline construction time, consider\n `shell_command_solid` instead.\n """\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **context.solid_config\n )\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(output=output)\n )\n\n return output\n\n\n[docs]def create_shell_command_solid(\n shell_command, name, description=None, required_resource_keys=None, tags=None,\n):\n """This function is a factory that constructs solids to execute a shell command.\n\n Note that you can only use `shell_command_solid` if you know the command you'd like to execute\n at pipeline construction time. If you'd like to construct shell commands dynamically during\n pipeline execution and pass them between solids, you should use `shell_solid` instead.\n\n Examples:\n\n .. literalinclude:: ../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_solid.py\n :language: python\n\n\n Args:\n shell_command (str): The shell command that the constructed solid will execute.\n name (str): The name of the constructed solid.\n description (Optional[str]): Human-readable description of this solid.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n check.str_param(shell_command, "shell_command")\n name = check.str_param(name, "name")\n\n @solid(\n name=name,\n description=description,\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_solid_config(),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_solid(context):\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **context.solid_config\n )\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_solid\n\n\n[docs]def create_shell_script_solid(\n shell_script_path, name="create_shell_script_solid", input_defs=None, **kwargs\n):\n """This function is a factory which constructs a solid that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid\n in the cases where you'd like to configure the shell solid with different config fields.\n\n\n Examples:\n\n .. literalinclude:: ../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_solid.py\n :language: python\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (str, optional): The name of this solid. Defaults to "create_shell_script_solid".\n input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n\n if "output_defs" in kwargs:\n raise TypeError("Overriding output_defs for shell solid is not supported.")\n\n if "config" in kwargs:\n raise TypeError("Overriding config for shell solid is not supported.")\n\n @solid(\n name=name,\n description=kwargs.pop("description", "A solid to invoke a shell command."),\n input_defs=input_defs or [InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_solid_config(),\n **kwargs,\n )\n def _shell_script_solid(context):\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **context.solid_config\n )\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_script_solid\n
\nfrom dagster import Field, StringSource, resource\nfrom slack import WebClient\n\n\n[docs]@resource(\n {\n "token": Field(\n StringSource,\n description="""To configure access to the Slack API, you'll need an access\n token provisioned with access to your Slack workspace.\n\n Tokens are typically either user tokens or bot tokens. For programmatic posting\n to Slack from this resource, you probably want to provision and use a bot token.\n\n More in the Slack API documentation here: https://api.slack.com/docs/token-types\n """,\n )\n },\n description="This resource is for connecting to Slack",\n)\ndef slack_resource(context):\n """This resource is for connecting to Slack.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster solid:\n\n Examples:\n\n .. code-block:: python\n\n import os\n\n from dagster import solid, execute_pipeline, ModeDefinition\n from dagster_slack import slack_resource\n\n\n @solid(required_resource_keys={'slack'})\n def slack_solid(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={'slack': slack_resource})],\n )\n def slack_pipeline():\n slack_solid()\n\n execute_pipeline(\n slack_pipeline, {'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n\n """\n return WebClient(context.resource_config.get("token"))\n
\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\n\nfrom dagster import check, resource\n\nfrom .configs import define_snowflake_config\n\ntry:\n import snowflake.connector\nexcept ImportError:\n msg = (\n "Could not import snowflake.connector. This could mean you have an incompatible version "\n "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n "dagster-snowflake to fix this error."\n )\n warnings.warn(msg)\n raise\n\n\n[docs]class SnowflakeConnection:\n def __init__(self, context): # pylint: disable=too-many-locals\n # Extract parameters from resource config. Note that we can't pass None values to\n # snowflake.connector.connect() because they will override the default values set within the\n # connector; remove them from the conn_args dict.\n self.connector = context.resource_config.get("connector", None)\n\n if self.connector == "sqlalchemy":\n self.conn_args = {\n k: context.resource_config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if context.resource_config.get(k) is not None\n }\n\n else:\n self.conn_args = {\n k: context.resource_config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n )\n if context.resource_config.get(k) is not None\n }\n\n self.autocommit = self.conn_args.get("autocommit", False)\n self.log = context.log_manager\n\n[docs] @contextmanager\n def get_connection(self, raw_conn=True):\n if self.connector == "sqlalchemy":\n from sqlalchemy import create_engine\n from snowflake.sqlalchemy import URL # pylint: disable=no-name-in-module,import-error\n\n engine = create_engine(URL(**self.conn_args))\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self.conn_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()\n\n[docs] def execute_query(self, sql, parameters=None, fetch_results=False):\n check.str_param(sql, "sql")\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n return cursor.fetchall() # pylint: disable=E1101\n\n[docs] def execute_queries(self, sql_queries, parameters=None, fetch_results=False):\n check.list_param(sql_queries, "sql_queries", of_type=str)\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n results = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for sql in sql_queries:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n results.append(cursor.fetchall()) # pylint: disable=E1101\n\n return results if fetch_results else None\n\n[docs] def load_table_from_local_parquet(self, src, table):\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n "CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);".format(table=table),\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n "PUT {src} @%{table};".format(src=src, table=table),\n "COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');".format(\n table=table\n ),\n ]\n\n self.execute_queries(sql_queries)\n\n\n[docs]@resource(\n config_schema=define_snowflake_config(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context):\n """A resource for connecting to the Snowflake data warehouse.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n\n .. code-block:: python\n\n from dagster import execute_pipeline, pipeline, DependencyDefinition, ModeDefinition\n from dagster_snowflake import snowflake_resource\n\n @solid(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={'snowflake': snowflake_resource})],\n )\n def snowflake_pipeline():\n get_one()\n\n result = execute_pipeline(\n snowflake_pipeline,\n {\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n },\n )\n\n """\n return SnowflakeConnection(context)\n\n\ndef _filter_password(args):\n """Remove password from connection args for logging"""\n return {k: v for k, v in args.items() if k != "password"}\n
\n"""Spark Configuration\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description="The path to your spark installation. Defaults to $SPARK_HOME at runtime if not provided.",\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }\n
\nimport os\nimport subprocess\n\nfrom dagster import check, resource\nfrom dagster.core.log_manager import DagsterLogManager\n\nfrom .types import SparkSolidError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n def __init__(self, logger):\n self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n def run_spark_job(self, config, main_class):\n check.dict_param(config, "config")\n check.str_param(main_class, "main_class")\n\n # Extract parameters from config\n (\n master_url,\n deploy_mode,\n application_jar,\n spark_conf,\n application_arguments,\n spark_home,\n ) = [\n config.get(k)\n for k in (\n "master_url",\n "deploy_mode",\n "application_jar",\n "spark_conf",\n "application_arguments",\n "spark_home",\n )\n ]\n\n if not os.path.exists(application_jar):\n raise SparkSolidError(\n (\n "Application jar {} does not exist. A valid jar must be "\n "built before running this solid.".format(application_jar)\n )\n )\n\n spark_shell_cmd = construct_spark_shell_command(\n application_jar=application_jar,\n main_class=main_class,\n master_url=master_url,\n spark_conf=spark_conf,\n deploy_mode=deploy_mode,\n application_arguments=application_arguments,\n spark_home=spark_home,\n )\n self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n if retcode != 0:\n raise SparkSolidError("Spark job failed. Please consult your logs.")\n\n\n\n
\nfrom dagster import InputDefinition, Nothing, OutputDefinition, check, solid\n\nfrom .configs import define_spark_config\n\n\n[docs]def create_spark_solid(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @solid(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(Nothing)],\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_solid(context): # pylint: disable=unused-argument\n context.resources.spark.run_spark_job(context.solid_config, main_class)\n\n return spark_solid\n
\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\n\n
\nimport itertools\nimport os\n\nfrom dagster import check\n\nfrom .types import SparkSolidError\n\n\ndef flatten_dict(d):\n def _flatten_dict(d, result, key_path=None):\n """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n """\n for k, v in d.items():\n new_key_path = (key_path or []) + [k]\n if isinstance(v, dict):\n _flatten_dict(v, result, new_key_path)\n else:\n result.append((".".join(new_key_path), v))\n\n result = []\n if d is not None:\n _flatten_dict(d, result)\n return result\n\n\ndef parse_spark_config(spark_conf):\n """For each key-value pair in spark conf, we need to pass to CLI in format:\n\n --conf "key=value"\n """\n\n spark_conf_list = flatten_dict(spark_conf)\n return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n return list(\n itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n )\n\n\n[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job.\n """\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkSolidError(\n (\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n ["{}/bin/spark-submit".format(spark_home), "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd\n
\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom dagster import Field, StringSource, check, resource\nfrom dagster.utils import merge_dicts, mkdir_p\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\n\ndef key_from_str(key_str):\n """Creates a paramiko SSH key from a string."""\n check.str_param(key_str, "key_str")\n\n # py2 StringIO doesn't support with\n key_file = StringIO(key_str)\n result = paramiko.RSAKey.from_private_key(key_file)\n key_file.close()\n return result\n\n\n[docs]class SSHResource:\n """\n Resource for ssh remote execution using Paramiko.\n ref: https://github.com/paramiko/paramiko\n """\n\n def __init__(\n self,\n remote_host,\n remote_port,\n username=None,\n password=None,\n key_file=None,\n key_string=None,\n timeout=10,\n keepalive_interval=30,\n compress=True,\n no_host_key_check=True,\n allow_host_key_change=False,\n logger=None,\n ):\n self.remote_host = check.str_param(remote_host, "remote_host")\n self.remote_port = check.opt_int_param(remote_port, "remote_port")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.key_file = check.opt_str_param(key_file, "key_file")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n self.compress = check.opt_bool_param(compress, "compress")\n self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n self.allow_host_key_change = check.opt_bool_param(\n allow_host_key_change, "allow_host_key_change"\n )\n self.log = logger\n\n self.host_proxy = None\n\n # Create RSAKey object from private key string\n self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n # Auto detecting username values from system\n if not self.username:\n logger.debug(\n "username to ssh to host: %s is not specified. Using system's default provided by"\n " getpass.getuser()" % self.remote_host\n )\n self.username = getpass.getuser()\n\n user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n if os.path.isfile(user_ssh_config_filename):\n ssh_conf = paramiko.SSHConfig()\n ssh_conf.parse(open(user_ssh_config_filename))\n host_info = ssh_conf.lookup(self.remote_host)\n if host_info and host_info.get("proxycommand"):\n self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n if not (self.password or self.key_file):\n if host_info and host_info.get("identityfile"):\n self.key_file = host_info.get("identityfile")[0]\n\n def get_connection(self):\n """\n Opens a SSH connection to the remote host.\n\n :rtype: paramiko.client.SSHClient\n """\n client = paramiko.SSHClient()\n if not self.allow_host_key_change:\n self.log.warning(\n "Remote Identification Change is not verified. This won't protect against "\n "Man-In-The-Middle attacks"\n )\n client.load_system_host_keys()\n if self.no_host_key_check:\n self.log.warning(\n "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n )\n # Default is RejectPolicy\n client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n if self.password and self.password.strip():\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n password=self.password,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n look_for_keys=False,\n )\n else:\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n )\n\n if self.keepalive_interval:\n client.get_transport().set_keepalive(self.keepalive_interval)\n\n return client\n\n def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n check.int_param(remote_port, "remote_port")\n check.str_param(remote_host, "remote_host")\n check.opt_int_param(local_port, "local_port")\n\n if local_port is not None:\n local_bind_address = ("localhost", local_port)\n else:\n local_bind_address = ("localhost",)\n\n # Will prefer key string if specified, otherwise use the key file\n pkey = self.key_obj if self.key_obj else self.key_file\n\n if self.password and self.password.strip():\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_password=self.password,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n logger=self.log,\n )\n else:\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n host_pkey_directories=[],\n logger=self.log,\n )\n\n return client\n\n def sftp_get(self, remote_filepath, local_filepath):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n local_folder = os.path.dirname(local_filepath)\n\n # Create intermediate directories if they don't exist\n mkdir_p(local_folder)\n\n self.log.info(\n "Starting to transfer from {0} to {1}".format(remote_filepath, local_filepath)\n )\n\n sftp_client.get(remote_filepath, local_filepath)\n\n conn.close()\n return local_filepath\n\n def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n self.log.info(\n "Starting to transfer file from {0} to {1}".format(local_filepath, remote_filepath)\n )\n\n sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n conn.close()\n return local_filepath\n\n\n[docs]@resource(\n {\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n int,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n int,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n int,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(bool, is_required=False, default_value=True),\n "no_host_key_check": Field(bool, is_required=False, default_value=True),\n "allow_host_key_change": Field(bool, is_required=False, default_value=False),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log_manager})\n return SSHResource(**args)\n
\nfrom dagster import Field, StringSource, resource\nfrom twilio.rest import Client\n\n\n[docs]@resource(\n {\n "account_sid": Field(StringSource, description="Twilio Account SID"),\n "auth_token": Field(StringSource, description="Twilio Auth Token"),\n },\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context):\n return Client(context.resource_config["account_sid"], context.resource_config["auth_token"])\n
\nfrom typing import Any, Dict, Set\n\nfrom dagster import PipelineDefinition, PipelineRun, SolidDefinition, check\nfrom dagster.core.definitions.dependency import Solid\nfrom dagster.core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster.core.execution.context.system import SystemPipelineExecutionContext\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.system_config.objects import EnvironmentConfig\n\n\n[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n pipeline_context: SystemPipelineExecutionContext,\n resource_keys_to_init: Set[str],\n solid_name: str,\n solid_config: Any = None,\n ):\n self._pipeline_context = check.inst_param(\n pipeline_context, "pipeline_context", SystemPipelineExecutionContext\n )\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.solid_name = check.str_param(solid_name, "solid_name")\n self._solid_config = solid_config\n\n[docs] def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._pipeline_context.has_tag(key)\n\n[docs] def get_tag(self, key: str) -> str:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._pipeline_context.get_tag(key)\n\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._pipeline_context.run_id\n\n @property\n def run_config(self) -> Dict[str, Any]:\n """dict: The run_config for the context."""\n return self._pipeline_context.run_config\n\n @property\n def environment_config(self) -> EnvironmentConfig:\n """:class:`dagster.EnvironmentConfig`: The environment_config for the context"""\n return self._pipeline_context.environment_config\n\n @property\n def logging_tags(self) -> Dict[str, str]:\n """dict: The logging tags for the context."""\n return self._pipeline_context.logging_tags\n\n @property\n def pipeline_name(self) -> str:\n return self._pipeline_context.pipeline_name\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n """:class:`dagster.PipelineDefinition`: The pipeline definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._pipeline_context.pipeline.get_definition()\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources."""\n return self._pipeline_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """:class:`dagster.PipelineRun`: The pipeline run for the context."""\n return self._pipeline_context.pipeline_run\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._pipeline_context.log\n\n @property\n def solid_def(self) -> SolidDefinition:\n """:class:`dagster.SolidDefinition`: The solid definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return self.pipeline_def.solid_def_named(self.solid_name)\n\n @property\n def solid(self) -> Solid:\n """:class:`dagster.Solid`: The solid for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return self.pipeline_def.solid_named(self.solid_name)\n\n @property\n def solid_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n solid-specific config."""\n if self._solid_config:\n return self._solid_config\n\n solid_config = self.environment_config.solids.get(self.solid_name)\n return solid_config.config if solid_config else None\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n pass\n
\nfrom dagster.core.errors import DagsterError\n\n\n\n
\nimport os\nimport pickle\nimport uuid\n\nfrom dagster import (\n AssetMaterialization,\n ExpectationResult,\n Failure,\n Materialization,\n ModeDefinition,\n PipelineDefinition,\n SolidDefinition,\n TypeCheck,\n check,\n seven,\n)\nfrom dagster.core.definitions.dependency import SolidHandle\nfrom dagster.core.definitions.reconstructable import ReconstructablePipeline\nfrom dagster.core.definitions.resource import ScopedResourcesBuilder\nfrom dagster.core.execution.api import create_execution_plan, scoped_pipeline_context\nfrom dagster.core.execution.resources_init import (\n get_required_resource_keys_to_init,\n resource_initialization_event_generator,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.loggers import colored_console_logger\nfrom dagster.serdes import unpack_value\nfrom dagster.utils import EventGenerationManager\n\nfrom .context import DagstermillExecutionContext, DagstermillRuntimeExecutionContext\nfrom .errors import DagstermillError\nfrom .serialize import PICKLE_PROTOCOL, read_value, write_value\n\n\nclass DagstermillResourceEventGenerationManager(EventGenerationManager):\n """ Utility class to explicitly manage setup/teardown of resource events. Overrides the default\n `generate_teardown_events` method so that teardown is deferred until explicitly called by the\n dagstermill Manager\n """\n\n def generate_teardown_events(self):\n return iter(())\n\n def teardown(self):\n return [\n teardown_event\n for teardown_event in super(\n DagstermillResourceEventGenerationManager, self\n ).generate_teardown_events()\n ]\n\n\nclass Manager:\n def __init__(self):\n self.pipeline = None\n self.solid_def = None\n self.in_pipeline = False\n self.marshal_dir = None\n self.context = None\n self.resource_manager = None\n\n def _setup_resources(\n self,\n execution_plan,\n environment_config,\n pipeline_run,\n log_manager,\n resource_keys_to_init,\n instance,\n resource_instances_to_override,\n ):\n """\n Drop-in replacement for\n `dagster.core.execution.resources_init.resource_initialization_manager`. It uses a\n `DagstermillResourceEventGenerationManager` and explicitly calls `teardown` on it\n """\n generator = resource_initialization_event_generator(\n execution_plan,\n environment_config,\n pipeline_run,\n log_manager,\n resource_keys_to_init,\n instance,\n resource_instances_to_override,\n )\n self.resource_manager = DagstermillResourceEventGenerationManager(\n generator, ScopedResourcesBuilder\n )\n return self.resource_manager\n\n def reconstitute_pipeline_context(\n self,\n output_log_path=None,\n marshal_dir=None,\n run_config=None,\n executable_dict=None,\n pipeline_run_dict=None,\n solid_handle_kwargs=None,\n instance_ref_dict=None,\n ):\n """Reconstitutes a context for dagstermill-managed execution.\n\n You'll see this function called to reconstruct a pipeline context within the ``injected\n parameters`` cell of a dagstermill output notebook. Users should not call this function\n interactively except when debugging output notebooks.\n\n Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a\n context for interactive exploration and development. This call will be replaced by one to\n :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by\n dagstermill.\n """\n check.opt_str_param(output_log_path, "output_log_path")\n check.opt_str_param(marshal_dir, "marshal_dir")\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n check.dict_param(pipeline_run_dict, "pipeline_run_dict")\n check.dict_param(executable_dict, "executable_dict")\n check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")\n check.dict_param(instance_ref_dict, "instance_ref_dict")\n\n pipeline = ReconstructablePipeline.from_dict(executable_dict)\n pipeline_def = pipeline.get_definition()\n\n try:\n instance_ref = unpack_value(instance_ref_dict)\n instance = DagsterInstance.from_ref(instance_ref)\n except Exception as err: # pylint: disable=broad-except\n raise DagstermillError(\n "Error when attempting to resolve DagsterInstance from serialized InstanceRef"\n ) from err\n\n pipeline_run = unpack_value(pipeline_run_dict)\n\n solid_handle = SolidHandle.from_dict(solid_handle_kwargs)\n solid_def = pipeline_def.get_solid(solid_handle).definition\n\n self.marshal_dir = marshal_dir\n self.in_pipeline = True\n self.solid_def = solid_def\n self.pipeline = pipeline\n\n execution_plan = create_execution_plan(\n self.pipeline,\n run_config,\n mode=pipeline_run.mode,\n step_keys_to_execute=pipeline_run.step_keys_to_execute,\n )\n\n with scoped_pipeline_context(\n execution_plan,\n run_config,\n pipeline_run,\n instance,\n scoped_resources_builder_cm=self._setup_resources,\n # Set this flag even though we're not in test for clearer error reporting\n raise_on_error=True,\n ) as pipeline_context:\n self.context = DagstermillRuntimeExecutionContext(\n pipeline_context=pipeline_context,\n solid_config=run_config.get("solids", {}).get(solid_def.name, {}).get("config"),\n resource_keys_to_init=get_required_resource_keys_to_init(\n execution_plan, pipeline_context.intermediate_storage_def,\n ),\n solid_name=solid_def.name,\n )\n\n return self.context\n\n def get_context(self, solid_config=None, mode_def=None, run_config=None):\n """Get a dagstermill execution context for interactive exploration and development.\n\n Args:\n solid_config (Optional[Any]): If specified, this value will be made available on the\n context as its ``solid_config`` property.\n mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to\n use to construct the context. Specify this if you would like a context constructed\n with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode\n with a console logger will be constructed.\n run_config(Optional[dict]): The environment config dict with which to construct\n the context.\n\n Returns:\n :py:class:`~dagstermill.DagstermillExecutionContext`\n """\n check.opt_inst_param(mode_def, "mode_def", ModeDefinition)\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n # If we are running non-interactively, and there is already a context reconstituted, return\n # that context rather than overwriting it.\n if self.context is not None and isinstance(\n self.context, DagstermillRuntimeExecutionContext\n ):\n return self.context\n\n if not mode_def:\n mode_def = ModeDefinition(logger_defs={"dagstermill": colored_console_logger})\n run_config["loggers"] = {"dagstermill": {}}\n\n solid_def = SolidDefinition(\n name="this_solid",\n input_defs=[],\n compute_fn=lambda *args, **kwargs: None,\n output_defs=[],\n description="Ephemeral solid constructed by dagstermill.get_context()",\n required_resource_keys=mode_def.resource_key_set,\n )\n\n pipeline_def = PipelineDefinition(\n [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline"\n )\n\n run_id = make_new_run_id()\n\n # construct stubbed PipelineRun for notebook exploration...\n # The actual pipeline run during pipeline execution will be serialized and reconstituted\n # in the `reconstitute_pipeline_context` call\n pipeline_run = PipelineRun(\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n run_config=run_config,\n mode=mode_def.name,\n step_keys_to_execute=None,\n status=PipelineRunStatus.NOT_STARTED,\n tags=None,\n )\n\n self.in_pipeline = False\n self.solid_def = solid_def\n self.pipeline = pipeline_def\n\n execution_plan = create_execution_plan(self.pipeline, run_config, mode=mode_def.name)\n with scoped_pipeline_context(\n execution_plan,\n run_config,\n pipeline_run,\n DagsterInstance.ephemeral(),\n scoped_resources_builder_cm=self._setup_resources,\n ) as pipeline_context:\n\n self.context = DagstermillExecutionContext(\n pipeline_context=pipeline_context,\n solid_config=solid_config,\n resource_keys_to_init=get_required_resource_keys_to_init(\n execution_plan, pipeline_context.intermediate_storage_def,\n ),\n solid_name=solid_def.name,\n )\n\n return self.context\n\n def yield_result(self, value, output_name="result"):\n """Yield a result directly from notebook code.\n\n When called interactively or in development, returns its input.\n\n Args:\n value (Any): The value to yield.\n output_name (Optional[str]): The name of the result to yield (default: ``'result'``).\n """\n if not self.in_pipeline:\n return value\n\n # deferred import for perf\n import scrapbook\n\n if not self.solid_def.has_output(output_name):\n raise DagstermillError(\n f"Solid {self.solid_def.name} does not have output named {output_name}."\n f"Expected one of {[str(output_def.name) for output_def in self.solid_def.output_defs]}"\n )\n\n dagster_type = self.solid_def.output_def_named(output_name).dagster_type\n\n out_file = os.path.join(self.marshal_dir, f"output-{output_name}")\n scrapbook.glue(output_name, write_value(dagster_type, value, out_file))\n\n def yield_event(self, dagster_event):\n """Yield a dagster event directly from notebook code.\n\n When called interactively or in development, returns its input.\n\n Args:\n dagster_event (Union[:class:`dagster.Materialization`, :class:`dagster.ExpectationResult`, :class:`dagster.TypeCheck`, :class:`dagster.Failure`]):\n An event to yield back to Dagster.\n """\n check.inst_param(\n dagster_event,\n "dagster_event",\n (AssetMaterialization, Materialization, ExpectationResult, TypeCheck, Failure),\n )\n\n if not self.in_pipeline:\n return dagster_event\n\n # deferred import for perf\n import scrapbook\n\n event_id = "event-{event_uuid}".format(event_uuid=str(uuid.uuid4()))\n out_file_path = os.path.join(self.marshal_dir, event_id)\n with open(out_file_path, "wb") as fd:\n fd.write(pickle.dumps(dagster_event, PICKLE_PROTOCOL))\n\n scrapbook.glue(event_id, out_file_path)\n\n def teardown_resources(self):\n if self.resource_manager is not None:\n self.resource_manager.teardown()\n\n def load_parameter(self, input_name, input_value):\n input_def = self.solid_def.input_def_named(input_name)\n return read_value(input_def.dagster_type, seven.json.loads(input_value))\n\n\nMANAGER_FOR_NOTEBOOK_INSTANCE = Manager()\n
\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\n\nimport nbformat\nimport papermill\nfrom dagster import (\n AssetMaterialization,\n EventMetadataEntry,\n InputDefinition,\n Output,\n OutputDefinition,\n SolidDefinition,\n check,\n seven,\n)\nfrom dagster.core.definitions.reconstructable import ReconstructablePipeline\nfrom dagster.core.execution.context.compute import SolidExecutionContext\nfrom dagster.core.execution.context.system import SystemComputeExecutionContext\nfrom dagster.core.storage.file_manager import FileHandle\nfrom dagster.serdes import pack_value\nfrom dagster.utils import mkdir_p, safe_tempfile_path\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\nfrom papermill.parameterize import _find_first_tagged_cell_index\n\nfrom .engine import DagstermillNBConvertEngine\nfrom .errors import DagstermillError\nfrom .serialize import read_value, write_value\nfrom .translator import RESERVED_INPUT_NAMES, DagsterTranslator\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n """Assigned parameters into the appropiate place in the input notebook\n\n Args:\n nb (NotebookNode): Executable notebook object\n parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n """\n check.dict_param(parameters, "parameters")\n\n # Copy the nb object to avoid polluting the input\n nb = copy.deepcopy(nb)\n\n # papermill method chooses translator based on kernel_name and language, but we just call the\n # DagsterTranslator to generate parameter content based on the kernel_name\n param_content = DagsterTranslator.codify(parameters)\n\n newcell = nbformat.v4.new_code_cell(source=param_content)\n newcell.metadata["tags"] = ["injected-parameters"]\n\n param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n if injected_cell_index >= 0:\n # Replace the injected cell with a new version\n before = nb.cells[:injected_cell_index]\n after = nb.cells[injected_cell_index + 1 :]\n check.int_value_param(param_cell_index, -1, "param_cell_index")\n # We should have blown away the parameters cell if there is an injected-parameters cell\n elif param_cell_index >= 0:\n # Replace the parameter cell with the injected-parameters cell\n before = nb.cells[:param_cell_index]\n after = nb.cells[param_cell_index + 1 :]\n else:\n # Inject to the top of the notebook, presumably first cell includes dagstermill import\n context.log.debug(\n (\n "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n "cell."\n )\n )\n before = []\n after = nb.cells\n\n nb.cells = before + [newcell] + after\n nb.metadata.papermill["parameters"] = seven.json.dumps(parameters)\n\n return nb\n\n\ndef get_papermill_parameters(compute_context, inputs, output_log_path):\n check.inst_param(compute_context, "compute_context", SystemComputeExecutionContext)\n check.param_invariant(\n isinstance(compute_context.run_config, dict),\n "compute_context",\n "SystemComputeExecutionContext must have valid run_config",\n )\n check.dict_param(inputs, "inputs", key_type=str)\n\n run_id = compute_context.run_id\n\n marshal_dir = "/tmp/dagstermill/{run_id}/marshal".format(run_id=run_id)\n mkdir_p(marshal_dir)\n\n if not isinstance(compute_context.pipeline, ReconstructablePipeline):\n raise DagstermillError(\n "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "\n "Use the reconstructable() function if executing from python"\n )\n\n dm_executable_dict = compute_context.pipeline.to_dict()\n\n dm_context_dict = {\n "output_log_path": output_log_path,\n "marshal_dir": marshal_dir,\n "run_config": compute_context.run_config,\n }\n\n dm_solid_handle_kwargs = compute_context.solid_handle._asdict()\n\n parameters = {}\n\n input_def_dict = compute_context.solid_def.input_dict\n for input_name, input_value in inputs.items():\n assert (\n input_name not in RESERVED_INPUT_NAMES\n ), "Dagstermill solids cannot have inputs named {input_name}".format(input_name=input_name)\n dagster_type = input_def_dict[input_name].dagster_type\n parameter_value = write_value(\n dagster_type, input_value, os.path.join(marshal_dir, "input-{}".format(input_name))\n )\n parameters[input_name] = parameter_value\n\n parameters["__dm_context"] = dm_context_dict\n parameters["__dm_executable_dict"] = dm_executable_dict\n parameters["__dm_pipeline_run_dict"] = pack_value(compute_context.pipeline_run)\n parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs\n parameters["__dm_instance_ref_dict"] = pack_value(compute_context.instance.get_ref())\n\n return parameters\n\n\ndef _dm_solid_compute(name, notebook_path, output_notebook=None, asset_key_prefix=None):\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.opt_str_param(output_notebook, "output_notebook")\n check.opt_list_param(asset_key_prefix, "asset_key_prefix")\n\n def _t_fn(compute_context, inputs):\n check.inst_param(compute_context, "compute_context", SolidExecutionContext)\n check.param_invariant(\n isinstance(compute_context.run_config, dict),\n "context",\n "SystemComputeExecutionContext must have valid run_config",\n )\n\n system_compute_context = compute_context.get_system_context()\n\n with tempfile.TemporaryDirectory() as output_notebook_dir:\n with safe_tempfile_path() as output_log_path:\n\n parameterized_notebook_path = os.path.join(\n output_notebook_dir, "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4()))\n )\n\n executed_notebook_path = os.path.join(\n output_notebook_dir, "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4()))\n )\n\n # Scaffold the registration here\n nb = load_notebook_node(notebook_path)\n nb_no_parameters = replace_parameters(\n system_compute_context,\n nb,\n get_papermill_parameters(system_compute_context, inputs, output_log_path),\n )\n write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n try:\n papermill_engines.register("dagstermill", DagstermillNBConvertEngine)\n papermill.execute_notebook(\n input_path=parameterized_notebook_path,\n output_path=executed_notebook_path,\n engine_name="dagstermill",\n log_output=True,\n )\n\n except Exception: # pylint: disable=broad-except\n try:\n with open(executed_notebook_path, "rb") as fd:\n executed_notebook_file_handle = compute_context.resources.file_manager.write(\n fd, mode="wb", ext="ipynb"\n )\n executed_notebook_materialization_path = (\n executed_notebook_file_handle.path_desc\n )\n except Exception: # pylint: disable=broad-except\n compute_context.log.warning(\n "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}".format(\n exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n )\n executed_notebook_materialization_path = executed_notebook_path\n\n yield AssetMaterialization(\n asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),\n description="Location of output notebook in file manager",\n metadata_entries=[\n EventMetadataEntry.fspath(\n executed_notebook_materialization_path,\n label="executed_notebook_path",\n )\n ],\n )\n raise\n\n system_compute_context.log.debug(\n "Notebook execution complete for {name} at {executed_notebook_path}.".format(\n name=name, executed_notebook_path=executed_notebook_path,\n )\n )\n\n executed_notebook_file_handle = None\n try:\n # use binary mode when when moving the file since certain file_managers such as S3\n # may try to hash the contents\n with open(executed_notebook_path, "rb") as fd:\n executed_notebook_file_handle = compute_context.resources.file_manager.write(\n fd, mode="wb", ext="ipynb"\n )\n executed_notebook_materialization_path = executed_notebook_file_handle.path_desc\n except Exception: # pylint: disable=broad-except\n compute_context.log.warning(\n "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}".format(\n exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n )\n executed_notebook_materialization_path = executed_notebook_path\n\n yield AssetMaterialization(\n asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),\n description="Location of output notebook in file manager",\n metadata_entries=[\n EventMetadataEntry.fspath(executed_notebook_materialization_path)\n ],\n )\n\n if output_notebook is not None:\n yield Output(executed_notebook_file_handle, output_notebook)\n\n # deferred import for perf\n import scrapbook\n\n output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n for (output_name, output_def) in system_compute_context.solid_def.output_dict.items():\n data_dict = output_nb.scraps.data_dict\n if output_name in data_dict:\n value = read_value(output_def.dagster_type, data_dict[output_name])\n\n yield Output(value, output_name)\n\n for key, value in output_nb.scraps.items():\n if key.startswith("event-"):\n with open(value.data, "rb") as fd:\n yield pickle.loads(fd.read())\n\n return _t_fn\n\n\n[docs]def define_dagstermill_solid(\n name,\n notebook_path,\n input_defs=None,\n output_defs=None,\n config_schema=None,\n required_resource_keys=None,\n output_notebook=None,\n asset_key_prefix=None,\n):\n """Wrap a Jupyter notebook in a solid.\n\n Arguments:\n name (str): The name of the solid.\n notebook_path (str): Path to the backing notebook.\n input_defs (Optional[List[InputDefinition]]): The solid's inputs.\n output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook (Optional[str]): If set, will be used as the name of an injected output of\n type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in\n addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This\n respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on\n the pipeline resources via the "file_manager" resource key, so, e.g.,\n if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :\n py:class:`~dagster_aws.s3.S3FileHandle`.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n\n Returns:\n :py:class:`~dagster.SolidDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition)\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n if output_notebook is not None:\n required_resource_keys.add("file_manager")\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n return SolidDefinition(\n name=name,\n input_defs=input_defs,\n compute_fn=_dm_solid_compute(\n name, notebook_path, output_notebook, asset_key_prefix=asset_key_prefix\n ),\n output_defs=output_defs\n + (\n [OutputDefinition(dagster_type=FileHandle, name=output_notebook)]\n if output_notebook\n else []\n ),\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description="This solid is backed by the notebook at {path}".format(path=notebook_path),\n tags={"notebook_path": notebook_path, "kind": "ipynb"},\n )\n
dagster asset [OPTIONS] COMMAND [ARGS]...\n
Eliminate asset key indexes from event logs. Warning: Cannot be undone
\nUsage:
\n\n\ndagster asset wipe \u2013all
\ndagster asset wipe <unstructured_asset_key_name>
\ndagster asset wipe <json_string_of_structured_asset_key>
\n
dagster asset wipe [OPTIONS] [KEY]...\n
Options
\n--all
\u00b6Eliminate all asset key indexes
\nArguments
\nKEY
\u00b6Optional argument(s)
\ndagster debug [OPTIONS] COMMAND [ARGS]...\n
dagster instance [OPTIONS] COMMAND [ARGS]...\n
List the information about the current instance.
\ndagster instance info [OPTIONS]\n
Automatically migrate an out of date instance.
\ndagster instance migrate [OPTIONS]\n
Rebuild index over historical runs for performance.
\ndagster instance reindex [OPTIONS]\n
dagster pipeline [OPTIONS] COMMAND [ARGS]...\n
Backfill a partitioned pipeline.
\nThis commands targets a partitioned pipeline. The pipeline and partition set must be defined in a repository, which can be specified in a number of ways:
\ndagster pipeline backfill -p <<pipeline_name>> (works if .workspace.yaml exists)
dagster pipeline backfill -p <<pipeline_name>> -w path/to/workspace.yaml
dagster pipeline backfill -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline backfill -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline backfill [OPTIONS]\n
Options
\n-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\n--partitions
<partitions>
\u00b6Comma-separated list of partition names that we want to backfill
\n--partition-set
<partition_set>
\u00b6The name of the partition set over which we want to backfill.
\n--all
<all>
\u00b6Specify to select all partitions to backfill.
\n--from
<from>
\u00b6Specify a start partition for this backfill job
\nExample: dagster pipeline backfill log_daily_stats \u2013from 20191101
\n--to
<to>
\u00b6Specify an end partition for this backfill job
\nExample: dagster pipeline backfill log_daily_stats \u2013to 20191201
\nJSON string of tags to use for this pipeline run
\n--noprompt
\u00b6Execute a pipeline.
\nThis commands targets a pipeline. The pipeline can be specified in a number of ways:
\ndagster pipeline execute -f /path/to/file.py -a define_some_pipeline
dagster pipeline execute -m a_module.submodule -a define_some_pipeline
dagster pipeline execute -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline execute -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline execute [OPTIONS]\n
Options
\n-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-r
,
--repository
<repository>
\u00b6Repository name, necessary if more than one repository is present.
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-c
,
--config
<config>
\u00b6Specify one or more run config files. These can also be file patterns. If more than one run config file is captured then those files are merged. Files listed first take precedence. They will smash the values of subsequent files at the key-level granularity. If the file is a pattern then you must enclose it in double quotes
\nExample: dagster pipeline execute -f hello_world.py -p pandas_hello_world -c \u201cpandas_hello_world/*.yaml\u201d
\nYou can also specify multiple files:
\nExample: dagster pipeline execute -f hello_world.py -p pandas_hello_world -c pandas_hello_world/solids.yaml -e pandas_hello_world/env.yaml
\n--preset
<preset>
\u00b6Specify a preset to use for this pipeline. Presets are defined on pipelines under preset_defs.
\n--mode
<mode>
\u00b6The name of the mode in which to execute the pipeline.
\nJSON string of tags to use for this pipeline run
\n-s
,
--solid-selection
<solid_selection>
\u00b6Specify the solid subselection to execute. It can be multiple clauses separated by commas.Examples:\n- \u201csome_solid\u201d will execute \u201csome_solid\u201d itself\n- \u201c*some_solid\u201d will execute \u201csome_solid\u201d and all its ancestors (upstream dependencies)\n- \u201c*some_solid+++\u201d will execute \u201csome_solid\u201d, all its ancestors, and its descendants (downstream dependencies) within 3 levels down\n- \u201c*some_solid,other_solid_a,other_solid_b+\u201d will execute \u201csome_solid\u201d and all its ancestors, \u201cother_solid_a\u201d itself, and \u201cother_solid_b\u201d and its direct child solids
\nLaunch a pipeline using the run launcher configured on the Dagster instance.
\nThis commands targets a pipeline. The pipeline can be specified in a number of ways:
\ndagster pipeline launch -p <<pipeline_name>> (works if .workspace.yaml exists)
dagster pipeline launch -p <<pipeline_name>> -w path/to/workspace.yaml
dagster pipeline launch -f /path/to/file.py -a define_some_pipeline
dagster pipeline launch -m a_module.submodule -a define_some_pipeline
dagster pipeline launch -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline launch -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline launch [OPTIONS]\n
Options
\n-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\n-c
,
--config
<config>
\u00b6Specify one or more run config files. These can also be file patterns. If more than one run config file is captured then those files are merged. Files listed first take precedence. They will smash the values of subsequent files at the key-level granularity. If the file is a pattern then you must enclose it in double quotes
\nExample: dagster pipeline launch -f hello_world.py -p pandas_hello_world -c \u201cpandas_hello_world/*.yaml\u201d
\nYou can also specify multiple files:
\nExample: dagster pipeline launch -f hello_world.py -p pandas_hello_world -c pandas_hello_world/solids.yaml -e pandas_hello_world/env.yaml
\n--config-json
<config_json>
\u00b6JSON string of run config to use for this pipeline run. Cannot be used with -c / \u2013config.
\n--preset
<preset>
\u00b6Specify a preset to use for this pipeline. Presets are defined on pipelines under preset_defs.
\n--mode
<mode>
\u00b6The name of the mode in which to execute the pipeline.
\nJSON string of tags to use for this pipeline run
\n-s
,
--solid-selection
<solid_selection>
\u00b6Specify the solid subselection to launch. It can be multiple clauses separated by commas.Examples:\n- \u201csome_solid\u201d will launch \u201csome_solid\u201d itself\n- \u201c*some_solid\u201d will launch \u201csome_solid\u201d and all its ancestors (upstream dependencies)\n- \u201c*some_solid+++\u201d will launch \u201csome_solid\u201d, all its ancestors, and its descendants (downstream dependencies) within 3 levels down\n- \u201c*some_solid,other_solid_a,other_solid_b+\u201d will launch \u201csome_solid\u201d and all its ancestors, \u201cother_solid_a\u201d itself, and \u201cother_solid_b\u201d and its direct child solids
\n--run-id
<run_id>
\u00b6The ID to give to the launched pipeline run
\nList the pipelines in a repository. Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.
\ndagster pipeline list [OPTIONS]\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nDisplay the freshness of memoized results for the given pipeline.
\nThis commands targets a pipeline. The pipeline can be specified in a number of ways:
\ndagster pipeline list_versions -f /path/to/file.py -a define_some_pipeline
dagster pipeline list_versions -m a_module.submodule -a define_some_pipeline
dagster pipeline list_versions -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline list_versions -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline list_versions [OPTIONS]\n
Options
\n-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-r
,
--repository
<repository>
\u00b6Repository name, necessary if more than one repository is present.
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-c
,
--config
<config>
\u00b6Specify one or more run config files. These can also be file patterns. If more than one run config file is captured then those files are merged. Files listed first take precedence. They will smash the values of subsequent files at the key-level granularity. If the file is a pattern then you must enclose it in double quotes
\nExample: dagster pipeline list_versions -f hello_world.py -p pandas_hello_world -c \u201cpandas_hello_world/*.yaml\u201d
\nYou can also specify multiple files:
\nExample: dagster pipeline list_versions -f hello_world.py -p pandas_hello_world -c pandas_hello_world/solids.yaml -e pandas_hello_world/env.yaml
\n--preset
<preset>
\u00b6Specify a preset to use for this pipeline. Presets are defined on pipelines under preset_defs.
\n--mode
<mode>
\u00b6The name of the mode in which to execute the pipeline.
\nPrint a pipeline.
\nThis commands targets a pipeline. The pipeline can be specified in a number of ways:
\ndagster pipeline print -p <<pipeline_name>> (works if .workspace.yaml exists)
dagster pipeline print -p <<pipeline_name>> -w path/to/workspace.yaml
dagster pipeline print -f /path/to/file.py -a define_some_pipeline
dagster pipeline print -m a_module.submodule -a define_some_pipeline
dagster pipeline print -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline print -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline print [OPTIONS]\n
Options
\n--verbose
\u00b6-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nScaffold the config for a pipeline.
\nThis commands targets a pipeline. The pipeline can be specified in a number of ways:
\ndagster pipeline scaffold_config -f /path/to/file.py -a define_some_pipeline
dagster pipeline scaffold_config -m a_module.submodule -a define_some_pipeline
dagster pipeline scaffold_config -f /path/to/file.py -a define_some_repo -p <<pipeline_name>>
dagster pipeline scaffold_config -m a_module.submodule -a define_some_repo -p <<pipeline_name>>
dagster pipeline scaffold_config [OPTIONS]\n
Options
\n-p
,
--pipeline
<pipeline>
\u00b6Pipeline within the repository, necessary if more than one pipeline is present.
\n-r
,
--repository
<repository>
\u00b6Repository name, necessary if more than one repository is present.
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n--print-only-required
\u00b6dagster run [OPTIONS] COMMAND [ARGS]...\n
List the runs in this dagster installation.
\ndagster run list [OPTIONS]\n
Options
\n--limit
<limit>
\u00b6Only list a specified number of runs
\nEliminate all run history and event logs. Warning: Cannot be undone
\ndagster run wipe [OPTIONS]\n
dagster schedule [OPTIONS] COMMAND [ARGS]...\n
List all schedules that correspond to a repository.
\ndagster schedule list [OPTIONS]\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\n--running
\u00b6Filter for running schedules
\n--stopped
\u00b6Filter for stopped schedules
\n--name
\u00b6Only display schedule schedule names
\nGet logs for a schedule
\ndagster schedule logs [OPTIONS] [SCHEDULE_NAME]...\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSCHEDULE_NAME
\u00b6Optional argument(s)
\nPreview changes that will be performed by `dagster schedule up
\ndagster schedule preview [OPTIONS]\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nRestart a running schedule
\ndagster schedule restart [OPTIONS] [SCHEDULE_NAME]...\n
Options
\n--restart-all-running
\u00b6restart previously running schedules
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSCHEDULE_NAME
\u00b6Optional argument(s)
\nStart an existing schedule
\ndagster schedule start [OPTIONS] [SCHEDULE_NAME]...\n
Options
\n--start-all
\u00b6start all schedules
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSCHEDULE_NAME
\u00b6Optional argument(s)
\nStop an existing schedule
\ndagster schedule stop [OPTIONS] [SCHEDULE_NAME]...\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSCHEDULE_NAME
\u00b6Optional argument(s)
\nUpdates the internal dagster representation of schedules to match the list of ScheduleDefinitions defined in the repository. Use dagster schedule up \u2013preview or dagster schedule preview to preview what changes will be applied. New ScheduleDefinitions will not start running by default when up is called. Use dagster schedule start and dagster schedule stop to start and stop a schedule. If a ScheduleDefinition is deleted, the corresponding running schedule will be stopped and deleted.
\ndagster schedule up [OPTIONS]\n
Options
\n--preview
\u00b6Preview changes
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nDeletes schedule history and turns off all schedules.
\ndagster schedule wipe [OPTIONS]\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\ndagster sensor [OPTIONS] COMMAND [ARGS]...\n
List all sensors that correspond to a repository.
\ndagster sensor list [OPTIONS]\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\n--running
\u00b6Filter for running sensors
\n--stopped
\u00b6Filter for stopped sensors
\n--name
\u00b6Only display sensor sensor names
\nPreview an existing sensor execution
\ndagster sensor preview [OPTIONS] [SENSOR_NAME]...\n
Options
\n--since
<since>
\u00b6Set the last_completion_time value as a timestamp float for the sensor context
\n--last_run_key
<last_run_key>
\u00b6Set the last_run_key value for the sensor context
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSENSOR_NAME
\u00b6Optional argument(s)
\nStart an existing sensor
\ndagster sensor start [OPTIONS] [SENSOR_NAME]...\n
Options
\n--start-all
\u00b6start all sensors
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSENSOR_NAME
\u00b6Optional argument(s)
\nStop an existing sensor
\ndagster sensor stop [OPTIONS] [SENSOR_NAME]...\n
Options
\n-l
,
--location
<location>
\u00b6RepositoryLocation within the workspace, necessary if more than one location is present.
\n-r
,
--repository
<repository>
\u00b6Repository within the workspace, necessary if more than one repository is present.
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\nArguments
\nSENSOR_NAME
\u00b6Optional argument(s)
\nRun a GraphQL query against the dagster interface to a specified repository or pipeline.
\nCan only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.
\nExamples:
\ndagster-graphql
dagster-graphql -y path/to/workspace.yaml
dagster-graphql -f path/to/file.py -a define_repo
dagster-graphql -m some_module -a define_repo
dagster-graphql -f path/to/file.py -a define_pipeline
dagster-graphql -m some_module -a define_pipeline
dagster-graphql [OPTIONS]\n
Options
\n--version
\u00b6Show the version and exit.
\n-t
,
--text
<text>
\u00b6GraphQL document to execute passed as a string
\n-f
,
--file
<file>
\u00b6GraphQL document to execute passed as a file
\n-p
,
--predefined
<predefined>
\u00b6GraphQL document to execute, from a predefined set provided by dagster-graphql.
\nlaunchPipelineExecution
\n-v
,
--variables
<variables>
\u00b6A JSON encoded string containing the variables for GraphQL execution.
\n-r
,
--remote
<remote>
\u00b6A URL for a remote instance running dagit server to send the GraphQL request to.
\n-o
,
--output
<output>
\u00b6A file path to store the GraphQL response to. This flag is useful when making pipeline execution queries, since pipeline execution causes logs to print to stdout and stderr.
\n--remap-sigterm
\u00b6Remap SIGTERM signal to SIGINT handler
\n--empty-workspace
\u00b6Allow an empty workspace
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\nRun dagit. Loads a repository or pipeline.
\nCan only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.
\nExamples:
\ndagit (works if .workspace.yaml exists)
dagit -w path/to/workspace.yaml
dagit -f path/to/file.py
dagit -f path/to/file.py -d path/to/working_directory
dagit -m some_module
dagit -f path/to/file.py -a define_repo
dagit -m some_module -a define_repo
dagit -p 3333
Options can also provide arguments via environment variables prefixed with DAGIT
\nFor example, DAGIT_PORT=3333 dagit
\ndagit [OPTIONS]\n
Options
\n--grpc-host
<grpc_host>
\u00b6Host to use to connect to gRPC server, defaults to localhost
\n--grpc-socket
<grpc_socket>
\u00b6Named socket to use to connect to gRPC server
\n--grpc-port
<grpc_port>
\u00b6Port to use to connect to gRPC server
\n-a
,
--attribute
<attribute>
\u00b6Attribute that is either a 1) repository or pipeline or 2) a function that returns a repository or pipeline
\n-m
,
--module-name
<module_name>
\u00b6Specify module where repository or pipeline function lives
\n--package-name
<package_name>
\u00b6Specify installed Python package where repository or pipeline function lives
\n-f
,
--python-file
<python_file>
\u00b6Specify python file where repository or pipeline function lives
\n-d
,
--working-directory
<working_directory>
\u00b6Specify working directory to use when loading the repository or pipeline. Can only be used along with -f/\u2013python-file
\n--empty-working-directory
\u00b6Indicates that the working directory should be empty and should not set to the current directory as a default
\n-w
,
--workspace
<workspace>
\u00b6Path to workspace file. Argument can be provided multiple times.
\n--empty-workspace
\u00b6Allow an empty workspace
\n-h
,
--host
<host>
\u00b6Host to run server on [default: 127.0.0.1]
\n-p
,
--port
<port>
\u00b6Port to run server on, default is 3000
\n-l
,
--path-prefix
<path_prefix>
\u00b6The path prefix where Dagit will be hosted (eg: /dagit) [default: ]
\n--storage-fallback
<storage_fallback>
\u00b6Base directory for dagster storage if $DAGSTER_HOME is not set
\n--db-statement-timeout
<db_statement_timeout>
\u00b6The timeout in milliseconds to set on database statements sent to the DagsterInstance. Not respected in all configurations. [default: 5000]
\n--version
\u00b6Show the version and exit.
\nThe following types are used to describe the schema of configuration\ndata via config
. They are used in conjunction with the\nbuiltin types above.
dagster.
ConfigSchema
[source]\u00b6This is a placeholder type. Any time that it appears in documentation, it means that any of\nthe following types are acceptable:
\nA Python scalar type that resolves to a Dagster config type\n(int
, float
, bool
,\nor str
). For example:
@solid(config_schema=int)
@solid(config_schema=str)
A built-in python collection (list
, or dict
).\nlist
is exactly equivalent to Array
[\nAny
] and dict
is equivalent to\nPermissive
. For example:
@solid(config_schema=list)
@solid(config_schema=dict)
A Dagster config type:
\nA bare python dictionary, which will be automatically wrapped in\nShape
. Values of the dictionary are resolved recursively\naccording to the same rules. For example:
{'some_config': str}
is equivalent to Shape({'some_config: str})
.
{'some_config1': {'some_config2': str}}
is equivalent toShape({'some_config1: Shape({'some_config2: str})})
.
A bare python list of length one, whose single element will be wrapped in a\nArray
is resolved recursively according to the same\nrules. For example:
[str]
is equivalent to Array[str]
.
[[str]]
is equivalent to Array[Array[str]]
.
[{'some_config': str}]
is equivalent to Array(Shape({'some_config: str}))
.
An instance of Field
.
dagster.
Field
(config, default_value=<class 'dagster.config.field_utils.__FieldValueSentinel'>, is_required=None, description=None)[source]\u00b6Defines the schema for a configuration field.
\nFields are used in config schema instead of bare types when one wants to add a description,\na default value, or to mark it as not required.
\nConfig fields are parsed according to their schemas in order to yield values available at\npipeline execution time through the config system. Config fields can be set on solids, on\nloaders and materializers for custom, and on other pluggable components of the system, such as\nresources, loggers, and executors.
\nconfig (Any) \u2013
The schema for the config. This value can be any of:
\nA Python primitive type that resolves to a Dagster config type\n(int
, float
, bool
,\nstr
, or list
).
A Dagster config type:
\nA bare python dictionary, which will be automatically wrapped in\nShape
. Values of the dictionary are resolved recursively\naccording to the same rules.
A bare python list of length one which itself is config type.\nBecomes Array
with list element as an argument.
default_value (Any) \u2013
A default value for this field, conformant to the schema set by the dagster_type
\nargument. If a default value is provided, is_required
should be False
.
Note: for config types that do post processing such as Enum, this value must be\nthe pre processed version, ie use ExampleEnum.VALUE.name
instead of\nExampleEnum.VALUE
is_required (bool) \u2013 Whether the presence of this field is required. Defaults to true. If is_required
\nis True
, no default value should be provided.
description (str) \u2013 A human-readable description of this config field.
Examples:
\n@solid(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n)\ndef repeat_word(context):\n return context.solid_config['word'] * context.solid_config['repeats']\n
dagster.
Selector
(fields, description=None)[source]\u00b6Define a config field requiring the user to select one option.
\nSelectors are used when you want to be able to present several different options in config but\nallow only one to be selected. For example, a single input might be read in from either a csv\nfile or a parquet file, but not both at once.
\nNote that in some other type systems this might be called an \u2018input union\u2019.
\nFunctionally, a selector is like a Dict
, except that only one key from the dict can\nbe specified in valid config.
Examples:
\n@solid(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n)\ndef hello_world_with_default(context):\n if 'haw' in context.solid_config:\n return 'Aloha {whom}!'.format(whom=context.solid_config['haw']['whom'])\n if 'cn' in context.solid_config:\n return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.solid_config['cn']['whom'])\n if 'en' in context.solid_config:\n return 'Hello, {whom}!'.format(whom=context.solid_config['en']['whom'])\n
dagster.
Permissive
(fields=None, description=None)[source]\u00b6Defines a config dict with a partially specified schema.
\nA permissive dict allows partial specification of the config schema. Any fields with a\nspecified schema will be type checked. Other fields will be allowed, but will be ignored by\nthe type checker.
\n\nExamples:
\n@solid(config_schema=Field(Permissive({'required': Field(String)})))\ndef partially_specified_config(context) -> List:\n return sorted(list(context.solid_config.items()))\n
dagster.
Shape
(fields, description=None)[source]\u00b6Schema for configuration data with string keys and typed values via Field
.
Unlike Permissive
, unspecified fields are not allowed and will throw a\nDagsterInvalidConfigError
.
dagster.
Array
(inner_type)[source]\u00b6Defines an array (list) configuration type that contains values of type inner_type
.
inner_type (type) \u2013 The type of the values that this configuration type can contain.
\ndagster.
Noneable
(inner_type)[source]\u00b6Defines a configuration type that is the union of NoneType
and the type inner_type
.
inner_type (type) \u2013 The type of the values that this configuration type can contain.
\nExamples:
\nconfig_schema={"name": Noneable(str)}\n\nconfig={"name": "Hello"} # Ok\nconfig={"name": None} # Ok\nconfig={} # Error\n
dagster.
Enum
(name, enum_values)[source]\u00b6Defines a enum configuration type that allows one of a defined set of possible values.
\nExamples:
\n@solid(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n)\ndef resolve_standoff(context):\n # ...\n
dagster.
EnumValue
(config_value, python_value=None, description=None)[source]\u00b6Define an entry in a Enum
.
dagster.
ScalarUnion
(scalar_type, non_scalar_schema, _key=None)[source]\u00b6Defines a configuration type that accepts a scalar value OR a non-scalar value like a\nList
, Dict
, or Selector
.
This allows runtime scalars to be configured without a dictionary with the key value
and\ninstead just use the scalar value directly. However this still leaves the option to\nload scalars from a json or pickle file.
scalar_type (type) \u2013 The scalar type of values that this configuration type can hold. For example,\nint
, float
, bool
,\nor str
.
non_scalar_schema (ConfigSchema) \u2013 The schema of a non-scalar Dagster configuration type. For example, List
,\nDict
, or Selector
.
key (Optional[str]) \u2013 The configuation type\u2019s unique key. If not set, then the key will be set to\nScalarUnion.{scalar_type}-{non_scalar_schema}
.
Examples:
\nsolids:\n transform_word:\n inputs:\n word:\n value: foobar\n
becomes, optionally,
\nsolids:\n transform_word:\n inputs:\n word: foobar\n
dagster.
StringSource
\u00b6Use this type when you want to read a string config value from an environment variable. The value\npassed to a config field of this type may either be a string literal, or a selector describing\nhow to look up the value from the executing process\u2019s environment variables.
\nExamples:
\n@solid(config_schema=StringSource)\ndef secret_solid(context) -> str:\n return context.solid_config\n\nexecute_solid(\n secret_solid,\n run_config={\n 'solids': {'secret_solid': {'config': 'test_value'}}\n }\n)\n\nexecute_solid(\n secret_solid,\n run_config={\n 'solids': {'secret_solid': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE'}}}\n }\n)\n
dagster.
IntSource
\u00b6Use this type when you want to read an integer config value from an environment variable. The\nvalue passed to a config field of this type may either be a integer literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables.
\nExamples:
\n@solid(config_schema=IntSource)\ndef secret_int_solid(context) -> str:\n return context.solid_config\n\nexecute_solid(\n secret_int_solid,\n run_config={\n 'solids': {'secret_int_solid': {'config': 3}}\n }\n)\n\nexecute_solid(\n secret_int_solid,\n run_config={\n 'solids': {'secret_int_solid': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_INT'}}}\n }\n)\n
@
dagster.
configured
(configurable: dagster.core.definitions.configurable.ConfigurableDefinition, config_schema: Optional[Dict[str, Any]] = None, **kwargs: Any)[source]\u00b6A decorator that makes it easy to create a function-configured version of an object.\nThe following definition types can be configured using this function:
\nIf the config that will be supplied to the object is constant, you may alternatively invoke this\nand call the result with a dict of config values to be curried. Examples of both strategies\nbelow.
\nconfigurable (ConfigurableDefinition) \u2013 An object that can be configured.
config_schema (ConfigSchema) \u2013 The config schema that the inputs to the decorated function\nmust satisfy.
**kwargs \u2013 Arbitrary keyword arguments that will be passed to the initializer of the returned\nobject.
(Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])
\nExamples:
\ndev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n@configured(s3_resource):\ndef dev_s3(_):\n return {'bucket': 'dev'}\n\n@configured(s3_resource, {'bucket_prefix', str}):\ndef dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n
dagster.experimental.
DynamicOutputDefinition
(dagster_type=None, name=None, description=None, is_required=None, io_manager_key=None, metadata=None)[source]\u00b6(EXPERIMENTAL) Variant of OutputDefinition
for an output that will dynamically\nalter the graph at runtime. Each copy of DynamicOutput
corresponding to this\ndefinition that is yielded from the solid will create a copy of the downstream graph.
dagster.experimental.
DynamicOutput
[source]\u00b6(Experimental) Variant of Output
used to support mapping. Each DynamicOutput\nproduced by a solid will result in the downstream dag being cloned to run on that individual\nvalue. Each DynamicOutput must have a unique mapping_key to distinguish it.
Core Dagster error classes.
\nAll errors thrown by the Dagster framework inherit from DagsterError
. Users\nshould not subclass this base class for their own exceptions.
There is another exception base class, DagsterUserCodeExecutionError
, which is\nused by the framework in concert with the user_code_error_boundary()
.
Dagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\nDagsterUserCodeExecutionError
.
The wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.
\ndagster.
DagsterError
[source]\u00b6Base class for all errors thrown by the Dagster framework.
\nUsers should not subclass this base class for their own exceptions.
\ndagster.
DagsterConfigMappingFunctionError
(*args, **kwargs)[source]\u00b6Indicates that an unexpected error occurred while executing the body of a config mapping\nfunction defined in a CompositeSolidDefinition
during config parsing.
dagster.
DagsterEventLogInvalidForRun
(run_id)[source]\u00b6Raised when the event logs for a historical run are malformed or invalid.
\ndagster.
DagsterExecutionStepExecutionError
(*args, **kwargs)[source]\u00b6Indicates an error occurred while executing the body of an execution step.
\ndagster.
DagsterExecutionStepNotFoundError
(*args, **kwargs)[source]\u00b6Thrown when the user specifies execution step keys that do not exist.
\ndagster.
DagsterInvalidConfigError
(preamble, errors, config_value, *args, **kwargs)[source]\u00b6Thrown when provided config is invalid (does not type check against the relevant config\nschema).
\ndagster.
DagsterInvalidConfigDefinitionError
(original_root, current_value, stack, reason=None, **kwargs)[source]\u00b6Indicates that you have attempted to construct a config with an invalid value
\nShape
. Values of the dictionary are resolved recursively\naccording to the same rules.
Becomes Array
with list element as an argument.
An instance of Field
.
dagster.
DagsterInvalidDefinitionError
[source]\u00b6Indicates that the rules for a definition have been violated by the user.
\ndagster.
DagsterInvariantViolationError
[source]\u00b6Indicates the user has violated a well-defined invariant that can only be enforced\nat runtime.
\ndagster.
DagsterResourceFunctionError
(*args, **kwargs)[source]\u00b6Indicates an error occurred while executing the body of the resource_fn
in a\nResourceDefinition
during resource initialization.
dagster.
DagsterRunNotFoundError
(*args, **kwargs)[source]\u00b6Thrown when a run cannot be found in run storage.
\ndagster.
DagsterStepOutputNotFoundError
(*args, **kwargs)[source]\u00b6Indicates that previous step outputs required for an execution step to proceed are not\navailable.
\ndagster.
DagsterSubprocessError
(*args, **kwargs)[source]\u00b6An exception has occurred in one or more of the child processes dagster manages.\nThis error forwards the message and stack trace for all of the collected errors.
\ndagster.
DagsterTypeCheckDidNotPass
(description=None, metadata_entries=None, dagster_type=None)[source]\u00b6Indicates that a type check failed.
\nThis is raised when raise_on_error
is True
in calls to the synchronous pipeline and\nsolid execution APIs (execute_pipeline()
, execute_solid()
,\netc.), that is, typically in test, and a DagsterType
\u2019s type check fails\nby returning either False
or an instance of TypeCheck
whose success
\nmember is False
.
dagster.
DagsterTypeCheckError
(*args, **kwargs)[source]\u00b6Indicates an error in the solid type system at runtime. E.g. a solid receives an\nunexpected input, or produces an output that does not match the type of the output definition.
\ndagster.
DagsterUnknownResourceError
(resource_name, *args, **kwargs)[source]\u00b6Indicates that an unknown resource was accessed in the body of an execution step. May often\nhappen by accessing a resource in the compute function of a solid without first supplying the\nsolid with the correct required_resource_keys argument.
\ndagster.
DagsterUnmetExecutorRequirementsError
[source]\u00b6Indicates the resolved executor is incompatible with the state of other systems\nsuch as the DagsterInstance
or system storage configuration.
dagster.
DagsterUserCodeExecutionError
(*args, **kwargs)[source]\u00b6This is the base class for any exception that is meant to wrap an\nException
thrown by user code. It wraps that existing user code.\nThe original_exc_info
argument to the constructor is meant to be a tuple of the type\nreturned by sys.exc_info
at the call site of the constructor.
Users should not subclass this base class for their own exceptions and should instead throw\nfreely from user code. User exceptions will be automatically wrapped and rethrown.
\ndagster.
execute_pipeline
(pipeline: Union[dagster.core.definitions.pipeline.PipelineDefinition, dagster.core.definitions.pipeline_base.IPipeline], run_config: Optional[dict] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, solid_selection: Optional[List[str]] = None, instance: Optional[dagster.core.instance.DagsterInstance] = None, raise_on_error: bool = True) → dagster.core.execution.results.PipelineExecutionResult[source]\u00b6Execute a pipeline synchronously.
\nUsers will typically call this API when testing pipeline execution, or running standalone\nscripts.
\npipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.
run_config (Optional[dict]) \u2013 The environment configuration that parametrizes this run,\nas a dict.
mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True
, since this is the most useful behavior in test.
solid_selection (Optional[List[str]]) \u2013
A list of solid selection queries (including single\nsolid names) to execute. For example:\n- [\u2018some_solid\u2019]: select \u201csome_solid\u201d itself.\n- [\u2018*some_solid\u2019]: select \u201csome_solid\u201d and all its ancestors (upstream dependencies).\n- [\u2018*some_solid+++\u2019]: select \u201csome_solid\u201d, all its ancestors, and its descendants
\n\n\n(downstream dependencies) within 3 levels down.
\n
ancestors, \u201cother_solid_a\u201d itself, and \u201cother_solid_b\u201d and its direct child solids.
\nThe result of pipeline execution.
\nFor the asynchronous version, see execute_pipeline_iterator()
.
dagster.
execute_pipeline_iterator
(pipeline: Union[dagster.core.definitions.pipeline.PipelineDefinition, dagster.core.definitions.pipeline_base.IPipeline], run_config: Optional[dict] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, solid_selection: Optional[List[str]] = None, instance: Optional[dagster.core.instance.DagsterInstance] = None) → Iterator[dagster.core.events.DagsterEvent][source]\u00b6Execute a pipeline iteratively.
\nRather than package up the result of running a pipeline into a single object, like\nexecute_pipeline()
, this function yields the stream of events resulting from pipeline\nexecution.
This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.
\npipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.
run_config (Optional[dict]) \u2013 The environment configuration that parametrizes this run,\nas a dict.
mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
solid_selection (Optional[List[str]]) \u2013
A list of solid selection queries (including single\nsolid names) to execute. For example:\n- [\u2018some_solid\u2019]: select \u201csome_solid\u201d itself.\n- [\u2018*some_solid\u2019]: select \u201csome_solid\u201d and all its ancestors (upstream dependencies).\n- [\u2018*some_solid+++\u2019]: select \u201csome_solid\u201d, all its ancestors, and its descendants
\n\n\n(downstream dependencies) within 3 levels down.
\n
ancestors, \u201cother_solid_a\u201d itself, and \u201cother_solid_b\u201d and its direct child solids.
\ninstance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
The stream of events resulting from pipeline execution.
\nIterator[DagsterEvent]
\ndagster.
reexecute_pipeline
(pipeline: Union[dagster.core.definitions.pipeline_base.IPipeline, dagster.core.definitions.pipeline.PipelineDefinition], parent_run_id: str, run_config: Optional[dict] = None, step_selection: Optional[List[str]] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, instance: dagster.core.instance.DagsterInstance = None, raise_on_error: bool = True) → dagster.core.execution.results.PipelineExecutionResult[source]\u00b6Reexecute an existing pipeline run.
\nUsers will typically call this API when testing pipeline reexecution, or running standalone\nscripts.
\npipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.
parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.
run_config (Optional[dict]) \u2013 The environment configuration that parametrizes this run,\nas a dict.
step_selection (Optional[List[str]]) \u2013
A list of step selection queries (including single\nstep keys) to execute. For example:\n- [\u2018some_solid\u2019]: select the execution step \u201csome_solid\u201d itself.\n- [\u2018*some_solid\u2019]: select the step \u201csome_solid\u201d and all its ancestors
\n\n\n(upstream dependencies).
\n
and its descendants (downstream dependencies) within 3 levels down.
\n\u201dsome_solid\u201d and all its ancestors, \u201cother_solid_a\u201d itself, and\n\u201cother_solid_b\u201d and its direct child execution steps.
\nmode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True
, since this is the most useful behavior in test.
The result of pipeline execution.
\nFor the asynchronous version, see reexecute_pipeline_iterator()
.
dagster.
reexecute_pipeline_iterator
(pipeline: Union[dagster.core.definitions.pipeline_base.IPipeline, dagster.core.definitions.pipeline.PipelineDefinition], parent_run_id: str, run_config: Optional[dict] = None, step_selection: Optional[List[str]] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, instance: dagster.core.instance.DagsterInstance = None) → Iterator[dagster.core.events.DagsterEvent][source]\u00b6Reexecute a pipeline iteratively.
\nRather than package up the result of running a pipeline into a single object, like\nreexecute_pipeline()
, this function yields the stream of events resulting from pipeline\nreexecution.
This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.
\npipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.
parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.
run_config (Optional[dict]) \u2013 The environment configuration that parametrizes this run,\nas a dict.
step_selection (Optional[List[str]]) \u2013
A list of step selection queries (including single\nstep keys) to execute. For example:\n- [\u2018some_solid\u2019]: select the execution step \u201csome_solid\u201d itself.\n- [\u2018*some_solid\u2019]: select the step \u201csome_solid\u201d and all its ancestors
\n\n\n(upstream dependencies).
\n
and its descendants (downstream dependencies) within 3 levels down.
\n\u201dsome_solid\u201d and all its ancestors, \u201cother_solid_a\u201d itself, and\n\u201cother_solid_b\u201d and its direct child execution steps.
\nmode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
The stream of events resulting from pipeline reexecution.
\nIterator[DagsterEvent]
\ndagster.
execute_solid
(solid_def, mode_def=None, input_values=None, tags=None, run_config=None, raise_on_error=True)[source]\u00b6Execute a single solid in an ephemeral pipeline.
\nIntended to support unit tests. Input values may be passed directly, and no pipeline need be\nspecified \u2013 an ephemeral pipeline will be constructed.
\nsolid_def (SolidDefinition) \u2013 The solid to execute.
mode_def (Optional[ModeDefinition]) \u2013 The mode within which to execute the solid. Use this\nif, e.g., custom resources, loggers, or executors are desired.
input_values (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass inputs to the solid directly. You may also use the run_config
to\nconfigure any inputs that are configurable.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
run_config (Optional[dict]) \u2013 The environment configuration that parameterized this\nexecution, as a dict.
raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True
, since this is the most useful behavior in test.
The result of executing the\nsolid.
\ndagster.
execute_solid_within_pipeline
(pipeline_def, solid_name, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6Execute a single solid within an existing pipeline.
\nIntended to support tests. Input values may be passed directly.
\npipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.
solid_name (str) \u2013 The name of the solid, or the aliased solid, to execute.
inputs (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass input values to the solid directly. You may also use the run_config
to\nconfigure any inputs that are configurable.
run_config (Optional[dict]) \u2013 The environment configuration that parameterized this\nexecution, as a dict.
mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
The result of executing the\nsolid.
\ndagster.
execute_solids_within_pipeline
(pipeline_def, solid_names, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6Execute a set of solids within an existing pipeline.
\nIntended to support tests. Input values may be passed directly.
\npipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.
solid_names (FrozenSet[str]) \u2013 A set of the solid names, or the aliased solids, to execute.
inputs (Optional[Dict[str, Dict[str, Any]]]) \u2013 A dict keyed on solid names, whose values are\ndicts of input names to input values, used to pass input values to the solids directly.\nYou may also use the run_config
to configure any inputs that are configurable.
run_config (Optional[dict]) \u2013 The environment configuration that parameterized this\nexecution, as a dict.
mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode
\nand preset
.
preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode
and preset
.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.
instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None
,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.
The results of\nexecuting the solids, keyed by solid name.
\nDict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]
\ndagster.core.execution.context.compute.
SolidExecutionContext
(system_compute_execution_context: dagster.core.execution.context.system.SystemComputeExecutionContext)[source]\u00b6The context
object available to solid compute logic.
instance
\u00b6The current Instance
\npdb
\u00b6Allows pdb debugging from within the solid.
\nExample:
\n@solid\ndef debug_solid(context):\n context.pdb.set_trace()\n
pipeline_run
\u00b6The current PipelineRun
\nsolid_config
\u00b6The parsed config specific to this solid.
\ndagster.core.execution.context.compute.
AbstractComputeExecutionContext
[source]\u00b6Base class for solid context implemented by SolidExecutionContext and DagstermillExecutionContext
\n\n\nhas_tag
(key) → bool[source]\u00b6Implement this method to check if a logging tag is set.
\nlog
\u00b6The log manager available in the execution context.
\npipeline_def
\u00b6The pipeline being executed.
\npipeline_run
\u00b6The PipelineRun object corresponding to the execution.
\nresources
\u00b6Resources available in the execution context.
\nrun_id
\u00b6The run id for the context.
\nsolid
\u00b6The solid corresponding to the execution step being executed.
\nsolid_config
\u00b6The parsed config specific to this solid.
\nsolid_def
\u00b6The solid definition corresponding to the execution step being executed.
\ndagster.
reconstructable
[source]\u00b6Create a ReconstructablePipeline from a function that returns a PipelineDefinition, or a\nfunction decorated with @pipeline
When your pipeline must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline\non the other side of the process boundary.
\nThis function implements a very conservative strategy for reconstructing pipelines, so that\nits behavior is easy to predict, but as a consequence it is not able to reconstruct certain\nkinds of pipelines, such as those defined by lambdas, in nested scopes (e.g., dynamically\nwithin a method call), or in interactive environments such as the Python REPL or Jupyter\nnotebooks.
\nIf you need to reconstruct pipelines constructed in these ways, you should use\nbuild_reconstructable_pipeline()
instead, which allows you to specify your own\nstrategy for reconstructing a pipeline.
Examples:
\nfrom dagster import PipelineDefinition, pipeline, reconstructable\n\n@pipeline\ndef foo_pipeline():\n ...\n\nreconstructable_foo_pipeline = reconstructable(foo_pipeline)\n\n\ndef make_bar_pipeline():\n return PipelineDefinition(...)\n\nreconstructable_bar_pipeline = reconstructable(bar_pipeline)\n
dagster.
PipelineExecutionResult
(pipeline_def, run_id, event_list, reconstruct_context, resource_instances_to_override=None)[source]\u00b6The result of executing a pipeline.
\nReturned by execute_pipeline()
. Users should not instantiate this class.
dagster.
SolidExecutionResult
(solid, step_events_by_kind, reconstruct_context, resource_instances_to_override=None)[source]\u00b6Execution result for a leaf solid in a pipeline.
\nUsers should not instantiate this class.
\ncompute_input_event_dict
\u00b6All events of type STEP_INPUT
, keyed by input name.
Dict[str, DagsterEvent]
\ncompute_output_events_dict
\u00b6All events of type STEP_OUTPUT
, keyed by output name
Dict[str, List[DagsterEvent]]
\ncompute_step_events
\u00b6All events generated by execution of the solid compute function.
\nList[DagsterEvent]
\ncompute_step_failure_event
\u00b6The STEP_FAILURE
event, throws if it did not fail.
expectation_events_during_compute
\u00b6All events of type STEP_EXPECTATION_RESULT
.
List[DagsterEvent]
\nexpectation_results_during_compute
\u00b6All expectation results yielded by the solid
\nList[ExpectationResult]
\nfailure_data
\u00b6Any data corresponding to this step\u2019s failure, if it\nfailed.
\nUnion[None, StepFailureData]
\nget_output_event_for_compute
(output_name='result')[source]\u00b6The STEP_OUTPUT
event for the given output name.
Throws if not present.
\noutput_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)
\nThe corresponding event.
\nget_output_events_for_compute
(output_name='result')[source]\u00b6The STEP_OUTPUT
event for the given output name.
Throws if not present.
\noutput_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)
\nThe corresponding events.
\nList[DagsterEvent]
\nget_step_success_event
()[source]\u00b6DagsterEvent: The STEP_SUCCESS
event, throws if not present.
input_events_during_compute
\u00b6All events of type STEP_INPUT
.
List[DagsterEvent]
\nmaterialization_events_during_compute
\u00b6All events of type STEP_MATERIALIZATION
.
List[DagsterEvent]
\nmaterializations_during_compute
\u00b6All materializations yielded by the solid.
\nList[Materialization]
\noutput_events_during_compute
\u00b6All events of type STEP_OUTPUT
.
List[DagsterEvent]
\noutput_value
(output_name='result')[source]\u00b6Get a computed output value.
\nNote that calling this method will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.
\noutput_name (str) \u2013 The output name for which to retrieve the value. (default: \u2018result\u2019)
\nNone
if execution did not succeed, the output valuein the normal case, and a dict of mapping keys to values in the mapped case.
\noutput_values
\u00b6The computed output values.
\nReturns None
if execution did not succeed.
the output values in the normal case
a dictionary from mapping key to corresponding value in the mapped case
Note that accessing this property will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.
\n\ndagster.
CompositeSolidExecutionResult
(solid, event_list, step_events_by_kind, reconstruct_context, handle=None, resource_instances_to_override=None)[source]\u00b6Execution result for a composite solid in a pipeline.
\nUsers should not instantiate this class.
\ndagster.
DagsterEvent
[source]\u00b6Events yielded by solid and pipeline execution.
\nUsers should not instantiate this class.
\n\n\n\n\n\n\nsolid_handle
\u00b6SolidHandle
\nevent_specific_data
\u00b6Type must correspond to event_type_value.
\nAny
\nstep_key
DEPRECATED
\nOptional[str]
\nevent_type
\u00b6The type of this event.
\ndagster.
DagsterEventType
[source]\u00b6The types of events that may be yielded by solid and pipeline execution.
\nASSET_STORE_OPERATION
= 'ASSET_STORE_OPERATION'\u00b6ENGINE_EVENT
= 'ENGINE_EVENT'\u00b6HANDLED_OUTPUT
= 'HANDLED_OUTPUT'\u00b6HOOK_COMPLETED
= 'HOOK_COMPLETED'\u00b6HOOK_ERRORED
= 'HOOK_ERRORED'\u00b6HOOK_SKIPPED
= 'HOOK_SKIPPED'\u00b6LOADED_INPUT
= 'LOADED_INPUT'\u00b6OBJECT_STORE_OPERATION
= 'OBJECT_STORE_OPERATION'\u00b6PIPELINE_CANCELED
= 'PIPELINE_CANCELED'\u00b6PIPELINE_CANCELING
= 'PIPELINE_CANCELING'\u00b6PIPELINE_DEQUEUED
= 'PIPELINE_DEQUEUED'\u00b6PIPELINE_ENQUEUED
= 'PIPELINE_ENQUEUED'\u00b6PIPELINE_FAILURE
= 'PIPELINE_FAILURE'\u00b6PIPELINE_INIT_FAILURE
= 'PIPELINE_INIT_FAILURE'\u00b6PIPELINE_START
= 'PIPELINE_START'\u00b6PIPELINE_STARTING
= 'PIPELINE_STARTING'\u00b6PIPELINE_SUCCESS
= 'PIPELINE_SUCCESS'\u00b6STEP_EXPECTATION_RESULT
= 'STEP_EXPECTATION_RESULT'\u00b6STEP_FAILURE
= 'STEP_FAILURE'\u00b6STEP_INPUT
= 'STEP_INPUT'\u00b6STEP_MATERIALIZATION
= 'STEP_MATERIALIZATION'\u00b6STEP_OUTPUT
= 'STEP_OUTPUT'\u00b6STEP_RESTARTED
= 'STEP_RESTARTED'\u00b6STEP_SKIPPED
= 'STEP_SKIPPED'\u00b6STEP_START
= 'STEP_START'\u00b6STEP_SUCCESS
= 'STEP_SUCCESS'\u00b6STEP_UP_FOR_RETRY
= 'STEP_UP_FOR_RETRY'\u00b6\n\nThe
\nrun_config
used byexecute_pipeline()
and\nexecute_pipeline_iterator()
has the following schema:\n\n{\n # configuration for execution, required if executors require config\n execution: {\n # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n __executor_name__: {\n # executor-specific config, if required or permitted\n config: {\n ...\n }\n }\n },\n\n # configuration for loggers, required if loggers require config\n loggers: {\n # the name of an available logger\n __logger_name__: {\n # logger-specific config, if required or permitted\n config: {\n ...\n }\n },\n ...\n },\n\n # configuration for resources, required if resources require config\n resources: {\n # the name of a resource\n __resource_name__: {\n # resource-specific config, if required or permitted\n config: {\n ...\n }\n },\n ...\n },\n\n # configuration for solids, required if solids require config\n solids: {\n\n # these keys align with the names of the solids, or their alias in this pipeline\n __solid_name__: {\n\n # pass any data that was defined via config_field\n config: ...,\n\n # configurably specify input values, keyed by input name\n inputs: {\n __input_name__: {\n # if an dagster_type_loader is specified, that schema must be satisfied here;\n # scalar, built-in types will generally allow their values to be specified directly:\n value: ...\n }\n },\n\n # configurably materialize output values\n outputs: {\n __output_name__: {\n # if an dagster_type_materializer is specified, that schema must be satisfied\n # here; pickleable types will generally allow output as follows:\n pickle: {\n path: String\n }\n }\n }\n }\n },\n\n # optionally use an available system storage for intermediates etc.\n intermediate_storage: {\n # the name of one, and only one available system storage, typically 'filesystem' or\n # 'in_memory'\n __storage_name__: {\n config: {\n ...\n }\n }\n }\n}\n
dagster.
io_manager_from_intermediate_storage
(intermediate_storage_def)[source]\u00b6Define an IOManagerDefinition
from an existing IntermediateStorageDefinition
.
This method is used to adapt an existing user-defined intermediate storage to a IO manager\nresource, for example:
\nmy_io_manager_def = io_manager_from_intermediate_storage(my_intermediate_storage_def)\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": my_io_manager_def})])\ndef my_pipeline():\n ...\n
intermediate_storage_def (IntermediateStorageDefinition) \u2013 The intermediate storage definition\nto be converted to an IO manager definition.
\nIOManagerDefinition
\ndagster.
mem_intermediate_storage
IntermediateStorageDefinition[source]\u00b6The default in-memory intermediate storage.
\nIn-memory intermediate storage is the default on any pipeline run that does\nnot configure any custom intermediate storage.
\nKeep in mind when using this storage that intermediates will not be persisted after the pipeline\nrun ends. Use a persistent intermediate storage like fs_intermediate_storage()
to\npersist intermediates and take advantage of advanced features like pipeline re-execution.
dagster.
fs_intermediate_storage
IntermediateStorageDefinition[source]\u00b6The default filesystem intermediate storage.
\nFilesystem system storage is available by default on any ModeDefinition
that does\nnot provide custom system storages. To select it, include a fragment such as the following in\nconfig:
intermediate_storage:\n filesystem:\n base_dir: '/path/to/dir/'\n
You may omit the base_dir
config value, in which case the filesystem storage will use\nthe DagsterInstance
-provided default.
dagster.
default_intermediate_storage_defs
List[IntermediateStorageDefinition]\u00b6Built-in mutable sequence.
\nIf no argument is given, the constructor creates a new empty list.\nThe argument must be an iterable if specified.\nThe default intermediate storages available on any ModeDefinition
that does not provide\ncustom intermediate storages. These are currently [mem_intermediate_storage
,\nfs_intermediate_storage
].
dagster.
in_process_executor
ExecutorDefinition[source]\u00b6The default in-process executor.
\nIn most Dagster environments, this will be the default executor. It is available by default on\nany ModeDefinition
that does not provide custom executors. To select it explicitly,\ninclude the following top-level fragment in config:
execution:\n in_process:\n
Execution priority can be configured using the dagster/priority
tag via solid metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.
dagster.
multiprocess_executor
ExecutorDefinition[source]\u00b6The default multiprocess executor.
\nThis simple multiprocess executor is available by default on any ModeDefinition
\nthat does not provide custom executors. To select the multiprocess executor, include a fragment\nsuch as the following in your config:
execution:\n multiprocess:\n config:\n max_concurrent: 4\n
The max_concurrent
arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent
to be 0, this is the return value of\nmultiprocessing.cpu_count()
.
Execution priority can be configured using the dagster/priority
tag via solid metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.
dagster.
default_executors
List[ExecutorDefinition]\u00b6Built-in mutable sequence.
\nIf no argument is given, the constructor creates a new empty list.\nThe argument must be an iterable if specified.\nThe default executors available on any ModeDefinition
that does not provide custom\nexecutors. These are currently [in_process_executor
,\nmultiprocess_executor
].
dagster.
SystemComputeExecutionContext
(execution_context_data: dagster.core.execution.context.system.SystemExecutionContextData, log_manager: dagster.core.log_manager.DagsterLogManager, step: dagster.core.execution.plan.step.ExecutionStep)[source]\u00b6dagster.
TypeCheckContext
(execution_context_data: dagster.core.execution.context.system.SystemExecutionContextData, log_manager: dagster.core.log_manager.DagsterLogManager, dagster_type: dagster.core.types.dagster_type.DagsterType)[source]\u00b6The context
object available to a type check function on a DagsterType.
log
\u00b6Centralized log dispatch from user code.
\nresources
\u00b6An object whose attributes contain the resources available to this solid.
\nAny
\ndagster.
HookContext
(execution_context_data: dagster.core.execution.context.system.SystemExecutionContextData, log_manager: dagster.core.log_manager.DagsterLogManager, hook_def: dagster.core.definitions.hook.HookDefinition, step: dagster.core.execution.plan.step.ExecutionStep)[source]\u00b6The context
object available to a hook function on an DagsterEvent.
log
\u00b6Centralized log dispatch from user code.
\nhook_def
\u00b6The hook that the context object belongs to.
\nHookDefinition
\nstep
\u00b6The compute step associated with the hook.
\nExecutionStep
\nsolid
\u00b6The solid instance associated with the hook.
\nSolid
\nresources
\u00b6Resources available in the hook context.
\nNamedTuple
\nsolid_config
\u00b6The parsed config specific to this solid.
\nAny
\n@
dagster.
success_hook
(name=None, required_resource_keys=None)[source]\u00b6Create a hook on step success events with the specified parameters from the decorated function.
\nExamples
\n@success_hook(required_resource_keys={'slack'})\ndef slack_on_success(context):\n message = 'solid {} succeeded'.format(context.solid.name)\n context.resources.slack.send_message(message)\n\n@success_hook\ndef do_something_on_success(context):\n do_something()\n
@
dagster.
failure_hook
(name=None, required_resource_keys=None)[source]\u00b6Create a hook on step failure events with the specified parameters from the decorated function.
\nExamples
\n@failure_hook(required_resource_keys={'slack'})\ndef slack_on_failure(context):\n message = 'solid {} failed'.format(context.solid.name)\n context.resources.slack.send_message(message)\n\n@failure_hook\ndef do_something_on_failure(context):\n do_something()\n
Please note that internal APIs are likely to be in much greater flux pre-1.0 than user-facing APIs,\nparticularly if not exported in the top level dagster
module.
If you find yourself consulting these docs because you are writing custom components and plug-ins,\nplease get in touch with the core team on our Slack.\nWe\u2019re curious what you\u2019re up to, happy to help, excited for new community contributions, and eager\nto make the system as easy to work with as possible \u2013 including for teams who are looking to\ncustomize it.
\ndagster.
DagsterLogManager
[source]\u00b6Centralized dispatch for logging from user code.
\nHandles the construction of uniform structured log messages and passes them through to the\nunderlying loggers.
\nAn instance of the log manager is made available to solids as context.log
. Users should not\ninitialize instances of the log manager directly. To configure custom loggers, set the\nlogger_defs
on a ModeDefinition
for a pipeline.
The log manager supports standard convenience methods like those exposed by the Python standard\nlibrary logging
module (i.e., within the body of a solid,\ncontext.log.{debug, info, warning, warn, error, critical, fatal}
).
The underlying integer API can also be called directly using, e.g.\ncontext.log.log(5, msg)
, and the log manager will delegate to the log
method\ndefined on each of the loggers it manages.
User-defined custom log levels are not supported, and calls to, e.g.,\ncontext.log.trace
or context.log.notice
will result in hard exceptions at runtime.
debug
(msg, **kwargs)[source]\u00b6Log at the logging.DEBUG
level.
The message will be automatically adorned with contextual information about the name\nof the pipeline, the name of the solid, etc., so it is generally unnecessary to include\nthis type of information in the log message.
\nYou can optionally additional key-value pairs to an individual log message using the kwargs\nto this method.
\nmsg (str) \u2013 The message to log.
**kwargs (Optional[Any]) \u2013 Any additional key-value pairs for only this log message.
fatal
(msg, **kwargs)\u00b6Alias for critical()
log
(level, msg, **kwargs)[source]\u00b6Invoke the underlying loggers for a given integer log level.
\n\nAdd new tags in \u201cnew_tags\u201d to the set of tags attached to this log manager instance, and\nreturn a new DagsterLogManager with the merged set of tags.
\nrun ID and loggers.
\n@
dagster.
executor
(name=None, config_schema=None)[source]\u00b6Define an executor.
\nThe decorated function should accept an InitExecutorContext
and return an instance\nof Executor
.
name (Optional[str]) \u2013 The name of the executor.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.executor_config.
dagster.
ExecutorDefinition
(name, config_schema=None, executor_creation_fn=None, description=None)[source]\u00b6name (Optional[str]) \u2013 The name of the executor.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.executor_config.
executor_creation_fn (Optional[Callable]) \u2013 Should accept an InitExecutorContext
\nand return an instance of Executor
required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the\nexecutor.
dagster.
InitExecutorContext
[source]\u00b6Executor-specific initialization context.
\npipeline
\u00b6The pipeline to be executed.
\nIPipeline
\nmode_def
\u00b6The mode in which the pipeline is to be executed.
\nexecutor_def
\u00b6The definition of the executor currently being\nconstructed.
\npipeline_run
\u00b6Configuration for this pipeline run.
\nenvironment_config
\u00b6The parsed environment configuration for this\npipeline run.
\nEnvironmentConfig
\nintermediate_storage_def
\u00b6The intermediate storage definition.
\nOptional[IntermediateStorageDefinition]
\ninstance
\u00b6The current instance.
\ndagster.
Executor
[source]\u00b6execute
(pipeline_context, execution_plan)[source]\u00b6For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.
\npipeline_context (SystemPipelineExecutionContext) \u2013 The pipeline execution context.
execution_plan (ExecutionPlan) \u2013 The plan to execute.
A stream of dagster events.
\nretries
\u00b6The Retries state / policy for this instance of the Executor. Executors should allow this to be\ncontrolled via configuration if possible.
\nReturns: Retries
\ndagster.core.storage.file_manager.
FileManager
[source]\u00b6Base class for all file managers in dagster.
\nThe file manager is an interface that can be implemented by resources to provide abstract\naccess to a file system such as local disk, S3, or other cloud storage.
\nFor examples of usage, see the documentation of the concrete file manager implementations.
\nIn 0.10.x, this abstraction will be deprecated in favor of the IOManager
.
copy_handle_to_local_temp
(file_handle: dagster.core.storage.file_manager.FileHandle) → str[source]\u00b6Copy a file represented by a file handle to a temp file.
\nIn an implementation built around an object store such as S3, this method would be expected\nto download the file from S3 to local filesystem in a location assigned by the standard\nlibrary\u2019s tempfile
module.
Temp files returned by this method are not guaranteed to be reusable across solid\nboundaries. For files that must be available across solid boundaries, use the\nread()
,\nread_data()
,\nwrite()
, and\nwrite_data()
methods.
file_handle (FileHandle) \u2013 The handle to the file to make available as a local temp file.
\nPath to the local temp file.
\ndelete_local_temp
()[source]\u00b6Delete all local temporary files created by previous calls to\ncopy_handle_to_local_temp()
.
Should typically only be called by framework implementors.
\nread
(file_handle: dagster.core.storage.file_manager.FileHandle, mode: str = 'rb') → Union[TextIO, BinaryIO][source]\u00b6Return a file-like stream for the file handle.
\nThis may incur an expensive network call for file managers backed by object stores\nsuch as S3.
\nfile_handle (FileHandle) \u2013 The file handle to make available as a stream.
mode (str) \u2013 The mode in which to open the file. Default: "rb"
.
A file-like stream.
\nUnion[TextIO, BinaryIO]
\nread_data
(file_handle: dagster.core.storage.file_manager.FileHandle) → bytes[source]\u00b6Return the bytes for a given file handle. This may incur an expensive network\ncall for file managers backed by object stores such as s3.
\nfile_handle (FileHandle) \u2013 The file handle for which to return bytes.
\nBytes for a given file handle.
\nwrite
(file_obj: Union[TextIO, BinaryIO], mode: str = 'wb', ext: str = None) → dagster.core.storage.file_manager.FileHandle[source]\u00b6Write the bytes contained within the given file object into the file manager.
\nfile_obj (Union[TextIO, StringIO]) \u2013 A file-like object.
mode (Optional[str]) \u2013 The mode in which to write the file into the file manager.\nDefault: "wb"
.
ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None
.
A handle to the newly created file.
\ndagster.
local_file_manager
ResourceDefinition[source]\u00b6FileManager that provides abstract access to a local filesystem.
\nImplements the FileManager
API.
Examples:
\nimport tempfile\n\nfrom dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n@solid(required_resource_keys={"file_manager"})\ndef write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n@solid(required_resource_keys={"file_manager"})\ndef read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\ndef files_pipeline():\n read_files(write_files())\n
@
dagster.
intermediate_storage
(required_resource_keys=None, name=None, is_persistent=True, config_schema=None)[source]\u00b6Creates an intermediate storage definition
\nThe decorated function will be passed as the intermediate_storage_creation_fn
to a\nIntermediateStorageDefinition
.
name (str) \u2013 The name of the intermediate storage.
is_persistent (bool) \u2013 Whether the storage is persistent in a way that can cross process/node\nboundaries. Re-execution with, for example, the multiprocess executor, or with\ndagster-airflow, requires a persistent storage mode.
required_resource_keys (Optional[Set[str]]) \u2013 The resources that this storage needs at runtime to function.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.intermediate_storage_config.
dagster.
IntermediateStorageDefinition
(name, is_persistent, required_resource_keys, config_schema=None, intermediate_storage_creation_fn=None, description=None)[source]\u00b6Defines intermediate data storage behaviors.
\nname (str) \u2013 Name of the storage mode.
is_persistent (bool) \u2013 Whether the storage is persistent in a way that can cross process/node\nboundaries. Re-execution with, for example, the multiprocess executor, or with\ndagster-airflow, requires a persistent storage mode.
required_resource_keys (Optional[Set[str]]) \u2013 The resources that this storage needs at runtime to function.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the storage\u2019s configuration schema.\nConfiguration data passed in this schema will be made available to the\nintermediate_storage_creation_fn
under init_context.intermediate_storage_config
.
intermediate_storage_creation_fn \u2013 (Callable[[InitIntermediateStorageContext], IntermediateStorage])\nCalled to construct the storage. This function should consume the init context and emit\na IntermediateStorage
.
dagster.
InitIntermediateStorageContext
[source]\u00b6Intermediate storage-specific initialization context.
\npipeline_def
\u00b6The definition of the pipeline in context.
\nmode_def
\u00b6The definition of the mode in context.
\nintermediate_storage_def
\u00b6The definition of the intermediate storage to be\nconstructed.
\npipeline_run
\u00b6The pipeline run in context.
\ninstance
\u00b6The instance.
\nenvironment_config
\u00b6The environment config.
\nEnvironmentConfig
\ntype_storage_plugin_registry
\u00b6Registry containing custom type\nstorage plugins.
\nTypeStoragePluginRegistry
\nresources
\u00b6Resources available in context.
\nAny
\nintermediate_storage_config
\u00b6The intermediate storage-specific configuration data\nprovided by the environment config. The schema for this data is defined by the\nconfig_schema
argument to IntermediateStorageDefinition
.
Dict[str, Any]
\ndagster.
DagsterInstance
(instance_type, local_artifact_storage, run_storage, event_storage, compute_log_manager, schedule_storage=None, scheduler=None, run_coordinator=None, run_launcher=None, settings=None, skip_validation_checks=False, ref=None)[source]\u00b6Core abstraction for managing Dagster\u2019s access to storage and other resources.
\nUse DagsterInstance.get() to grab the current DagsterInstance which will load based on\nthe values in the dagster.yaml
file in $DAGSTER_HOME
if set, otherwise fallback\nto using an ephemeral in-memory set of components.
Configuration of this class should be done by setting values in $DAGSTER_HOME/dagster.yaml
.\nFor example, to use Postgres for run and event log storage, you can write a dagster.yaml
\nsuch as the following:
instance_type (InstanceType) \u2013 Indicates whether the instance is ephemeral or persistent.\nUsers should not attempt to set this value directly or in their dagster.yaml
files.
local_artifact_storage (LocalArtifactStorage) \u2013 The local artifact storage is used to\nconfigure storage for any artifacts that require a local disk, such as schedules, or\nwhen using the filesystem system storage to manage files and intermediates. By default,\nthis will be a dagster.core.storage.root.LocalArtifactStorage
. Configurable\nin dagster.yaml
using the ConfigurableClass
\nmachinery.
run_storage (RunStorage) \u2013 The run storage is used to store metadata about ongoing and past\npipeline runs. By default, this will be a\ndagster.core.storage.runs.SqliteRunStorage
. Configurable in dagster.yaml
\nusing the ConfigurableClass
machinery.
event_storage (EventLogStorage) \u2013 Used to store the structured event logs generated by\npipeline runs. By default, this will be a\ndagster.core.storage.event_log.SqliteEventLogStorage
. Configurable in\ndagster.yaml
using the ConfigurableClass
machinery.
compute_log_manager (ComputeLogManager) \u2013 The compute log manager handles stdout and stderr\nlogging for solid compute functions. By default, this will be a\ndagster.core.storage.local_compute_log_manager.LocalComputeLogManager
.\nConfigurable in dagster.yaml
using the\nConfigurableClass
machinery.
run_coordinator (RunCoordinator) \u2013 A runs coordinator may be used to manage the execution\nof pipeline runs.
run_launcher (Optional[RunLauncher]) \u2013 Optionally, a run launcher may be used to enable\na Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\naddition to running them locally.
settings (Optional[Dict]) \u2013 Specifies certain per-instance settings,\nsuch as feature flags. These are set in the dagster.yaml
under a set of whitelisted\nkeys.
ref (Optional[InstanceRef]) \u2013 Used by internal machinery to pass instances across process\nboundaries.
add_daemon_heartbeat
(daemon_heartbeat)[source]\u00b6Called on a regular interval by the daemon
\nget_addresses_for_step_output_versions
(step_output_versions)[source]\u00b6For each given step output, finds whether an output exists with the given\nversion, and returns its address if it does.
\n\nlaunch_run
(run_id, external_pipeline)[source]\u00b6Launch a pipeline run.
\nThis method is typically called using instance.submit_run rather than being invoked\ndirectly. This method delegates to the RunLauncher
, if any, configured on the instance,\nand will call its implementation of RunLauncher.launch_run()
to begin the execution of\nthe specified run. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()
) before this method is called, and should be in the\nPipelineRunStatus.NOT_STARTED
state.
run_id (str) \u2013 The id of the run the launch.
\nreport_engine_event
(message, pipeline_run, engine_event_data=None, cls=None, step_key=None)[source]\u00b6Report a EngineEvent that occurred outside of a pipeline execution context.
\nsubmit_run
(run_id, external_pipeline)[source]\u00b6Submit a pipeline run to the coordinator.
\nThis method delegates to the RunCoordinator
, configured on the instance, and will\ncall its implementation of RunCoordinator.submit_run()
to send the run to the\ncoordinator for execution. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()
) before this method is called, and\nshould be in the PipelineRunStatus.NOT_STARTED
state. They also must have a non-null\nExternalPipelineOrigin.
run_id (str) \u2013 The id of the run.
\ndagster.core.instance.
InstanceRef
[source]\u00b6Serializable representation of a DagsterInstance
.
Users should not instantiate this class directly.
\ndagster.serdes.
ConfigurableClass
[source]\u00b6Abstract mixin for classes that can be loaded from config.
\nThis supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\nof conditional imports / optional extras_requires in dagster core and b) a magic directory or\nfile in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\nrun storage, pluggable with a config chunk like:
\nrun_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n
This same pattern should eventually be viable for other system components, e.g. engines.
\nThe ConfigurableClass
mixin provides the necessary hooks for classes to be instantiated from\nan instance of ConfigurableClassData
.
Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\ntype such as:
\n{'module': str, 'class': str, 'config': Field(Permissive())}\n
config_type
()[source]\u00b6dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData
.
from_config_value
(inst_data, config_value)[source]\u00b6New up an instance of the ConfigurableClass from a validated config value.
\nCalled by ConfigurableClassData.rehydrate.
\nconfig_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue
attribute of a\nEvaluateValueResult
.
A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:
\n@staticmethod\ndef from_config_value(inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n
inst_data
\u00b6Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.
\ndagster.serdes.
ConfigurableClassData
[source]\u00b6Serializable tuple describing where to find a class and the config fragment that should\nbe used to instantiate it.
\nUsers should not instantiate this class directly.
\nClasses intended to be serialized in this way should implement the\ndagster.serdes.ConfigurableClass
mixin.
dagster.core.storage.root.
LocalArtifactStorage
(base_dir, inst_data=None)[source]\u00b6config_type
()[source]\u00b6dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData
.
from_config_value
(inst_data, config_value)[source]\u00b6New up an instance of the ConfigurableClass from a validated config value.
\nCalled by ConfigurableClassData.rehydrate.
\nconfig_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue
attribute of a\nEvaluateValueResult
.
A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:
\n@staticmethod\ndef from_config_value(inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n
inst_data
\u00b6Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.
\ndagster.
PipelineRun
[source]\u00b6Serializable internal representation of a pipeline run, as stored in a\nRunStorage
.
dagster.core.storage.runs.
RunStorage
[source]\u00b6Abstract base class for storing pipeline run history.
\nNote that run storages using SQL databases as backing stores should implement\nSqlRunStorage
.
Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit
and dagster-graphql
load, based on the values in the\ndagster.yaml
file in $DAGSTER_HOME
. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.
dagster.core.storage.runs.
SqlRunStorage
[source]\u00b6Base class for SQL based run storages
\ndagster.core.storage.runs.
SqliteRunStorage
(conn_string, inst_data=None)[source]\u00b6SQLite-backed run storage.
\nUsers should not directly instantiate this class; it is instantiated by internal machinery when\ndagit
and dagster-graphql
load, based on the values in the dagster.yaml
file in\n$DAGSTER_HOME
. Configuration of this class should be done by setting values in that file.
This is the default run storage when none is specified in the dagster.yaml
.
To explicitly specify SQLite for run storage, you can add a block such as the following to your\ndagster.yaml
:
run_storage:\n module: dagster.core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n
The base_dir
param tells the run storage where on disk to store the database.
See also: dagster_postgres.PostgresRunStorage
.
dagster.core.storage.event_log.
EventLogStorage
[source]\u00b6Abstract base class for storing structured event logs from pipeline runs.
\nNote that event log storages using SQL databases as backing stores should implement\nSqlEventLogStorage
.
Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit
and dagster-graphql
load, based on the values in the\ndagster.yaml
file in $DAGSTER_HOME
. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.
dagster.core.storage.event_log.
SqlEventLogStorage
[source]\u00b6Base class for SQL backed event log storages.
\ndagster.core.storage.event_log.
SqliteEventLogStorage
(base_dir, inst_data=None)[source]\u00b6SQLite-backed event log storage.
\nUsers should not directly instantiate this class; it is instantiated by internal machinery when\ndagit
and dagster-graphql
load, based on the values in the dagster.yaml
file in\n$DAGSTER_HOME
. Configuration of this class should be done by setting values in that file.
This is the default event log storage when none is specified in the dagster.yaml
.
To explicitly specify SQLite for event log storage, you can add a block such as the following\nto your dagster.yaml
:
event_log_storage:\n module: dagster.core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n
The base_dir
param tells the event log storage where on disk to store the databases. To\nimprove concurrent performance, event logs are stored in a separate SQLite database for each\nrun.
See also: dagster_postgres.PostgresEventLogStorage
.
dagster.core.storage.compute_log_manager.
ComputeLogManager
[source]\u00b6Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\nsteps of pipeline solids.
\ndagster.core.storage.local_compute_log_manager.
LocalComputeLogManager
(base_dir, inst_data=None)[source]\u00b6Stores copies of stdout & stderr for each compute step locally on disk.
\nSee also: dagster_aws.S3ComputeLogManager
.
dagster.core.storage.memoizable_io_manager.
MemoizableIOManager
[source]\u00b6Base class for IO manager enabled to work with memoized execution. Users should implement\nthe load_input
and handle_output
methods described in the IOManager
API, and the\nhas_output
method, which returns a boolean representing whether a data object can be found.
dagster.core.storage.memoizable_io_manager.
versioned_filesystem_io_manager
()[source]\u00b6Filesystem IO manager that utilizes versioning of stored objects.
\nIt allows users to specify a base directory where all the step outputs will be stored in. It\nserializes and deserializes output values (assets) using pickling and automatically constructs\nthe filepaths for the assets using the provided directory, and the version for a provided step\noutput.
\nSee also: dagster.IOManager
.
dagster.core.launcher.
DefaultRunLauncher
(inst_data=None)[source]\u00b6Launches runs against running GRPC servers.
\nSee also: dagster_k8s.K8sRunLauncher
.
dagster.core.run_coordinator.
DefaultRunCoordinator
(inst_data=None)[source]\u00b6Immediately send runs to the run launcher.
\ndagster.core.run_coordinator.
QueuedRunCoordinator
(max_concurrent_runs=None, tag_concurrency_limits=None, dequeue_interval_seconds=None, inst_data=None)[source]\u00b6Sends runs to the dequeuer process via the run storage. Requires the external process to be\nalive for runs to be launched.
\ndagster.core.scheduler.
Scheduler
[source]\u00b6Abstract base class for a scheduler. This component is responsible for interfacing with\nan external system such as cron to ensure scheduled repeated execution according.
\ndagster.core.scheduler.
DagsterDaemonScheduler
(max_catchup_runs=None, inst_data=None)[source]\u00b6Default scheduler implementation that submits runs from the dagster-daemon\nlong-lived process.
\ndagster_cron.cron_scheduler.
SystemCronScheduler
(inst_data=None)[source]\u00b6Scheduler implementation that uses the local systems cron. Only works on unix systems that\nhave cron.
\nEnable this scheduler by adding it to your dagster.yaml
in $DAGSTER_HOME
.
dagster.core.storage.schedules.
ScheduleStorage
[source]\u00b6Abstract class for managing persistance of scheduler artifacts
\ndagster.core.storage.schedules.
SqlScheduleStorage
[source]\u00b6Base class for SQL backed schedule storage
\ndagster.core.storage.schedules.
SqliteScheduleStorage
(conn_string, inst_data=None)[source]\u00b6Local SQLite backed schedule storage
\nSee also: dagster_postgres.PostgresScheduleStorage
.
dagster.core.errors.
user_code_error_boundary
(error_cls, msg_fn, control_flow_exceptions=None, **kwargs)[source]\u00b6Wraps the execution of user-space code in an error boundary. This places a uniform\npolicy around an user code invoked by the framework. This ensures that all user\nerrors are wrapped in an exception derived from DagsterUserCodeExecutionError,\nand that the original stack trace of the user error is preserved, so that it\ncan be reported without confusing framework code in the stack trace, if a\ntool author wishes to do so.
\nExamples:
\nwith user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n):\n call_user_provided_function()\n
IO managers are user-provided objects that store solid outputs and load them as inputs to downstream\nsolids.
\n@
dagster.
io_manager
(config_schema=None, description=None, output_config_schema=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6Define an IO manager.
\nIOManagers are used to store solid outputs and load them as inputs to downstream solids.
\nThe decorated function should accept an InitResourceContext
and return an\nIOManager
.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource config. Configuration\ndata available in init_context.resource_config.
description (Optional[str]) \u2013 A human-readable description of the resource.
output_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-output config.
input_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-input config.
required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the object\nmanager.
version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.
Examples:
\nclass MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n@io_manager\ndef my_io_manager(init_context):\n return MyIOManager()\n\n@solid(output_defs=[OutputDefinition(io_manager_key="my_io_manager_key")])\ndef my_solid(_):\n return do_stuff()\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={"my_io_manager_key": my_io_manager})]\n)\ndef my_pipeline():\n my_solid()\n\nexecute_pipeline(my_pipeline)\n
dagster.
IOManager
[source]\u00b6Base class for user-provided IO managers.
\nIOManagers are used to store solid outputs and load them as inputs to downstream solids.
\nExtend this class to handle how objects are loaded and stored. Users should implement\nhandle_output
to store an object and load_input
to retrieve an object.
handle_output
(context, obj)[source]\u00b6User-defined method that stores an output of a solid.
\ncontext (OutputContext) \u2013 The context of the step output that produces this object.
obj (Any) \u2013 The object, returned by the solid, to be stored.
load_input
(context)[source]\u00b6User-defined method that loads an input to a solid.
\ncontext (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.
\nThe data object.
\nAny
\ndagster.
IOManagerDefinition
(resource_fn=None, config_schema=None, description=None, required_resource_keys=None, version=None, input_config_schema=None, output_config_schema=None)[source]\u00b6Definition of an IO manager resource.
\nIOManagers are used to store solid outputs and load them as inputs to downstream solids.
\nAn IOManagerDefinition is a ResourceDefinition
whose resource_fn returns an\nIOManager
.
The easiest way to create an IOManagerDefnition is with the @io_manager
\ndecorator.
input_config_schema
\u00b6The schema for per-input configuration for inputs that are managed by this\ninput manager
\noutput_config_schema
\u00b6The schema for per-output configuration for outputs that are managed by this\nmanager
\ndagster.
InputContext
[source]\u00b6The context
object available to the load_input method of RootInputManager
.
solid_def
\u00b6The definition of the solid that\u2019s loading the input.
\nOptional[SolidDefinition]
\nconfig
\u00b6The config attached to the input that we\u2019re loading.
\nOptional[Any]
\nmetadata
\u00b6A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.
\nOptional[Dict[str, Any]]
\nupstream_output
\u00b6Info about the output that produced the object\nwe\u2019re loading.
\nOptional[OutputContext]
\ndagster_type
\u00b6The type of this input.
\nOptional[DagsterType]
\nlog
\u00b6The log manager to use for this input.
\nOptional[DagsterLogManager]
\nresource_config
\u00b6The config associated with the resource that\ninitializes the RootInputManager.
\nOptional[Dict[str, Any]]
\nresources
\u00b6The resources required by the resource that initializes the\ninput manager. If using the @root_input_manager()
decorator, these resources\ncorrespond to those requested with the required_resource_keys parameter.
ScopedResources
\ndagster.
OutputContext
[source]\u00b6The context object that is available to the handle_output method of an IOManager
.
metadata
\u00b6A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.
\nOptional[Dict[str, Any]]
\nmapping_key
\u00b6The key that identifies a unique mapped output. None for regular outputs.
\nOptional[str]
\nconfig
\u00b6The configuration for the output.
\nOptional[Any]
\nsolid_def
\u00b6The definition of the solid that produced the output.
\nOptional[SolidDefinition]
\ndagster_type
\u00b6The type of this output.
\nOptional[DagsterType]
\nlog
\u00b6The log manager to use for this output.
\nOptional[DagsterLogManager]
\nresources
\u00b6The resources required by the output manager, specified by the\nrequired_resource_keys parameter.
\nOptional[ScopedResources]
\nget_run_scoped_output_identifier
() → List[str][source]\u00b6Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.
\nThe unique identifier collection consists of
\nrun_id
: the id of the run which generates the output.Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id
will be the id\nof its parent run.
step_key
: the key for a compute step.
name
: the name of the output. (default: \u2018result\u2019).
A list of identifiers, i.e. run id, step key, and output name
\nList[str, ..]
\ndagster.
mem_io_manager
= <dagster.core.storage.io_manager.IOManagerDefinition object>[source]\u00b6Built-in IO manager that stores and retrieves values in memory.
\ndagster.
fs_io_manager
= <dagster.core.storage.io_manager.IOManagerDefinition object>[source]\u00b6Built-in filesystem IO manager that stores and retrieves values using pickling.
\nIt allows users to specify a base directory where all the step outputs will be stored. It\nserializes and deserializes output values using pickling and automatically constructs\nthe filepaths for the assets.
\nExample usage:
\n1. Specify a pipeline-level IO manager using the reserved resource key "io_manager"
,\nwhich will set the given IO manager on all solids across a pipeline.
@solid\ndef solid_a(context, df):\n return df\n\n@solid\ndef solid_b(context, df):\n return df[:5]\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})])\ndef pipe():\n solid_b(solid_a())\n
2. Specify IO manager on OutputDefinition
, which allows the user to set\ndifferent IO managers on different step outputs.
@solid(output_defs=[OutputDefinition(io_manager_key="my_io_manager")])\ndef solid_a(context, df):\n return df\n\n@solid\ndef solid_b(context, df):\n return df[:5]\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={"my_io_manager": fs_io_manager})]\n)\ndef pipe():\n solid_b(solid_a())\n
dagster.
custom_path_fs_io_manager
= <dagster.core.storage.io_manager.IOManagerDefinition object>[source]\u00b6Built-in IO manager that allows users to custom output file path per output definition.
\nIt also allows users to specify a base directory where all the step output will be stored in. It\nserializes and deserializes output values (assets) using pickling and stores the pickled object\nin the user-provided file paths.
\nExample usage:
\n@solid(\n output_defs=[\n OutputDefinition(\n io_manager_key="io_manager", metadata={"path": "path/to/sample_output"}\n )\n ]\n)\ndef sample_data(context, df):\n return df[:5]\n\n@pipeline(\n mode_defs=[\n ModeDefinition(resource_defs={"io_manager": custom_path_fs_io_manager}),\n ],\n)\ndef pipe():\n sample_data()\n
Root input managers are user-provided objects that specify how to load inputs that aren\u2019t connected\nto upstream outputs.
\n@
dagster.
root_input_manager
(config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6Define a root input manager.
\nRoot input managers load solid inputs that aren\u2019t connected to upstream outputs.
\nThe decorated function should accept a InputContext
and resource config, and return\na loaded object that will be passed into one of the inputs of a solid.
The decorator produces an RootInputManagerDefinition
.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource-level config.
description (Optional[str]) \u2013 A human-readable description of the resource.
input_config_schema (Optional[ConfigSchema]) \u2013 A schema for the input-level config. Each\ninput that uses this input manager can be configured separately using this config.
required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the input\nmanager.
version (Optional[str]) \u2013 (Experimental) the version of the input manager definition.
Examples:
\n@root_input_manager\ndef csv_loader(_):\n return read_csv("some/path")\n\n@solid(input_defs=[InputDefinition("input1", root_manager_key="csv_loader_key")])\ndef my_solid(_, input1):\n do_stuff(input1)\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"csv_loader_key": csv_loader})])\ndef my_pipeline():\n my_solid()\n\n@root_input_manager(config_schema={"base_dir": str})\ndef csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n@root_input_manager(input_config_schema={"path": str})\ndef csv_loader(context):\n return read_csv(context.config["path"])\n
dagster.
RootInputManager
[source]\u00b6RootInputManagers are used to load inputs to solids at the root of a pipeline.
\nThe easiest way to define an RootInputManager is with the\n@root_input_manager
decorator.
load_input
(context)[source]\u00b6The user-defined read method that loads data given its metadata.
\ncontext (InputContext) \u2013 The context of the step output that produces this asset.
\nThe data object.
\nAny
\ndagster.
RootInputManagerDefinition
(resource_fn=None, config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6Definition of a root input manager resource.
\nRoot input managers load solid inputs that aren\u2019t connected to upstream outputs.
\nAn RootInputManagerDefinition is a ResourceDefinition
whose resource_fn returns an\nRootInputManager
.
The easiest way to create an RootInputManagerDefinition is with the\n@root_input_manager
decorator.
input_config_schema
\u00b6The schema for per-input configuration for inputs that are managed by this\ninput manager
\ndagster_airflow.
make_airflow_dag
(module_name, pipeline_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None)[source]\u00b6Construct an Airflow DAG corresponding to a given Dagster pipeline.
\nTasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\ncallable, run by an underlying PythonOperator
. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the Python environment within which\nyour Airflow tasks execute. If you cannot install requirements into this environment, or you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized()
.
This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.
\nmodule_name (str) \u2013 The name of the importable module in which the pipeline definition can be\nfound.
pipeline_name (str) \u2013 The name of the pipeline definition.
run_config (Optional[dict]) \u2013 The environment config, if any, with which to compile\nthe pipeline to an execution plan, as a Python dict.
mode (Optional[str]) \u2013 The mode in which to execute the pipeline.
instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline.
dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG
).
dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG
)
dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG
constructor, including default_args
.
op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nPythonOperator
).
The generated Airflow DAG, and a\nlist of its constituent tasks.
\ndagster_airflow.
make_airflow_dag_for_operator
(recon_repo, pipeline_name, operator, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None)[source]\u00b6Construct an Airflow DAG corresponding to a given Dagster pipeline and custom operator.
\n\nTasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\nOperator BaseOperator
. If you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized()
.
This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.
\nrecon_repo (dagster.ReconstructableRepository
) \u2013 reference to a Dagster RepositoryDefinition\nthat can be reconstructed in another process
pipeline_name (str) \u2013 The name of the pipeline definition.
operator (type) \u2013 The operator to use. Must be a class that inherits from\nBaseOperator
run_config (Optional[dict]) \u2013 The environment config, if any, with which to compile\nthe pipeline to an execution plan, as a Python dict.
mode (Optional[str]) \u2013 The mode in which to execute the pipeline.
instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline.
dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG
).
dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG
)
dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG
constructor, including default_args
.
op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator.
The generated Airflow DAG, and a\nlist of its constituent tasks.
\ndagster_airflow.
make_airflow_dag_containerized
(module_name, pipeline_name, image, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None)[source]\u00b6Construct a containerized Airflow DAG corresponding to a given Dagster pipeline.
\nTasks in the resulting DAG will execute the Dagster logic they encapsulate by calling the\ndagster-graphql API exposed by a container run using a subclass of\nDockerOperator
. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the container spun up by this operator.\nTypically you\u2019ll want to install these requirements onto the image you\u2019re using.
This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.
\nmodule_name (str) \u2013 The name of the importable module in which the pipeline definition can be\nfound.
pipeline_name (str) \u2013 The name of the pipeline definition.
image (str) \u2013 The name of the Docker image to use for execution (passed through to\nDockerOperator
).
run_config (Optional[dict]) \u2013 The environment config, if any, with which to compile\nthe pipeline to an execution plan, as a Python dict.
mode (Optional[str]) \u2013 The mode in which to execute the pipeline.
dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG
).
dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG
)
dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG
constructor, including default_args
.
op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nDockerOperator
).
The generated Airflow DAG, and a\nlist of its constituent tasks.
\ndagster_airflow.
make_dagster_pipeline_from_airflow_dag
(dag, tags=None, use_airflow_template_context=False, unique_id=None)[source]\u00b6Construct a Dagster pipeline corresponding to a given Airflow DAG.
\nTasks in the resulting pipeline will execute the execute()
method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.
To set Airflow\u2019s execution_date
for use with Airflow Operator\u2019s execute()
methods,\neither:
time (in UTC) of pipeline invocation:
\nexecute_pipeline(\n pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n preset='default')\n
Add {'airflow_execution_date': utc_date_string}
to the PipelineDefinition tags. This will\noverride behavior from (1).
\n\n\n\nexecute_pipeline(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n)\n
{'airflow_execution_date': utc_date_string}
to the PipelineRun tags,such as in the Dagit UI. This will override behavior from (1) and (2)
\nWe apply normalized_name() to the dag id and task ids when generating pipeline name and solid\nnames to ensure that names conform to Dagster\u2019s naming conventions.
\ndag (DAG) \u2013 The Airflow DAG to compile into a Dagster pipeline
tags (Dict[str, Field]) \u2013 Pipeline tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.
use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)
unique_id (int) \u2013 If not None, this id will be postpended to generated solid names. Used by\nframework authors to enforce unique solid names within a repo.
The generated Dagster pipeline
\npipeline_def (PipelineDefinition)
\ndagster_airflow.
make_dagster_repo_from_airflow_dags_path
(dag_path, repo_name, safe_mode=True, store_serialized_dags=False, use_airflow_template_context=False)[source]\u00b6Construct a Dagster repository corresponding to Airflow DAGs in dag_path.
\nDagBag.get_dag()
dependency requires Airflow DB to be initialized.
Create make_dagster_repo.py
:
from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\ndef make_repo_from_dir():\n return make_dagster_repo_from_airflow_dags_path(\n '/path/to/dags/', 'my_repo_name'\n )\n
Use RepositoryDefinition as usual, for example:\ndagit -f path/to/make_dagster_repo.py -n make_repo_from_dir
dag_path (str) \u2013 Path to directory or file that contains Airflow Dags
repo_name (str) \u2013 Name for generated RepositoryDefinition
include_examples (bool) \u2013 True to include Airflow\u2019s example DAGs. (default: False)
safe_mode (bool) \u2013 True to use Airflow\u2019s default heuristic to find files that contain DAGs\n(ie find files that contain both b\u2019DAG\u2019 and b\u2019airflow\u2019) (default: True)
store_serialized_dags (bool) \u2013 True to read Airflow DAGS from Airflow DB. False to read DAGS\nfrom Python files. (default: False)
use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)
RepositoryDefinition
\ndagster_airflow.
make_dagster_repo_from_airflow_dag_bag
(dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False)[source]\u00b6Construct a Dagster repository corresponding to Airflow DAGs in DagBag.
\nfrom dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\nfrom airflow_home import my_dag_bag
\nreturn make_dagster_repo_from_airflow_dag_bag(my_dag_bag, \u2018my_repo_name\u2019)
\ndagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag
\ndag_path (str) \u2013 Path to directory or file that contains Airflow Dags
repo_name (str) \u2013 Name for generated RepositoryDefinition
refresh_from_airflow_db (bool) \u2013 If True, will refresh DAG if expired via DagBag.get_dag(),\nwhich requires access to initialized Airflow DB. If False (recommended), gets dag from\nDagBag\u2019s dags dict without depending on Airflow DB. (default: False)
use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)
RepositoryDefinition
\ndagster_airflow.
make_dagster_repo_from_airflow_example_dags
(repo_name='airflow_example_dags_repo')[source]\u00b6Construct a Dagster repository for Airflow\u2019s example DAGs.
\n\u2018example_external_task_marker_child\u2019,\n\u2018example_pig_operator\u2019,\n\u2018example_skip_dag\u2019,\n\u2018example_trigger_target_dag\u2019,\n\u2018example_xcom\u2019,\n\u2018test_utils\u2019,
\nUsage:
\n\n\n\n
\n- Create make_dagster_repo.py:
- \n
from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags
\n\n
\n- def make_airflow_example_dags():
- \n
return make_dagster_repo_from_airflow_example_dags()
\n- Use RepositoryDefinition as usual, for example:
- \n
dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags
\n
repo_name (str) \u2013 Name for generated RepositoryDefinition
\nRepositoryDefinition
\ndagster_aws.s3.
S3ComputeLogManager
(bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None)[source]\u00b6Logs solid compute function stdout and stderr to S3.
\nUsers should not instantiate this class directly. Instead, use a YAML block in dagster.yaml
\nsuch as the following:
compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n
bucket (str) \u2013 The name of the s3 bucket to which to log.
local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster.seven.get_system_temp_directory()
.
prefix (Optional[str]) \u2013 Prefix for the log file keys.
use_ssl (Optional[bool]) \u2013 Whether or not to use SSL. Default True.
verify (Optional[bool]) \u2013 Whether or not to verify SSL certificates. Default True.
verify_cert_path (Optional[str]) \u2013 A filename of the CA cert bundle to use. Only used if\nverify set to False.
endpoint_url (Optional[str]) \u2013 Override for the S3 endpoint url.
inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.
dagster_aws.s3.
S3FileCache
(s3_bucket, s3_key, s3_session, overwrite=False)[source]\u00b6dagster_aws.s3.
S3FileHandle
(s3_bucket: str, s3_key: str)[source]\u00b6A reference to a file on S3.
\n\n\n\n\n\n\n\n\ndagster_aws.s3.
s3_file_manager
ResourceDefinition[source]\u00b6FileManager that provides abstract access to S3.
\nImplements the FileManager
API.
dagster_aws.s3.
s3_resource
ResourceDefinition[source]\u00b6Resource that gives solids access to S3.
\nThe underlying S3 session is created by calling boto3.resource('s3')
.
Attach this resource definition to a ModeDefinition
in order to make it\navailable to your solids.
Example
\nfrom dagster import ModeDefinition, execute_solid, solid\nfrom dagster_aws.s3 import s3_resource\n\n@solid(required_resource_keys={'s3'})\ndef example_s3_solid(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\nresult = execute_solid(\n example_s3_solid,\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n },\n mode_def=ModeDefinition(resource_defs={'s3': s3_resource}),\n)\n
Note that your solids must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.
\nYou may configure this resource as follows:
\nresources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n
dagster_aws.s3.
S3Coordinate
DagsterType\u00b6A dagster.DagsterType
intended to make it easier to pass information about files on S3\nfrom solid to solid. Objects of this type should be dicts with 'bucket'
and 'key'
keys,\nand may be hydrated from config in the intuitive way, e.g., for an input with the name\ns3_file
:
inputs:\n s3_file:\n value:\n bucket: my-bucket\n key: my-key\n
dagster_aws.s3.
s3_pickle_io_manager
IOManagerDefinition[source]\u00b6Persistent IO manager using S3 for storage.
\nSerializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.
\nAttach this resource definition to a ModeDefinition
\nin order to make it available to your pipeline:
pipeline_def = PipelineDefinition(\n mode_defs=[\n ModeDefinition(\n resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...},\n ), ...\n ], ...\n)\n
You may configure this storage as follows:
\nresources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n
dagster_aws.redshift.
redshift_resource
ResourceDefinition[source]\u00b6This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.
\nExample
\nfrom dagster import ModeDefinition, execute_solid, solid\nfrom dagster_aws.redshift import redshift_resource\n\n@solid(required_resource_keys={'redshift'})\ndef example_redshift_solid(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\nresult = execute_solid(\n example_redshift_solid,\n run_config={\n 'resources': {\n 'redshift': {\n 'config': {\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n }\n }\n }\n },\n mode_def=ModeDefinition(resource_defs={'redshift': redshift_resource}),\n)\nassert result.output_value() == [(1,)]\n
dagster_aws.emr.
emr_pyspark_step_launcher
ResourceDefinition[source]\u00b6spark_config:
cluster_id: Name of the job flow (cluster) on which to execute.
region_name: The AWS region that the cluster is in.
action_on_failure: The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html
staging_bucket: S3 bucket to use for passing files between the plan process and EMR process.
staging_prefix: S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process
wait_for_logs: If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.
local_pipeline_package_path: Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.
deploy_local_pipeline_package: If set, before every step run, the launcher will zip up all the code in local_pipeline_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_pipeline_package_path should not also be set.
s3_pipeline_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.
dagster_aws.cloudwatch.
cloudwatch_logger
LoggerDefinition\u00b6Core class for defining loggers.
\nLoggers are pipeline-scoped logging handlers, which will be automatically invoked whenever\nsolids in a pipeline log messages.
\nlogger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log
are called from within solid compute logic.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config.
description (Optional[str]) \u2013 A human-readable description of this logger.
Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake\nStorage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage.
\nNOTE: This package is incompatible with dagster-snowflake
! This is due to a version mismatch\nbetween the underlying azure-storage-blob
package; dagster-snowflake
has a transitive\ndependency on an old version, via snowflake-connector-python
.
To get a local rabbitmq broker started and available via the default\npyamqp://guest@localhost:5672
, in the dagster/python_modules/libraries/dagster-celery/
\ndirectory run:
docker-compose up\n
To run a celery worker:
\ncelery -A dagster_celery.app worker -l info\n
To start multiple workers in the background, run:
\ncelery multi start w2 -A dagster_celery.app -l info\n
To execute a pipeline using the celery-backed executor, you\u2019ll need to add the celery executor to\na mode definition on the pipeline:
\nfrom dagster import default_executors\nfrom dagster_celery import celery_executor\n\n@pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [celery_executor])])\ndef my_pipeline():\n pass\n
Then you can use config like the following to execute the pipeline:
\nexecution:\n celery:\n
We advise using [Flower](https://celery.readthedocs.io/en/latest/userguide/monitoring.html#flower-real-time-celery-web-monitor):
\ncelery -A dagster_celery.app flower\n
By default this will use amqp://guest:**@localhost:5672//
as the Celery broker URL and\nrpc://
as the results backend. In production, you will want to change these values. Pending the\nintroduction of a dagster_celery CLI, that would entail writing a Python module my_module
as\nfollows:
from celery import Celery\n\nfrom dagster_celery.tasks import create_task\n\napp = Celery('dagster', broker_url='some://custom@value', ...)\n\nexecute_plan = create_task(app)\n\nif __name__ == '__main__':\n app.worker_main()\n
You can then run the celery worker using:
\ncelery -A my_module worker --loglevel=info\n
This customization mechanism is used to implement dagster_celery_k8s and dagster_celery_k8s which delegate the execution of steps to ephemeral kubernetes pods and docker containers, respectively.
\nCelery is a rich and full-featured system. We\u2019ve found the following resources helpful:
\nDeni Bertovi\u0107\u2019s [Celery best practices](https://denibertovic.com/posts/celery-best-practices/)
Pawel Zadrozny\u2019s [series of articles](https://pawelzny.com/python/celery/2017/08/14/celery-4-tasks-best-practices/) on Celery best practices
Balthazar Rouberol\u2019s [Celery best practices](https://blog.balthazar-rouberol.com/celery-best-practices)
dagster_celery.
celery_executor
ExecutorDefinition[source]\u00b6Celery-based executor.
\nThe Celery executor exposes config settings for the underlying Celery app under\nthe config_source
key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery
constructor as its config_source
argument.\n(See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)
The executor also exposes the broker
, backend, and include
arguments to the\ncelery.Celery
constructor.
In the most common case, you may want to modify the broker
and backend
(e.g., to use\nRedis instead of RabbitMQ). We expect that config_source
will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute pipelines\nwith variations on these settings.
If you\u2019d like to configure a celery executor in addition to the\ndefault_executors
, you should add it to the executor_defs
defined on a\nModeDefinition
as follows:
from dagster import ModeDefinition, default_executors, pipeline\nfrom dagster_celery import celery_executor\n\n@pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [celery_executor])])\ndef celery_enabled_pipeline():\n pass\n
Then you can configure the executor as follows:
\nexecution:\n celery:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n
Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.
\nThe dagster-celery
CLI lets you start, monitor, and terminate workers.
Start a dagster celery worker.
\ndagster-celery worker start [OPTIONS] [ADDITIONAL_ARGS]...\n
Options
\n-n
,
--name
<name>
\u00b6The name of the worker. Defaults to a unique name prefixed with \u201cdagster-\u201d and ending with the hostname.
\n-y
,
--config-yaml
<config_yaml>
\u00b6Specify the path to a config YAML file with options for the worker. This is the same config block that you provide to dagster_celery.celery_executor when configuring a pipeline for execution with Celery, with, e.g., the URL of the broker to use.
\n-q
,
--queue
<queue>
\u00b6Names of the queues on which this worker should listen for tasks. Provide multiple -q arguments to specify multiple queues. Note that each celery worker may listen on no more than four queues.
\n-d
,
--background
\u00b6Set this flag to run the worker in the background.
\n-i
,
--includes
<includes>
\u00b6Python modules the worker should import. Provide multiple -i arguments to specify multiple modules.
\n-l
,
--loglevel
<loglevel>
\u00b6Log level for the worker.
\n-A
,
--app
<app>
\u00b6Arguments
\nADDITIONAL_ARGS
\u00b6Optional argument(s)
\nList running dagster-celery workers. Note that we use the broker to contact the workers.
\ndagster-celery worker list [OPTIONS]\n
Options
\n-y
,
--config-yaml
<config_yaml>
\u00b6Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a pipeline for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to find your workers (since the CLI won\u2019t know how to reach the broker).
\nShut down dagster-celery workers. Note that we use the broker to send signals to the workers to terminate \u2013 if the broker is not running, this command is a no-op. Provide the argument NAME to terminate a specific worker by name.
\ndagster-celery worker terminate [OPTIONS] [NAME]\n
Options
\n-a
,
--all
\u00b6Set this flag to terminate all running workers.
\n-y
,
--config-yaml
<config_yaml>
\u00b6Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a pipeline for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to terminate your workers (since the CLI won\u2019t know how to reach the broker).
\nArguments
\nNAME
\u00b6Optional argument
\ndagster_celery_docker.
celery_docker_executor
ExecutorDefinition[source]\u00b6Celery-based executor which launches tasks in docker containers.
\nThe Celery executor exposes config settings for the underlying Celery app under\nthe config_source
key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery
constructor as its config_source
argument.\n(See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)
The executor also exposes the broker
, backend, and include
arguments to the\ncelery.Celery
constructor.
In the most common case, you may want to modify the broker
and backend
(e.g., to use\nRedis instead of RabbitMQ). We expect that config_source
will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute pipelines\nwith variations on these settings.
If you\u2019d like to configure a Celery Docker executor in addition to the\ndefault_executors
, you should add it to the executor_defs
defined on a\nModeDefinition
as follows:
from dagster import ModeDefinition, default_executors, pipeline\nfrom dagster_celery_docker.executor import celery_docker_executor\n\n@pipeline(mode_defs=[\n ModeDefinition(executor_defs=default_executors + [celery_docker_executor])\n])\ndef celery_enabled_pipeline():\n pass\n
Then you can configure the executor as follows:
\nexecution:\n celery-docker:\n config:\n\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n
Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.
\nIn deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_docker.app argument.
\ndagster_celery_k8s.
CeleryK8sRunLauncher
(instance_config_map, dagster_home, postgres_password_secret, load_incluster_config=True, kubeconfig_file=None, broker=None, backend=None, include=None, config_source=None, retries=None, inst_data=None, k8s_client_batch_api=None)[source]\u00b6In contrast to the K8sRunLauncher
, which launches pipeline runs as single K8s\nJobs, this run launcher is intended for use in concert with\ndagster_celery_k8s.celery_k8s_job_executor()
.
With this run launcher, execution is delegated to:
\n\n\n\n
\n- \n
A run coordinator Kubernetes Job, which traverses the pipeline run execution plan and\nsubmits steps to Celery queues for execution;
- \n
The step executions which are submitted to Celery queues are picked up by Celery workers,\nand each step execution spawns a step execution Kubernetes Job. See the implementation\ndefined in
dagster_celery_k8.executor.create_k8s_job_task()
.
You may configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml
like the following:
run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n
As always when using a ConfigurableClass
, the values\nunder the config
key of this YAML block will be passed to the constructor. The full list\nof acceptable values is given below by the constructor args.
instance_config_map (str) \u2013 The name
of an existing Volume to mount into the pod in\norder to provide a ConfigMap for the Dagster instance. This Volume should contain a\ndagster.yaml
with appropriate values for run storage, event log storage, etc.
dagster_home (str) \u2013 The location of DAGSTER_HOME in the Job container; this is where the\ndagster.yaml
file will be mounted from the instance ConfigMap specified above.
postgres_password_secret (str) \u2013 The name of the Kubernetes Secret where the postgres\npassword can be retrieved. Will be mounted and supplied as an environment variable to\nthe Job Pod.
load_incluster_config (Optional[bool]) \u2013 Set this value if you are running the launcher\nwithin a k8s cluster. If True
, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config
. Otherwise,\nwe will use the k8s config specified in kubeconfig_file
(using\nkubernetes.config.load_kube_config
) or fall back to the default kubeconfig. Default:\nTrue
.
kubeconfig_file (Optional[str]) \u2013 The kubeconfig file from which to load config. Defaults to\nNone (using the default kubeconfig).
broker (Optional[str]) \u2013 The URL of the Celery broker.
backend (Optional[str]) \u2013 The URL of the Celery backend.
include (List[str]) \u2013 List of includes for the Celery workers
config_source \u2013 (Optional[dict]): Additional settings for the Celery app.
retries \u2013 (Optional[dict]): Default retry configuration for Celery tasks.
dagster_celery_k8s.
celery_k8s_job_executor
ExecutorDefinition[source]\u00b6Celery-based executor which launches tasks as Kubernetes Jobs.
\nThe Celery executor exposes config settings for the underlying Celery app under\nthe config_source
key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery
constructor as its config_source
argument.\n(See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)
The executor also exposes the broker
, backend, and include
arguments to the\ncelery.Celery
constructor.
In the most common case, you may want to modify the broker
and backend
(e.g., to use\nRedis instead of RabbitMQ). We expect that config_source
will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute pipelines\nwith variations on these settings.
If you\u2019d like to configure a Celery Kubernetes Job executor in addition to the\ndefault_executors
, you should add it to the executor_defs
defined on a\nModeDefinition
as follows:
Then you can configure the executor as follows:
\nexecution:\n celery-k8s:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n
Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.
\nIn deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_k8s.app argument.
\nSee also the Dask deployment guide.
\ndagster_dask.
dask_executor
ExecutorDefinition[source]\u00b6Dask-based executor.
\nThe \u2018cluster\u2019 can be one of the following:\n(\u2018existing\u2019, \u2018local\u2019, \u2018yarn\u2019, \u2018ssh\u2019, \u2018pbs\u2019, \u2018moab\u2019, \u2018sge\u2019, \u2018lsf\u2019, \u2018slurm\u2019, \u2018oar\u2019, \u2018kube\u2019).
\nIf the Dask executor is used without providing executor-specific config, a local Dask cluster\nwill be created (as when calling dask.distributed.Client()
\nwith dask.distributed.LocalCluster()
).
The Dask executor optionally takes the following config:
\ncluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n
If you\u2019d like to configure a dask executor in addition to the\ndefault_executors
, you should add it to the executor_defs
defined on a\nModeDefinition
as follows:
from dagster import ModeDefinition, default_executors, pipeline\nfrom dagster_dask import dask_executor\n\n@pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [dask_executor])])\ndef dask_enabled_pipeline():\n pass\n
The dagster_databricks
package provides two main pieces of functionality:
A resource, databricks_pyspark_step_launcher
, which will execute a solid within a Databricks\ncontext on a cluster, such that the pyspark
resource uses the cluster\u2019s Spark instance.
A function, create_databricks_job_solid
, which creates a solid that submits an external\nconfigurable job to Databricks using the \u2018Run Now\u2019 API.
Note that, for the databricks_pyspark_step_launcher
, either S3 or Azure Data Lake Storage config\nmust be specified for solids to succeed, and the credentials for this storage must also be\nstored as a Databricks Secret and stored in the resource config so that the Databricks cluster can\naccess storage.
dagster_databricks.
create_databricks_job_solid
(name='databricks_job', num_inputs=1, description=None, required_resource_keys=frozenset({'databricks_client'}))[source]\u00b6Creates a solid that launches a databricks job.
\nAs config, the solid accepts a blob of the form described in Databricks\u2019 job API:\nhttps://docs.databricks.com/dev-tools/api/latest/jobs.html.
\nA solid definition.
\ndagster_databricks.
databricks_pyspark_step_launcher
ResourceDefinition[source]\u00b6Resource for running solids as a Databricks Job.
\nWhen this resource is used, the solid will be executed in Databricks using the \u2018Run Submit\u2019\nAPI. Pipeline code will be zipped up and copied to a directory in DBFS along with the solid\u2019s\nexecution context.
\nUse the \u2018run_config\u2019 configuration to specify the details of the Databricks cluster used, and\nthe \u2018storage\u2019 key to configure persistent storage on that cluster. Storage is accessed by\nsetting the credentials in the Spark context, as documented here for S3 and here for ADLS.
\nThis library provides an integration with Datadog, to support publishing metrics to Datadog from\nwithin Dagster solids.
\nWe use the Python datadogpy library. To use it, you\u2019ll\nfirst need to create a DataDog account and get both API and Application keys.
\nThe integration uses DogStatsD, so you\u2019ll need\nto ensure the datadog agent is running on the host you\u2019re sending metrics from.
\ndagster_datadog.
datadog_resource
ResourceDefinition[source]\u00b6This resource is a thin wrapper over the\ndogstatsd library.
\nAs such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.
\nExamples
\n@solid(required_resource_keys={'datadog'})\ndef datadog_solid(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={'datadog': datadog_resource})])\ndef dd_pipeline():\n datadog_solid()\n\nresult = execute_pipeline(\n dd_pipeline,\n {'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}},\n)\n
This library provides a Dagster integration with dbt (data build tool), created by Fishtown Analytics.
\ndagster_dbt.
dbt_cli_compile
(*args, **kwargs)[source]\u00b6This solid executes dbt compile
via the dbt CLI.
dagster_dbt.
dbt_cli_run
(*args, **kwargs)[source]\u00b6This solid executes dbt run
via the dbt CLI.
dagster_dbt.
dbt_cli_run_operation
(*args, **kwargs)[source]\u00b6This solid executes dbt run-operation
via the dbt CLI.
dagster_dbt.
dbt_cli_snapshot
(*args, **kwargs)[source]\u00b6This solid executes dbt snapshot
via the dbt CLI.
dagster_dbt.
dbt_cli_snapshot_freshness
(*args, **kwargs)[source]\u00b6This solid executes dbt source snapshot-freshness
via the dbt CLI.
dagster_dbt.
dbt_cli_test
(*args, **kwargs)[source]\u00b6This solid executes dbt test
via the dbt CLI.
dagster_dbt.
DbtCliOutput
[source]\u00b6The results of executing a dbt command, along with additional metadata about the dbt CLI\nprocess that was run.
\nNote that users should not construct instances of this class directly. This class is intended\nto be constructed from the JSON output of dbt commands.
\nIf the executed dbt command is either run
or test
, then the .num_*
attributes will\ncontain non-None
integer values. Otherwise, they will be None
.
num_warn
\u00b6The number of dbt nodes (models) that emitted warnings.
\nOptional[int]
\nnum_error
\u00b6The number of dbt nodes (models) that emitted errors.
\nOptional[int]
\nnum_skip
\u00b6The number of dbt nodes (models) that were skipped.
\nOptional[int]
\nnum_total
\u00b6The total number of dbt nodes (models) that were processed.
\nOptional[int]
\nfrom_dict
(d: Dict[str, Any]) → dagster_dbt.cli.types.DbtCliOutput[source]\u00b6Constructs an instance of DbtCliOutput
from a\ndictionary.
d (Dict[str, Any]) \u2013 A dictionary with key-values to construct a DbtCliOutput
.
An instance of DbtCliOutput
.
dagster_dbt.
create_dbt_rpc_run_sql_solid
(name: str, output_def: Optional[dagster.core.definitions.output.OutputDefinition] = None, **kwargs) → Callable[source]\u00b6This function is a factory which constructs a solid that will copy the results of a SQL query\nrun within the context of a dbt project to a pandas DataFrame
.
Any kwargs passed to this function will be passed along to the underlying @solid
decorator. However, note that overriding config_schema
, input_defs
, and\nrequired_resource_keys
is not allowed and will throw a DagsterInvalidDefinitionError
.
If you would like to configure this solid with different config fields, you could consider using\n@composite_solid
to wrap this solid.
name (str) \u2013 The name of this solid.
output_def (OutputDefinition, optional) \u2013 The OutputDefinition
for the solid. This value should always be a representation\nof a pandas DataFrame
. If not specified, the solid will default to an\nOutputDefinition
named \u201cdf\u201d with a DataFrame
\ndagster type.
Returns the constructed solid definition.
\ndagster_dbt.
dbt_rpc_compile_sql
(*args, **kwargs)[source]\u00b6This solid sends the dbt compile
command to a dbt RPC server and returns the request\ntoken.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_run
(*args, **kwargs)[source]\u00b6This solid sends the dbt run
command to a dbt RPC server and returns the request token.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_run_and_wait
(*args, **kwargs)[source]\u00b6This solid sends the dbt run
command to a dbt RPC server and returns the result of the\nexecuted dbt process.
This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.
\ndagster_dbt.
dbt_rpc_run_operation
(*args, **kwargs)[source]\u00b6This solid sends the dbt run-operation
command to a dbt RPC server and returns the\nrequest token.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_run_operation_and_wait
(*args, **kwargs)[source]\u00b6This solid sends the dbt run-operation
command to a dbt RPC server and returns the\nresult of the executed dbt process.
This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.
\ndagster_dbt.
dbt_rpc_snapshot
(*args, **kwargs)[source]\u00b6This solid sends the dbt snapshot
command to a dbt RPC server and returns the\nrequest token.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_snapshot_and_wait
(*args, **kwargs)[source]\u00b6This solid sends the dbt snapshot
command to a dbt RPC server and returns the result of\nthe executed dbt process.
This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.
\ndagster_dbt.
dbt_rpc_snapshot_freshness
(*args, **kwargs)[source]\u00b6This solid sends the dbt source snapshot-freshness
command to a dbt RPC server and\nreturns the request token.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_snapshot_freshness_and_wait
(*args, **kwargs)[source]\u00b6This solid sends the dbt source snapshot
command to a dbt RPC server and returns the\nresult of the executed dbt process.
This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.
\ndagster_dbt.
dbt_rpc_test
(*args, **kwargs)[source]\u00b6This solid sends the dbt test
command to a dbt RPC server and returns the request token.
This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.
\ndagster_dbt.
dbt_rpc_test_and_wait
(*args, **kwargs)[source]\u00b6This solid sends the dbt test
command to a dbt RPC server and returns the result of the\nexecuted dbt process.
This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.
\ndagster_dbt.
dbt_rpc_resource
ResourceDefinition[source]\u00b6This resource defines a dbt RPC client.
\nTo configure this resource, we recommend using the configured method.
\nExamples:
\ncustom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt_rpc": custom_dbt_rpc_resource})])\ndef dbt_rpc_pipeline():\n # Run solids with `required_resource_keys={"dbt_rpc", ...}`.\n
dagster_dbt.
local_dbt_rpc_resource
ResourceDefinition\u00b6This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580.
\ndagster_dbt.
DbtRpcClient
(host: str = '0.0.0.0', port: int = 8580, jsonrpc_version: str = '2.0', logger: Optional[Any] = None, **_)[source]\u00b6A client for a dbt RPC server.
\nIf you are need a dbt RPC server as a Dagster resource, we recommend that you use\ndbt_rpc_resource
.
cli
(*, cli: str, **kwargs) → requests.models.Response[source]\u00b6Sends a request with CLI syntax to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for running CLI commands via RPC.
\ncli (str) \u2013 a dbt command in CLI syntax.
\nthe HTTP response from the dbt RPC server.
\nResponse
\ncompile
(*, models: List[str] = None, exclude: List[str] = None, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method compile
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling projects via RPC.
compile_sql
(*, sql: str, name: str) → requests.models.Response[source]\u00b6Sends a request with the method compile_sql
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling SQL via RPC.
generate_docs
(*, models: List[str] = None, exclude: List[str] = None, compile: bool = False, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method docs.generate
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method docs.generate.
the HTTP response from the dbt RPC server.
\nResponse
\njsonrpc_version
\u00b6The JSON-RPC version to send in RPC requests.
\nkill
(*, task_id: str) → requests.models.Response[source]\u00b6Sends a request with the method kill
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method kill.
task_id (str) \u2013 the ID of the task to terminate.
\nthe HTTP response from the dbt RPC server.
\nResponse
\nlogger
\u00b6A property for injecting a logger dependency.
\nOptional[Any]
\npoll
(*, request_token: str, logs: bool = False, logs_start: int = 0) → requests.models.Response[source]\u00b6Sends a request with the method poll
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method poll.
the HTTP response from the dbt RPC server.
\nResponse
\nps
(*, completed: bool = False) → requests.models.Response[source]\u00b6Sends a request with the method ps
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method ps.
compelted (bool) \u2013 If True
, then also return completed tasks. Defaults to False
.
the HTTP response from the dbt RPC server.
\nResponse
\nrun
(*, models: List[str] = None, exclude: List[str] = None, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method run
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method run.
run_operation
(*, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method run-operation
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command run-operation.
run_sql
(*, sql: str, name: str) → requests.models.Response[source]\u00b6Sends a request with the method run_sql
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for running SQL via RPC.
seed
(*, show: bool = False, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method seed
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method seed.
show (bool, optional) \u2013 If True
, then show a sample of the seeded data in the\nresponse. Defaults to False
.
the HTTP response from the dbt RPC server.
\nResponse
\nsnapshot
(*, select: List[str] = None, exclude: List[str] = None, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method snapshot
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command snapshot.
snapshot_freshness
(*, select: Optional[List[str]] = None, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method snapshot-freshness
to the dbt RPC server, and returns\nthe response. For more details, see the dbt docs for the command source snapshot-freshness.
select (List[str], optional) \u2013 the models to include in calculating snapshot freshness.
\nthe HTTP response from the dbt RPC server.
\nResponse
\nstatus
()[source]\u00b6Sends a request with the method status
to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method status.
the HTTP response from the dbt RPC server.
\nResponse
\ntest
(*, models: List[str] = None, exclude: List[str] = None, data: bool = True, schema: bool = True, **kwargs) → requests.models.Response[source]\u00b6Sends a request with the method test
to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method test.
the HTTP response from the dbt RPC server.
\nResponse
\ndagster_dbt.
DbtRpcOutput
[source]\u00b6The output from executing a dbt command via the dbt RPC server.
\nNote that users should not construct instances of this class directly. This class is intended to be\nconstructed from the JSON output of dbt commands.
\n\n\n\n\n\n\n\n\nelapsed
\u00b6The duration (in seconds) for which the dbt process was running.
\nfrom_dict
(d: Dict[str, Any]) → dagster_dbt.rpc.types.DbtRpcOutput[source]\u00b6Constructs an instance of DbtRpcOutput
from a\ndictionary.
d (Dict[str, Any]) \u2013 A dictionary with key-values to construct a DbtRpcOutput
.
An instance of DbtRpcOutput
.
dagster_dbt.
DbtResult
[source]\u00b6The results of executing a dbt command.
\nNote that users should not construct instances of this class directly. This class is intended to be\nconstructed from the JSON output of dbt commands.
\n\n\nresults
\u00b6Details about each executed dbt node (model) in the run.
\nList[NodeResult]]
\ngenerated_at
\u00b6An ISO string timestamp of when the run result was generated by dbt.
\ndagster_dbt.
NodeResult
[source]\u00b6The result of executing a dbt node (model).
\nNote that users should not construct instances of this class directly. This class is intended to be\nconstructed from the JSON output of dbt commands.
\n\n\n\n\nfail
\u00b6The fail
field from the results of the executed dbt node.
Optional[Any]
\nwarn
\u00b6The warn
field from the results of the executed dbt node.
Optional[Any]
\nskip
\u00b6The skip
field from the results of the executed dbt node.
Optional[Any]
\nstatus
\u00b6The status of the executed dbt node (model).
\n\nexecution_time
\u00b6The execution duration (in seconds) of the dbt node (model).
\nthread_id
\u00b6The dbt thread identifier that executed the dbt node (model).
\nstep_timings
\u00b6The timings for each step in the executed dbt node\n(model).
\nList[StepTiming]
\ntable
\u00b6Details about the table/view that is created from executing a\nrun_sql\ncommand on an dbt RPC server.
\nOptional[Dict]
\nfrom_dict
(d: Dict[str, Any]) → dagster_dbt.types.NodeResult[source]\u00b6Constructs an instance of NodeResult
from a dictionary.
d (Dict[str, Any]) \u2013 A dictionary with key-values to construct a NodeResult
.
An instance of NodeResult
.
dagster_dbt.
StepTiming
[source]\u00b6The timing information of an executed step for a dbt node (model).
\nNote that users should not construct instances of this class directly. This class is intended to be\nconstructed from the JSON output of dbt commands.
\n\n\nstarted_at
\u00b6An ISO string timestamp of when the step started executing.
\ncompleted_at
\u00b6An ISO string timestamp of when the step completed\nexecution.
\nduration
\u00b6The execution duration of the step.
\ndagster_dbt.
DagsterDbtError
(description=None, metadata_entries=None)[source]\u00b6The base exception of the dagster-dbt
library.
dagster_dbt.
DagsterDbtCliRuntimeError
(description: str, logs: List[Dict[str, Any]], raw_output: str)[source]\u00b6Represents an error while executing a dbt CLI command.
\ndagster_dbt.
DagsterDbtCliFatalRuntimeError
(logs: List[Dict[str, Any]], raw_output: str)[source]\u00b6Represents a fatal error in the dbt CLI (return code 2).
\ndagster_dbt.
DagsterDbtCliHandledRuntimeError
(logs: List[Dict[str, Any]], raw_output: str)[source]\u00b6Represents a model error reported by the dbt CLI at runtime (return code 1).
\ndagster_dbt.
DagsterDbtCliOutputsNotFoundError
(path: str)[source]\u00b6Represents a problem in finding the target/run_results.json
artifact when executing a dbt\nCLI command.
For more details on target/run_results.json
, see\nhttps://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.
dagster_gcp.
bq_create_dataset
(*args, **kwargs)[source]\u00b6BigQuery Create Dataset.
\nThis solid encapsulates creating a BigQuery dataset.
\nExpects a BQ client to be provisioned in resources as context.resources.bigquery.
\ndagster_gcp.
bq_delete_dataset
(*args, **kwargs)[source]\u00b6BigQuery Delete Dataset.
\nThis solid encapsulates deleting a BigQuery dataset.
\nExpects a BQ client to be provisioned in resources as context.resources.bigquery.
\ndagster_gcp.
GCSFileHandle
(gcs_bucket: str, gcs_key: str)[source]\u00b6A reference to a file on GCS.
\n\n\n\n\n\n\n\n\ndagster_gcp.
gcs_file_manager
ResourceDefinition[source]\u00b6FileManager that provides abstract access to GCS.
\nImplements the FileManager
API.
dagster_gcp.gcs.
gcs_pickle_io_manager
IOManagerDefinition[source]\u00b6Persistent IO manager using GCS for storage.
\nSerializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.
\nAttach this resource definition to a ModeDefinition
\nin order to make it available to your pipeline:
pipeline_def = PipelineDefinition(\n mode_defs=[\n ModeDefinition(\n resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...},\n ), ...\n ], ...\n)\n
You may configure this storage as follows:
\nresources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n
dagster_ge.
ge_validation_solid_factory
(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster.core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6\n\nGenerates solids for interacting with GE.
\n
name (str) \u2013 the name of the solid
datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml
suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml
validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to None,\nwhich generates an ephemeral validator.\nIf you want to save data docs, use \u2018action_list_operator\u2019.\nSee https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html
input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the\nsolid. Defaults to dagster_pandas.DataFrame.
batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset},\nwhere dataset is the input to the generated solid.
A solid that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)
\nThis library provides an integration with GitHub Apps, to support performing various automation\noperations within your github repositories and with the tighter permissions scopes that github apps\nallow for vs using a personal token.
\nPresently, it provides a thin wrapper on the github v4 graphql API.
\nTo use this integration, you\u2019ll first need to create a GitHub App for it.
\nCreate App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/, You will end up with a private key and App ID, which will be used when configuring the\ndagster-github
resource. Note you will need to grant your app the relevent permissions\nfor the API requests you want to make, for example to post issues it will need read/write access\nfor the issues repository permission, more info on GitHub application permissions can be found\nhere
Install App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/#step-7-install-the-app-on-your-account
Find your installation_id: You can pull this from the GitHub app administration page,\nhttps://github.com/apps/<app-name>/installations/<installation_id>
. Note if your app is\ninstalled more than once you can also programatically retrieve these IDs.
Sharing your App ID and Installation ID is fine, but make sure that the Private Key for your app is\nstored securily.
\nNow, you can create issues in GitHub from Dagster with the GitHub resource:
\nimport os\n\nfrom dagster import solid, execute_pipeline, ModeDefinition\nfrom dagster_github import github_resource\n\n\n@solid(resource_defs={'github'})\ndef github_solid(context):\n context.resources.github.create_issue(\n repo_name='dagster',\n repo_owner='dagster-io',\n title='Dagster\\'s first github issue',\n body='this open source thing seems like a pretty good idea',\n )\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={'github': github_resource})],\n)\ndef github_pipeline():\n github_solid()\n\nexecute_pipeline(\n github_pipeline, {'resources': {'github': {'config': {\n "github_app_id": os.getenv('GITHUB_APP_ID'),\n "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n }}}}\n)\n
Run the above code, and you\u2019ll see the issue appear in GitHub:\n
\nBy provisioning github_resource
as a Dagster pipeline resource, you can post to GitHub from\nwithin any solid execution.
import os\n\nfrom dagster import solid, execute_pipeline, ModeDefinition\nfrom dagster_github import github_resource\n\n\n@solid(resource_defs={'github'})\ndef github_solid(context):\n context.resources.github.execute(\n query="""\n query get_repo_id($repo_name: String!, $repo_owner: String!) {\n repository(name: $repo_name, owner: $repo_owner) {\n id\n }\n }\n """,\n variables={"repo_name": repo_name, "repo_owner": repo_owner},\n )\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={'github': github_resource})],\n)\ndef github_pipeline():\n github_solid()\n\nexecute_pipeline(\n github_pipeline, {'resources': {'github': {'config': {\n "github_app_id": os.getenv('GITHUB_APP_ID'),\n "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n }}}}\n)\n
See also the Kubernetes deployment guide.
\nThis library contains utilities for running Dagster with Kubernetes. This includes a Python API\nallowing Dagit to launch runs as Kubernetes Jobs, as well as a Helm chart you can use as the basis\nfor a Dagster deployment on a Kubernetes cluster.
\ndagster_k8s.
K8sRunLauncher
(service_account_name, instance_config_map, postgres_password_secret, dagster_home, job_image=None, image_pull_policy='Always', image_pull_secrets=None, load_incluster_config=True, kubeconfig_file=None, inst_data=None, job_namespace='default', env_config_maps=None, env_secrets=None, k8s_client_batch_api=None, k8s_client_core_api=None)[source]\u00b6RunLauncher that starts a Kubernetes Job for each pipeline run.
\nEncapsulates each pipeline run in a separate, isolated invocation of dagster-graphql
.
You may configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml
like the following:
run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: pipeline_run_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n
As always when using a ConfigurableClass
, the values\nunder the config
key of this YAML block will be passed to the constructor. The full list\nof acceptable values is given below by the constructor args.
service_account_name (str) \u2013 The name of the Kubernetes service account under which to run\nthe Job.
job_image (Optional[str]) \u2013 The name
of the image to use for the Job\u2019s Dagster container.\nThis image will be run with the command\ndagster api execute_run
.\nWhen using user code deployments, the image should not be specified.
instance_config_map (str) \u2013 The name
of an existing Volume to mount into the pod in\norder to provide a ConfigMap for the Dagster instance. This Volume should contain a\ndagster.yaml
with appropriate values for run storage, event log storage, etc.
postgres_password_secret (str) \u2013 The name of the Kubernetes Secret where the postgres\npassword can be retrieved. Will be mounted and supplied as an environment variable to\nthe Job Pod.
dagster_home (str) \u2013 The location of DAGSTER_HOME in the Job container; this is where the\ndagster.yaml
file will be mounted from the instance ConfigMap specified above.
load_incluster_config (Optional[bool]) \u2013 Set this value if you are running the launcher\nwithin a k8s cluster. If True
, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config
. Otherwise,\nwe will use the k8s config specified in kubeconfig_file
(using\nkubernetes.config.load_kube_config
) or fall back to the default kubeconfig. Default:\nTrue
.
kubeconfig_file (Optional[str]) \u2013 The kubeconfig file from which to load config. Defaults to\nNone (using the default kubeconfig).
image_pull_secrets (Optional[List[Dict[str, str]]]) \u2013 Optionally, a list of dicts, each of\nwhich corresponds to a Kubernetes LocalObjectReference
(e.g.,\n{'name': 'myRegistryName'}
). This allows you to specify the `imagePullSecrets
on\na pod basis. Typically, these will be provided through the service account, when needed,\nand you will not need to pass this argument.\nSee:\nhttps://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\nand https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core.
image_pull_policy (Optional[str]) \u2013 Allows the image pull policy to be overridden, e.g. to\nfacilitate local testing with kind. Default:\n"Always"
. See: https://kubernetes.io/docs/concepts/containers/images/#updating-images.
job_namespace (Optional[str]) \u2013 The namespace into which to launch new jobs. Note that any\nother Kubernetes resources the Job requires (such as the service account) must be\npresent in this namespace. Default: "default"
env_config_maps (Optional[List[str]]) \u2013 A list of custom ConfigMapEnvSource names from which to\ndraw environment variables (using envFrom
) for the Job. Default: []
. See:\nhttps://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container
env_secrets (Optional[List[str]]) \u2013 A list of custom Secret names from which to\ndraw environment variables (using envFrom
) for the Job. Default: []
. See:\nhttps://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables
dagster_k8s.
K8sScheduler
(dagster_home, service_account_name, instance_config_map, postgres_password_secret, job_image, load_incluster_config=True, scheduler_namespace='default', image_pull_policy='Always', image_pull_secrets=None, kubeconfig_file=None, inst_data=None, env_config_maps=None, env_secrets=None)[source]\u00b6Scheduler implementation on top of Kubernetes CronJob.
\nEnable this scheduler by adding it to your dagster.yaml, or by configuring the scheduler\nsection of the Helm chart\nhttps://github.com/dagster-io/dagster/tree/master/helm
\nThe K8sRunLauncher
allows Dagit instances to be configured to launch new runs by starting per-run\nKubernetes Jobs. To configure the K8sRunLauncher
, your dagster.yaml
should include a section\nlike:
run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n image_pull_secrets:\n service_account_name: dagster\n job_image: "my-company.com/image:latest"\n dagster_home: "/opt/dagster/dagster_home"\n postgres_password_secret: "dagster-postgresql-secret"\n image_pull_policy: "IfNotPresent"\n job_namespace: "dagster"\n instance_config_map: "dagster-instance"\n env_config_maps:\n - "dagster-k8s-job-runner-env"\n env_secrets:\n - "dagster-k8s-some-secret"\n
For local dev (e.g., on kind or minikube):
\nhelm install \\\n --set dagit.image.repository="dagster.io/buildkite-test-image" \\\n --set dagit.image.tag="py37-latest" \\\n --set job_runner.image.repository="dagster.io/buildkite-test-image" \\\n --set job_runner.image.tag="py37-latest" \\\n --set imagePullPolicy="IfNotPresent" \\\n dagster \\\n helm/dagster/\n
Upon installation, the Helm chart will provide instructions for port forwarding Dagit and Flower (if\nconfigured).
\nTo run the unit tests:
\npytest -m "not integration"\n
To run the integration tests, you must have Docker,\nkind,\nand helm installed.
\nOn macOS:
\nbrew install kind\nbrew install helm\n
Docker must be running.
\nYou may experience slow first test runs thanks to image pulls (run pytest -svv --fulltrace
for\nvisibility). Building images and loading them to the kind cluster is slow, and there is\nno visibility into the progress of the load.
NOTE: This process is quite slow, as it requires bootstrapping a local kind
cluster with Docker images and the dagster-k8s
Helm chart. For faster development, you can either:
Keep a warm kind cluster
Use a remote K8s cluster, e.g. via AWS EKS or GCP GKE
Instructions are below.
\nYou may find that the kind cluster creation, image loading, and kind cluster creation loop\nis too slow for effective local dev.
\nYou may bypass cluster creation and image loading in the following way. First add the --no-cleanup
\nflag to your pytest invocation:
pytest --no-cleanup -s -vvv -m "not integration"\n
The tests will run as before, but the kind cluster will be left running after the tests are completed.
\nFor subsequent test runs, you can run:
\npytest --kind-cluster="cluster-d9971c84d44d47f382a2928c8c161faa" --existing-helm-namespace="dagster-test-95590a" -s -vvv -m "not integration"\n
This will bypass cluster creation, image loading, and Helm chart installation, for much faster tests.
\nThe kind cluster name and Helm namespace for this command can be found in the logs, or retrieved via the respective CLIs, using kind get clusters
and kubectl get namespaces
. Note that for kubectl
and helm
to work correctly with a kind cluster, you should override your kubeconfig file location with:
kind get kubeconfig --name kind-test > /tmp/kubeconfig\nexport KUBECONFIG=/tmp/kubeconfig\n
The test fixtures provided by dagster-k8s
automate the process described below, but sometimes its useful to manually configure a kind cluster and load images onto it.
First, ensure you have a Docker image appropriate for your Python version. Run, from the root of\nthe repo:
\n./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6 \\\n dagster.io.priv/buildkite-test-image:py37-latest\n
In the above invocation, the Python majmin version should be appropriate for your desired tests.
\nThen run the following commands to create the cluster and load the image. Note that there is no\nfeedback from the loading process.
\nkind create cluster --name kind-test\nkind load docker-image --name kind-test dagster.io/dagster-docker-buildkite:py37-latest\n
If you are deploying the Helm chart with an in-cluster Postgres (rather than an external database),\nand/or with dagster-celery workers (and a RabbitMQ), you\u2019ll also want to have images present for\nrabbitmq and postgresql:
\ndocker pull docker.io/bitnami/rabbitmq\ndocker pull docker.io/bitnami/postgresql\n\nkind load docker-image --name kind-test docker.io/bitnami/rabbitmq:latest\nkind load docker-image --name kind-test docker.io/bitnami/postgresql:latest\n
Then you can run pytest as follows:
\npytest --kind-cluster=kind-test\n
If you already have a development K8s cluster available, you can run tests on that cluster vs. running locally in kind
.
For this to work, first build and deploy the test image to a registry available to your cluster. For example, with ECR:
\n./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6\ndocker tag dagster-docker-buildkite:latest $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n\naws ecr get-login --no-include-email --region us-west-1 | sh\ndocker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n
Then, you can run tests on EKS with:
\nexport DAGSTER_DOCKER_IMAGE_TAG="2020-04-21T21-04-06"\nexport DAGSTER_DOCKER_REPOSITORY="$AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com"\nexport DAGSTER_DOCKER_IMAGE="dagster-k8s-tests"\n\n# First run with --no-cleanup to leave Helm chart in place\npytest --cluster-provider="kubeconfig" --no-cleanup -s -vvv\n\n# Subsequent runs against existing Helm chart\npytest --cluster-provider="kubeconfig" --existing-helm-namespace="dagster-test-<some id>" -s -vvv\n
To test / validate Helm charts, you can run:
\nhelm install dagster --dry-run --debug helm/dagster\nhelm lint\n
To enable GCR access from Minikube:
\nkubectl create secret docker-registry element-dev-key \\\n --docker-server=https://gcr.io \\\n --docker-username=oauth2accesstoken \\\n --docker-password="$(gcloud auth print-access-token)" \\\n --docker-email=my@email.com\n
Both the Postgres and the RabbitMQ Helm charts will store credentials using Persistent Volume\nClaims, which will outlive test invocations and calls to helm uninstall
. These must be deleted if\nyou want to change credentials. To view your pvcs, run:
kubectl get pvc\n
The Redis Helm chart installs w/ a randomly-generated password by default; turn this off:
\nhelm install dagredis stable/redis --set usePassword=false\n
Then, to connect to your database from outside the cluster execute the following commands:
\nkubectl port-forward --namespace default svc/dagredis-master 6379:6379\nredis-cli -h 127.0.0.1 -p 6379\n
This library provides an integration with PagerDuty, to support creating alerts from your Dagster\ncode.
\nPresently, it provides a thin wrapper on the Events V2 API.
\nYou can install this library with:
\npip install dagster_pagerduty\n
To use this integration, you\u2019ll first need to create a PagerDuty integration. There are instructions\nhere for\ncreating a new PagerDuty service & integration.
\nAs noted in the PagerDuty documentation, you\u2019ll find an integration key (also referred to as a\n\u201crouting key\u201d) on the Integrations tab for your new service. This key is used to authorize events\ncreated from the PagerDuty events API.
\nOnce your service/integration is created, you can provision a PagerDuty resource and issue PagerDuty\nalerts from within your solids.
\ndagster_pagerduty.
pagerduty_resource
ResourceDefinition[source]\u00b6A resource for posting events (alerts) to PagerDuty.
\nExample:
\n@solid(required_resource_keys={'pagerduty'})\ndef pagerduty_solid(context):\n context.resources.pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={'pagerduty': pagerduty_resource})],\n)\ndef pd_pipeline():\n pagerduty_solid()\n\nexecute_pipeline(\n pd_pipeline,\n {\n 'resources': {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n },\n)\n
The dagster_pandas library provides utilities for using pandas with Dagster and for implementing\nvalidation on pandas DataFrames. A good place to start with dagster_pandas is the validation\nguide.
\ndagster_pandas.
create_dagster_pandas_dataframe_type
(name, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None, loader=None, materializer=None)[source]\u00b6Constructs a custom pandas dataframe dagster type.
\nname (str) \u2013 Name of the dagster pandas type.
description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.
columns (Optional[List[PandasColumn]]) \u2013 A list of PandasColumn
objects\nwhich express dataframe column schemas and constraints.
event_metadata_fn (Optional[func]) \u2013 A callable which takes your dataframe and returns a list of EventMetadata\nwhich allow you to express things like summary statistics during runtime.
dataframe_constraints (Optional[List[DataFrameConstraint]]) \u2013 A list of objects that inherit from\nDataFrameConstraint
. This allows you to express dataframe-level constraints.
loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader
. If None, we will default\nto using dataframe_loader.
materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer
. If None, we will\ndefault to using dataframe_materializer.
dagster_pandas.
RowCountConstraint
(num_allowed_rows, error_tolerance=0)[source]\u00b6A dataframe constraint that validates the expected count of rows.
\n\ndagster_pandas.
StrictColumnsConstraint
(strict_column_list, enforce_ordering=False)[source]\u00b6A dataframe constraint that validates column existence and ordering.
\n\ndagster_pandas.
PandasColumn
(name, constraints=None, is_required=None)[source]\u00b6The main API for expressing column level schemas and constraints for your custom dataframe\ntypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf th column exists, the validate function will validate the column. Defaults to True.
constraints (Optional[List[Constraint]]) \u2013 List of constraint objects that indicate the\nvalidation rules for the pandas column.
boolean_column
(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
categorical_column
(name, categories, of_types='object', non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
categories (List[Any]) \u2013 The valid set of buckets that all values in the column must match.
of_types (Optional[Union[str, Set[str]]]) \u2013 The expected dtype[s] that your categories and values must\nabide by.
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in\nthe column ought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the\nconstraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
datetime_column
(name, min_datetime=Timestamp('1677-09-21 00:12:43.145225'), max_datetime=Timestamp('2262-04-11 23:47:16.854775807'), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses datetime constraints on \u2018datetime64[ns]\u2019 dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
min_datetime (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column.\nDefaults to pandas.Timestamp.min.
max_datetime (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column.\nDefaults to pandas.Timestamp.max.
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
exists
(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses existence constraints.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
float_column
(name, min_value=-inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)
max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
integer_column
(name, min_value=-inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)
max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
numeric_column
(name, min_value=-inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)
max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
string_column
(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6Simple constructor for PandasColumns that expresses constraints on string dtypes.
\nname (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.
non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.
unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.
ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.
dagster_pandas.
DataFrame
= <dagster.core.types.dagster_type.DagsterType object>\u00b6Define a type in dagster. These can be used in the inputs and outputs of solids.
\ntype_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the solid. If it passes, return either\nTrue
or a TypeCheck
with success
set to True
. If it fails,\nreturn either False
or a TypeCheck
with success
set to False
.\nThe first argument must be named context
(or, if unused, _
, _context
, or context_
).\nUse required_resource_keys
for access to resources.
key (Optional[str]) \u2013
The unique key to identify types programatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name
. If\nneither key
nor name
is provided, a CheckError
is thrown.
In the case of a generic type such as List
or Optional
, this is\ngenerated programatically based on the type parameters.
For most use cases, name should be set and the key argument should\nnot be specified.
\nname (Optional[str]) \u2013 A unique name given by a user. If key
is None
, key
\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.
description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.
loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader
and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader
decorator to construct\nthese arguments.
materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer
and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer
\ndecorator to construct these arguments.
serialization_strategy (Optional[SerializationStrategy]) \u2013 An instance of a class that\ninherits from SerializationStrategy
. The default strategy for serializing\nthis value when automatically persisting it between execution steps. You should set\nthis value if the ordinary serialization machinery (e.g., pickle) will not be adequate\nfor this type.
auto_plugins (Optional[List[Type[TypeStoragePlugin]]]) \u2013 If types must be serialized differently\ndepending on the storage being used for intermediates, they should specify this\nargument. In these cases the serialization_strategy argument is not sufficient because\nserialization requires specialized API calls, e.g. to call an S3 API directly instead\nof using a generic file object. See dagster_pyspark.DataFrame
for an example.
required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn
.
is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String
, Int
) to visually distinguish\nthem from user-defined types. Meant for internal use.
kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.
This library provides an integration with Papertrail for logging.
\nYou can easily set up your Dagster pipeline to log to Papertrail. You\u2019ll need an active Papertrail\naccount, and have your papertrail URL and port handy.
\ndagster_papertrail.
papertrail_logger
LoggerDefinition\u00b6Core class for defining loggers.
\nLoggers are pipeline-scoped logging handlers, which will be automatically invoked whenever\nsolids in a pipeline log messages.
\nlogger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log
are called from within solid compute logic.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config.
description (Optional[str]) \u2013 A human-readable description of this logger.
dagster_postgres.
PostgresEventLogStorage
(postgres_url, inst_data=None)[source]\u00b6Postgres-backed event log storage.
\nUsers should not directly instantiate this class; it is instantiated by internal machinery when\ndagit
and dagster-graphql
load, based on the values in the dagster.yaml
file in\n$DAGSTER_HOME
. Configuration of this class should be done by setting values in that file.
To use Postgres for event log storage, you can add a block such as the following to your\ndagster.yaml
:
Note that the fields in this config are StringSource
and\nIntSource
and can be configured from environment variables.
dagster_postgres.
PostgresRunStorage
(postgres_url, inst_data=None)[source]\u00b6Postgres-backed run storage.
\nUsers should not directly instantiate this class; it is instantiated by internal machinery when\ndagit
and dagster-graphql
load, based on the values in the dagster.yaml
file in\n$DAGSTER_HOME
. Configuration of this class should be done by setting values in that file.
To use Postgres for run storage, you can add a block such as the following to your\ndagster.yaml
:
Note that the fields in this config are StringSource
and\nIntSource
and can be configured from environment variables.
dagster_postgres.
PostgresScheduleStorage
(postgres_url, inst_data=None)[source]\u00b6Postgres-backed run storage.
\nUsers should not directly instantiate this class; it is instantiated by internal machinery when\ndagit
and dagster-graphql
load, based on the values in the dagster.yaml
file in\n$DAGSTER_HOME
. Configuration of this class should be done by setting values in that file.
To use Postgres for schedule storage, you can add a block such as the following to your\ndagster.yaml
:
Note that the fields in this config are StringSource
and\nIntSource
and can be configured from environment variables.
The Dagster shell library provides solid factories for executing inline shell scripts or script files.
\ndagster_shell.
create_shell_command_solid
(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6This function is a factory that constructs solids to execute a shell command.
\nNote that you can only use shell_command_solid if you know the command you\u2019d like to execute\nat pipeline construction time. If you\u2019d like to construct shell commands dynamically during\npipeline execution and pass them between solids, you should use shell_solid instead.
\nExamples:
\nshell_command (str) \u2013 The shell command that the constructed solid will execute.
name (str) \u2013 The name of the constructed solid.
description (Optional[str]) \u2013 Human-readable description of this solid.
required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.
Failure \u2013 Raised when the shell command returns a non-zero exit code.
\nReturns the constructed solid definition.
\ndagster_shell.
create_shell_script_solid
(shell_script_path, name='create_shell_script_solid', input_defs=None, **kwargs)[source]\u00b6This function is a factory which constructs a solid that will execute a shell command read\nfrom a script file.
\nAny kwargs passed to this function will be passed along to the underlying @solid
decorator. However, note that overriding config
or output_defs
is not\nsupported.
You might consider using @composite_solid
to wrap this solid\nin the cases where you\u2019d like to configure the shell solid with different config fields.
Examples:
\nshell_script_path (str) \u2013 The script file to execute.
name (str, optional) \u2013 The name of this solid. Defaults to \u201ccreate_shell_script_solid\u201d.
input_defs (List[InputDefinition], optional) \u2013 input definitions for the solid. Defaults to\na single Nothing input.
Failure \u2013 Raised when the shell command returns a non-zero exit code.
\nReturns the constructed solid definition.
\ndagster_shell.
shell_solid
(*args, **kwargs)[source]\u00b6This solid executes a shell command it receives as input.
\nThis solid is suitable for uses where the command to execute is generated dynamically by\nupstream solids. If you know the command to execute at pipeline construction time, consider\nshell_command_solid instead.
\n\n
This library provides an integration with Slack, to support posting messages in your company\u2019s Slack workspace.
\nPresently, it provides a thin wrapper on the Slack client API chat.postMessage.
\nTo use this integration, you\u2019ll first need to create a Slack App for it.
\nCreate App: Go to https://api.slack.com/apps and click \u201cCreate New App\u201d:
\n\n
Install App: After creating an app, on the left-hand side of the app configuration, click \u201cBot Users\u201d, and then create a bot user. Then, click \u201cInstall App\u201d on the left hand side, and finally \u201cInstall App to Workspace\u201d.
Bot Token: Once finished, this will create a new bot token for your bot/workspace:
\n\n
Copy this bot token and put it somewhere safe; see Safely Storing Credentials for more on this topic.
\ndagster_slack.
slack_resource
ResourceDefinition[source]\u00b6This resource is for connecting to Slack.
\nBy configuring this Slack resource, you can post messages to Slack from any Dagster solid:
\nExamples:
\nimport os\n\nfrom dagster import solid, execute_pipeline, ModeDefinition\nfrom dagster_slack import slack_resource\n\n\n@solid(required_resource_keys={'slack'})\ndef slack_solid(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={'slack': slack_resource})],\n)\ndef slack_pipeline():\n slack_solid()\n\nexecute_pipeline(\n slack_pipeline, {'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n)\n
This library provides an integration with the Snowflake data\nwarehouse.
\nPresently, it provides a snowflake_resource
, which is a Dagster resource for configuring\nSnowflake connections and issuing queries.
To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.
\ndagster_snowflake.
snowflake_resource
ResourceDefinition[source]\u00b6A resource for connecting to the Snowflake data warehouse.
\nA simple example of loading data into Snowflake and subsequently querying that data is shown below:
\nExamples:
\nfrom dagster import execute_pipeline, pipeline, DependencyDefinition, ModeDefinition\nfrom dagster_snowflake import snowflake_resource\n\n@solid(required_resource_keys={'snowflake'})\ndef get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n@pipeline(\n mode_defs=[ModeDefinition(resource_defs={'snowflake': snowflake_resource})],\n)\ndef snowflake_pipeline():\n get_one()\n\nresult = execute_pipeline(\n snowflake_pipeline,\n {\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n },\n)\n
dagster_spark.
define_spark_config
()[source]\u00b6Spark configuration.
\nhttps://spark.apache.org/docs/latest/submitting-applications.html
\ndagster_spark.
create_spark_solid
(name, main_class, description=None, required_resource_keys=frozenset({'spark'}))[source]\u00b6This library provides an integration with SSH and SFTP.
\ndagster_ssh.
SSHResource
(remote_host, remote_port, username=None, password=None, key_file=None, key_string=None, timeout=10, keepalive_interval=30, compress=True, no_host_key_check=True, allow_host_key_change=False, logger=None)[source]\u00b6Resource for ssh remote execution using Paramiko.\nref: https://github.com/paramiko/paramiko
\nThis library provides an integration with Twilio.
\n\n\ndagstermill.
define_dagstermill_solid
(name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, asset_key_prefix=None)[source]\u00b6Wrap a Jupyter notebook in a solid.
\nname (str) \u2013 The name of the solid.
notebook_path (str) \u2013 Path to the backing notebook.
input_defs (Optional[List[InputDefinition]]) \u2013 The solid\u2019s inputs.
output_defs (Optional[List[OutputDefinition]]) \u2013 The solid\u2019s outputs. Your notebook should\ncall yield_result()
to yield each of these outputs.
required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.
output_notebook (Optional[str]) \u2013 If set, will be used as the name of an injected output of\ntype FileHandle
that will point to the executed notebook (in\naddition to the AssetMaterialization
that is always created). This\nrespects the FileManager
configured on\nthe pipeline resources via the \u201cfile_manager\u201d resource key, so, e.g.,\nif s3_file_manager
is configured, the output will be a :\npy:class:~dagster_aws.s3.S3FileHandle.
asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.
dagstermill.
get_context
(solid_config=None, mode_def=None, run_config=None)\u00b6Get a dagstermill execution context for interactive exploration and development.
\nsolid_config (Optional[Any]) \u2013 If specified, this value will be made available on the\ncontext as its solid_config
property.
mode_def (Optional[dagster.ModeDefinition
]) \u2013 If specified, defines the mode to\nuse to construct the context. Specify this if you would like a context constructed\nwith specific resource_defs
or logger_defs
. By default, an ephemeral mode\nwith a console logger will be constructed.
run_config (Optional[dict]) \u2013 The environment config dict with which to construct\nthe context.
dagstermill.
yield_event
(dagster_event)\u00b6Yield a dagster event directly from notebook code.
\nWhen called interactively or in development, returns its input.
\ndagster_event (Union[dagster.Materialization
, dagster.ExpectationResult
, dagster.TypeCheck
, dagster.Failure
]) \u2013 An event to yield back to Dagster.
dagstermill.
yield_result
(value, output_name='result')\u00b6Yield a result directly from notebook code.
\nWhen called interactively or in development, returns its input.
\nvalue (Any) \u2013 The value to yield.
output_name (Optional[str]) \u2013 The name of the result to yield (default: 'result'
).
dagstermill.
DagstermillExecutionContext
(pipeline_context: dagster.core.execution.context.system.SystemPipelineExecutionContext, resource_keys_to_init: Set[str], solid_name: str, solid_config: Any = None)[source]\u00b6Dagstermill-specific execution context.
\nDo not initialize directly: use dagstermill.get_context()
.
environment_config
\u00b6The environment_config for the context
\ndagster.EnvironmentConfig
get_tag
(key: str) → str[source]\u00b6Get a logging tag defined on the context.
\nkey (str) \u2013 The key to get.
\nstr
\nhas_tag
(key: str) → bool[source]\u00b6Check if a logging tag is defined on the context.
\nkey (str) \u2013 The key to check.
\nbool
\nlog
\u00b6The log manager for the context.
\nCall, e.g., log.info()
to log messages through the Dagster machinery.
The logging tags for the context.
\npipeline_def
\u00b6The pipeline definition for the context.
\nThis will be a dagstermill-specific shim.
\npipeline_run
\u00b6The pipeline run for the context.
\nresources
\u00b6A dynamically-created type whose properties allow access to\nresources.
\ncollections.namedtuple
\nsolid
\u00b6The solid for the context.
\nIn interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context
.
dagster.Solid
solid_config
\u00b6A dynamically-created type whose properties allow access to\nsolid-specific config.
\ncollections.namedtuple
\nsolid_def
\u00b6The solid definition for the context.
\nIn interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context
.
@
dagster.
logger
(config_schema=None, description=None)[source]\u00b6Define a logger.
\nThe decorated function should accept an InitLoggerContext
and return an instance of\nlogging.Logger
. This function will become the logger_fn
of an underlying\nLoggerDefinition
.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config.
description (Optional[str]) \u2013 A human-readable description of the logger.
dagster.
LoggerDefinition
(logger_fn, config_schema=None, description=None)[source]\u00b6Core class for defining loggers.
\nLoggers are pipeline-scoped logging handlers, which will be automatically invoked whenever\nsolids in a pipeline log messages.
\nlogger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log
are called from within solid compute logic.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config.
description (Optional[str]) \u2013 A human-readable description of this logger.
dagster.
InitLoggerContext
[source]\u00b6Logger-specific initialization context.
\nAn instance of this class is made available as the first argument to the logger_fn
decorated\nby @logger
or set on a LoggerDefinition
.
Users should not instantiate this class.
\nlogger_config
\u00b6The configuration data provided by the environment config. The\nschema for this data is defined by config_schema
on the LoggerDefinition
Any
\npipeline_def
\u00b6The pipeline definition currently being executed.
\nlogger_def
\u00b6The logger definition for the logger being constructed.
\ndagster.
ModeDefinition
[source]\u00b6Define a mode in which a pipeline can operate.
\nA mode provides pipelines with a set of resource implementations, loggers, system storages,\nand executors.
\nname (Optional[str]) \u2013 The name of the mode. Must be unique within the\nPipelineDefinition
to which the mode is attached. (default: \u201cdefault\u201d).
resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 A dictionary of string resource\nkeys to their implementations. Individual solids may require resources to be present by\nthese keys.
logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger\nidentifiers to their implementations.
executor_defs (Optional[List[ExecutorDefinition]]) \u2013 The set of executors available when\nexecuting in this mode. By default, this will be the \u2018in_process\u2019 and \u2018multiprocess\u2019\nexecutors (default_executors
).
description (Optional[str]) \u2013 A human-readable description of the mode.
intermediate_storage_defs (Optional[List[IntermediateStorageDefinition]]) \u2013 The set of intermediate storage\noptions available when executing in this mode. By default, this will be the \u2018in_memory\u2019\nand \u2018filesystem\u2019 system storages.
@
dagster.
resource
(config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6Define a resource.
\nThe decorated function should accept an InitResourceContext
and return an instance of\nthe resource. This function will become the resource_fn
of an underlying\nResourceDefinition
.
If the decorated function yields once rather than returning (in the manner of functions\ndecorable with @contextlib.contextmanager
) then\nthe body of the function after the yield will be run after execution resolves, allowing users\nto write their own teardown/cleanup logic.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.resource_config.
description (Optional[str]) \u2013 A human-readable description of the resource.
version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.
required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by this resource.
dagster.
ResourceDefinition
(resource_fn=None, config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6Core class for defining resources.
\nResources are scoped ways to make external resources (like database connections) available to\nsolids during pipeline execution and to clean up after execution resolves.
\nIf resource_fn yields once rather than returning (in the manner of functions decorable with\n@contextlib.contextmanager
) then the body of the\nfunction after the yield will be run after execution resolves, allowing users to write their\nown teardown/cleanup logic.
Depending on your executor, resources may be instantiated and cleaned up more than once in a\npipeline execution.
\nresource_fn (Callable[[InitResourceContext], Any]) \u2013 User-provided function to instantiate\nthe resource, which will be made available to solid executions keyed on the\ncontext.resources
object.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.resource_config.
description (Optional[str]) \u2013 A human-readable description of the resource.
required_resource_keys \u2013 (Optional[Set[str]]) Keys for the resources required by this\nresource. A DagsterInvariantViolationError will be raised during initialization if\ndependencies are cyclic.
version (Optional[str]) \u2013 (Experimental) The version of the resource\u2019s definition fn. Two\nwrapped resource functions should only have the same version if they produce the same\nresource definition when provided with the same inputs.
hardcoded_resource
(value, description=None)[source]\u00b6A helper function that creates a ResourceDefinition
with a hardcoded object.
value (Any) \u2013 A hardcoded object which helps mock the resource.
description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.
A hardcoded resource.
\nmock_resource
(description=None)[source]\u00b6A helper function that creates a ResourceDefinition
which wraps a mock.MagicMock
.
description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.
\nyou mock existing resources.
\ndagster.
InitResourceContext
[source]\u00b6Resource-specific initialization context.
\nresource_config
\u00b6The configuration data provided by the environment config. The schema\nfor this data is defined by the config_field
argument to\nResourceDefinition
.
Any
\nresource_def
\u00b6The definition of the resource currently being\nconstructed.
\npipeline_run
\u00b6The pipeline run in context.
\nlog_manager
\u00b6The log manager for this run of the pipeline
\nresources
\u00b6The resources that are available to the resource that we are\ninitalizing.
\nScopedResources
\ndagster.
Partition
[source]\u00b6Partition is the representation of a logical slice across an axis of a pipeline\u2019s work
\nvalue (Any) \u2013 The object for this partition
name (str) \u2013 Name for this partition
dagster.
PartitionSetDefinition
[source]\u00b6Defines a partition set, representing the set of slices making up an axis of a pipeline
\nname (str) \u2013 Name for this partition set
pipeline_name (str) \u2013 The name of the pipeline definition
partition_fn (Callable[void, List[Partition]]) \u2013 User-provided function to define the set of\nvalid partition objects.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this partition. (default: \u2018default\u2019)
run_config_fn_for_partition (Callable[[Partition], [Dict]]) \u2013 A\nfunction that takes a Partition
and returns the run\nconfiguration that parameterizes the execution for this partition, as a dict
tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]) \u2013 A function that\ntakes a Partition
and returns a list of key value pairs that will\nbe added to the generated run for this partition.
dagster.
date_partition_range
(start, end=None, delta_range='days', fmt=None, inclusive=False, timezone=None)[source]\u00b6Utility function that returns a partition generating function to be used in creating a\nPartitionSet definition.
\nstart (datetime) \u2013 Datetime capturing the start of the time range.
end (Optional(datetime)) \u2013 Datetime capturing the end of the partition. By default, the\ncurrent time is used. The range is not inclusive of the end\nvalue.
delta_range (Optional(str)) \u2013 string representing the time duration of each partition.\nMust be a valid argument to pendulum.period.range (\u201cdays\u201d, \u201chours\u201d, \u201cmonths\u201d, etc.).
fmt (Optional(str)) \u2013 Format string to represent each partition by its start time
inclusive (Optional(bool)) \u2013 By default, the partition set only contains date interval\npartitions for which the end time of the interval is less than current time. In other\nwords, the partition set contains date interval partitions that are completely in the\npast. If inclusive is set to True, then the partition set will include all date\ninterval partitions for which the start time of the interval is less than the\ncurrent time.
timezone (Optional(str)) \u2013 Timezone in which the partition values should be expressed.
Callable[[], List[Partition]]
\ndagster.
identity_partition_selector
(context, partition_set_def)[source]\u00b6Utility function for supplying a partition selector when creating a schedule from a\npartition set made of `datetime`s that assumes the schedule always executes at the\npartition time.
\nIt\u2019s important that the cron string passed into create_schedule_definition match\nthe partition set times. For example, a schedule created from a partition set with partitions for each day at\nmidnight would create its partition selector as follows:
\npartition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n "hello_world_daily_schedule",\n "0 0 * * *",\n partition_selector=identity_partition_selector,\n execution_timezone="US/Central",\n)\n
dagster.
create_offset_partition_selector
(execution_time_to_partition_fn)[source]\u00b6Utility function for supplying a partition selector when creating a schedule from a\npartition set made of `datetime`s that assumes a fixed time offset between the partition\ntime and the time at which the schedule executes.
\nIt\u2019s important to keep the cron string that\u2019s supplied to\nPartitionSetDefinition.create_schedule_definition in sync with the offset that\u2019s\nsupplied to this function. For example, a schedule created from a partition set with\npartitions for each day at midnight that fills in the partition for day N at day N+1 at\n10:00AM would create the partition selector as follows:
\npartition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n "daily_10am_schedule",\n "0 10 * * *",\n partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n execution_timezone="US/Central",\n)\n
execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]) \u2013 A
that maps the execution time of the schedule to the partition time. (function) \u2013
@
dagster.
pipeline
(name=None, description=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, input_defs=None, output_defs=None, config_schema=None, config_fn=None)[source]\u00b6Create a pipeline with the specified parameters from the decorated composition function.
\nUsing this decorator allows you to build up the dependency graph of the pipeline by writing a\nfunction that invokes solids and passes the output to other solids.
\nname (Optional[str]) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition
containing the pipeline.
description (Optional[str]) \u2013 A human-readable description of the pipeline.
mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary\navailable resource and logging implementations between local test and production runs.
preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.
hook_defs (Optional[Set[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.
Example
\n@solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\ndef emit_two_four(_) -> int:\n yield Output(2, "two")\n yield Output(4, "four")\n\n\n@lambda_solid\ndef add_one(num: int) -> int:\n return num + 1\n\n\n@lambda_solid\ndef mult_two(num: int) -> int:\n return num * 2\n\n\n@pipeline\ndef math_pipeline():\n two, four = emit_two_four()\n add_one(two)\n mult_two(four)\n
dagster.
PipelineDefinition
(solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, input_mappings=None, output_mappings=None, config_mapping=None, positional_inputs=None, _parent_pipeline_def=None)[source]\u00b6Defines a Dagster pipeline.
\nA pipeline is made up of
\nSolids, each of which is a single functional unit of data computation.
Dependencies, which determine how the values produced by solids as their outputs flow from\none solid to another. This tells Dagster how to arrange solids, and potentially multiple\naliased instances of solids, into a directed, acyclic graph (DAG) of compute.
Modes, which can be used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline, and to switch between them.
Presets, which can be used to ship common combinations of pipeline config options in Python\ncode, and to switch between them.
solid_defs (List[SolidDefinition]) \u2013 The set of solids used in this pipeline.
name (Optional[str]) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition
containing the pipeline.
description (Optional[str]) \u2013 A human-readable description of the pipeline.
dependencies (Optional[Dict[Union[str, SolidInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each solid\u2019s inputs on the outputs of\nother solids in the pipeline. Keys of the top level dict are either the string names of\nsolids in the pipeline or, in the case of aliased solids,\nSolidInvocations
. Values of the top level dict are\nthemselves dicts, which map input names belonging to the solid or aliased solid to\nDependencyDefinitions
.
mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary available\nresource and logging implementations between local test and production runs.
preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.
hook_defs (Optional[Set[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.
_parent_pipeline_def (INTERNAL ONLY) \u2013 Used for tracking pipelines created using solid subsets.
Examples
\n@lambda_solid\ndef return_one():\n return 1\n\n\n@solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\ndef apply_op(context, num):\n return context.resources.op(num)\n\n@resource(config_schema=Int)\ndef adder_resource(init_context):\n return lambda x: x + init_context.resource_config\n\n\nadd_mode = ModeDefinition(\n name='add_mode',\n resource_defs={'op': adder_resource},\n description='Mode that adds things',\n)\n\n\nadd_three_preset = PresetDefinition(\n name='add_three_preset',\n run_config={'resources': {'op': {'config': 3}}},\n mode='add_mode',\n)\n\n\npipeline_def = PipelineDefinition(\n name='basic',\n solid_defs=[return_one, apply_op],\n dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n mode_defs=[add_mode],\n preset_defs=[add_three_preset],\n)\n
dagster.
DependencyDefinition
[source]\u00b6Represents an edge in the DAG of solid instances forming a pipeline.
\nThis object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a pipeline whose keys represent the dependent solid and dependent\ninput, so this object only contains information about the dependee.
\nConcretely, if the input named \u2018input\u2019 of solid_b depends on the output named \u2018result\u2019 of\nsolid_a, this structure will look as follows:
\ndependency_structure = {\n 'solid_b': {\n 'input': DependencyDefinition('solid_a', 'result')\n }\n}\n
In general, users should prefer not to construct this class directly or use the\nPipelineDefinition
API that requires instances of this class. Instead, use the\n@pipeline
API:
@pipeline\ndef pipeline():\n solid_b(solid_a())\n
solid (str) \u2013 The name of the solid that is depended on, that is, from which the value\npassed between the two solids originates.
output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)
description (Optional[str]) \u2013 Human-readable description of this dependency.
dagster.
MultiDependencyDefinition
[source]\u00b6Represents a fan-in edge in the DAG of solid instances forming a pipeline.
\nThis object is used only when an input of type List[T]
is assembled by fanning-in multiple\nupstream outputs of type T
.
This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a pipeline whose keys represent the dependent solid and dependent\ninput, so this object only contains information about the dependee.
\nConcretely, if the input named \u2018input\u2019 of solid_c depends on the outputs named \u2018result\u2019 of\nsolid_a and solid_b, this structure will look as follows:
\ndependency_structure = {\n 'solid_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('solid_a', 'result'),\n DependencyDefinition('solid_b', 'result')\n ]\n )\n }\n}\n
In general, users should prefer not to construct this class directly or use the\nPipelineDefinition
API that requires instances of this class. Instead, use the\n@pipeline
API:
@pipeline\ndef pipeline():\n solid_c(solid_a(), solid_b())\n
solid (str) \u2013 The name of the solid that is depended on, that is, from which the value\npassed between the two solids originates.
output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)
description (Optional[str]) \u2013 Human-readable description of this dependency.
dagster.
SolidInvocation
[source]\u00b6Identifies an instance of a solid in a pipeline dependency structure.
\nname (str) \u2013 Name of the solid of which this is an instance.
alias (Optional[str]) \u2013 Name specific to this instance of the solid. Necessary when there are\nmultiple instances of the same solid.
tags (Optional[Dict[str, Any]]) \u2013 Optional tags values to extend or override those\nset on the solid definition.
hook_defs (Optional[Set[HookDefinition]]) \u2013 A set of hook definitions applied to the\nsolid instance.
Examples
\npipeline = PipelineDefinition(\n solid_defs=[solid_1, solid_2]\n dependencies={\n SolidInvocation('solid_1', alias='other_name') : {\n 'input_name' : DependencyDefinition('solid_1'),\n },\n 'solid_2' : {\n 'input_name': DependencyDefinition('other_name'),\n },\n }\n)\n
In general, users should prefer not to construct this class directly or use the\nPipelineDefinition
API that requires instances of this class. Instead, use the\n@pipeline
API:
@pipeline\ndef pipeline():\n other_name = solid_1.alias('other_name')\n solid_2(other_name(solid_1))\n
dagster.
PresetDefinition
[source]\u00b6Defines a preset configuration in which a pipeline can execute.
\nPresets can be used in Dagit to load predefined configurations into the tool.
\nPresets may also be used from the Python API (in a script, or in test) as follows:
\nexecute_pipeline(pipeline_def, preset='example_preset')\n
Presets may also be used with the command line tools:
\n$ dagster pipeline execute example_pipeline --preset example_preset\n
name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.
run_config (Optional[dict]) \u2013 A dict representing the config to set with the preset.\nThis is equivalent to the run_config
argument to execute_pipeline()
.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default: \u2018default\u2019)
tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.
from_files
(name, config_files=None, solid_selection=None, mode=None, tags=None)[source]\u00b6Static constructor for presets from YAML files.
\nname (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.
config_files (Optional[List[str]]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the environment config for this preset.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)
tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.
A PresetDefinition constructed from the provided YAML files.
\nDagsterInvariantViolationError \u2013 When one of the YAML files is invalid and has a parse\n error.
\nfrom_pkg_resources
(name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None)[source]\u00b6Load a preset from a package resource, using pkg_resources.resource_string()
.
Example:
\nPresetDefinition.from_pkg_resources(\n name='local',\n mode='local',\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n)\n
name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.
pkg_resource_defs (Optional[List[(str, str)]]) \u2013 List of pkg_resource modules/files to\nload as environment config for this preset.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g.\n['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)
tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.
A PresetDefinition constructed from the provided YAML strings
\nDagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.
\nfrom_yaml_strings
(name, yaml_strings=None, solid_selection=None, mode=None, tags=None)[source]\u00b6Static constructor for presets from YAML strings.
\nname (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.
yaml_strings (Optional[List[str]]) \u2013 List of yaml strings to parse as the environment\nconfig for this preset.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)
tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.
A PresetDefinition constructed from the provided YAML strings
\nDagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.
\ndagster.
repository
RepositoryDefinition[source]\u00b6Create a repository from the decorated function.
\nThe decorated function should take no arguments and its return value should one of:
\nList[Union[PipelineDefinition, PartitionSetDefinition, ScheduleDefinition]]
. Use thisform when you have no need to lazy load pipelines or other definitions. This is the\ntypical use case.
\nA dict of the form:
{\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n}\n
This form is intended to allow definitions to be created lazily when accessed by name,\nwhich can be helpful for performance when there are many definitions in a repository, or\nwhen constructing the definitions is costly.
\nRepositoryData
. Return this object if you need fine-grainedcontrol over the construction and indexing of definitions within the repository, e.g., to\ncreate definitions dynamically from .yaml files in a directory.
\nExample:
\n######################################################################\n# A simple repository using the first form of the decorated function\n######################################################################\n\n@solid(config_schema={n: Field(Int)})\ndef return_n(context):\n return context.solid_config['n']\n\n@pipeline(name='simple_pipeline')\ndef simple_pipeline():\n return_n()\n\nsimple_partition_set = PartitionSetDefinition(\n name='simple_partition_set',\n pipeline_name='simple_pipeline',\n partition_fn=lambda: range(10),\n run_config_fn_for_partition=(\n lambda partition: {\n 'solids': {'return_n': {'config': {'n': partition}}}\n }\n ),\n)\n\nsimple_schedule = simple_partition_set.create_schedule_definition(\n schedule_name='simple_daily_10_pm_schedule',\n cron_schedule='0 22 * * *',\n)\n\n@repository\ndef simple_repository():\n return [simple_pipeline, simple_partition_set, simple_schedule]\n\n\n######################################################################\n# A lazy-loaded repository\n######################################################################\n\ndef make_expensive_pipeline():\n @pipeline(name='expensive_pipeline')\n def expensive_pipeline():\n for i in range(10000):\n return_n.alias('return_n_{i}'.format(i=i))()\n\n return expensive_pipeline\n\nexpensive_partition_set = PartitionSetDefinition(\n name='expensive_partition_set',\n pipeline_name='expensive_pipeline',\n partition_fn=lambda: range(10),\n run_config_fn_for_partition=(\n lambda partition: {\n 'solids': {\n 'return_n_{i}'.format(i=i): {'config': {'n': partition}}\n for i in range(10000)\n }\n }\n ),\n)\n\ndef make_expensive_schedule():\n expensive_partition_set.create_schedule_definition(\n schedule_name='expensive_schedule',\n cron_schedule='0 22 * * *',\n)\n\n@repository\ndef lazy_loaded_repository():\n return {\n 'pipelines': {'expensive_pipeline': make_expensive_pipeline},\n 'partition_sets': {\n 'expensive_partition_set': expensive_partition_set\n },\n 'schedules': {'expensive_schedule: make_expensive_schedule}\n }\n\n\n######################################################################\n# A complex repository that lazily construct pipelines from a directory\n# of files in a bespoke YAML format\n######################################################################\n\nclass ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_pipeline(self, pipeline_name):\n return self._construct_pipeline_def_from_yaml_file(\n self._yaml_file_for_pipeline_name(pipeline_name)\n )\n\n ...\n\n@repository\ndef complex_repository():\n return ComplexRepositoryData('some_directory')\n
dagster.
RepositoryDefinition
(name, repository_data, description=None)[source]\u00b6Define a repository that contains a collection of definitions.
\nUsers should typically not create objects of this class directly. Instead, use the\n@repository()
decorator.
get_all_pipelines
()[source]\u00b6Return all pipelines in the repository as a list.
\nNote that this will construct any pipeline in the lazily evaluated pipeline_dict
that\nhas not yet been constructed.
All pipelines in the repository.
\nList[PipelineDefinition]
\nget_all_solid_defs
()[source]\u00b6Get all the solid definitions in a repository.
\nAll solid definitions in the repository.
\nList[SolidDefinition]
\nget_pipeline
(name)[source]\u00b6Get a pipeline by name.
\nIf this pipeline is present in the lazily evaluated pipeline_dict
passed to the\nconstructor, but has not yet been constructed, only this pipeline is constructed, and will\nbe cached for future calls.
name (str) \u2013 Name of the pipeline to retrieve.
\nThe pipeline definition corresponding to the given name.
\nhas_pipeline
(name)[source]\u00b6Check if a pipeline with a given name is present in the repository.
\nname (str) \u2013 The name of the pipeline.
\nbool
\npipeline_names
\u00b6Names of all pipelines in the repository
\nList[str]
\ndagster.
RunRequest
[source]\u00b6Represents all the information required to launch a single run. Must be returned by a\nSensorDefinition or ScheduleDefinition\u2019s evaluation function for a run to be launched.
\nrun_key
\u00b6A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.
\nstr | None
\nrun_config
\u00b6The environment config that parameterizes the run execution to\nbe launched, as a dict.
\nOptional[Dict]
\n@
dagster.
schedule
(cron_schedule, pipeline_name, name=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None)[source]\u00b6Create a schedule.
\nThe decorated function will be called as the run_config_fn
of the underlying\nScheduleDefinition
and should take a\nScheduleExecutionContext
as its only argument, returning the environment\ndict for the scheduled execution.
cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n'45 23 * * 6'
for a schedule that runs at 11:45 PM every Saturday.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
name (Optional[str]) \u2013 The name of the schedule to create.
tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.
tags_fn (Optional[Callable[[ScheduleExecutionContext], Optional[Dict[str, str]]]]) \u2013 A function\nthat generates tags to attach to the schedules runs. Takes a\nScheduleExecutionContext
and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags
and tags_fn
.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)
should_execute (Optional[Callable[[ScheduleExecutionContext], bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
@
dagster.
monthly_schedule
(pipeline_name, start_date, name=None, execution_day_of_month=1, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, end_date=None, execution_timezone=None)[source]\u00b6Create a schedule that runs monthly.
\nThe decorated function will be called as the run_config_fn
of the underlying\nScheduleDefinition
and should take a\nScheduleExecutionContext
as its only argument, returning the environment\ndict for the scheduled execution.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
start_date (datetime.datetime) \u2013 The date from which to run the schedule.
name (Optional[str]) \u2013 The name of the schedule to create.
execution_day_of_month (int) \u2013 The day of the month on which to run the schedule (must be\nbetween 0 and 31).
execution_time (datetime.time) \u2013 The time at which to execute the schedule.
tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes the date of the\nschedule run and returns a dictionary of tags (string key-value pairs).
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)
should_execute (Optional[Callable[ScheduleExecutionContext, bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.
end_date (Optional[datetime.datetime]) \u2013 The last time to run the schedule to, defaults to\ncurrent time.
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
@
dagster.
weekly_schedule
(pipeline_name, start_date, name=None, execution_day_of_week=0, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, end_date=None, execution_timezone=None)[source]\u00b6Create a schedule that runs weekly.
\nThe decorated function will be called as the run_config_fn
of the underlying\nScheduleDefinition
and should take a\nScheduleExecutionContext
as its only argument, returning the environment\ndict for the scheduled execution.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
start_date (datetime.datetime) \u2013 The date from which to run the schedule.
name (Optional[str]) \u2013 The name of the schedule to create.
execution_day_of_week (int) \u2013 The day of the week on which to run the schedule. Must be\nbetween 0 (Sunday) and 6 (Saturday).
execution_time (datetime.time) \u2013 The time at which to execute the schedule.
tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes the date of the\nschedule run and returns a dictionary of tags (string key-value pairs).
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)
should_execute (Optional[Callable[ScheduleExecutionContext, bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.
end_date (Optional[datetime.datetime]) \u2013 The last time to run the schedule to, defaults to\ncurrent time.
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
@
dagster.
hourly_schedule
(pipeline_name, start_date, name=None, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, end_date=None, execution_timezone=None)[source]\u00b6Create a schedule that runs hourly.
\nThe decorated function will be called as the run_config_fn
of the underlying\nScheduleDefinition
and should take a\nScheduleExecutionContext
as its only argument, returning the environment\ndict for the scheduled execution.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
start_date (datetime.datetime) \u2013 The date from which to run the schedule.
name (Optional[str]) \u2013 The name of the schedule to create. By default, this will be the name\nof the decorated function.
execution_time (datetime.time) \u2013 The time at which to execute the schedule. Only the minutes\ncomponent will be respected \u2013 the hour should be 0, and will be ignored if it is not 0.
tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes the date of the\nschedule run and returns a dictionary of tags (string key-value pairs).
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)
should_execute (Optional[Callable[ScheduleExecutionContext, bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.
end_date (Optional[datetime.datetime]) \u2013 The last time to run the schedule to, defaults to\ncurrent time.
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
@
dagster.
daily_schedule
(pipeline_name, start_date, name=None, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, end_date=None, execution_timezone=None)[source]\u00b6Create a schedule that runs daily.
\nThe decorated function will be called as the run_config_fn
of the underlying\nScheduleDefinition
and should take a\nScheduleExecutionContext
as its only argument, returning the environment\ndict for the scheduled execution.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
start_date (datetime.datetime) \u2013 The date from which to run the schedule.
name (Optional[str]) \u2013 The name of the schedule to create.
execution_time (datetime.time) \u2013 The time at which to execute the schedule.
tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes the date of the\nschedule run and returns a dictionary of tags (string key-value pairs).
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)
should_execute (Optional[Callable[ScheduleExecutionContext, bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.
end_date (Optional[datetime.datetime]) \u2013 The last time to run the schedule to, defaults to\ncurrent time.
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
dagster.
ScheduleDefinition
(name, cron_schedule, pipeline_name, run_config=None, run_config_fn=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None, execution_fn=None)[source]\u00b6Define a schedule that targets a pipeline
\nname (str) \u2013 The name of the schedule to create.
cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n\u201845 23 * * 6\u2019 for a schedule that runs at 11:45 PM every Saturday.
pipeline_name (str) \u2013 The name of the pipeline to execute when the schedule runs.
execution_fn (Callable[ScheduleExecutionContext]) \u2013
The core evaluation function for the\nschedule, which is run at an interval to determine whether a run should be launched or\nnot. Takes a ScheduleExecutionContext
.
This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.
\nrun_config (Optional[Dict]) \u2013 The environment config that parameterizes this execution,\nas a dict.
run_config_fn (Callable[[ScheduleExecutionContext], [Dict]]) \u2013 A function that takes a\nScheduleExecutionContext object and returns the environment configuration that\nparameterizes this execution, as a dict. You may set only one of run_config
\nand run_config_fn
.
tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.
tags_fn (Optional[Callable[[ScheduleExecutionContext], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes a\nScheduleExecutionContext
and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags
and tags_fn
.
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this schedule. (default: \u2018default\u2019)
should_execute (Optional[Callable[[ScheduleExecutionContext], bool]]) \u2013 A function that runs\nat schedule execution time to determine whether a schedule should execute or skip. Takes\na ScheduleExecutionContext
and returns a boolean (True
if the\nschedule should execute). Defaults to a function that always returns True
.
environment_vars (Optional[dict[str, str]]) \u2013 The environment variables to set for the\nschedule
execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run. Only works\nwith DagsterDaemonScheduler, and must be set when using that scheduler.
dagster.
ScheduleExecutionContext
(instance, scheduled_execution_time)[source]\u00b6Schedule-specific execution context.
\nAn instance of this class is made available as the first argument to various ScheduleDefinition\nfunctions. It is passed as the first argument to run_config_fn
, tags_fn
,\nand should_execute
.
instance
\u00b6The instance configured to run the schedule
\nscheduled_execution_time
\u00b6The time in which the execution was scheduled to happen. May differ slightly\nfrom both the actual execution time and the time at which the run config is computed.\nNot available in all schedulers - currently only set in deployments using\nDagsterDaemonScheduler.
\ndatetime
\n@
dagster.
sensor
(pipeline_name, name=None, solid_selection=None, mode=None)[source]\u00b6Creates a sensor where the decorated function is used as the sensor\u2019s evaluation function. The\ndecorated function may:
\nReturn a RunRequest object.
Yield multiple of RunRequest objects.
Return or yield a SkipReason object, providing a descriptive message of why no runs were\nrequested.
Return or yield nothing (skipping without providing a reason)
Takes a SensorExecutionContext
.
name (str) \u2013 The name of this sensor
solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute for runs for this sensor e.g.\n['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing runs for this sensor.\n(default: \u2018default\u2019)
dagster.
SensorDefinition
(name, pipeline_name, evaluation_fn, solid_selection=None, mode=None)[source]\u00b6Define a sensor that initiates a set of job runs
\nname (str) \u2013 The name of the sensor to create.
pipeline_name (str) \u2013 The name of the pipeline to execute when the sensor fires.
evaluation_fn (Callable[[SensorExecutionContext]]) \u2013
The core evaluation function for the\nsensor, which is run at an interval to determine whether a run should be launched or\nnot. Takes a SensorExecutionContext
.
This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.
\nsolid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the sensor runs. e.g. ['*some_solid+', 'other_solid']
mode (Optional[str]) \u2013 The mode to apply when executing this sensor. (default: \u2018default\u2019)
dagster.
SensorExecutionContext
(instance, last_completion_time, last_run_key)[source]\u00b6Sensor execution context.
\nAn instance of this class is made available as the first argument to the evaluation function\non SensorDefinition.
\ninstance
\u00b6The instance configured to run the schedule
\nlast_completion_time
\u00b6The last time that the sensor was evaluated (UTC).
\nThe foundational unit of composition in Dagster.
\n@
dagster.
solid
(name=None, description=None, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, tags=None, version=None)[source]\u00b6Create a solid with the specified parameters from the decorated function.
\nThis shortcut simplifies the core SolidDefinition
API by exploding arguments into\nkwargs of the decorated compute function and omitting additional parameters when they are not\nneeded.
Input and output definitions will be inferred from the type signature of the decorated function\nif not explicitly provided.
\nThe decorated function will be used as the solid\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn
in the core API; it may:
Return a value. This value will be wrapped in an Output
and yielded by the compute function.
Return an Output
. This output will be yielded by the compute function.
Yield Output
or other event objects. Same as default compute behavior.
Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput
and yield it.
name (Optional[str]) \u2013 Name of solid. Must be unique within any PipelineDefinition
\nusing the solid.
description (Optional[str]) \u2013 Human-readable description of this solid.
input_defs (Optional[List[InputDefinition]]) \u2013 List of input definitions. Inferred from typehints if not provided.
output_defs (Optional[List[OutputDefinition]]) \u2013 List of output definitions. Inferred from typehints if not provided.
config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable as context.solid_config.
required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.
version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.
Examples
\n@solid\ndef hello_world(_context):\n print('hello')\n\n@solid\ndef hello_world(_context):\n return {'foo': 'bar'}\n\n@solid\ndef hello_world(_context):\n return Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world(_context):\n yield Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world(_context, foo):\n return foo\n\n@solid(\n input_defs=[InputDefinition(name="foo", str)],\n output_defs=[OutputDefinition(str)]\n)\ndef hello_world(_context, foo):\n # explicitly type and name inputs and outputs\n return foo\n\n@solid\ndef hello_world(_context, foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n@solid\ndef hello_world(context, foo):\n context.log.info('log something')\n return foo\n\n@solid(\n config_schema={'str_value' : Field(str)}\n)\ndef hello_world(context, foo):\n # context.solid_config is a dictionary with 'str_value' key\n return foo + context.solid_config['str_value']\n
dagster.
SolidDefinition
(name, input_defs, compute_fn, output_defs, config_schema=None, description=None, tags=None, required_resource_keys=None, positional_inputs=None, version=None)[source]\u00b6The definition of a Solid that performs a user-defined computation.
\nFor more details on what a solid is, refer to the\nSolid Guide .
\nEnd users should prefer the @solid
and @lambda_solid
\ndecorators. SolidDefinition is generally intended to be used by framework authors.
name (str) \u2013 Name of the solid. Must be unique within any PipelineDefinition
\nusing the solid.
input_defs (List[InputDefinition]) \u2013 Inputs of the solid.
compute_fn (Callable) \u2013
The core of the solid, the function that does the actual\ncomputation. The signature of this function is determined by input_defs
, with\nan additional injected first argument, context
, a collection of information provided\nby the system.
This function must return a generator, which must yield one Output
for each\nof the solid\u2019s output_defs
, and additionally may yield other types of Dagster\nevents, including Materialization
and ExpectationResult
.
output_defs (List[OutputDefinition]) \u2013 Outputs of the solid.
config_schema (Optional[ConfigSchema) \u2013 The schema for the config. Configuration data\navailable in init_context.solid_config.
description (Optional[str]) \u2013 Human-readable description of the solid.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.
required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this\nsolid.
positional_inputs (Optional[List[str]]) \u2013 The positional order of the input names if it\ndiffers from the order of the input definitions.
version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.
Examples
\ndef _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\nSolidDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n)\n
@
dagster.
lambda_solid
(name=None, description=None, input_defs=None, output_def=None)[source]\u00b6Create a simple solid from the decorated function.
\nThis shortcut allows the creation of simple solids that do not require\nconfiguration and whose implementations do not require a\ncontext
.
Lambda solids take any number of inputs and produce a single output.
\nInputs can be defined using InputDefinition
and passed to the input_defs
argument\nof this decorator, or inferred from the type signature of the decorated function.
The single output can be defined using OutputDefinition
and passed as the\noutput_def
argument of this decorator, or its type can be inferred from the type signature\nof the decorated function.
The body of the decorated function should return a single value, which will be yielded as the\nsolid\u2019s output.
\nname (str) \u2013 Name of solid.
description (str) \u2013 Solid description.
input_defs (List[InputDefinition]) \u2013 List of input_defs.
output_def (OutputDefinition) \u2013 The output of the solid. Defaults to\nOutputDefinition()
.
Examples:
\n@lambda_solid\ndef hello_world():\n return 'hello'\n\n@lambda_solid(\n input_defs=[InputDefinition(name='foo', str)],\n output_def=OutputDefinition(str)\n)\ndef hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n@lambda_solid\ndef hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n
dagster.
InputDefinition
(name, dagster_type=None, description=None, default_value=<class 'dagster.core.definitions.input._NoValueSentinel'>, root_manager_key=None, metadata=None)[source]\u00b6Defines an argument to a solid\u2019s compute function.
\nInputs may flow from previous solids\u2019 outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.
\nname (str) \u2013 Name of the input.
dagster_type (Optional[Any]) \u2013 The type of this input. Users should provide one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type()
, @usable_as_dagster_type <dagster_type()
, or\nPythonObjectDagsterType()
, or a Python type. Defaults to Any
.
description (Optional[str]) \u2013 Human-readable description of the input.
default_value (Optional[Any]) \u2013 The default value to use if no input is provided.
root_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nRootInputManager
used for loading this input when it is not connected to an\nupstream output.
metadata (Optional[Dict[str, Any]]) \u2013 (Experimental) A dict of metadata for the input.
mapping_to
(solid_name, input_name, fan_in_index=None)[source]\u00b6Create an input mapping to an input of a child solid.
\nIn a CompositeSolidDefinition, you can use this helper function to construct\nan InputMapping
to the input of a child solid.
Examples
\ninput_mapping = InputDefinition('composite_input', Int).mapping_to(\n 'child_solid', 'int_input'\n)\n
dagster.
OutputDefinition
(dagster_type=None, name=None, description=None, is_required=None, io_manager_key=None, metadata=None)[source]\u00b6Defines an output from a solid\u2019s compute function.
\nSolids can have multiple outputs, in which case outputs cannot be anonymous.
\nMany solids have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.
\nOutput definitions may be typed using the Dagster type system.
\ndagster_type (Optional[Any]) \u2013 The type of this output. Users should provide one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type()
, @usable_as_dagster_type <dagster_type()
, or\nPythonObjectDagsterType()
, or a Python type. Defaults to Any
.
name (Optional[str]) \u2013 Name of the output. (default: \u201cresult\u201d)
description (Optional[str]) \u2013 Human-readable description of the output.
is_required (Optional[bool]) \u2013 Whether the presence of this field is required. (default: True)
io_manager_key (Optional[str]) \u2013 The resource key of the output manager used for this output.\n(default: \u201cio_manager\u201d).
metadata (Optional[Dict[str, Any]]) \u2013 (Experimental) A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.
mapping_from
(solid_name, output_name=None)[source]\u00b6Create an output mapping from an output of a child solid.
\nIn a CompositeSolidDefinition, you can use this helper function to construct\nan OutputMapping
from the output of a child solid.
Examples
\noutput_mapping = OutputDefinition(Int).mapping_from('child_solid')\n
@
dagster.
composite_solid
(name: Union[str, None, Callable[[...], Any]] = None, input_defs: Optional[List[dagster.core.definitions.input.InputDefinition]] = None, output_defs: Optional[List[dagster.core.definitions.output.OutputDefinition]] = None, description: Optional[str] = None, config_schema: Optional[Dict[str, Any]] = None, config_fn: Optional[Callable[[dict], dict]] = None) → dagster.core.definitions.decorators.composite_solid._CompositeSolid[source]\u00b6Create a composite solid with the specified parameters from the decorated composition\nfunction.
\nUsing this decorator allows you to build up the dependency graph of the composite by writing a\nfunction that invokes solids and passes the output to other solids. This is similar to the use\nof the @pipeline
decorator, with the additional ability to remap inputs,\noutputs, and config across the composite boundary.
name (Optional[str]) \u2013 Name for the new composite solid. Must be unique within any\nPipelineDefinition
using the solid.
description (Optional[str]) \u2013 Human-readable description of the new composite solid.
input_defs (Optional[List[InputDefinition]]) \u2013
Input definitions for the composite solid.\nIf not provided explicitly, these will be inferred from typehints.
\nUses of these inputs in the body of the decorated composition function will be used to\ninfer the appropriate set of InputMappings
passed to the\nunderlying CompositeSolidDefinition
.
output_defs (Optional[List[OutputDefinition]]) \u2013
Output definitions for the composite solid.\nIf not provided explicitly, these will be inferred from typehints.
\nUses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings
for the underlying\nCompositeSolidDefinition
.
To map multiple outputs, return a dictionary from the composition function.
\nconfig_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Must be combined with the\nconfig_fn argument in order to transform this config into the config for the contained\nsolids.
config_fn (Callable[[dict], dict]) \u2013
By specifying a config mapping\nfunction, you can override the configuration for the child solids contained within this\ncomposite solid.
\nConfig mappings require the configuration field to be specified as config_schema
, which\nwill be exposed as the configuration field for the composite solid, as well as a\nconfiguration mapping function, config_fn
, which maps the config provided to the\ncomposite solid to the config that will be provided to the child solids.
Examples
\n@lambda_solid\ndef add_one(num: int) -> int:\n return num + 1\n\n@composite_solid\ndef add_two(num: int) -> int:\n adder_1 = add_one.alias('adder_1')\n adder_2 = add_one.alias('adder_2')\n\n return adder_2(adder_1(num))\n
dagster.
CompositeSolidDefinition
(name, solid_defs, input_mappings=None, output_mappings=None, config_mapping=None, dependencies=None, description=None, tags=None, positional_inputs=None)[source]\u00b6The core unit of composition and abstraction, composite solids allow you to\ndefine a solid from a graph of solids.
\nIn the same way you would refactor a block of code in to a function to deduplicate, organize,\nor manage complexity - you can refactor solids in a pipeline in to a composite solid.
\nname (str) \u2013 The name of this composite solid. Must be unique within any\nPipelineDefinition
using the solid.
solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]) \u2013 The set of solid\ndefinitions used in this composite solid. Composites may be arbitrarily nested.
input_mappings (Optional[List[InputMapping]]) \u2013 Define the inputs to the composite solid,\nand how they map to the inputs of its constituent solids.
output_mappings (Optional[List[OutputMapping]]) \u2013 Define the outputs of the composite solid,\nand how they map from the outputs of its constituent solids.
config_mapping (Optional[ConfigMapping]) \u2013 By specifying a config mapping, you can override\nthe configuration for the child solids contained within this composite solid. Config\nmappings require both a configuration field to be specified, which is exposed as the\nconfiguration for the composite solid, and a configuration mapping function, which\nis called to map the configuration of the composite solid into the configuration that\nis applied to any child solids.
dependencies (Optional[Dict[Union[str, SolidInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares where each solid gets its inputs. The keys at the top\nlevel dict are either string names of solids or SolidInvocations. The values\nare dicts that map input names to DependencyDefinitions.
description (Optional[str]) \u2013 Human readable description of this composite solid.
tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.\nmay expect and require certain metadata to be attached to a solid.
positional_inputs (Optional[List[str]]) \u2013 The positional order of the inputs if it\ndiffers from the order of the input mappings
Examples
\n@lambda_solid\ndef add_one(num: int) -> int:\n return num + 1\n\nadd_two = CompositeSolidDefinition(\n 'add_two',\n solid_defs=[add_one],\n dependencies={\n SolidInvocation('add_one', 'adder_1'): {},\n SolidInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n },\n input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n)\n
dagster.
InputMapping
[source]\u00b6Defines an input mapping for a composite solid.
\ndefinition (InputDefinition) \u2013 Defines the input to the composite solid.
solid_name (str) \u2013 The name of the child solid onto which to map the input.
input_name (str) \u2013 The name of the input to the child solid onto which to map the input.
dagster.
OutputMapping
[source]\u00b6Defines an output mapping for a composite solid.
\ndefinition (OutputDefinition) \u2013 Defines the output of the composite solid.
solid_name (str) \u2013 The name of the child solid from which to map the output.
output_name (str) \u2013 The name of the child solid\u2019s output from which to map the output.
dagster.
ConfigMapping
[source]\u00b6Defines a config mapping for a composite solid.
\nBy specifying a config mapping function, you can override the configuration for the child\nsolids contained within a composite solid.
\nConfig mappings require the configuration schema to be specified as config_schema
, which will\nbe exposed as the configuration schema for the composite solid, as well as a configuration mapping\nfunction, config_fn
, which maps the config provided to the composite solid to the config\nthat will be provided to the child solids.
config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the composite config to a config appropriate for the child solids.
config_schema (ConfigSchema) \u2013 The schema of the composite config.
The objects that can be yielded by the body of solids\u2019 compute functions to communicate with the\nDagster framework.
\n(Note that Failure
and RetryRequested
are intended to be raised from solids rather than yielded.)
dagster.
Output
[source]\u00b6Event corresponding to one of a solid\u2019s outputs.
\nSolid compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a solid using the\nSolidDefinition
API directly.
Outputs are values produced by solids that will be consumed by downstream solids in a pipeline.\nThey are type-checked at solid boundaries when their corresponding OutputDefinition
\nor the downstream InputDefinition
is typed.
value (Any) \u2013 The value returned by the compute function.
output_name (Optional[str]) \u2013 Name of the corresponding output definition. (default:\n\u201cresult\u201d)
dagster.
AssetMaterialization
[source]\u00b6Event indicating that a solid has materialized an asset.
\nSolid compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nsolids, and their persistence is controlled by solid logic, rather than by the Dagster\nframework.
\nSolid authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in Dagit.
\nasset_key (str|List[str]|AssetKey) \u2013 A key to identify the materialized asset across pipeline\nruns
description (Optional[str]) \u2013 A longer human-radable description of the materialized value.
metadata_entries (Optional[List[EventMetadataEntry]]) \u2013 Arbitrary metadata about the\nmaterialized value.
partition (Optional[str]) \u2013 The name of the partition that was materialized.
dagster.
ExpectationResult
[source]\u00b6Event corresponding to a data quality test.
\nSolid compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.
\nsuccess (bool) \u2013 Whether the expectation passed or not.
label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.
description (Optional[str]) \u2013 A longer human-readable description of the expectation.
metadata_entries (Optional[List[EventMetadataEntry]]) \u2013 Arbitrary metadata about the\nexpectation.
dagster.
TypeCheck
[source]\u00b6Event corresponding to a successful typecheck.
\nEvents of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type()
, @usable_as_dagster_type
, or the underlying\nPythonObjectDagsterType()
API.)
Solid compute functions should generally avoid yielding events of this type to avoid confusion.
\nsuccess (bool) \u2013 True
if the type check succeeded, False
otherwise.
description (Optional[str]) \u2013 A human-readable description of the type check.
metadata_entries (Optional[List[EventMetadataEntry]]) \u2013 Arbitrary metadata about the\ntype check.
dagster.
Failure
(description=None, metadata_entries=None)[source]\u00b6Event indicating solid failure.
\nRaise events of this type from within solid compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.
\ndescription (Optional[str]) \u2013 A human-readable description of the failure.
metadata_entries (Optional[List[EventMetadataEntry]]) \u2013 Arbitrary metadata about the\nfailure.
Dagster uses lists of metadata entries to communicate arbitrary user-specified metadata about\nstructured events.
\ndagster.
EventMetadataEntry
[source]\u00b6The standard structure for describing metadata for Dagster events.
\nLists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin Dagit and other tooling.
\nlabel (str) \u2013 Short display label for this metadata entry.
description (Optional[str]) \u2013 A human-readable description of this metadata entry.
entry_data (Union[(Union[TextMetadataEntryData, UrlMetadataEntryData, PathMetadataEntryData, JsonMetadataEntryData, MarkdownMetadataEntryData, FloatMetadataEntryData, IntMetadataEntryData]) \u2013 Typed metadata entry data. The different types allow for customized display in tools\nlike dagit.
float
(value, label, description=None)[source]\u00b6Static constructor for a metadata entry containing float as\nFloatMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n )\n
fspath
(path, label=None, description=None)[source]\u00b6Static constructor for a metadata entry containing a filesystem path as\nPathMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.fspath("path/to/file")],\n )\n
int
(value, label, description=None)[source]\u00b6Static constructor for a metadata entry containing int as\nIntMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.int(len(df), "number of rows")],\n )\n
json
(data, label, description=None)[source]\u00b6Static constructor for a metadata entry containing JSON data as\nJsonMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata_entries=[\n EventMetadataEntry.json(\n label="metadata", data={"missing_columns": missing_things},\n )\n ],\n )\n
md
(md_str, label, description=None)[source]\u00b6Static constructor for a metadata entry containing markdown data as\nMarkdownMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata_entries=[EventMetadataEntry.md(md_str=md_str)],\n )\n
path
(path, label, description=None)[source]\u00b6Static constructor for a metadata entry containing a path as\nPathMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[EventMetadataEntry.path("path/to/file", label="filepath")],\n )\n
text
(text, label, description=None)[source]\u00b6Static constructor for a metadata entry containing text as\nTextMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n
url
(url, label, description=None)[source]\u00b6Static constructor for a metadata entry containing a URL as\nUrlMetadataEntryData
. For example:
@solid\ndef emit_metadata_solid(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata_entries=[\n EventMetadataEntry.url(\n "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n ),\n ],\n )\n
dagster.
JsonMetadataEntryData
[source]\u00b6Container class for JSON metadata entry data.
\ndata (Optional[Dict[str, Any]]) \u2013 The JSON data.
\ndagster.
MarkdownMetadataEntryData
[source]\u00b6Container class for markdown metadata entry data.
\nmd_str (Optional[str]) \u2013 The markdown as a string.
\ndagster.
PathMetadataEntryData
[source]\u00b6Container class for path metadata entry data.
\npath (Optional[str]) \u2013 The path as a string.
\ndagster.
TextMetadataEntryData
[source]\u00b6Container class for text metadata entry data.
\ntext (Optional[str]) \u2013 The text data.
\ndagster.
UrlMetadataEntryData
[source]\u00b6Container class for URL metadata entry data.
\nurl (Optional[str]) \u2013 The URL as a string.
\nDagster uses AssetKey
to build an index on Materialization
events.\nAssets materialized with an AssetKey
are highlighted in dagit on the Assets\ndashboard.
dagster.
AssetKey
[source]\u00b6Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.
\nExample usage:
\n@solid\ndef emit_metadata_solid(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n@solid\ndef structured_asset_key_solid(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n@solid\ndef structured_asset_key_solid_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata_entries=[\n EventMetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n
path (str|str[]|str()) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.
\nDagster includes facilities for typing the input and output values of solids (\u201cruntime\u201d types).
\ndagster.
Any
\u00b6Use this type for any input, output, or config field whose type is unconstrained
\nAll values are considered to be instances of Any
.
Examples:
\n@solid\ndef identity(_, x: Any) -> Any:\n return x\n\n# Untyped inputs and outputs are implicitly typed Any\n@solid\ndef identity_imp(_, x):\n return x\n\n# Explicitly typed\n@solid(\n input_defs=[InputDefinition('x', dagster_type=Any)],\n output_defs=[OutputDefinition(dagster_type=Any)]\n)\ndef identity(_, x):\n return x\n\n@solid(config_schema=Field(Any))\ndef any_config(context):\n return context.solid_config\n
dagster.
Bool
\u00b6Use this type for any boolean input, output, or config_field. At runtime, this will perform an\nisinstance(value, bool)
check. You may also use the ordinary bool
\ntype as an alias.
Examples:
\n@solid\ndef boolean(_, x: Bool) -> String:\n return 'true' if x else 'false'\n\n@solid\ndef empty_string(_, x: String) -> bool:\n return len(x) == 0\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('x', dagster_type=Bool)],\n output_defs=[OutputDefinition(dagster_type=String)]\n)\ndef boolean(_, x):\n return 'true' if x else 'false'\n\n@solid(\n input_defs=[InputDefinition('x', dagster_type=String)],\n output_defs=[OutputDefinition(dagster_type=bool)]\n)\ndef empty_string(_, x):\n return len(x) == 0\n\n@solid(config_schema=Field(Bool))\ndef bool_config(context):\n return 'true' if context.solid_config else 'false'\n
dagster.
Int
\u00b6Use this type for any integer input or output. At runtime, this will perform an\nisinstance(value, int)
check. You may also use the ordinary int
\ntype as an alias.
Examples:
\n@solid\ndef add_3(_, x: Int) -> int:\n return x + 3\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('x', dagster_type=Int)],\n output_defs=[OutputDefinition(dagster_type=Int)]\n)\ndef add_3(_, x):\n return x + 3\n
dagster.
Float
\u00b6Use this type for any float input, output, or config value. At runtime, this will perform an\nisinstance(value, float)
check. You may also use the ordinary float
\ntype as an alias.
Examples:
\n@solid\ndef div_2(_, x: Float) -> float:\n return x / 2\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('x', dagster_type=Float)],\n output_defs=[OutputDefinition(dagster_type=float)]\n)\ndef div_2(_, x):\n return x / 2\n\n@solid(config_schema=Field(Float))\ndef div_y(context, x: Float) -> float:\n return x / context.solid_config\n
dagster.
String
\u00b6Use this type for any string input, output, or config value. At runtime, this will perform an\nisinstance(value, str)
check. You may also use the ordinary str
type\nas an alias.
Examples:
\n@solid\ndef concat(_, x: String, y: str) -> str:\n return x + y\n\n# Explicit\n@solid(\n input_defs=[\n InputDefinition('x', dagster_type=String),\n InputDefinition('y', dagster_type=str)\n ],\n output_defs=[OutputDefinition(dagster_type=str)]\n)\ndef concat(_, x, y):\n return x + y\n\n@solid(config_schema=Field(String))\ndef hello(context) -> str:\n return 'Hello, {friend}!'.format(friend=context.solid_config)\n
dagster.
Nothing
\u00b6Use this type only for inputs and outputs, in order to establish an execution dependency without\ncommunicating a value. Inputs of this type will not be pased to the solid compute function, so\nit is necessary to use the explicit InputDefinition
API to define them rather than\nthe Python 3 type hint syntax.
All values are considered to be instances of Nothing
.
Examples:
\n@solid\ndef wait(_) -> Nothing:\n time.sleep(1)\n return\n\n@solid(\n InputDefinition('ready', dagster_type=Nothing)\n)\ndef done(_) -> str:\n return 'done'\n\n@pipeline\ndef nothing_pipeline():\n done(wait())\n\n# Any value will pass the type check for Nothing\n@solid\ndef wait_int(_) -> Int:\n time.sleep(1)\n return 1\n\n@pipeline\ndef nothing_int_pipeline():\n done(wait_int())\n
dagster.
Optional
\u00b6Use this type only for inputs and outputs, if the value can also be None
.
Examples:
\n@solid\ndef nullable_concat(_, x: String, y: Optional[String]) -> String:\n return x + (y or '')\n\n# Explicit\n@solid(\n input_defs=[\n InputDefinition('x', dagster_type=String),\n InputDefinition('y', dagster_type=Optional[String])\n ],\n output_defs=[OutputDefinition(dagster_type=String)]\n)\ndef nullable_concat(_, x, y):\n return x + (y or '')\n
dagster.
List
\u00b6Use this type for inputs, or outputs.
\nLists are also the appropriate input types when fanning in multiple outputs using a\nMultiDependencyDefinition
or the equivalent composition function syntax.
Examples:
\n@solid\ndef concat_list(_, xs: List[String]) -> String:\n return ''.join(xs)\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('xs', dagster_type=List[String])],\n output_defs=[OutputDefinition(dagster_type=String)]\n)\ndef concat_list(_, xs) -> String:\n return ''.join(xs)\n\n# Fanning in multiple outputs\n@solid\ndef emit_1(_) -> int:\n return 1\n\n@solid\ndef emit_2(_) -> int:\n return 2\n\n@solid\ndef emit_3(_) -> int:\n return 3\n\n@solid\ndef sum_solid(_, xs: List[int]) -> int:\n return sum(xs)\n\n@pipeline\ndef sum_pipeline():\n sum_solid([emit_1(), emit_2(), emit_3()])\n
dagster.
Dict
\u00b6Use this type for inputs, or outputs that are dicts.
\nFor inputs and outputs, you may optionally specify the key and value types using the square\nbrackets syntax for Python typing.
\nExamples:
\n@solid\ndef repeat(_, spec: Dict) -> str:\n return spec['word'] * spec['times']\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('spec', dagster_type=Dict)],\n output_defs=[OutputDefinition(String)]\n)\ndef repeat(_, spec):\n return spec['word'] * spec['times']\n
dagster.
Set
\u00b6Use this type for inputs, or outputs that are sets. Alias for\ntyping.Set
.
You may optionally specify the inner type using the square brackets syntax for Python typing.
\nExamples:
\n@solid\ndef set_solid(_, set_input: Set[String]) -> List[String]:\n return sorted([x for x in set_input])\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('set_input', dagster_type=Set[String])],\n output_defs=[OutputDefinition(List[String])],\n)\ndef set_solid(_, set_input):\n return sorted([x for x in set_input])\n
dagster.
Tuple
\u00b6Use this type for inputs or outputs that are tuples. Alias for\ntyping.Tuple
.
You may optionally specify the inner types using the square brackets syntax for Python typing.
\nConfig values should be passed as a list (in YAML or the Python config dict).
\nExamples:
\n@solid\ndef tuple_solid(_, tuple_input: Tuple[String, Int, Float]) -> List:\n return [x for x in tuple_input]\n\n# Explicit\n@solid(\n input_defs=[InputDefinition('tuple_input', dagster_type=Tuple[String, Int, Float])],\n output_defs=[OutputDefinition(List)],\n)\ndef tuple_solid(_, tuple_input):\n return [x for x in tuple_input]\n
dagster.
FileHandle
[source]\u00b6A reference to a file as manipulated by a FileManager
\nSubclasses may handle files that are resident on the local file system, in an object store, or\nin any arbitrary place where a file can be stored.
\nThis exists to handle the very common case where you wish to write a computation that reads,\ntransforms, and writes files, but where you also want the same code to work in local development\nas well as on a cluster where the files will be stored in a globally available object store\nsuch as S3.
\npath_desc
\u00b6A representation of the file path for display purposes only.
\ndagster.
DagsterType
(type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=<DagsterTypeKind.REGULAR: 'REGULAR'>)[source]\u00b6Define a type in dagster. These can be used in the inputs and outputs of solids.
\ntype_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the solid. If it passes, return either\nTrue
or a TypeCheck
with success
set to True
. If it fails,\nreturn either False
or a TypeCheck
with success
set to False
.\nThe first argument must be named context
(or, if unused, _
, _context
, or context_
).\nUse required_resource_keys
for access to resources.
key (Optional[str]) \u2013
The unique key to identify types programatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name
. If\nneither key
nor name
is provided, a CheckError
is thrown.
In the case of a generic type such as List
or Optional
, this is\ngenerated programatically based on the type parameters.
For most use cases, name should be set and the key argument should\nnot be specified.
\nname (Optional[str]) \u2013 A unique name given by a user. If key
is None
, key
\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.
description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.
loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader
and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader
decorator to construct\nthese arguments.
materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer
and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer
\ndecorator to construct these arguments.
serialization_strategy (Optional[SerializationStrategy]) \u2013 An instance of a class that\ninherits from SerializationStrategy
. The default strategy for serializing\nthis value when automatically persisting it between execution steps. You should set\nthis value if the ordinary serialization machinery (e.g., pickle) will not be adequate\nfor this type.
auto_plugins (Optional[List[Type[TypeStoragePlugin]]]) \u2013 If types must be serialized differently\ndepending on the storage being used for intermediates, they should specify this\nargument. In these cases the serialization_strategy argument is not sufficient because\nserialization requires specialized API calls, e.g. to call an S3 API directly instead\nof using a generic file object. See dagster_pyspark.DataFrame
for an example.
required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn
.
is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String
, Int
) to visually distinguish\nthem from user-defined types. Meant for internal use.
kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.
dagster.
PythonObjectDagsterType
(python_type, key=None, name=None, **kwargs)[source]\u00b6Define a type in dagster whose typecheck is an isinstance check.
\nSpecifically, the type can either be a single python type (e.g. int),\nor a tuple of types (e.g. (int, float)) which is treated as a union.
\nExamples
\nntype = PythonObjectDagsterType(python_type=int)\nassert ntype.name == 'int'\nassert_success(ntype, 1)\nassert_failure(ntype, 'a')\n
ntype = PythonObjectDagsterType(python_type=(int, float))\nassert ntype.name == 'Union[int, float]'\nassert_success(ntype, 1)\nassert_success(ntype, 1.5)\nassert_failure(ntype, 'a')\n
python_type (Union[Type, Tuple[Type, ..]) \u2013 The dagster typecheck function calls instanceof on\nthis type.
name (Optional[str]) \u2013 Name the type. Defaults to the name of python_type
.
key (Optional[str]) \u2013 Key of the type. Defaults to name.
description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.
loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader
and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader
decorator to construct\nthese arguments.
materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer
and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_mate
\ndecorator to construct these arguments.
serialization_strategy (Optional[SerializationStrategy]) \u2013 An instance of a class that\ninherits from SerializationStrategy
. The default strategy for serializing\nthis value when automatically persisting it between execution steps. You should set\nthis value if the ordinary serialization machinery (e.g., pickle) will not be adequate\nfor this type.
auto_plugins (Optional[List[Type[TypeStoragePlugin]]]) \u2013 If types must be serialized differently\ndepending on the storage being used for intermediates, they should specify this\nargument. In these cases the serialization_strategy argument is not sufficient because\nserialization requires specialized API calls, e.g. to call an S3 API directly instead\nof using a generic file object. See dagster_pyspark.DataFrame
for an example.
dagster.
dagster_type_loader
(config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None)[source]\u00b6Create an dagster type loader that maps config data to a runtime value.
\nThe decorated function should take the execution context and parsed config value and return the\nappropriate runtime value.
\nconfig_schema (ConfigSchema) \u2013 The schema for the config that\u2019s passed to the decorated\nfunction.
loader_version (str) \u2013 (Experimental) The version of the decorated compute function. Two\nloading functions should have the same version if and only if they deterministically\nproduce the same outputs when provided the same inputs.
external_version_fn (Callable) \u2013 (Experimental) A function that takes in the same parameters as the loader\nfunction (config_value) and returns a representation of the version of the external\nasset (str). Two external assets with identical versions are treated as identical to one\nanother.
Examples:
\n@dagster_type_loader(Permissive())\ndef load_dict(_context, value):\n return value\n
dagster.
dagster_type_materializer
(config_schema, required_resource_keys=None)[source]\u00b6Create an output materialization hydration config that configurably materializes a runtime\nvalue.
\nThe decorated function should take the execution context, the parsed config value, and the\nruntime value and the parsed config data, should materialize the runtime value, and should\nreturn an appropriate AssetMaterialization
.
config_schema (Any) \u2013 The type of the config data expected by the decorated function.
\nExamples:
\n# Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\nvalue, and writes\n@dagster_type_materializer(str)\ndef materialize_df(_context, path, value):\n with open(path, 'w') as fd:\n writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n writer.writeheader()\n writer.writerows(rowdicts=value)\n\n return AssetMaterialization.file(path)\n
dagster.
usable_as_dagster_type
(name=None, description=None, loader=None, materializer=None, serialization_strategy=None, auto_plugins=None)[source]\u00b6Decorate a Python class to make it usable as a Dagster Type.
\nThis is intended to make it straightforward to annotate existing business logic classes to\nmake them dagster types whose typecheck is an isinstance check against that python class.
\npython_type (cls) \u2013 The python type to make usable as python type.
name (Optional[str]) \u2013 Name of the new Dagster type. If None
, the name (__name__
) of\nthe python_type
will be used.
description (Optional[str]) \u2013 A user-readable description of the type.
loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader
and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader
decorator to construct\nthese arguments.
materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer
and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer
\ndecorator to construct these arguments.
serialization_strategy (Optional[SerializationStrategy]) \u2013 An instance of a class that\ninherits from SerializationStrategy
. The default strategy for serializing\nthis value when automatically persisting it between execution steps. You should set\nthis value if the ordinary serialization machinery (e.g., pickle) will not be adequate\nfor this type.
auto_plugins (Optional[List[TypeStoragePlugin]]) \u2013 If types must be serialized differently\ndepending on the storage being used for intermediates, they should specify this\nargument. In these cases the serialization_strategy argument is not sufficient because\nserialization requires specialized API calls, e.g. to call an S3 API directly instead\nof using a generic file object. See dagster_pyspark.DataFrame
for an example.
Examples:
\n# dagster_aws.s3.file_manager.S3FileHandle\n@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n
dagster.
make_python_type_usable_as_dagster_type
(python_type, dagster_type)[source]\u00b6Take any existing python type and map it to a dagster type (generally created with\nDagsterType
) This can only be called once\non a given python type.
dagster.
check_dagster_type
(dagster_type, value)[source]\u00b6Test a custom Dagster type.
\ndagster_type (Any) \u2013 The Dagster type to test. Should be one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type()
, @usable_as_dagster_type
, or\nPythonObjectDagsterType()
, or a Python type.
value (Any) \u2013 The runtime value to test.
The result of the type check.
\nExamples
\nassert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n
dagster.
file_relative_path
(dunderfile, relative_path)[source]\u00b6This function is useful when one needs to load a file that is\nrelative to the position of the current file. (Such as when\nyou encode a configuration file path in source file and want\nin runnable in any current working directory)
\nIt is meant to be used like the following:
\nfile_relative_path(__file__, \u2018path/relative/to/file\u2019)
\n