diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 809191f..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,13 +0,0 @@ -[bumpversion] -current_version = 0.9.14 -commit = False -parse = (?P\d+)\.(?P\d+)\.(?P\d+)((?P.*))? -serialize = - {major}.{minor}.{patch}{release} - {major}.{minor}.{patch} -files = disdat/VERSION - -[bumpversion:part:release] -optional_value = production -values = - production diff --git a/MANIFEST.in b/MANIFEST.in index 6db666f..b255d80 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,8 @@ -include disdat/infrastructure/dockerizer/Makefile -include disdat/infrastructure/dockerizer/config.mk.template -include disdat/infrastructure/dockerizer/config/* include LICENSE.txt +include NOTICE.txt include setup.py include disdat/VERSION include disdat/resources/* recursive-include disdat/config * -recursive-include disdat/infrastructure/dockerizer/context.template * global-exclude *~ *.pyc \ No newline at end of file diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..6b43cfe --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,12 @@ +Disdat + +The creation of the core Disdat API, including the core Bundle data type, unifying remote/local link management, +partial localization, and myriad other features (in support of instrumenting 3rd party systems +(e.g, Kubeflow Pipelines, Luigi, Step functions), were developed by Ken Yocum. +Copyright 2018 - 2022 Kenneth Yocum All Rights Reserved. + +Contributions to the original dockerizer as well as initial AWS Batch support were made by Theodore Wong. +Copyright 2015-2017 Theodore Wong, All Rights Reserved. + +The Initial Developer of some parts of this framework is Human Longevity, Inc. +Copyright 2015 - 2017 Human Longevity, Inc. All Rights Reserved. \ No newline at end of file diff --git a/build-dist.sh b/build-dist.sh index f7a9be1..36c4f9b 100755 --- a/build-dist.sh +++ b/build-dist.sh @@ -2,26 +2,12 @@ echo "Building Disdat package for local installation or PyPi . . ." -# Bump version up -- Can use release or patch or major or minor -# bumpversion --dry-run --verbose release disdat/VERSION - -# Now bump version for real -# and git commit -am "" +# Use git to tag the release with the semver you wish # git tag -# Remove the prior tar ball from the context.template -rm -rf disdat/infrastructure/dockerizer/context.template/disdat-*.tar.gz -rm -rf dist/disdat-*.tar.gz - # Create a new sdist python setup.py sdist -# Copy over to the context.template. -cp dist/disdat-*.tar.gz disdat/infrastructure/dockerizer/context.template/. - -# Create a new sdist that will have that tar.gz in the template -python setup.py sdist - # publish to test pypi if false; then echo "Uploading to PYPI test and real" diff --git a/disdat/VERSION b/disdat/VERSION deleted file mode 100644 index 6d44d22..0000000 --- a/disdat/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.9.14 diff --git a/disdat/add.py b/disdat/add.py index 1ddaff3..513ff93 100644 --- a/disdat/add.py +++ b/disdat/add.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/disdat/api.py b/disdat/api.py index c48e84c..4dd994c 100644 --- a/disdat/api.py +++ b/disdat/api.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, ... Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -14,39 +12,29 @@ # limitations under the License. # - """ A Disdat API for creating, publishing, and finding bundles. -These calls are not thread safe. If they operate on a context, they -require the user to specify the context. This is unlike the CLI that maintains state on disk -that keeps track of your current context between calls. The API won't change the CLI's context and vice versa. +These calls are not thread safe. If they operate on a context, they require the user to specify the context. +This is unlike the CLI that maintains state on disk that keeps track of your current context between calls. +The API won't change the CLI's context and vice versa. -Author: Kenneth Yocum """ -from __future__ import print_function import os -import json import shutil import getpass -import warnings import errno import hashlib import collections import urllib -import disdat.apply # <--- imports api, which imports apply, etc. -import disdat.run import disdat.fs import disdat.common as common -from disdat.pipe_base import PipeBase from disdat.data_context import DataContext from disdat.utility.aws_s3 import s3_path_exists -from disdat.hyperframe import HyperFrameRecord, LineageRecord -from disdat.run import Backend, run_entry -from disdat.dockerize import dockerize_entry +from disdat.hyperframe import HyperFrameRecord, LineageRecord, parse_return_val from disdat import logger as _logger PROC_ID_TRUNCATE_HASH = 10 # 10 ls hex digits @@ -93,7 +81,7 @@ def __init__(self, There are three ways to create bundles: 1.) Create a bundle with a single call. Must include a data field! - b = api.Bundle('examples', name='propensity_model',owner='kyocum',data='/Users/kyocum/model.tgz') + b = api.Bundle('examples', name='propensity_model',owner='fred',data='/Users/fred/model.tgz') 2.) Create a bundle using a context manager. The initial call requires only a context. with api.Bundle('examples') as b: @@ -218,7 +206,13 @@ def abandon(self): """ self._check_open() _logger.debug(f"Disdat api abandon bundle obj [{id(self)}] process[{os.getpid()}] uuid[{self.uuid}]") - PipeBase.rm_bundle_dir(self._local_dir, self.uuid) + try: + shutil.rmtree(self._local_dir, ignore_errors=True) + os.rmdir(self._local_dir) + # TODO: if people create s3 files, s3 file targets, inside of an s3 context, + # TODO: then we will have to clean those up as well. + except IOError as why: + _logger.error("Removal of bundle directory {} failed with error {}. Continuing removal...".format(self._local_dir, why)) def _check_open(self): assert not self._closed, "Bundle must be open (not closed) for editing." @@ -554,7 +548,7 @@ def extract_human_name(code_ref): return code_ref.split('.')[-1] try: - presentation, frames = PipeBase.parse_return_val(self.uuid, self._data, self.data_context) + presentation, frames = parse_return_val(self.uuid, self._data, self.data_context) self.add_frames(frames) self.pb.presentation = presentation assert self.uuid != '', "Disdat API Error: Cannot close a bundle without a UUID." @@ -1335,217 +1329,6 @@ def pull(local_context, bundle_name=None, uuid=None, localize=False): fs.pull(human_name=bundle_name, uuid=uuid, localize=localize, data_context=data_context) -def apply(local_context, transform, output_bundle='-', - input_tags=None, output_tags=None, force=False, - force_all=False, params=None, - output_bundle_uuid=None, central_scheduler=False, workers=1, - incremental_push=False, incremental_pull=False): - """ Execute a Disdat pipeline natively on the local machine. Note that `api.run` will execute - a Disdat pipeline that has been dockerized (either locally or remotely on AWS Batch or AWS Sagemaker) - - Args: - local_context (str): The name of the local context in which the pipeline will run in the container - transform (type[disdat.pipe.PipeTask]): A reference to the Disdat Pipe class - output_bundle (str): The name of the output bundle. Defaults to `_` - input_tags: optional tags dictionary for selecting input bundle - output_tags: optional tags dictionary to tag output bundle - force (bool): Force re-running this transform, default False - force_all (bool): Force re-running ALL transforms, default False - params: optional parameters dictionary - output_bundle_uuid: Force UUID of output bundle - central_scheduler (bool): Use a central scheduler, default False, i.e., use local scheduler - workers (int): Number of workers, default 1. - incremental_push (bool): commit and push task bundles as they complete - incremental_pull (bool): localize bundles from remote as they are required by downstream tasks - - Returns: - result (int): 0 success, >0 if issue - - """ - - # check for deprecated str input for transform - if isinstance(transform, str): - msg = ('PipeTask classes should be passed as references, not strings, ' - 'support for string inputs will be removed in future versions') - warnings.warn(msg, DeprecationWarning) - transform = common.load_class(transform) - - data_context = _get_context(local_context) - - if input_tags is None: - input_tags = {} - - if output_tags is None: - output_tags = {} - - if params is None: - params = {} - - # IF apply raises, let it go up. - # If API, caller can catch. - # If CLI, python will exit 1 - result = disdat.apply.apply(output_bundle, params, transform, - input_tags, output_tags, force, force_all, - output_bundle_uuid=output_bundle_uuid, - central_scheduler=central_scheduler, - workers=workers, - data_context=data_context, - incremental_push=incremental_push, - incremental_pull=incremental_pull) - - # If no raise, but luigi says not successful - # If API (here), then raise for caller to catch. - # For CLI, we exit with 1 - common.apply_handle_result(result, raise_not_exit=True) - - return result - - -def run(setup_dir, - local_context, - pipe_cls, - pipeline_args=None, - output_bundle='-', - remote_context=None, - remote_s3_url=None, - backend=Backend.default(), - input_tags=None, - output_tags=None, - force=False, - force_all=False, - pull=None, - push=None, - no_push_int=False, - vcpus=2, - memory=4000, - workers=1, - no_submit=False, - aws_session_token_duration=42300, - job_role_arn=None): - """ Execute a pipeline in a container. Run locally, on AWS Batch, or AWS Sagemaker - - Simplest execution is with a setup directory (that contains your setup.py), the local context in which to - execute, and the pipeline to run. By default this call runs the container locally, reading and writing data only - to the local context. - - By default this call will assume the remote_context and remote_s3_url of the local context on this system. - Note that the user must provide both the remote_context and remote_s3_url to override the remote context bound - to the local context (if any). - - Args: - setup_dir (str): The directory that contains the setup.py holding the requirements for any pipelines - local_context (str): The name of the local context in which the pipeline will run in the container - pipe_cls (str): The pkg.module.class of the root of the pipeline DAG - pipeline_args (dict): Dictionary of the parameters of the root task - output_bundle (str): The human name of output bundle - remote_context (str): The remote context to pull / push bundles during execution. Default is `local_context` - remote_s3_url (str): The remote's S3 path - backend : Backend.Local | Backend.AWSBatch. Default Backend.local - input_tags (dict): str:str dictionary of tags required of the input bundle - output_tags (dict): str:str dictionary of tags placed on all output bundles (including intermediates) - force (bool): Re-run the last pipe task no matter prior outputs - force_all (bool): Re-run the entire pipeline no matter prior outputs - pull (bool): Pull before execution. Default if Backend.Local then False, else True - push (bool): Push output bundles to remote. Default if Backend.Local then False, else True - no_push_int (bool): Do not push intermediate task bundles after execution. Default False - vcpus (int): Number of virtual CPUs (if backend=`AWSBatch`). Default 2. - memory (int): Number of MB (if backend='AWSBatch'). Default 2000. - workers (int): Number of Luigi workers. Default 1. - no_submit (bool): If True, just create the AWS Batch Job definition, but do not submit the job - aws_session_token_duration (int): Seconds lifetime of temporary token (backend='AWSBatch'). Default 42300 - job_role_arn (str): AWS ARN for job execution in a batch container (backend='AWSBatch') - - Returns: - json (str): - - """ - - pipeline_arg_list = [] - if pipeline_args is not None: - for k,v in pipeline_args.items(): - pipeline_arg_list.append(k) - pipeline_arg_list.append(json.dumps(v)) - - # Set up context as 'remote_name/local_name' - if remote_context is None: - assert remote_s3_url is None, "disdat.api.run: user must specify both remote_s3_url and remote_context" - context = local_context - else: - assert remote_s3_url is not None, "disdat.api.run: user must specify both remote_s3_url and remote_context" - context = "{}/{}".format(remote_context, local_context) - - retval = run_entry(output_bundle=output_bundle, - pipeline_root=setup_dir, - pipeline_args=pipeline_arg_list, - pipe_cls=pipe_cls, - backend=backend, - input_tags=input_tags, - output_tags=output_tags, - force=force, - force_all=force_all, - context=context, - remote=remote_s3_url, - pull=pull, - push=push, - no_push_int=no_push_int, - vcpus=vcpus, - memory=memory, - workers=workers, - no_submit=no_submit, - job_role_arn=job_role_arn, - aws_session_token_duration=aws_session_token_duration) - - return retval - - -def dockerize(setup_dir, - config_dir=None, - build=True, - push=False, - sagemaker=False): - """ Create a docker container image using a setup.py and pkg.module.class description of the pipeline. - - Note: - Users set the os_type and os_version in the disdat.cfg file. - os_type: The base operating system type for the Docker image - os_version: The base operating system version for the Docker image - - Args: - setup_dir (str): The directory that contains the setup.py holding the requirements for any pipelines - config_dir (str): The directory containing the configuration of .deb packages - build (bool): If False, just copy files into the Docker build context without building image. - push (bool): Push the container to the repository - sagemaker (bool): Create a Docker image executable as a SageMaker container (instead of a local / AWSBatch container). - - Returns: - (int): 0 if success, 1 on failure - - """ - - retval = dockerize_entry(pipeline_root=setup_dir, - config_dir=config_dir, - os_type=None, - os_version=None, - build=build, - push=push, - sagemaker=sagemaker - ) - - return retval - - -def dockerize_get_id(setup_dir): - """ Retrieve the docker container image identifier - - Args: - setup_dir (str): The directory that contains the setup.py holding the requirements for any pipelines - - Returns: - (str): The full docker container image hash - """ - return dockerize_entry(pipeline_root=setup_dir, get_id=True) - - def helper_get_files_in_dir(dir): """ Return all files under this directory. If no scheme, assume local files. diff --git a/disdat/apply.py b/disdat/apply.py deleted file mode 100644 index dfd4a41..0000000 --- a/disdat/apply.py +++ /dev/null @@ -1,347 +0,0 @@ -# -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -apply - -API for executing a pipe - -pipes apply output_bundle pipes_cls - -author: Kenneth Yocum -""" -from __future__ import print_function - -import sys -import os -import argparse -import multiprocessing - -import luigi.task_register -from luigi import build -from luigi.execution_summary import LuigiStatusCode, _partition_tasks - -import disdat.common as common # config, especially logging, before luigi ever loads -import disdat.fs as fs -from disdat import logger as _logger - - -def apply(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force, force_all, - output_bundle_uuid=None, central_scheduler=False, workers=1, data_context=None, - incremental_push=False, incremental_pull=False): - """ - Given an input bundle, run the pipesline on the bundle. - Note, we first make a copy of all tasks that are parameterized identically to the tasks we will run. - This is so we can figure out what we will need to re-run. - This is why we make a single uuid for the output bundle of apply (for the driver). - - Args: - output_bundle: The new bundle to be created - pipe_params (dict): mapping of parameter name to Luigi Task parameter value - pipe_cls (type[disdat.pipe.PipeTask]): reference to the task class - force: force recomputation of the last user task. - force_all: force recomputation of dependencies - input_tags (dict): Tags used to find the input bundle - output_tags (dict): Tags that need to be placed on the output bundle - force_all (bool): whether to re-run this pipe - output_bundle_uuid (str): Optionally specify exactly the UUID of the output bundle IFF we actually need to produce it - central_scheduler: Use a centralized Luigi scheduler (default False, i.e., --local-scheduler is used) - workers: The number of luigi workers to use for this workflow (default 1) - data_context: Actual context object or None and read current context. - incremental_push (bool): Whether this job should push tasks as they complete to the remote (if configured) - incremental_pull (bool): Whether this job should localize bundles as needed from the remote (if configured) - - Returns: - bool: True if there were no failed tasks and no failed schedulings (missing external dependencies) - """ - - _logger.debug("pipe_cls {}".format(pipe_cls)) - _logger.debug("pipe params: {}".format(pipe_params)) - _logger.debug("force: {}".format(force)) - _logger.debug("force_all: {}".format(force_all)) - _logger.debug("input tags: {}".format(input_tags)) - _logger.debug("output tags: {}".format(output_tags)) - _logger.debug("sys.path {}".format(sys.path)) - _logger.debug("central_scheduler {}".format(central_scheduler)) - _logger.debug("workers {}".format(workers)) - _logger.debug("incremental_push {}".format(incremental_push)) - _logger.debug("incremental_pull {}".format(incremental_pull)) - - if incremental_push: - _logger.warn("incremental_push {}".format(incremental_push)) - - if incremental_pull: - _logger.warn("incremental_pull {}".format(incremental_pull)) - - pfs = fs.DisdatFS() - - if data_context is None: - if not pfs.in_context(): - _logger.warning('Not in a data context') - return None - data_context = pfs.curr_context - - # Increment the reference count for this process - apply.reference_count += 1 - - # If we are using Fork, we cannot have internal apply's with workers > 1 - # otherwise luigi loops forever with "There are no more tasks to run" and " is currently run by worker" - # This happens with Vanilla luigi in fork mode. In <=P37, MP fork for OS X is the default - # in >=P38, Spawn is the default. - if apply.reference_count > 1: - if multiprocessing.get_start_method() == 'fork': - workers = 1 - - def cleanup_cached_state(): - """ - After running, decrement our reference count (which tells how many simultaneous apply methods are - running nested in this process. Once the last one completes, blow away the luigi instance cache and git hash. - Needed if we're run twice (from scratch) in the same process. Otherwise, on the next run, we could find - the same class instances, with the old cached_output_bundle fields set. - """ - apply.reference_count -= 1 - if not apply.reference_count: - fs.DisdatFS().clear_pipe_version() - luigi.task_register.Register.clear_instance_cache() - - # Only pass data_context name, not reference to the pipe class - # data contexts may have open sql connections and other state - # that is not ForingPickler safe. - data_context_name = data_context.get_local_name() - - # Re-execute logic -- make copy of task DAG - # Creates a cache of {pipe:path_cache_entry} in the pipesFS object. - # This "task_path_cache" is used throughout execution to find output bundles. - dag = create_users_task(pipe_cls, pipe_params, output_bundle, - output_bundle_uuid, force_all, output_tags, - data_context_name, incremental_push, incremental_pull) - - # Get version information for pipeline - pipeline_path = os.path.dirname(sys.modules[dag.__module__].__file__) - fs.DisdatFS().get_pipe_version(pipeline_path) - - # If the user just wants to re-run this task, use mark_force - if force: - dag.mark_force() - - # Will be LuigiRunResult - status = build([dag], local_scheduler=not central_scheduler, workers=workers, detailed_summary=True) - success = False - if status.status == LuigiStatusCode.SUCCESS: - success = True - task_sets = _partition_tasks(status.worker) - did_work = len(task_sets['completed']) > 0 - - cleanup_cached_state() - - return {'success': success, 'did_work': did_work} - - -# Add a reference count to apply, so we can determine when to clean up the path_cache -apply.reference_count = 0 - -def create_users_task(pipe_cls, - pipe_params, - root_bundle_name, - forced_output_bundle_uuid, - force, - output_tags, - data_context_name, - incremental_push, - incremental_pull): - """ - Create the users task - - Every apply or run is logically a single execution and produces a set of outputs. Each - task produces a single bundle represented as a hyperframe internally. - - Args: - pipe_cls (disdat.Pipe): The user's Pipe class - pipe_params: parameters for this pipeline - root_bundle_name (str): user set output bundle name for last task - forced_output_bundle_uuid (str): user set output bundle uuid for last task, else None - force (bool): force re-run of entire pipeline - output_tags (dict): str,str tag dict - data_context_name (str): name in which pipe will run - incremental_push (bool): push bundle when pipe finishes - incremental_pull (bool): pull non-localized bundles before execution - - Returns: - `disdat.Pipe` - """ - - # Force root task to take an explicit bundle name? - if root_bundle_name == '-': - root_bundle_name = None - - task_params = {'is_root_task': True, - 'root_output_bundle_name': root_bundle_name, - 'forced_output_bundle_uuid': forced_output_bundle_uuid, - 'force': force, - 'output_tags': output_tags, - 'data_context_name': data_context_name, - 'incremental_push': incremental_push, - 'incremental_pull': incremental_pull - } - - # Get user pipeline parameters for this Pipe / Luigi Task - if pipe_params: - task_params.update(pipe_params) - - # Instantiate and return the class directly with the parameters - # Instance caching is taken care of automatically by luigi - return pipe_cls(**task_params) - - -def different_code_versions(code_version, lineage_obj): - """ - Given the current version, see if it is different than found_version - Note, if either version is dirty, we are forced to say they are different - - Typically we get the code_version from the pipe and the lineage object from the - bundle. We then see if the current code == the information in lineage object. - - Args: - current_version (CodeVersion) : - lineage_obj (LineageObject): - - Returns: - - """ - - conf = common.DisdatConfig.instance() - - if conf.ignore_code_version: - return False - - # If there were uncommitted changes, then we have to re-run, mark as different - if code_version.dirty: - return True - - if code_version.semver != lineage_obj.pb.code_semver: - return True - - if code_version.hash != lineage_obj.pb.code_hash: - return True - - ## Currently ignoring tstamp, branch, url - ## CodeVersion = collections.namedtuple('CodeVersion', 'semver hash tstamp branch url dirty') - - return False - - -def new_output_bundle(pipe, data_context, force_uuid=None): - """ - This proposes a new output bundle - 1.) Create a new UUID - 2.) Create the directory in the context - 3.) Add this to the path cache - - Note: We don't add to context's db yet. The job or pipe hasn't run yet. So it - hasn't made all of its outputs. If it fails, by definition it won't right out the - hframe to the context's directory. On rebuild / restart we will delete the directory. - However, the path_cache will hold on to this directory in memory. - - Args: - pipe (`disdat.pipe.PipeTask`): The task generating this output - data_context (`disdat.data_context.DataContext`): Place output in this context - force_uuid (str): Override uuid chosen by Disdat Bundle API - - Returns: - None - """ - import disdat.api as api # 3.7 allows us to put this import at the top, but not 3.6.8 - pce = PathCache.get_path_cache(pipe) - - if pce is None: - _logger.debug("new_output_bundle: Adding a new (unseen) task to the path cache.") - else: - _logger.debug("new_output_bundle: Found a task in our dag already in the path cache: reusing!") - return - - b = api.Bundle(data_context).open(force_uuid=force_uuid) - - PathCache.put_path_cache(pipe, b, b.uuid, b.local_dir, True) - - -def cli_apply(args): - """ - Parse and prepare strings from argparse arguments into suitable Python objects - to call the api's version of apply. Note, args.pipe_cls is already a cls object. - Most of the work here is to deser each input parameter value according to its - Luigi definition. - - Parameters: - disdat_config: - args: - - Returns: - None - """ - - if not fs.DisdatFS().in_context(): - print("Apply unavailable -- Disdat not in a valid context.") - return - - # Create a dictionary of str->str arguments to str->python objects deser'd by Luigi Parameters - deser_user_params = common.parse_params(args.pipe_cls, args.params) - - input_tags = common.parse_args_tags(args.input_tag) - - output_tags = common.parse_args_tags(args.output_tag) - - # NOTE: sysexit=False is required for us to pass a data_context object through luigi tasks. - # Else we build up arguments as strings to run_with_retcodes(). And it crashes because the data_context is - # not a string. - result = apply(args.output_bundle, deser_user_params, args.pipe_cls, input_tags, output_tags, - args.force, args.force_all, - central_scheduler=args.central_scheduler, - workers=args.workers, - incremental_push=args.incremental_push, - incremental_pull=args.incremental_pull) - - # If we didn't successfully run any task, sys.exit with non-zero code - common.apply_handle_result(result) - - -def add_arg_parser(subparsers): - """Initialize a command line set of subparsers with file system commands. - - Args: - subparsers: A collection of subparsers as defined by `argsparse`. - """ - - apply_p = subparsers.add_parser('apply', - description="Apply a transform to an input bundle to produce an output bundle.") - apply_p.add_argument('-cs', '--central-scheduler', action='store_true', default=False, - help="Use a central Luigi scheduler (defaults to local scheduler)") - apply_p.add_argument('-w', '--workers', type=int, default=1, help="Number of Luigi workers on this node") - apply_p.add_argument('-it', '--input-tag', nargs=1, type=str, action='append', - help="Input bundle tags: '-it authoritative:True -it version:0.7.1'") - apply_p.add_argument('-ot', '--output-tag', nargs=1, type=str, action='append', - help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'") - apply_p.add_argument('-o', '--output-bundle', type=str, default='-', - help="Name output bundle: '-o my.output.bundle'. Default name is '_'") - apply_p.add_argument('-f', '--force', action='store_true', help="Force re-computation of only this task.") - apply_p.add_argument('--force-all', action='store_true', help="Force re-computation of ALL upstream tasks.") - apply_p.add_argument('--incremental-push', action='store_true', - help="Commit and push each task's bundle as it is produced to the remote.") - apply_p.add_argument('--incremental-pull', action='store_true', - help="Localize bundles as they are needed by downstream tasks from the remote.") - apply_p.add_argument('pipe_cls', type=common.load_class, help="User-defined transform, e.g., 'module.PipeClass'") - apply_p.add_argument('params', type=str, nargs=argparse.REMAINDER, - help="Optional set of parameters for this pipe '--parameter value'") - apply_p.set_defaults(func=lambda args: cli_apply(args)) diff --git a/disdat/common.py b/disdat/common.py index 621a243..bb0f8e3 100644 --- a/disdat/common.py +++ b/disdat/common.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,22 +11,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" -Configuration -""" import logging import os import sys import shutil -import importlib -import subprocess import uuid +import importlib -import luigi from six.moves import urllib from six.moves import configparser -import six from disdat import resource import disdat.config @@ -38,7 +30,6 @@ SYSTEM_CONFIG_DIR = '~/.config/disdat' PACKAGE_CONFIG_DIR = 'disdat' LOGGING_FILE = 'logging.conf' -LUIGI_FILE = 'luigi.cfg' CFG_FILE = 'disdat.cfg' META_DIR = '.disdat' DISDAT_CONTEXT_DIR = 'context' # ~/.disdat/context/ @@ -50,22 +41,6 @@ BUNDLE_TAG_TRANSIENT = '__transient' BUNDLE_TAG_PUSH_META = '__push_meta' -LOCAL_EXECUTION = 'LOCAL_EXECUTION' # Docker endpoint env variable if we're running a container locally - - -class ApplyError(Exception): - def __init__(self, message, apply_result): - super(ApplyError, self).__init__(message) - self.apply_result = apply_result - @property - def result(self): - return self.apply_result - - -class ExtDepError(Exception): - def __init__(self, message): - super(ExtDepError, self).__init__(message) - class CatNoBundleError(Exception): def __init__(self, message): @@ -82,32 +57,6 @@ def error(msg, *args, **kwargs): sys.exit(1) -def apply_handle_result(apply_result, raise_not_exit=False): - """ Execute an appropriate sys.exit() call based on the dictionary - returned by apply. - - Args: - apply_result(dict): Has keys 'success' and 'did_work' that give Boolean values. - raise_not_exit (bool): Raise ApplyException instead of performing sys.exit - - Returns: - None - - """ - - if apply_result['success']: - if raise_not_exit: - pass - else: - sys.exit(None) # None yields exit value of 0 - else: - error_str = "Disdat Apply ran, but one or more tasks failed or missing dependencies." - if raise_not_exit: - raise ApplyError(error_str, apply_result) - else: - sys.exit(error_str) - - class SingletonType(type): def __call__(self, *args, **kwargs): try: @@ -117,10 +66,6 @@ def __call__(self, *args, **kwargs): return self.__instance -class MySingleton(object): - __metaclass__ = SingletonType - - class DisdatConfig(object): """ Configure Disdat. Configure logging. @@ -134,7 +79,7 @@ def __init__(self, meta_dir_root=None, config_dir=None): Args: meta_dir_root (str): Optional place to store disdat contexts. Default `~/` - config_dir (str): Optional directory from which to get disdat.cfg and luigi.cfg. Default SYSTEM_CONFIG_DIR + config_dir (str): Optional directory from which to get disdat.cfg and (optionally) luigi.cfg. Default SYSTEM_CONFIG_DIR """ # Find configuration directory @@ -149,16 +94,14 @@ def __init__(self, meta_dir_root=None, config_dir=None): 'Call "dsdt init" to initialize Disdat.' ) - # Extract individual configuration files disdat_cfg = os.path.join(config_dir, CFG_FILE) - luigi_cfg = os.path.join(config_dir, LUIGI_FILE) if meta_dir_root: self.meta_dir_root = meta_dir_root else: self.meta_dir_root = '~/' self.logging_config = None - self.parser = self._read_configuration_file(disdat_cfg, luigi_cfg) + self.parser = self._read_configuration_file(disdat_cfg) @staticmethod def instance(meta_dir_root=None, config_dir=None): @@ -167,7 +110,7 @@ def instance(meta_dir_root=None, config_dir=None): Args: meta_dir_root (str): Optional place to store disdat contexts. Default `~/` - config_dir (str): Optional directory from which to get disdat.cfg and luigi.cfg. Default SYSTEM_CONFIG_DIR + config_dir (str): Optional directory from which to get disdat.cfg and (optional) luigi.cfg. Default SYSTEM_CONFIG_DIR """ if DisdatConfig._instance is None: DisdatConfig._instance = DisdatConfig(meta_dir_root=meta_dir_root, config_dir=config_dir) @@ -179,9 +122,9 @@ def _fix_relative_path(config_file, to_fix_path): return os.path.join(os.path.dirname(config_file), to_fix_path) return to_fix_path - def _read_configuration_file(self, disdat_config_file, luigi_config_file): + def _read_configuration_file(self, disdat_config_file): """ - Check for environment varialbe 'DISDAT_CONFIG_PATH' -- should point to disdat.cfg + Check for environment variable 'DISDAT_CONFIG_PATH' -- should point to disdat.cfg Paths in the config might be relative. If so, add the prefix to them. Next, see if there is a disdat.cfg in cwd. Then configure disdat and (re)configure logging. """ @@ -192,13 +135,10 @@ def _read_configuration_file(self, disdat_config_file, luigi_config_file): self.meta_dir_root = DisdatConfig._fix_relative_path(disdat_config_file, self.meta_dir_root) self.ignore_code_version = config.getboolean('core', 'ignore_code_version') - # Set up luigi configuration - luigi.configuration.get_config().read(luigi_config_file) - # Tell everything to push warnings through the logging infrastructure logging.captureWarnings(True) - # unfortunately that's not enough -- kill all luigi (and disdat) warnings + # unfortunately that's not enough -- kill all warnings import warnings warnings.filterwarnings("ignore") @@ -238,63 +178,6 @@ def init(): dst = directory shutil.copytree(src, dst) - # Make sure paths are absolute in luigi config - luigi_dir = os.path.join(directory, LUIGI_FILE) - config = configparser.ConfigParser() - config.read(luigi_dir) - with open(luigi_dir, 'w') as handle: - config.write(handle) - -# -# subprocess wrapper -# - - -def do_subprocess(cmd, cli): - """ Standardize error processing - - Args: - cmd (str): command to execute - cli (bool): whether called from CLI (True) or API (False) - - Returns: - (int): 0 if success, >0 if failure - - """ - output = 'No captured output from running CMD [{}]'.format(cmd) - try: - if not cli: - output = subprocess.check_output(cmd) - else: - subprocess.check_call(cmd) - except subprocess.CalledProcessError as cpe: - if not cli: - print (output) - return cpe.returncode - raise - - return 0 - - -def do_subprocess_with_output(cmd): - """ Standardize error processing - - Args: - cmd (str): command to execute - - Returns: - (str): output of command - - """ - try: - output = subprocess.check_output(cmd) - return output - except subprocess.CalledProcessError as cpe: - raise - -# -# One place to update all the uuid's made by Disdat -# def create_uuid(): """ @@ -307,149 +190,6 @@ def create_uuid(): return str(uuid.uuid4()) -# -# Make Docker images names from pipeline class names -# - - -def make_project_image_name(setup_file_path): - """ - Create a container name from the name field in the setup.py file. - This uses some setuptools magic. When you install disdat, we install - an entrypoint. It becomes available to anyone using setup tools. - This extracts the information from the setup.py. - - see disdat/infrastructure/dockerizer/setup_tools_commands.py - - Args: - setup_file_path (str): The FQP to the setup.py file used to dockerize. - - Returns: - (str): image name string - - """ - - python_command = [ - 'python', - setup_file_path, - '-q', - 'dsdt_distname' - ] - - retval = do_subprocess_with_output(python_command).strip() - - # If P3, this may be a byte array. If P2, if not unicode, convert ... - retval = six.ensure_str(retval) - - return retval - - -def make_sagemaker_project_image_name(setup_file_path): - """ Create the string for the image for this pipeline if it uses sagemaker's - calling convention - - Args: - setup_file_path (str): The FQP to the setup.py file used to dockerize - - Returns: - str: The name of the image + '-sagemaker' - """ - - return make_project_image_name(setup_file_path) + "-sagemaker" - - -def make_project_repository_name(docker_repository_prefix, setup_file_path): - return '/'.join(([docker_repository_prefix.strip('/')] if docker_repository_prefix is not None else []) + [make_project_image_name(setup_file_path)]) - - -def make_sagemaker_project_repository_name(docker_repository_prefix, setup_file_path): - return '/'.join(([docker_repository_prefix.strip('/')] if docker_repository_prefix is not None else []) + [make_sagemaker_project_image_name(setup_file_path)]) - - -# -# Make run commands -# - -def get_run_command_parameters(pfs): - remote = pfs.curr_context.remote_ctxt_url - if remote is not None: - remote = remote.replace('/{}'.format(DISDAT_CONTEXT_DIR), '') - local_ctxt = "{}/{}".format(pfs.curr_context.remote_ctxt, pfs.curr_context.local_ctxt) - else: - local_ctxt = "{}".format(pfs.curr_context.local_ctxt) - return remote, local_ctxt - - -def make_run_command( - output_bundle, - output_bundle_uuid, - pipe_cls, - remote, - context, - input_tags, - output_tags, - force, - force_all, - no_pull, - no_push, - no_push_int, - workers, - pipeline_params -): - """ Create a list of args. Note that for execution via run, we always set - --output-bundle, even though it is optional. The CLI and API will place a '-' - if the user does not specify it, which means use the default output bundle name. Here - we make sure to pass it through. - - Args: - output_bundle: - output_bundle_uuid: - pipe_cls: - remote: - context: - input_tags: - output_tags: - force: - force_all: - no_pull: - no_push: - no_push_int: - workers: - pipeline_params: - - Returns: - - """ - args = [ - '--output-bundle-uuid ', output_bundle_uuid, - '--output-bundle', output_bundle, - '--branch', context, - '--workers', str(workers) - ] - if remote: - args.extend(['--remote', remote]) - if no_pull: - args += ['--no-pull'] - if no_push: - args += ['--no-push'] - if no_push_int: - args += ['--no-push-intermediates'] - if force: - args += ['--force'] - if force_all: - args += ['--force-all'] - if len(input_tags) > 0: - for next_tag in input_tags: - args += ['--input-tag', next_tag] - if len(output_tags) > 0: - for next_tag in output_tags: - args += ['--output-tag', next_tag] - - args += [str(pipe_cls)] # The one required argument to the entrypoint - - return [x.strip() for x in args + pipeline_params] - - def parse_args_tags(args_tag, to='dict'): """ parse argument string of tags 'tag:value tag:value' into a dictionary. @@ -475,64 +215,6 @@ def parse_args_tags(args_tag, to='dict'): return tag_thing -def parse_params(cls, params): - """ - Create a dictionary of str->str arguments to str->python objects deser'd by Luigi Parameters - - Input is the string "--arg value --arg2 value2" - - Convert to dict {'arg':str,'arg2':str2} - - then - - Convert to dict {'arg':luigi.Parameter.value,'arg2':luigi.Parameter.value2} - - Args: - cls (type[disdat.pipe.PipeTask]): - params: from argparse - - Returns: - dict {'arg':value,'arg2':value2} - """ - - params_str_dict = {k.lstrip('--'): v for k, v in zip(params[::2], params[1::2])} - - return convert_str_params(cls, params_str_dict) - - -def convert_str_params(cls, params_str): - """ - This is similar to Luigi.Task.from_str_params(cls, params_str) - But we don't create the class here, and we outer loop through our params (not the classes - params). We just want to convert each of the params that are in the class and in this dictionary - into the deserialized form. - - NOTE: This is somewhat dangerous and could break if Luigi changes around - this code. The alternative is to use Luigi.load_task() but then we have to ensure - all the input parameters are "strings" and we have to then put special code - inside of apply to know when to create a class normally, or create it from the CLI. - - Parameters: - params_str (dict): dict of str->str. param name -> value . - """ - kwargs = {} - - cls_params = {n: p for n, p in cls.get_params()} # get_params() returns [ (name, param), ... ] - - for param_name, param_str in params_str.items(): - if param_name in cls_params: - param = cls_params[param_name] - if isinstance(param_str, list): - kwargs[param_name] = param._parse_list(param_str) - else: - kwargs[param_name] = param.parse(param_str) - else: - _logger.error("Parameter {} is not defined in class {}.".format(param_name, cls.__name__)) - raise ValueError("Parameter {} is not defined in class {}.".format(param_name, cls.__name__)) - - return kwargs - - def get_local_file_path(url): """ Get a local file path from a file:// URL. @@ -568,15 +250,6 @@ def slicezip(a, b): return result -def setup_exists(fqp_setup): - """ Check if file exists - """ - if not os.path.exists(fqp_setup): - print ("No setup.py found at {}.".format(fqp_setup)) - return False - return True - - def load_class(class_path): """ Given a fully-qualified [pkg.mod.sub.classname] class name, diff --git a/disdat/config/disdat/disdat.cfg b/disdat/config/disdat/disdat.cfg index bbe3259..d6cdd60 100644 --- a/disdat/config/disdat/disdat.cfg +++ b/disdat/config/disdat/disdat.cfg @@ -1,66 +1,3 @@ [core] # Out of the box, ignore code version. -ignore_code_version=True - -[docker] -# A Docker registry to which to push pipeline images. For example: -# registry = docker.io -# If using AWS ECR, you can specify "*ECR*", and disdat will determine the -# registry for you: -# registry = *ECR* - -# An optional Docker repository prefix to use before the (generated) -# pipeline image name when pushing images to a registry. Do *not* include -# the registry in the repository prefix. For example: -# repository_prefix = username/projectname - -# If specified, log into ECR before pushing an image; not necessary if the -# registry is "*ECR*" -# ecr_login = - -[dockerize] -os_type = python -os_version = 3.7.8-slim - -# Optional other value for os_version -# os_version = 2.7.15-slim - -# Optional pip file -# dot_pip_file = ~/.pip/pip.conf - -# Optional odbc file bake-in. - -# Note, you might expose a password in the docker cache / registry if they are not secured. -# AWS ECS encrypts images, but the machine on which you build the container may contain a layer in its cache. -# You have been warned. - -# Also, note that your architecture might install your client DB libraries in different places than the -# Docker container (python-slim). You will need to create a config// directory and place -# either a deb.txt or deb packages for those client libraries to be installed. - -#dot_odbc_ini_file = ~/.odbc.ini - -[run] -# For AWS Batch: A job queue -aws_batch_queue = disdat-batch-queue - -# For AWS SageMaker: default instance type is smallest -# other valid types include: -# 'ml.m4.xlarge' | 'ml.m4.2xlarge' | 'ml.m4.4xlarge' | 'ml.m4.10xlarge' | 'ml.m4.16xlarge' | 'ml.m5.large' -#| 'ml.m5.xlarge' | 'ml.m5.2xlarge' | 'ml.m5.4xlarge' | 'ml.m5.12xlarge' | 'ml.m5.24xlarge' | 'ml.c4.xlarge' -#| 'ml.c4.2xlarge' | 'ml.c4.4xlarge' | 'ml.c4.8xlarge' | 'ml.p2.xlarge' | 'ml.p2.8xlarge' | 'ml.p2.16xlarge' -#| 'ml.p3.2xlarge' | 'ml.p3.8xlarge' | 'ml.p3.16xlarge' | 'ml.c5.xlarge' | 'ml.c5.2xlarge' | 'ml.c5.4xlarge' -#| 'ml.c5.9xlarge' | 'ml.c5.18xlarge' -aws_sagemaker_instance_type = ml.m4.xlarge -aws_sagemaker_instance_count = 1 -# Note if you have a lot of inputs in your s3 input uri, they have to fit here. Since -# disdat uses other s3 paths for inputs, you can keep this small unless you're doing something special. -aws_sagemaker_volume_sizeGB = 128 -# Max run time for training job -- 5 minutes default -aws_sagemaker_max_runtime_sec = 300 -# An input prefix, all objects that share this prefix show up in the container -aws_sagemaker_s3_input_uri = s3://somepath -# Disdat doesn't place models in the s3 destination bucket. But SageMaker still needs one. -aws_sagemaker_s3_output_uri = s3://somepath -# Role for SageMaker containers to assume to access S3, Cloud logs, etc. -aws_sagemaker_role_arn = somearn +ignore_code_version=True \ No newline at end of file diff --git a/disdat/config/disdat/luigi.cfg b/disdat/config/disdat/luigi.cfg deleted file mode 100644 index d2e8243..0000000 --- a/disdat/config/disdat/luigi.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[core] -log_level=INFO diff --git a/disdat/constants.py b/disdat/constants.py index b8a89fd..c9ff519 100644 --- a/disdat/constants.py +++ b/disdat/constants.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,10 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" -DisDat constants - -""" _MANAGED_OBJECTS = "objects" # directory in the context for objects diff --git a/disdat/data_context.py b/disdat/data_context.py index 0187fb2..25656ad 100644 --- a/disdat/data_context.py +++ b/disdat/data_context.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,9 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" -A DisDat context -""" + import os import json import glob @@ -24,8 +20,6 @@ from sqlalchemy import create_engine import pandas as pd import numpy as np -import luigi -from luigi.contrib.s3 import S3Target import urllib import boto3 @@ -35,7 +29,6 @@ import disdat.common as common import disdat.utility.aws_s3 as aws_s3 from disdat.common import DisdatConfig -from disdat.db_link import DBLink from disdat import logger as _logger @@ -660,6 +653,30 @@ def make_managed_path(self, uuid=None): return local_dir, _provided_uuid, remote_dir + @staticmethod + def rm_bundle_dir(output_path, uuid): + """ + We created a directory (managed path) to hold the bundle and any files. The files have been + copied in. Removing the directory removes any created files. + + ASSUMES: That we haven't actually updated the local DB with information on this bundle. + + Args: + output_path (str): + uuid (str): + db_targets (list(DBTarget)): + Returns: + None + """ + try: + shutil.rmtree(output_path, ignore_errors=True) + os.rmdir(output_path) + # TODO: if people create s3 files, s3 file targets, inside of an s3 context, + # TODO: then we will have to clean those up as well. + except IOError as why: + _logger.error("Removal of hyperframe directory {} failed with error {}. Continuing removal...".format( + uuid, why)) + def rm_hframe(self, hfr_uuid): """ Given a hfr_uuid, remove the hyperframe from the context. @@ -800,7 +817,6 @@ def write_hframe_remote(self, hfr, dry_run=False): return [(src, dst) for src, dst in zip(to_copy_files, dst_files)] - def atomic_update_hframe(self, hfr): """ Given an HFR that has new meta information, such as tags, update the version on disk atomically, @@ -974,10 +990,10 @@ def convert_serieslike2frame(self, hfid, name, series_like): copied_in_series_like = [] for src in series_like: - if isinstance(src, S3Target): - src = src.path - elif isinstance(src, luigi.LocalTarget): - src = urllib.parse.urljoin('file:', src.path) + #if isinstance(src, S3Target): + # src = src.path + #elif isinstance(src, luigi.LocalTarget): + # src = urllib.parse.urljoin('file:', src.path) if urllib.parse.urlparse(src).scheme == 's3': if remote_managed_path is not None: @@ -1087,8 +1103,8 @@ def copy_in_files(self, src_files, dst_dir, localize=True, dry_run=False): src_files = [src_files] for src_path in src_files: - if isinstance(src_path, luigi.LocalTarget) or isinstance(src_path, S3Target): - src_path = src_path.path + #if isinstance(src_path, luigi.LocalTarget) or isinstance(src_path, S3Target): + # src_path = src_path.path src_urlparse = urllib.parse.urlparse(src_path) @@ -1179,21 +1195,16 @@ def actualize_link_urls(self, fr, strip_file_scheme=False): """ file_set = [] - if not (fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame()): + if not (fr.is_local_fs_link_frame() or fr.is_s3_link_frame()): _logger.error("actualize_link_urls called on non-link frame.") raise ValueError("actualize_link_urls called on non-link frame.") urls = fr.get_link_urls() - if fr.is_db_link_frame(): - """ No-Op with db links """ - return urls - else: - """ Must be s3 or local file links. All the files in the link must be present """ - assert urllib.parse.urlparse(urls[0]).scheme == common.BUNDLE_URI_SCHEME.replace('://', '') - local_dir = self.get_object_dir() - local_file_set = [os.path.join(local_dir, fr.hframe_uuid, f.replace(common.BUNDLE_URI_SCHEME, '')) for f in - urls] + """ Must be s3 or local file links. All the files in the link must be present """ + assert urllib.parse.urlparse(urls[0]).scheme == common.BUNDLE_URI_SCHEME.replace('://', '') + local_dir = self.get_object_dir() + local_file_set = [os.path.join(local_dir, fr.hframe_uuid, f.replace(common.BUNDLE_URI_SCHEME, '')) for f in urls] # Check to see which files are present and which must stay remote # This can now happen with individual link localize and delocalize. @@ -1234,7 +1245,7 @@ def convert_hfr2df(self, hfr): frames = hfr.get_frames(self) columns = [] for fr in frames: - if fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame(): + if fr.is_local_fs_link_frame() or fr.is_s3_link_frame(): src_paths = self.actualize_link_urls(fr, strip_file_scheme=True) columns.append(pd.Series(data=src_paths, name=fr.pb.name)) else: @@ -1260,7 +1271,7 @@ def convert_hfr2scalar(self, hfr): assert len(frames) == 1 fr = frames[0] - if fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame(): + if fr.is_local_fs_link_frame() or fr.is_s3_link_frame(): src_paths = self.actualize_link_urls(fr, strip_file_scheme=True) nda = np.array(src_paths) else: @@ -1283,8 +1294,7 @@ def convert_hfr2json(self, hfr): assert len(frames) == 1 fr = frames[0] - assert not (fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame()), \ - "hfr2json, failed since this is a link frame. " + assert not (fr.is_local_fs_link_frame() or fr.is_s3_link_frame()), "hfr2json, failed since frame has links." nda = fr.to_ndarray() @@ -1303,7 +1313,7 @@ def convert_hfr2ndarray(self, hfr): assert len(frames) == 1 fr = frames[0] - if fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame(): + if fr.is_local_fs_link_frame() or fr.is_s3_link_frame(): src_paths = self.actualize_link_urls(fr, strip_file_scheme=True) return np.array(src_paths) else: @@ -1324,7 +1334,7 @@ def convert_hfr2row(self, hfr): frames = hfr.get_frames(self) row = [] for fr in frames: - if fr.is_local_fs_link_frame() or fr.is_s3_link_frame() or fr.is_db_link_frame(): + if fr.is_local_fs_link_frame() or fr.is_s3_link_frame(): src_paths = self.actualize_link_urls(fr, strip_file_scheme=True) if len(src_paths) == 1: row.append((fr.pb.name, src_paths[0])) diff --git a/disdat/db_link.py b/disdat/db_link.py deleted file mode 100644 index fe03cfd..0000000 --- a/disdat/db_link.py +++ /dev/null @@ -1,290 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import print_function - - -class DBLink(object): - """ - - A database table target. Often we need a task to create a table in a database. - This object simply records of the db, the schema, the table name, - that were used to make a table. - - Versioning and naming tables, though, is challenging. - - The pattern Disdat currently supports is to: - - 1.) Upstream tasks create data you want in or as a table and output parquet file. - This is your "versioned" database table. - - 2.) A subsequent downstream task uses upstream task as input. The sole purpose of this task is to create / insert - data into a database table. The users determines the naming scheme. - - A.) if you want versions, you have the upstream bundle. - B.) if you want to "commit" run the downstream task, else, for testing run just the upstream - C.) The lastest is whatever the user named it. No more versions in the DB. - - The DBTarget, then is a marker output bundle. If it exists, we assume the user has already made the table -- - they synchronously waited to get confirmation of the transaction. And since each bundle knows its inputs, - we have the data used to create the table. - - We assume that the user has configured their system to reach their DB - through the use of a Data Source Name or DSN, and that a file, like - ~/.odbc.ini, defines the attributes of the DSN, including the database, - login, password, servername, port, and driver. - - Optional features: - 0.) The user can ask for a "physical table name" from the DBTarget. This will prepend "DISDAT_" and append "_UUID". - This is the bundle UUID -- the user must ensure their names do not collide with multiple tables in a bundle. - 1.) User over-rides rm(): Called when the bundle is removed from a context. Argument tells you whether bundle - was committed. - 1.) User over-rides commit(): Disdat calls this when the bundle is committed. For example the user may write - code to create a view from the latest table. - - """ - - disdat_prefix = "DISDAT" - - def __init__(self, pipe_task, dsn, table_name, schema_name, servername='unknown', - database='unknown', uuid=None, port=-1): - """ - User creates a db_target within a Disdat Pipe when they want to work on a database table. - - The virt_name is the virtual table name (schema.'DISDAT_') - The phys_name is the physical table name (schema.'DISDAT__') - - Note that at the moment we are using the bundle uuid to append to the end of the virt_name. - - Args: - pipe_task (`disdat.pipe.PipeTask`): The pipe that is requesting to create a table. May be None - dsn (unicode): The DSN of the database. Assumes access via odbc. If None, will create a - DBTarget but will not be 'connected.' - table_name (unicode): The name of the table the user wants to create. - schema_name (unicode): Pass in schema name. Currently do not auto-generate schema - servername (unicode): If no dsn, use servername. Default is 'unknown' - database (unicode): If no dsn, use database. Default is 'unknown' - uuid (unicode): Optional, only used if pipe argument is None. - """ - - self.table_name = table_name - self.servername = servername - self.database = database - self.port = port - self.user_name = 'unknown' - self.dsn = dsn - self.pipe_task = pipe_task - - self.phys_name_url = None - self.virt_name = None - self.phys_name = None - self.schema = schema_name - self.committed = False - - self.uuid = uuid - if self.uuid is None: - pce = self.pipe_task.pfs.get_path_cache(self.pipe_task) - assert(pce is not None) - self.uuid = pce.uuid - self.sql_name_uuid = self.uuid.replace('-', '') - - self.init() - - # If this is a user, they must add the pipe argument, and we - # keep track of the different db_targets they create. - if self.pipe_task is not None: - self.pipe_task.add_db_target(self) - - def init(self): - """ - Create schema name, ensure it exists. - Create physical table name. - - Sometimes we create a DBTarget outside of running a particular pipe or task. This happens on - commit, for example. In that case, we assume a bundle uuid as an argument. - - Returns: - None - """ - - """ - Here the name is prefixed by - We don't prefix with the context because bundles are independent of context - """ - assert self.schema - self.phys_name = "{}.{}_{}_{}".format(self.schema, - self.disdat_prefix, - self.table_name, - self.sql_name_uuid) - - """ The fully qualified physical name is the text representation of the link stored in the bundle """ - self.phys_name_url = self.url() - - self.virt_name = DBLink.phys_to_virt(self.phys_name) - - def commit(self): - """ - - Commit a database table link by creating (maybe replacing) a view for the table, using the physical name. - - Returns: - None - - """ - pass - - def rm(self, commit_tag=False): - """ - User-supplied code for removing a database table link. - - Args: - commit_tag (bool): Indicate whether the bundle has been committed. - - Returns: - bool: Whether the remove was successful. - - """ - return True - - def url(self): - """ - The phys_name_url contains the servername, database, schema, and table name. - It is the string that may be used in lieu of a db_target. - - Returns: - (unicode): ..@ - - """ - return "db://{}.{}@{}".format(self.database, self.phys_name, self.servername) - - @property - def pn(self): - """ - Retrieve the physical table name. Use this name if you need - to generate a unique table each time you run the same task. - - phys = {}.{}_{}_{}.format(user schema, disdat_prefix, name, uuid) - - - Returns: - name (str) - """ - - return self.phys_name - - @property - def vn(self): - """ - Return virtual table name (schema.disdat_prefix_name) - - Use this name if you want the name to reflect that Disdat writes this table. - - Returns: - (unicode): table_name - - """ - return self.virt_name - - @property - def tn(self): - """ - Return table name (without disdat_prefix or UUID of the pn). - - Returns: - (unicode): table_name - - """ - return self.table_name - - @staticmethod - def phys_to_virt(phys_name): - """ - Convert physical to virtual name, i.e., strip uuid - - phys = {}.{}_{}_{}.format(user schema, disdat_prefix, name, uuid) - - virt = {}.{}_{} schema, disdat_prefix, name - - Returns: - (str): schema.name - - """ - schema, name = phys_name.split('.') - - name = '_'.join(name.split('_')[:-1]) - - return "{}.{}".format(schema, name) - - @staticmethod - def schema_from_phys(phys_name): - """ - Return schema from phys name - - phys = {}.{}_{}_{}.format(user schema, disdat_prefix, name, uuid) - - Returns: - (str): schema.name - - """ - schema, name = phys_name.split('.') - - return str(schema) - - @staticmethod - def schema_from_url(url): - """ - - Args: - url: - - Returns: - - """ - return url.replace('db://', '').split('@')[0].split('.')[1] - - @staticmethod - def table_from_url(url): - """ - Returns table name - - Args: - url: - - Returns: - - """ - return url.replace('db://', '').split('@')[0].split('.')[2] - - @staticmethod - def servername_from_url(url): - """ - Extract servername from URL - - Returns: - (unicode): servername - - """ - servername = url.replace('db://', '').split('@')[-1] - return str(servername) - - @staticmethod - def database_from_url(url): - """ - Extract database from URL - - Returns: - (unicode): database - - """ - database = url.replace('db://', '').split('.')[0] - return str(database) diff --git a/disdat/dockerize.py b/disdat/dockerize.py deleted file mode 100644 index 89fd632..0000000 --- a/disdat/dockerize.py +++ /dev/null @@ -1,337 +0,0 @@ -# -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import print_function - -# Built-in imports -import inspect -import logging -import os -import tempfile -import shutil - -# Third-party imports -import disdat.common -import disdat.resources -import disdat.utility.aws_s3 as aws -from disdat.infrastructure import dockerizer -from disdat.fs import determine_pipe_version - -import docker - -_MODULE_NAME = inspect.getmodulename(__file__) - -_DOCKERIZER_ROOT = os.path.dirname(inspect.getsourcefile(dockerizer)) - -_logger = logging.getLogger(__name__) - - -def _copy_in_dot_file(disdat_config, docker_context, dot_file_name, option_name, cli): - """ - Copy in a dot file to a file with name "dot_file_name" into the docker context. - The user might have put the src path in the disdat config file under the "option_name" - - Args: - disdat_config: A disdat config object - docker_context: The place we are storing the docker context for build - dot_file_name: The name of the file in the docker context - option_name: the name of the option in disdat.cfg - cli (bool): whether we are called from cli - - Returns: - None or if subprocess has output - - """ - retval = 0 - - dot_file_path = os.path.join(docker_context, dot_file_name) - if disdat_config.parser.has_option(_MODULE_NAME, option_name): - dot_file = os.path.expanduser(disdat_config.parser.get(_MODULE_NAME, option_name)) - shutil.copy(dot_file, dot_file_path) - print("Copying dot file {} into {}".format(dot_file, docker_context)) - else: - touch_command = [ - 'touch', - dot_file_path - ] - retval = disdat.common.do_subprocess(touch_command, cli) - - return retval - - -def latest_container_id(pipeline_root, cli): - """ Return the unique container image hash from the latest container made - from this setup.py - - Args: - pipeline_root (str): The path to the setup.py file that defines the container - cli (bool): if called from cli - - Returns: - (str): The full docker container image hash - - """ - setup_file = os.path.join(pipeline_root, 'setup.py') - pipeline_image_name = disdat.common.make_project_image_name(setup_file) - docker_client = docker.from_env() - - try: - img_obj = docker_client.images.get(pipeline_image_name) - except (docker.errors.APIError, docker.errors.ImageNotFound) as er: - if cli: - print("Disdat unable to find image with project name {}".format(pipeline_image_name)) - raise # exit with code > 0 - return None - - id = img_obj.id.replace('sha256:','') - - if cli: - print("{}".format(id)) - - return id - - -def dockerize(pipeline_root, - config_dir=None, - os_type=None, - os_version=None, - build=True, - push=False, - sagemaker=False, - cli=False - ): - """ Create a Docker image for running a pipeline. - - Args: - pipeline_root: Root of the Python source tree containing the - setuptools-style setup.py file. - config_dir (str): Configuration of image (.deb, requires.txt, etc.) - os_type (str): OS type string - os_version (str): Version of OS - build (bool): Build the image (default True) - push (bool): Push to registry listed in Disdat config file - sagemaker (bool): Build a container for 'train' or 'serve' in SageMaker - cli (bool): Whether dockerize was called from the CLI (True) or an API (False -- default) - - Returns: - (int): 0 equals success, >0 for error - - """ - - disdat_config = disdat.common.DisdatConfig.instance() - - # Get configuration parameters - image_os_type = os_type if os_type is not None else disdat_config.parser.get(_MODULE_NAME, 'os_type') - image_os_version = os_version if os_version is not None else disdat_config.parser.get(_MODULE_NAME, 'os_version') - docker_context = None - - if docker_context is None: - docker_context = tempfile.mkdtemp(suffix=_MODULE_NAME) - docker_makefile = os.path.join(_DOCKERIZER_ROOT, 'Makefile') - _logger.debug('Using Docker context {}'.format(docker_context)) - - # Populate the Docker context with the template containing dockerfiles, - # entrypoints, etc. - rsync_command = [ - 'rsync', - '-aq', # Archive mode, no non-error messages - '--exclude', '*.pyc', # Don't copy any compiled Python files - os.path.join(_DOCKERIZER_ROOT, 'context.template', ''), - docker_context, - ] - - retval = disdat.common.do_subprocess(rsync_command, cli) - if retval: return retval - - # PIP: Overwrite pip.conf in the context.template in your repo if they have set the option, - # else just create empty file. - # At this time, the Dockerfile always sets the PIP_CONFIG_FILE ENV var to this file. - retval = _copy_in_dot_file(disdat_config, docker_context, "pip.conf", "dot_pip_file", cli) - if retval: return retval - - # ODBC: Overwrite the .odbc.ini in the context.template in your repo if the user set the option, - # else just create empty file. - # At this time, the Dockerfile always sets the ODBCINI var to this file. - retval = _copy_in_dot_file(disdat_config, docker_context, "odbc.ini", "dot_odbc_ini_file", cli) - if retval: return retval - - setup_file = os.path.join(pipeline_root, 'setup.py') - - if not disdat.common.setup_exists(setup_file): - return 1 - - pipeline_image_name = disdat.common.make_project_image_name(setup_file) - - DEFAULT_DISDAT_HOME = os.path.join('/', *os.path.dirname(disdat.dockerize.__file__).split('/')[:-1]) - DISDAT_HOME = os.getenv('DISDAT_HOME', DEFAULT_DISDAT_HOME) - - if build: - pipe_version = determine_pipe_version(pipeline_root) - build_command = [ - 'make', # XXX really need to check that this is GNU make - '-f', docker_makefile, - 'PIPELINE_IMAGE_NAME={}'.format(pipeline_image_name), - 'DISDAT_DOCKER_CONTEXT={}'.format(docker_context), - 'DISDAT_ROOT={}'.format(os.path.join(DISDAT_HOME)), # XXX YUCK - 'PIPELINE_ROOT={}'.format(pipeline_root), - 'OS_TYPE={}'.format(image_os_type), - 'OS_VERSION={}'.format(image_os_version), - 'GIT_HASH={}'.format(pipe_version.hash), - 'GIT_BRANCH={}'.format(pipe_version.branch), - 'GIT_FETCH_URL={}'.format(pipe_version.url), - 'GIT_TIMESTAMP={}'.format(pipe_version.tstamp), - 'GIT_DIRTY={}'.format(pipe_version.dirty), - ] - - _logger.debug("pipeline root = {} build command = {}".format(pipeline_root, build_command)) - - if config_dir is not None: - build_command.append('CONFIG_ROOT={}'.format(config_dir)) - if sagemaker: - build_command.append('SAGEMAKER_TRAIN_IMAGE_NAME={}'.format(disdat.common.make_sagemaker_project_image_name(setup_file))) - build_command.append('sagemaker') - retval = disdat.common.do_subprocess(build_command, cli) - if retval: return retval - - if push: - docker_client = docker.from_env() - - repository_name_prefix = None - - if disdat_config.parser.has_option('docker', 'repository_prefix'): - repository_name_prefix = disdat_config.parser.get('docker', 'repository_prefix') - if sagemaker: - repository_name = disdat.common.make_sagemaker_project_repository_name(repository_name_prefix, setup_file) - pipeline_image_name = disdat.common.make_sagemaker_project_image_name(setup_file) - else: - repository_name = disdat.common.make_project_repository_name(repository_name_prefix, setup_file) - - # Figure out the fully-qualified repository name, i.e., the name - # including the registry. - if disdat_config.parser.has_option('docker','registry'): - registry_name = disdat_config.parser.get('docker', 'registry').strip('/') - if registry_name == '*ECR*': - policy_resource_name = None - if disdat_config.parser.has_option('docker', 'ecr_policy'): - policy_resource_name = disdat_config.parser.get('docker', 'ecr_policy') - fq_repository_name = aws.ecr_create_fq_respository_name( - repository_name, - policy_resource_package=disdat.resources, - policy_resource_name=policy_resource_name - ) - else: - fq_repository_name = '{}/{}'.format(registry_name, repository_name) - else: - if cli: - raise RuntimeError("No registry present for push to succeed") - else: - return 1 - - auth_config = None - if disdat_config.parser.has_option('docker', 'ecr_login') or registry_name == '*ECR*': - auth_config = aws.ecr_get_auth_config() - docker_client.api.tag(pipeline_image_name, fq_repository_name) - for line in docker_client.images.push(fq_repository_name, auth_config=auth_config, stream=True): - if b'error' in line: - if cli: - raise RuntimeError(line) - else: - return 1 - else: - if cli: print(line) - - return 0 - - -def add_arg_parser(parsers): - dockerize_p = parsers.add_parser('dockerize', description="Dockerizer a particular transform.") - dockerize_p.add_argument( - '--config-dir', - type=str, - default=None, - help="A directory containing configuration files for the operating system within the Docker image", - ) - dockerize_p.add_argument( - '--os-type', - type=str, - default=None, - help='The base operating system type for the Docker image', - ) - dockerize_p.add_argument( - '--os-version', - type=str, - default=None, - help='The base operating system version for the Docker image', - ) - dockerize_p.add_argument( - '--push', - action='store_true', - help="Push the image to a remote Docker registry (default is to not push; must set 'docker_registry' in Disdat config)", - ) - dockerize_p.add_argument( - '--get-id', - action='store_true', - help="Do not build, only return latest container image ID", - ) - dockerize_p.add_argument( - '--sagemaker', - action='store_true', - default=False, - help="Create a Docker image executable as a SageMaker container.", - ) - dockerize_p.add_argument( - '--no-build', - action='store_false', - help='Do not build an image (only copy files into the Docker build context)', - dest='build', - ) - dockerize_p.add_argument( - "pipeline_root", - type=str, - help="Root of the Python source tree containing the user-defined transform; must have a setuptools-style setup.py file" - ) - dockerize_p.set_defaults(func=lambda args: dockerize_entry(cli=True, **vars(args))) - return parsers - - -def dockerize_entry(cli=False, **kwargs): - """Run the dockerize command with parameters from the command line or the api. - - Parameters: - cli (bool): Whether this was called from the command line or the api, default False - **kwargs - - Returns: - (int): 0 for success, 1 for failure - """ - - if 'get_id' in kwargs and kwargs['get_id']: - return latest_container_id(kwargs['pipeline_root'], cli) - else: - return dockerize(kwargs['pipeline_root'], - config_dir=kwargs['config_dir'], - os_type=kwargs['os_type'], - os_version=kwargs['os_version'], - build=kwargs['build'], - push=kwargs['push'], - sagemaker=kwargs['sagemaker'], - cli=cli - ) - - -if __name__ == "__main__": - import api - - api.dockerize('/Users/kyocum/Code/anomaly-detection-service/anomaly', 'pipeline.pipeline.Train', push=True) \ No newline at end of file diff --git a/disdat/entrypoints/__init__.py b/disdat/entrypoints/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/disdat/entrypoints/cli_ep.py b/disdat/entrypoints/cli_ep.py old mode 100755 new mode 100644 index 5059c0b..a17e985 --- a/disdat/entrypoints/cli_ep.py +++ b/disdat/entrypoints/cli_ep.py @@ -1,7 +1,5 @@ #! /usr/bin/env python # -# Copyright 2015, 2016 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -18,7 +16,9 @@ """ Disdat -Distributed data (dsdt) command line utility for working with data science pipelines. +Command line utility +This comes with disdat (disdat-core). It looks for other installed packages, such as disdat-luigi, +that may define additional commands. """ @@ -26,8 +26,9 @@ import logging import sys import os +import importlib.util -from disdat import apply, dockerize, run, fs, add, lineage +from disdat import fs, add, lineage from disdat.common import DisdatConfig from disdat import log, __version__ @@ -35,7 +36,34 @@ DISDAT_PATH = os.environ.get("PATH", None) DISDAT_PYTHONPATH = os.environ.get("PYTHONPATH", None) +DISDAT_CLI_EXTRAS = ["disdatluigi"] +EXTENSION_MODULE = "cli_extension" +EXTENSION_METHOD = "add_arg_parser" +def resolve_cli_extras(subparsers): + """ + For each additional package that might extend the CLI check to see if the module + is loaded, create a reference to it, and call the "add_arg_parser" function. + We expect two things from the high-level package: + 1.) a top-level module "cli_extension" + 2.) method called "add_arg_parser" + + Returns: + None + """ + for module in DISDAT_CLI_EXTRAS: + spec = importlib.util.find_spec(module) + if spec is None: + pass + #print(f"Dynamic CLI extension: {module} is not installed") + else: + #print(f"Dynamic CLI extension: {module} found, attempting to extend CLI . . . ") + module_handle = importlib.import_module(module+f".{EXTENSION_MODULE}") + try: + add_cli_arg_parser = getattr(module_handle, EXTENSION_METHOD) + add_cli_arg_parser(subparsers) + except AttributeError as ae: + print(f"Disdat CLI unable to add commands from loaded extension [{module}], error {ae}") def main(): """ @@ -65,14 +93,14 @@ def main(): ls_p = subparsers.add_parser('init') ls_p.set_defaults(func=lambda args: DisdatConfig.init()) - # Add additional subparsers - dockerize.add_arg_parser(subparsers) - run.add_arg_parser(subparsers) - apply.add_arg_parser(subparsers) + # Add disdat core subparsers fs.add_arg_parser(subparsers) add.add_arg_parser(subparsers) lineage.add_arg_parser(subparsers) + # Add additional parsers if we are imported + resolve_cli_extras(subparsers) + args = parser.parse_args(args) log_level = logging.INFO diff --git a/disdat/entrypoints/docker_ep.py b/disdat/entrypoints/docker_ep.py deleted file mode 100755 index fa6f012..0000000 --- a/disdat/entrypoints/docker_ep.py +++ /dev/null @@ -1,386 +0,0 @@ -#!/usr/bin/env python -""" -Entry point for pipelines run within Docker images. - -@author: twong / kyocum -@copyright: Human Longevity, Inc. 2017 -@license: Apache 2.0 -""" -from __future__ import print_function - -import logging -import os -import sys - -import argparse -import subprocess -import disdat.common -import disdat.fs -import disdat.api -from disdat import log - -import boto3 -from botocore.exceptions import ClientError - -_HELP = """ Run a Disdat pipeline. This script wraps up several of the -steps required to run a pipeline, including: creating a working context, -running a pipeline class to generate an output bundle, and pushing an -output bundle to a Disdat remote. -""" - -_logger = logging.getLogger(__name__) - - -def _context_and_remote(context_name, remote=None): - """Create a new Disdat context and bind remote if not None. - - Check environment for 'LOCAL_EXECUTION', which should exist and be True if we are running - a container in an existing .disdat environment (e.g., on someone's laptop). - - If so, do not take any actions that would change the state of the users CLI. That is, do not - switch contexts. - - Args: - context_name (str): A fully-qualified context name. remote-context/local-context or local-context - remote (str): S3 remote name. - """ - - retval = disdat.api.context(context_name) - - if retval == 1: # branch exists - _logger.warning("Entrypoint found existing local context {} ".format(context_name)) - _logger.warning("Entrypoint not switching and ignoring directive to change to remote context {}".format(remote)) - elif retval == 0: # just made a new branch - if remote is not None: - _logger.info("Entrypoint made a new context {}, attaching remote {}".format(context_name, remote)) - _remote(context_name, remote) - else: - _logger.error("Entrypoint got non standard retval {} from api.context({}) command.".format(retval, context_name)) - return False - - if disdat.common.LOCAL_EXECUTION not in os.environ: - disdat.api.switch(context_name) - else: - _logger.info("Container running locally (not in a cloud provider, aka AWS). Not switching contexts") - - return True - - -def _remote(context_arg, remote_url): - """ Add remote to our context. - - Args: - context_arg: / or to use in this container - remote_url: The remote to add to this local context - - Returns: - None - """ - _logger.debug("Adding remote at URL {} for branch '{}'".format(remote_url, context_arg)) - - contexts = context_arg.split('/') - - if len(contexts) > 1: - remote_context = contexts[0] - local_context = contexts[1] - else: - local_context = contexts[0] - remote_context = local_context - - if remote_url is None: - _logger.error("Got an invalid URL {}".format(remote_url)) - return False - - try: - disdat.api.remote(local_context, remote_context, remote_url) - except Exception: - return False - return True - - -def retrieve_secret(secret_name): - """ Placeholder for ability to retrieve secrets needed by image - - Returns: - - """ - - raise NotImplementedError - - # Modify these to get them from the current environment - endpoint_url = "https://secretsmanager.us-west-2.amazonaws.com" - region_name = "us-west-2" - - session = boto3.session.Session() - client = session.client( - service_name='secretsmanager', - region_name=region_name, - endpoint_url=endpoint_url - ) - - try: - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) - except ClientError as e: - if e.response['Error']['Code'] == 'ResourceNotFoundException': - print("The requested secret " + secret_name + " was not found") - elif e.response['Error']['Code'] == 'InvalidRequestException': - print(("The request was invalid due to:", e)) - elif e.response['Error']['Code'] == 'InvalidParameterException': - print(("The request had invalid params:", e)) - else: - # Decrypted secret using the associated KMS CMK - # Depending on whether the secret was a string or binary, one of these fields will be populated - if 'SecretString' in get_secret_value_response: - secret = get_secret_value_response['SecretString'] - else: - binary_secret_data = get_secret_value_response['SecretBinary'] - - print ("Found the secret string as ") - print(secret) - - -def add_argument_help_string(help_string, default=None): - if default is None: - return '{}'.format(help_string) - else: - return "{} (default '{}')".format(help_string, default) - - -def _commit_and_push(b): - """ commit and push bundle b if not transient """ - if disdat.common.BUNDLE_TAG_TRANSIENT not in b.tags: - b.commit() - b.push() - - -def run_disdat_container(args): - """ Execute Disdat inside of container - - Args: - args: input arguments - - Returns: - None - - """ - print("Entrypoint running with args: {}".format(args)) - - if args.remote is not None: - response = boto3.client('sts').get_caller_identity() - _logger.info("boto3 caller identity {}".format(response)) - incremental_pull = True # running with a remote - else: - incremental_pull = False # running without a remote - - print ("Entrypoint running with incremental_pull=={}".format(incremental_pull)) - - # Check to make sure that we have initialized the Disdat environment - if not os.path.exists(os.path.join(os.environ['HOME'], '.config', 'disdat')): - _logger.warning("Disdat environment possibly uninitialized?") - - # Create context, add remote, and switch to it - if not _context_and_remote(args.branch, args.remote): - _logger.error("Failed to branch to \'{}\' and optionally bind to \'{}\'".format(args.branch, - args.remote)) - sys.exit(os.EX_IOERR) - - # Pull the remote branch into the local branch or download individual items - try: - if not args.no_pull: - disdat.api.pull(args.branch, localize=not incremental_pull) - except Exception as e: - _logger.error("Failed to pull and localize all bundles from context {} due to {}".format(args.branch, e)) - sys.exit(os.EX_IOERR) - - # If specified, decode the ordinary 'key:value' strings into a dictionary of tags. - input_tags = disdat.common.parse_args_tags(args.input_tag) - output_tags = disdat.common.parse_args_tags(args.output_tag) - - # Convert string of pipeline args into dictionary for api.apply - deser_user_params = disdat.common.parse_params(args.pipe_cls, args.pipeline_args) - - # If the user wants final and intermediate, then inc push. - if not args.no_push and not args.no_push_intermediates: - incremental_push = True - else: - incremental_push = False - - try: - result = disdat.api.apply(args.branch, - args.pipe_cls, - output_bundle=args.output_bundle, - input_tags=input_tags, - output_tags=output_tags, - params=deser_user_params, - output_bundle_uuid=args.output_bundle_uuid, - force=args.force, - force_all=args.force_all, - workers=args.workers, - incremental_push=incremental_push, - incremental_pull=incremental_pull) - - if not incremental_push: - if not args.no_push: - if not args.no_push_intermediates: - to_push = disdat.api.search(args.branch, is_committed=False, find_intermediates=True) - for b in to_push: - _commit_and_push(b) - b = disdat.api.get(None, uuid=args.output_bundle_uuid) - if b is not None: - _logger.info("Pipeline ran. Committing and pushing output bundle UUID {}.".format(args.output_bundle_uuid)) - _commit_and_push(b) - else: - _logger.info("Pipeline ran but did not finish final task.") - else: - _logger.info("Pipeline ran but user specified not to push any bundles to remote context.") - else: - _logger.info("Pipeline ran using incremental pushing.") - - except RuntimeError as re: - _logger.error('Failed to run pipeline: RuntimeError {}'.format(re)) - sys.exit(os.EX_IOERR) - - except disdat.common.ApplyError as ae: - _logger.error('Failed to run pipeline: ApplyException {}'.format(ae)) - sys.exit(os.EX_IOERR) - - if args.dump_output: - print(disdat.api.cat(args.branch, args.output_bundle)) - - sys.exit(os.EX_OK) - - -def argparse_and_run(input_args): - - # To simplify configuring and building pipeline images, we can keep - # various default parameter values in the Docker image makefile, - # and pass them on as Docker ENV variables. At the moment, we set - # the default params below to handle most cases. This is an example - # of how you might do this in the future if needed. - # some_default = os.environ[ENVVAR] if ENVVAR in os.environ else None - - parser = argparse.ArgumentParser( - description=_HELP, - ) - - parser.add_argument( - '--dump-output', - help='Dump the output to standard output', - action='store_true', - ) - parser.add_argument( - '--debug-level', - default=logging.INFO, - help='The debug logging level (default {})'.format(logging.getLevelName(logging.WARNING)) - ) - - disdat_parser = parser.add_argument_group('remote repository arguments') - disdat_parser.add_argument( - '--remote', - type=str, - default=None, - help='The s3 bucket from/to which to pull/push data', - ) - disdat_parser.add_argument( - '--no-pull', - action='store_true', - help='Do not pull (synchronize) remote repository with local repo - may cause entire pipeline to re-run.', - ) - disdat_parser.add_argument( - '--no-push', - action='store_true', - help='Do not push output bundles (including intermediates) to the remote repository (default is to push)', - ) - disdat_parser.add_argument( - '--no-push-intermediates', - action='store_true', - help='Do not push the intermediate bundles to the remote repository (default is to push)', - ) - - pipeline_parser = parser.add_argument_group('pipe arguments') - - pipeline_parser.add_argument( - '--branch', - type=str, - required=True, - help='The fully-qualified Disdat branch to use when running', - ) - - pipeline_parser.add_argument( - '--workers', - type=int, - default=2, - help="The number of Luigi workers to spawn. Default is 2." - ) - - pipeline_parser.add_argument( - '-it', '--input-tag', - nargs=1, type=str, action='append', - help="Input bundle tags: '-it authoritative:True -it version:0.7.1'") - - pipeline_parser.add_argument( - '-ot', '--output-tag', - nargs=1, type=str, action='append', - help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'") - - pipeline_parser.add_argument( - '--output-bundle-uuid', - default=None, - type=str, - help='UUID for the output bundle (default is for apply to generate a UUID)', - ) - - pipeline_parser.add_argument( - '-o', - '--output-bundle', - type=str, - default='-', - help="Name output bundle: '-o my.output.bundle'. Default name is '_'" - ) - - pipeline_parser.add_argument( - '--force', - action='store_true', - help='Force recomputation of the last task.', - ) - - pipeline_parser.add_argument( - '--force-all', - action='store_true', - help='Force recomputation of all upstream tasks.', - ) - - pipeline_parser.add_argument( - 'pipe_cls', - default=None, - type=disdat.common.load_class, - help=add_argument_help_string("Name of the pipeline class to run, e.g., 'package.module.ClassName'"), - ) - - pipeline_parser.add_argument( - "pipeline_args", - nargs=argparse.REMAINDER, - type=str, - help="Optional set of parameters for this pipe '--parameter value'" - ) - - args = parser.parse_args(input_args) - - log_level = logging.INFO - - log.enable(level=log_level) # TODO: Add configurable verbosity - - run_disdat_container(args) - - -def main(): - input_args = sys.argv[1:] - argparse_and_run(input_args) - - -if __name__ == '__main__': - main() - - diff --git a/disdat/entrypoints/sagemaker_ep.py b/disdat/entrypoints/sagemaker_ep.py deleted file mode 100755 index 6be6405..0000000 --- a/disdat/entrypoints/sagemaker_ep.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -""" -AWS SageMaker entrypoint wrapper for Disdatified pipelines. - -@author: twong / kyocum -@copyright: Human Longevity, Inc. 2017 -@license: Apache 2.0 -""" - -import argparse -import json -import logging -import os -import sys -import entrypoint - -from multiprocessing import Process - -_HELP = """ AWS SageMaker Disdat pipeline wrapper. This script will call the main entrypoint -to execute the pipeline, parsing arguments from hyperparameter.json. -""" - -_logger = logging.getLogger(__name__) - -# This is a hard-coded path that will be present in the container according to AWS SageMaker -# https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html -_HYPERPARAMETERS = "/opt/ml/input/config/hyperparameters.json" - - -def add_argument_help_string(help_string, default=None): - if default is None: - return '{}'.format(help_string) - else: - return "{} (default '{}')".format(help_string, default) - - -def train(): - - with open(_HYPERPARAMETERS) as hp: - args = json.load(hp) - - arglist = json.loads(args['arglist']) - - _logger.info("Disdat SageMaker Train calling entrypoint with json loads arglist {}".format(arglist)) - - p = Process(target=entrypoint.main, args=[arglist,]) - p.start() - p.join() - return p.exitcode == 0 - - -if __name__ == '__main__': - """ SageMaker invokes the container with 'train' or 'serve'. - Train jobs support arbitrary 'hyperparameter' params inside a json blob. - We read the json and interpret them as arguments to the Disdat entrypoint. - - Note: - 1.) We ignore the input S3 path (inputs come as bundles) - 2.) We use the output S3 path to store the output bundle context, remote, name and UUID - """ - - parser = argparse.ArgumentParser( - description=_HELP, - ) - - parser.add_argument( - 'purpose', - type=str, - help="'train' or 'serve'", - ) - - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO) - _logger.setLevel(logging.INFO) - - if args.purpose == 'train': - if not train(): - _logger.error("Disdat SageMaker train entrypoint failed.") - sys.exit(os.EX_IOERR) - elif args.purpose == 'serve': - _logger.warn("Disdat does not yet support SageMaker serve.") - sys.exit(os.EX_UNAVAILABLE) - else: - _logger.error("Disdat SageMaker invoked entrypoint with {}, not 'train' or 'serve'".format(args.purpose)) - sys.exit(os.EX_USAGE) - - sys.exit(os.EX_OK) - diff --git a/disdat/exceptions.py b/disdat/exceptions.py index 81d972f..7586298 100644 --- a/disdat/exceptions.py +++ b/disdat/exceptions.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,10 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" -Exceptions particular to pipes execution -""" - class BundleError(Exception): pass \ No newline at end of file diff --git a/disdat/fs.py b/disdat/fs.py index e169157..c3a2be3 100644 --- a/disdat/fs.py +++ b/disdat/fs.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -20,7 +18,6 @@ from a variety of backend resources. """ -from __future__ import print_function import os import json diff --git a/disdat/hyperframe.py b/disdat/hyperframe.py index 9cf0e84..68a9268 100644 --- a/disdat/hyperframe.py +++ b/disdat/hyperframe.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -28,20 +26,18 @@ Link -- contains link literals and pointer to LinkAuth. No table, no PB on disk LinkAuth -- contains auth creds. Has a Table storing PB, and PB on disk -**question design decision. Might have lineage table. Might not store frame pb in db. - Each Python object is called Record Each PB object is called HyperFrame contains UUIDs of Frames. Supports downloading HyperFrames without downloading all contained data. -It may be stored in an HFrame table as a byte blob and re-inflated without worry that it will be - excessively large. + +Note: as of this time LinkAuths are not used """ -from __future__ import print_function import sys from collections import namedtuple, defaultdict +from collections.abc import Sequence import hashlib import time import os @@ -50,7 +46,6 @@ import numpy as np import pandas as pd -import luigi import six import enum from sqlalchemy import Table, Column, String, MetaData, BLOB, Text, Enum, UniqueConstraint, DateTime @@ -58,7 +53,6 @@ from sqlalchemy.exc import IntegrityError import disdat.common as common -from disdat.db_link import DBLink from disdat import hyperframe_pb2 from disdat.utility.aws_s3 import s3_path_exists from disdat import logger as _logger @@ -646,6 +640,95 @@ def strip_file_prefix(series): series[i] = series[i][7:] +def parse_return_val(hfid, val, data_context): + """ + Interpret the return values and create an HFrame to wrap them. + This means setting the correct presentation bit in the HFrame so that + we call downstream tasks with parameters as the author intended. + + POLICY / NOTE: An non-HF output is a Presentable. + NOTE: For now, a task output is *always* presentable. + NOTE: No other code should set presentation in a HyperFrame. + + The mirror to this function (that unpacks a presentable is disdat.fs.present_hfr() + + Args: + hfid (str): UUID + val (object): A scalar, dict, tuple, list, dataframe + data_context (DataContext): The data context into which to place this value + + Returns: + (presentation, frames[]) + + """ + + possible_scalar_types = ( + int, + float, + str, + bool, + np.bool_, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, + six.binary_type, + six.text_type, + np.unicode_, + np.string_ + ) + + frames = [] + + if val is None: + """ None's stored as json.dumps([None]) or '[null]' """ + presentation = hyperframe_pb2.JSON + frames.append(data_context.convert_scalar2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) + + elif isinstance(val, HyperFrameRecord): + presentation = hyperframe_pb2.HF + frames.append(FrameRecord.make_hframe_frame(hfid, common.DEFAULT_FRAME_NAME + ':0', [val])) + + elif isinstance(val, np.ndarray) or isinstance(val, list): + presentation = hyperframe_pb2.TENSOR + if isinstance(val, list): + val = np.array(val) + frames.append(data_context.convert_serieslike2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) + + elif isinstance(val, tuple): + presentation = hyperframe_pb2.ROW + val = np.array(val) + frames.append(data_context.convert_serieslike2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) + + elif isinstance(val, dict): + presentation = hyperframe_pb2.ROW + for k, v in val.items(): + if not isinstance(v, (list, tuple, pd.core.series.Series, np.ndarray, Sequence)): + # assuming this is a scalar + assert isinstance(v, possible_scalar_types), 'Disdat requires dictionary values to be one of {} not {}'.format(possible_scalar_types, type(v)) + frames.append(data_context.convert_scalar2frame(hfid, k, v)) + else: + assert isinstance(v, (list, tuple, pd.core.series.Series, np.ndarray, Sequence)) + frames.append(data_context.convert_serieslike2frame(hfid, k, v)) + + elif isinstance(val, pd.DataFrame): + presentation = hyperframe_pb2.DF + frames.extend(data_context.convert_df2frames(hfid, val)) + + else: + presentation = hyperframe_pb2.SCALAR + frames.append(data_context.convert_scalar2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) + + return presentation, frames + + class PBObject(object): """ Most objects mirror PB objects. @@ -926,7 +1009,6 @@ def __init__(self, owner=None, human_name=None, processing_name=None, uuid=None, def is_presentable(self): """ Whether or not this HyperFrame is a presentable. - All HF's made by individual Luigi Tasks are presentable. Returns: (bool) @@ -1688,21 +1770,16 @@ def is_link_series(series_like): (bool): Whether the series | ndarray appears to be a link column """ - # Welcome to duck typing. Get the first element of - # the series and check to see if it is some kind of recognizable + # Get the first element of the series and check to see if it is some kind of recognizable # file element. If we get a TypeError (does not implement # __getitem__) or an attribute error (not a string) then we # definitely do not have a link series. try: tester = series_like[0] - if isinstance(tester, luigi.Target): - return True - elif isinstance(tester, DBLink): - return True - elif (tester.startswith('file:///') or - tester.startswith('s3://') or - tester.startswith('db://') - ): + + if (tester.startswith('file:///') or + tester.startswith('s3://') or + tester.startswith('db://')): return True else: return False @@ -1748,19 +1825,6 @@ def is_s3_link_frame(self): link_pb = self.pb.links[0] return link_pb.WhichOneof('link') == 's3' - def is_db_link_frame(self): - """ - Whether this frame contains db links - - Returns: - (bool): - """ - if not self.is_link_frame(): - return False - assert(len(self.pb.links) > 0) - link_pb = self.pb.links[0] - return link_pb.WhichOneof('link') == 'database' - def is_hfr_frame(self): """ Whether this frame contains hyperframes or not @@ -2024,7 +2088,7 @@ def make_hframe_frame(hfid, name, hframes): @staticmethod def make_link_frame(hfid, name, file_paths, local_managed_path, remote_managed_path): - """ Create link frame from file paths (file, s3, or db) or luigi.Target objects. + """ Create link frame from file paths (file, s3, or db) Assumes file_paths are 'file:///' or 's3://' or 'db://' Assumes that the files are already copied into the bundle directory. @@ -2036,7 +2100,7 @@ def make_link_frame(hfid, name, file_paths, local_managed_path, remote_managed_p Args: hfid: hyperframe id name: column name - file_paths (:list:str): array of paths or luigi.Target objects + file_paths (:list:str): array of paths local_managed_path (str): The current local directory structure remote_managed_path (str): The current remote directory structure @@ -2044,14 +2108,11 @@ def make_link_frame(hfid, name, file_paths, local_managed_path, remote_managed_p (FrameRecord) """ - if isinstance(file_paths[0], luigi.LocalTarget): - file_paths = ['file://{}'.format(lt.path) if lt.path.startswith('/') else lt.path for lt in file_paths] - if file_paths[0].startswith('file:///'): link_type = FileLinkRecord elif file_paths[0].startswith('s3://'): link_type = S3LinkRecord - elif file_paths[0].startswith('db://') or isinstance(file_paths[0], DBLink): + elif file_paths[0].startswith('db://'): _logger.error("Found database reference[{}], DBLinks deprecated in 0.9.3 ".format(file_paths[0])) raise Exception("hyperframe:make_link_frame: error trying to use a database reference.") else: diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/hyperframe_pb2.py b/disdat/infrastructure/Dockerfiles/hyperframe_def/hyperframe_pb2.py deleted file mode 100644 index bfd774d..0000000 --- a/disdat/infrastructure/Dockerfiles/hyperframe_def/hyperframe_pb2.py +++ /dev/null @@ -1,1140 +0,0 @@ -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: hyperframe.proto - -import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) -from google.protobuf.internal import enum_type_wrapper -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -from google.protobuf import descriptor_pb2 -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='hyperframe.proto', - package='bundle', - syntax='proto3', - serialized_pb=_b('\n\x10hyperframe.proto\x12\x06\x62undle\"#\n\x0bStringTuple\x12\t\n\x01k\x18\x01 \x01(\t\x12\t\n\x01v\x18\x02 \x01(\t\"\xfa\x01\n\nHyperFrame\x12\r\n\x05owner\x18\x01 \x01(\t\x12\x12\n\nhuman_name\x18\x02 \x01(\t\x12\x17\n\x0fprocessing_name\x18\x03 \x01(\t\x12\x0c\n\x04uuid\x18\x04 \x01(\t\x12#\n\x06\x66rames\x18\x05 \x03(\x0b\x32\x13.bundle.StringTuple\x12 \n\x07lineage\x18\x06 \x01(\x0b\x32\x0f.bundle.Lineage\x12!\n\x04tags\x18\x07 \x03(\x0b\x32\x13.bundle.StringTuple\x12*\n\x0cpresentation\x18\x08 \x01(\x0e\x32\x14.bundle.Presentation\x12\x0c\n\x04hash\x18\t \x01(\t\"\xe3\x01\n\x05\x46rame\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04uuid\x18\x02 \x01(\t\x12\x1a\n\x04type\x18\x03 \x01(\x0e\x32\x0c.bundle.Type\x12\r\n\x05shape\x18\x04 \x03(\r\x12$\n\tbyteorder\x18\x05 \x01(\x0e\x32\x11.bundle.ByteOrder\x12#\n\x07hframes\x18\x06 \x03(\x0b\x32\x12.bundle.HyperFrame\x12\x1b\n\x05links\x18\x07 \x03(\x0b\x32\x0c.bundle.Link\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\x0c\n\x04\x64\x61ta\x18\t \x01(\x0c\x12\x0c\n\x04hash\x18\n \x01(\t\"\x98\x03\n\x07Lineage\x12\x18\n\x10hframe_proc_name\x18\x01 \x01(\t\x12\x13\n\x0bhframe_uuid\x18\x02 \x01(\t\x12\x11\n\tcode_repo\x18\x03 \x01(\t\x12\x11\n\tcode_name\x18\x04 \x01(\t\x12\x13\n\x0b\x63ode_semver\x18\x05 \x01(\t\x12\x11\n\tcode_hash\x18\x06 \x01(\t\x12\x13\n\x0b\x63ode_branch\x18\x07 \x01(\t\x12\x14\n\x0c\x64\x61ta_context\x18\x08 \x01(\t\x12\x13\n\x0b\x64\x61ta_branch\x18\t \x01(\t\x12\x15\n\rcreation_date\x18\n \x01(\x01\x12.\n\ndepends_on\x18\x0b \x03(\x0b\x32\x1a.bundle.Lineage.Dependency\x12\x12\n\nstart_time\x18\x0c \x01(\x01\x12\x11\n\tstop_time\x18\r \x01(\x01\x12\x13\n\x0b\x63ode_method\x18\x0e \x01(\t\x1aM\n\nDependency\x12\x18\n\x10hframe_proc_name\x18\x01 \x01(\t\x12\x13\n\x0bhframe_uuid\x18\x02 \x01(\t\x12\x10\n\x08\x61rg_name\x18\x03 \x01(\t\"\x8d\x01\n\x08LinkAuth\x12\x0f\n\x07profile\x18\x01 \x01(\t\x12\x0c\n\x04uuid\x18\x02 \x01(\t\x12%\n\x07s3_auth\x18\x03 \x01(\x0b\x32\x12.bundle.S3LinkAuthH\x00\x12%\n\x07\x64\x62_auth\x18\x04 \x01(\x0b\x32\x12.bundle.DBLinkAuthH\x00\x12\x0c\n\x04hash\x18\x05 \x01(\tB\x06\n\x04\x61uth\"a\n\nS3LinkAuth\x12\x19\n\x11\x61ws_access_key_id\x18\x01 \x01(\t\x12\x1d\n\x15\x61ws_secret_access_key\x18\x02 \x01(\t\x12\x19\n\x11\x61ws_session_token\x18\x03 \x01(\t\"\x90\x01\n\nDBLinkAuth\x12\x0e\n\x06\x64river\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x10\n\x08\x64\x61tabase\x18\x03 \x01(\t\x12\x12\n\nservername\x18\x04 \x01(\t\x12\x0b\n\x03uid\x18\x05 \x01(\t\x12\x0b\n\x03pwd\x18\x06 \x01(\t\x12\x0c\n\x04port\x18\x07 \x01(\t\x12\x0f\n\x07sslmode\x18\x08 \x01(\t\"\xc1\x01\n\x04Link\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\x12\n\nframe_uuid\x18\x02 \x01(\t\x12\x15\n\rlinkauth_uuid\x18\x03 \x01(\t\x12\x0c\n\x04hash\x18\x04 \x01(\t\x12\"\n\x05local\x18\x05 \x01(\x0b\x32\x11.bundle.LocalLinkH\x00\x12\x1c\n\x02s3\x18\x06 \x01(\x0b\x32\x0e.bundle.S3LinkH\x00\x12(\n\x08\x64\x61tabase\x18\x07 \x01(\x0b\x32\x14.bundle.DatabaseLinkH\x00\x42\x06\n\x04link\"\x19\n\tLocalLink\x12\x0c\n\x04path\x18\x01 \x01(\t\"\x15\n\x06S3Link\x12\x0b\n\x03url\x18\x01 \x01(\t\"\x8c\x01\n\x0c\x44\x61tabaseLink\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tabase\x18\x02 \x01(\t\x12\x12\n\nservername\x18\x03 \x01(\t\x12\x0e\n\x06schema\x18\x04 \x01(\t\x12\r\n\x05table\x18\x05 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x06 \x03(\t\x12\x0b\n\x03\x64sn\x18\x07 \x01(\t\x12\x0c\n\x04port\x18\x08 \x01(\x05*V\n\x0cPresentation\x12\x06\n\x02HF\x10\x00\x12\x06\n\x02\x44\x46\x10\x01\x12\n\n\x06SCALAR\x10\x03\x12\n\n\x06TENSOR\x10\x04\x12\x07\n\x03ROW\x10\x05\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x06\x12\x08\n\x04JSON\x10\x07*(\n\tByteOrder\x12\x07\n\x03\x42IG\x10\x00\x12\n\n\x06LITTLE\x10\x01\x12\x06\n\x02NA\x10\x02*\xe8\x01\n\x04Type\x12\x08\n\x04NONE\x10\x00\x12\x08\n\x04LINK\x10\x01\x12\x0b\n\x07\x46LOAT16\x10\x02\x12\x0b\n\x07\x46LOAT32\x10\x03\x12\x0b\n\x07\x46LOAT64\x10\x04\x12\t\n\x05UINT8\x10\x05\x12\n\n\x06UINT16\x10\x06\x12\n\n\x06UINT32\x10\x07\x12\n\n\x06UINT64\x10\x08\x12\x08\n\x04INT8\x10\t\x12\t\n\x05INT16\x10\n\x12\t\n\x05INT32\x10\x0b\x12\t\n\x05INT64\x10\x0c\x12\n\n\x06STRING\x10\r\x12\x08\n\x04\x42OOL\x10\x0e\x12\r\n\tCOMPLEX64\x10\x0f\x12\x0e\n\nCOMPLEX128\x10\x10\x12\n\n\x06HFRAME\x10\x11\x12\n\n\x06OBJECT\x10\x12\x62\x06proto3') -) -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -_PRESENTATION = _descriptor.EnumDescriptor( - name='Presentation', - full_name='bundle.Presentation', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='HF', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='DF', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SCALAR', index=2, number=3, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TENSOR', index=3, number=4, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='ROW', index=4, number=5, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='DEFAULT', index=5, number=6, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='JSON', index=6, number=7, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=1738, - serialized_end=1824, -) -_sym_db.RegisterEnumDescriptor(_PRESENTATION) - -Presentation = enum_type_wrapper.EnumTypeWrapper(_PRESENTATION) -_BYTEORDER = _descriptor.EnumDescriptor( - name='ByteOrder', - full_name='bundle.ByteOrder', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='BIG', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='LITTLE', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='NA', index=2, number=2, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=1826, - serialized_end=1866, -) -_sym_db.RegisterEnumDescriptor(_BYTEORDER) - -ByteOrder = enum_type_wrapper.EnumTypeWrapper(_BYTEORDER) -_TYPE = _descriptor.EnumDescriptor( - name='Type', - full_name='bundle.Type', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='NONE', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='LINK', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FLOAT16', index=2, number=2, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FLOAT32', index=3, number=3, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FLOAT64', index=4, number=4, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='UINT8', index=5, number=5, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='UINT16', index=6, number=6, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='UINT32', index=7, number=7, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='UINT64', index=8, number=8, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='INT8', index=9, number=9, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='INT16', index=10, number=10, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='INT32', index=11, number=11, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='INT64', index=12, number=12, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='STRING', index=13, number=13, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='BOOL', index=14, number=14, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='COMPLEX64', index=15, number=15, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='COMPLEX128', index=16, number=16, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='HFRAME', index=17, number=17, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='OBJECT', index=18, number=18, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=1869, - serialized_end=2101, -) -_sym_db.RegisterEnumDescriptor(_TYPE) - -Type = enum_type_wrapper.EnumTypeWrapper(_TYPE) -HF = 0 -DF = 1 -SCALAR = 3 -TENSOR = 4 -ROW = 5 -DEFAULT = 6 -JSON = 7 -BIG = 0 -LITTLE = 1 -NA = 2 -NONE = 0 -LINK = 1 -FLOAT16 = 2 -FLOAT32 = 3 -FLOAT64 = 4 -UINT8 = 5 -UINT16 = 6 -UINT32 = 7 -UINT64 = 8 -INT8 = 9 -INT16 = 10 -INT32 = 11 -INT64 = 12 -STRING = 13 -BOOL = 14 -COMPLEX64 = 15 -COMPLEX128 = 16 -HFRAME = 17 -OBJECT = 18 - - - -_STRINGTUPLE = _descriptor.Descriptor( - name='StringTuple', - full_name='bundle.StringTuple', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='k', full_name='bundle.StringTuple.k', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='v', full_name='bundle.StringTuple.v', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=28, - serialized_end=63, -) - - -_HYPERFRAME = _descriptor.Descriptor( - name='HyperFrame', - full_name='bundle.HyperFrame', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='owner', full_name='bundle.HyperFrame.owner', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='human_name', full_name='bundle.HyperFrame.human_name', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='processing_name', full_name='bundle.HyperFrame.processing_name', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uuid', full_name='bundle.HyperFrame.uuid', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='frames', full_name='bundle.HyperFrame.frames', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='lineage', full_name='bundle.HyperFrame.lineage', index=5, - number=6, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='tags', full_name='bundle.HyperFrame.tags', index=6, - number=7, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='presentation', full_name='bundle.HyperFrame.presentation', index=7, - number=8, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hash', full_name='bundle.HyperFrame.hash', index=8, - number=9, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=66, - serialized_end=316, -) - - -_FRAME = _descriptor.Descriptor( - name='Frame', - full_name='bundle.Frame', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='bundle.Frame.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uuid', full_name='bundle.Frame.uuid', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='type', full_name='bundle.Frame.type', index=2, - number=3, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='shape', full_name='bundle.Frame.shape', index=3, - number=4, type=13, cpp_type=3, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='byteorder', full_name='bundle.Frame.byteorder', index=4, - number=5, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hframes', full_name='bundle.Frame.hframes', index=5, - number=6, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='links', full_name='bundle.Frame.links', index=6, - number=7, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='strings', full_name='bundle.Frame.strings', index=7, - number=8, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data', full_name='bundle.Frame.data', index=8, - number=9, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hash', full_name='bundle.Frame.hash', index=9, - number=10, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=319, - serialized_end=546, -) - - -_LINEAGE_DEPENDENCY = _descriptor.Descriptor( - name='Dependency', - full_name='bundle.Lineage.Dependency', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='hframe_proc_name', full_name='bundle.Lineage.Dependency.hframe_proc_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hframe_uuid', full_name='bundle.Lineage.Dependency.hframe_uuid', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='arg_name', full_name='bundle.Lineage.Dependency.arg_name', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=880, - serialized_end=957, -) - -_LINEAGE = _descriptor.Descriptor( - name='Lineage', - full_name='bundle.Lineage', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='hframe_proc_name', full_name='bundle.Lineage.hframe_proc_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hframe_uuid', full_name='bundle.Lineage.hframe_uuid', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_repo', full_name='bundle.Lineage.code_repo', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_name', full_name='bundle.Lineage.code_name', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_semver', full_name='bundle.Lineage.code_semver', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_hash', full_name='bundle.Lineage.code_hash', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_branch', full_name='bundle.Lineage.code_branch', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data_context', full_name='bundle.Lineage.data_context', index=7, - number=8, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data_branch', full_name='bundle.Lineage.data_branch', index=8, - number=9, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='creation_date', full_name='bundle.Lineage.creation_date', index=9, - number=10, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='depends_on', full_name='bundle.Lineage.depends_on', index=10, - number=11, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='start_time', full_name='bundle.Lineage.start_time', index=11, - number=12, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='stop_time', full_name='bundle.Lineage.stop_time', index=12, - number=13, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='code_method', full_name='bundle.Lineage.code_method', index=13, - number=14, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[_LINEAGE_DEPENDENCY, ], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=549, - serialized_end=957, -) - - -_LINKAUTH = _descriptor.Descriptor( - name='LinkAuth', - full_name='bundle.LinkAuth', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='profile', full_name='bundle.LinkAuth.profile', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uuid', full_name='bundle.LinkAuth.uuid', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='s3_auth', full_name='bundle.LinkAuth.s3_auth', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='db_auth', full_name='bundle.LinkAuth.db_auth', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hash', full_name='bundle.LinkAuth.hash', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name='auth', full_name='bundle.LinkAuth.auth', - index=0, containing_type=None, fields=[]), - ], - serialized_start=960, - serialized_end=1101, -) - - -_S3LINKAUTH = _descriptor.Descriptor( - name='S3LinkAuth', - full_name='bundle.S3LinkAuth', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='aws_access_key_id', full_name='bundle.S3LinkAuth.aws_access_key_id', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='aws_secret_access_key', full_name='bundle.S3LinkAuth.aws_secret_access_key', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='aws_session_token', full_name='bundle.S3LinkAuth.aws_session_token', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1103, - serialized_end=1200, -) - - -_DBLINKAUTH = _descriptor.Descriptor( - name='DBLinkAuth', - full_name='bundle.DBLinkAuth', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='driver', full_name='bundle.DBLinkAuth.driver', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='description', full_name='bundle.DBLinkAuth.description', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='database', full_name='bundle.DBLinkAuth.database', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='servername', full_name='bundle.DBLinkAuth.servername', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uid', full_name='bundle.DBLinkAuth.uid', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='pwd', full_name='bundle.DBLinkAuth.pwd', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='port', full_name='bundle.DBLinkAuth.port', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='sslmode', full_name='bundle.DBLinkAuth.sslmode', index=7, - number=8, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1203, - serialized_end=1347, -) - - -_LINK = _descriptor.Descriptor( - name='Link', - full_name='bundle.Link', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='uuid', full_name='bundle.Link.uuid', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='frame_uuid', full_name='bundle.Link.frame_uuid', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='linkauth_uuid', full_name='bundle.Link.linkauth_uuid', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hash', full_name='bundle.Link.hash', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='local', full_name='bundle.Link.local', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='s3', full_name='bundle.Link.s3', index=5, - number=6, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='database', full_name='bundle.Link.database', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name='link', full_name='bundle.Link.link', - index=0, containing_type=None, fields=[]), - ], - serialized_start=1350, - serialized_end=1543, -) - - -_LOCALLINK = _descriptor.Descriptor( - name='LocalLink', - full_name='bundle.LocalLink', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='path', full_name='bundle.LocalLink.path', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1545, - serialized_end=1570, -) - - -_S3LINK = _descriptor.Descriptor( - name='S3Link', - full_name='bundle.S3Link', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='url', full_name='bundle.S3Link.url', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1572, - serialized_end=1593, -) - - -_DATABASELINK = _descriptor.Descriptor( - name='DatabaseLink', - full_name='bundle.DatabaseLink', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='url', full_name='bundle.DatabaseLink.url', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='database', full_name='bundle.DatabaseLink.database', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='servername', full_name='bundle.DatabaseLink.servername', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='schema', full_name='bundle.DatabaseLink.schema', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table', full_name='bundle.DatabaseLink.table', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='columns', full_name='bundle.DatabaseLink.columns', index=5, - number=6, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dsn', full_name='bundle.DatabaseLink.dsn', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='port', full_name='bundle.DatabaseLink.port', index=7, - number=8, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1596, - serialized_end=1736, -) - -_HYPERFRAME.fields_by_name['frames'].message_type = _STRINGTUPLE -_HYPERFRAME.fields_by_name['lineage'].message_type = _LINEAGE -_HYPERFRAME.fields_by_name['tags'].message_type = _STRINGTUPLE -_HYPERFRAME.fields_by_name['presentation'].enum_type = _PRESENTATION -_FRAME.fields_by_name['type'].enum_type = _TYPE -_FRAME.fields_by_name['byteorder'].enum_type = _BYTEORDER -_FRAME.fields_by_name['hframes'].message_type = _HYPERFRAME -_FRAME.fields_by_name['links'].message_type = _LINK -_LINEAGE_DEPENDENCY.containing_type = _LINEAGE -_LINEAGE.fields_by_name['depends_on'].message_type = _LINEAGE_DEPENDENCY -_LINKAUTH.fields_by_name['s3_auth'].message_type = _S3LINKAUTH -_LINKAUTH.fields_by_name['db_auth'].message_type = _DBLINKAUTH -_LINKAUTH.oneofs_by_name['auth'].fields.append( - _LINKAUTH.fields_by_name['s3_auth']) -_LINKAUTH.fields_by_name['s3_auth'].containing_oneof = _LINKAUTH.oneofs_by_name['auth'] -_LINKAUTH.oneofs_by_name['auth'].fields.append( - _LINKAUTH.fields_by_name['db_auth']) -_LINKAUTH.fields_by_name['db_auth'].containing_oneof = _LINKAUTH.oneofs_by_name['auth'] -_LINK.fields_by_name['local'].message_type = _LOCALLINK -_LINK.fields_by_name['s3'].message_type = _S3LINK -_LINK.fields_by_name['database'].message_type = _DATABASELINK -_LINK.oneofs_by_name['link'].fields.append( - _LINK.fields_by_name['local']) -_LINK.fields_by_name['local'].containing_oneof = _LINK.oneofs_by_name['link'] -_LINK.oneofs_by_name['link'].fields.append( - _LINK.fields_by_name['s3']) -_LINK.fields_by_name['s3'].containing_oneof = _LINK.oneofs_by_name['link'] -_LINK.oneofs_by_name['link'].fields.append( - _LINK.fields_by_name['database']) -_LINK.fields_by_name['database'].containing_oneof = _LINK.oneofs_by_name['link'] -DESCRIPTOR.message_types_by_name['StringTuple'] = _STRINGTUPLE -DESCRIPTOR.message_types_by_name['HyperFrame'] = _HYPERFRAME -DESCRIPTOR.message_types_by_name['Frame'] = _FRAME -DESCRIPTOR.message_types_by_name['Lineage'] = _LINEAGE -DESCRIPTOR.message_types_by_name['LinkAuth'] = _LINKAUTH -DESCRIPTOR.message_types_by_name['S3LinkAuth'] = _S3LINKAUTH -DESCRIPTOR.message_types_by_name['DBLinkAuth'] = _DBLINKAUTH -DESCRIPTOR.message_types_by_name['Link'] = _LINK -DESCRIPTOR.message_types_by_name['LocalLink'] = _LOCALLINK -DESCRIPTOR.message_types_by_name['S3Link'] = _S3LINK -DESCRIPTOR.message_types_by_name['DatabaseLink'] = _DATABASELINK -DESCRIPTOR.enum_types_by_name['Presentation'] = _PRESENTATION -DESCRIPTOR.enum_types_by_name['ByteOrder'] = _BYTEORDER -DESCRIPTOR.enum_types_by_name['Type'] = _TYPE - -StringTuple = _reflection.GeneratedProtocolMessageType('StringTuple', (_message.Message,), dict( - DESCRIPTOR = _STRINGTUPLE, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.StringTuple) - )) -_sym_db.RegisterMessage(StringTuple) - -HyperFrame = _reflection.GeneratedProtocolMessageType('HyperFrame', (_message.Message,), dict( - DESCRIPTOR = _HYPERFRAME, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.HyperFrame) - )) -_sym_db.RegisterMessage(HyperFrame) - -Frame = _reflection.GeneratedProtocolMessageType('Frame', (_message.Message,), dict( - DESCRIPTOR = _FRAME, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.Frame) - )) -_sym_db.RegisterMessage(Frame) - -Lineage = _reflection.GeneratedProtocolMessageType('Lineage', (_message.Message,), dict( - - Dependency = _reflection.GeneratedProtocolMessageType('Dependency', (_message.Message,), dict( - DESCRIPTOR = _LINEAGE_DEPENDENCY, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.Lineage.Dependency) - )) - , - DESCRIPTOR = _LINEAGE, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.Lineage) - )) -_sym_db.RegisterMessage(Lineage) -_sym_db.RegisterMessage(Lineage.Dependency) - -LinkAuth = _reflection.GeneratedProtocolMessageType('LinkAuth', (_message.Message,), dict( - DESCRIPTOR = _LINKAUTH, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.LinkAuth) - )) -_sym_db.RegisterMessage(LinkAuth) - -S3LinkAuth = _reflection.GeneratedProtocolMessageType('S3LinkAuth', (_message.Message,), dict( - DESCRIPTOR = _S3LINKAUTH, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.S3LinkAuth) - )) -_sym_db.RegisterMessage(S3LinkAuth) - -DBLinkAuth = _reflection.GeneratedProtocolMessageType('DBLinkAuth', (_message.Message,), dict( - DESCRIPTOR = _DBLINKAUTH, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.DBLinkAuth) - )) -_sym_db.RegisterMessage(DBLinkAuth) - -Link = _reflection.GeneratedProtocolMessageType('Link', (_message.Message,), dict( - DESCRIPTOR = _LINK, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.Link) - )) -_sym_db.RegisterMessage(Link) - -LocalLink = _reflection.GeneratedProtocolMessageType('LocalLink', (_message.Message,), dict( - DESCRIPTOR = _LOCALLINK, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.LocalLink) - )) -_sym_db.RegisterMessage(LocalLink) - -S3Link = _reflection.GeneratedProtocolMessageType('S3Link', (_message.Message,), dict( - DESCRIPTOR = _S3LINK, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.S3Link) - )) -_sym_db.RegisterMessage(S3Link) - -DatabaseLink = _reflection.GeneratedProtocolMessageType('DatabaseLink', (_message.Message,), dict( - DESCRIPTOR = _DATABASELINK, - __module__ = 'hyperframe_pb2' - # @@protoc_insertion_point(class_scope:bundle.DatabaseLink) - )) -_sym_db.RegisterMessage(DatabaseLink) - - -# @@protoc_insertion_point(module_scope) diff --git a/disdat/infrastructure/README.md b/disdat/infrastructure/README.md deleted file mode 100644 index 1303bc8..0000000 --- a/disdat/infrastructure/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Infrastructure - -aws: Scripts for working with AWS resources such as S3 and EC2 - -kickstart: Scripts for creating linux and python environments. Used for -Docker images. - -pythia: Database for storing bundles, lineage, and schemas. - -kubernetes: Facilities for starting, stopping, resizing kubernetes -clusters. - -Dockerfiles: Set of scripts to create Docker images. diff --git a/disdat/infrastructure/__init__.py b/disdat/infrastructure/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/disdat/infrastructure/dockerizer/Makefile b/disdat/infrastructure/dockerizer/Makefile deleted file mode 100644 index 576029e..0000000 --- a/disdat/infrastructure/dockerizer/Makefile +++ /dev/null @@ -1,213 +0,0 @@ -# If the user provides a configuration file with build variables, use it -ifneq ($(CONFIG_MK), ) -include $(CONFIG_MK) -endif - -# The desired name for the pipeline Docker image -ifeq ($(PIPELINE_IMAGE_NAME), ) -$(error PIPELINE_IMAGE_NAME variable not set) -endif - -# The root for the Docker context in which we will build the image -DISDAT_DOCKER_CONTEXT := $(shell echo $(DISDAT_DOCKER_CONTEXT) | sed 's/\/*$$//') -ifeq ($(DISDAT_DOCKER_CONTEXT), ) -$(error DISDAT_DOCKER_CONTEXT variable not set or invalid) -endif - -# The root of the Disdat source distribution -DISDAT_ROOT := $(shell echo $(DISDAT_ROOT) | sed 's/\/*$$//') -ifeq ($(DISDAT_ROOT), ) -$(error DISDAT_ROOT variable not set or invalid) -endif - -# The current Disdat source distribution path -DISDAT_SDIST := $(shell basename `find $(DISDAT_ROOT)/disdat/infrastructure/dockerizer/context.template/*.tar.gz -type f`) - -# The root of an extra user configuration files (O/S package dependencies, -# etc.) -CONFIG_ROOT ?= $(DISDAT_ROOT)/disdat/infrastructure/dockerizer/config -CONFIG_ROOT := $(shell echo $(CONFIG_ROOT) | sed 's/\/*$$//') -ifeq ($(CONFIG_ROOT), ) -$(error CONFIG_ROOT variable not set or invalid) -endif - -# The root of the Docker context template (Dockerfiles, kickstart scripts, -# etc.) -DISDAT_DOCKER_CONTEXT_TEMPLATE ?= $(DISDAT_ROOT)/disdat/infrastructure/dockerizer/context.template -DISDAT_DOCKER_CONTEXT_TEMPLATE := $(shell echo $(DISDAT_DOCKER_CONTEXT_TEMPLATE) | sed 's/\/*$$//') -ifeq ($(DISDAT_DOCKER_CONTEXT_TEMPLATE), ) -$(error DISDAT_DOCKER_CONTEXT_TEMPLATE variable not set or invalid) -endif - -# The source root of the pipeline to run -PIPELINE_ROOT := $(shell echo $(PIPELINE_ROOT) | sed 's/\/*$$//') -ifeq ($(PIPELINE_ROOT), ) -$(error PIPELINE_ROOT variable not set or invalid) -endif - -# Declare the base operating system to use -ifeq ($(OS_TYPE), ) -$(error OS_TYPE variable not set) -endif -ifeq ($(OS_VERSION), ) -$(error OS_VERSION variable not set) -endif -OS_NAME := $(OS_TYPE)-$(OS_VERSION) - -# The Docker registry to which to push images (default is no registry) -DOCKER_REGISTRY ?= -# Flag to set if the Docker registry requires an AWS ECR login -DOCKER_REGISTRY_IS_ECR ?= - -# -# Define build arguments to pass through when "running" the Docker -# dockerfiles. The IMAGE_ prefix flags all such arguments. -# - -# Kickstart script installation -IMAGE_KICKSTART_ROOT ?= /opt/kickstart - -# Directory for the Disdat Python virtual environment. By default, we -# install whatever packages show up when using pip. If the user wants -# Miniconda, they should set IMAGE_CONDA_VERSION in a custom CONFIG_MK -# file. -IMAGE_VIRTUAL_ENV ?= /opt/python-virtualenv -IMAGE_CONDA_VERSION ?= NO_CONDA - -# Temporary build directory for holding various things copied in from the -# build context -IMAGE_BUILD_ROOT ?= /opt/build - -# Permanent directory for holding the pipeline package source. Even though -# we install the pipeline package in Python virtual environment, we keep -# the source after the build in case it contains user data that is not -# otherwise installed by setuptools. -IMAGE_PIPELINE_ROOT ?= /opt/pipeline - -# -# Internal Docker image layers -# - -# Layer 00: Base operating system environment -IMAGE_00_LAYER := disdat-$(OS_NAME) -# Layer 01: Disdat and its Python package dependencies -IMAGE_01_LAYER := disdat-$(OS_NAME)-python - -.PHONY: build build-00 build-01 build-02 context push - -all: build - -# -# Build the Docker images layers: -# -# Layer 00: Base operating system environment -# Layer 01: Disdat and its Python package dependencies -# Layer 02: The user operating system dependencies, Python requirements, -# and scripts/executables -# - -build: $(DISDAT_DOCKER_CONTEXT) build-00 build-01 build-02 - @echo "----- Built Docker image for the $(PIPELINE_IMAGE_NAME) pipeline on $(OS_NAME)" - -sagemaker: build build-03 - @echo "----- Built SageMaker Docker image for the $(PIPELINE_IMAGE_NAME) pipeline on $(OS_NAME)" - -build-00: $(DISDAT_DOCKER_CONTEXT) - @echo "---------- Building base operating system environment" - docker build \ - --build-arg KICKSTART_ROOT=$(IMAGE_KICKSTART_ROOT) \ - --build-arg CONDA_VERSION=$(IMAGE_CONDA_VERSION) \ - --build-arg VIRTUAL_ENV=$(IMAGE_VIRTUAL_ENV) \ - --file $(DISDAT_DOCKER_CONTEXT)/Dockerfiles/00-disdat-$(OS_NAME).dockerfile \ - --tag $(IMAGE_00_LAYER) \ - $(DISDAT_DOCKER_CONTEXT) - -build-01: $(DISDAT_DOCKER_CONTEXT) $(DISDAT_DOCKER_CONTEXT)/disdat - @echo "---------- Building Disdat and its Python package dependencies" - docker build \ - --build-arg IMAGE_LAYER=$(IMAGE_00_LAYER) \ - --build-arg BUILD_ROOT=$(IMAGE_BUILD_ROOT) \ - --build-arg DISDAT_SDIST=$(DISDAT_SDIST) \ - --file $(DISDAT_DOCKER_CONTEXT)/Dockerfiles/01-disdat-python.dockerfile \ - --tag $(IMAGE_01_LAYER) \ - $(DISDAT_DOCKER_CONTEXT) - -build-02: $(DISDAT_DOCKER_CONTEXT) $(DISDAT_DOCKER_CONTEXT)/config $(DISDAT_DOCKER_CONTEXT)/pipeline - @echo "---------- Installing the user environment" - docker build \ - --build-arg OS_NAME=$(OS_NAME) \ - --build-arg IMAGE_LAYER=$(IMAGE_01_LAYER) \ - --build-arg PIPELINE_ROOT=$(IMAGE_PIPELINE_ROOT) \ - --build-arg GIT_HASH=$(GIT_HASH) \ - --build-arg GIT_BRANCH=$(GIT_BRANCH) \ - --build-arg GIT_FETCH_URL=$(GIT_FETCH_URL) \ - --build-arg GIT_TIMESTAMP=$(GIT_TIMESTAMP) \ - --build-arg GIT_DIRTY=$(GIT_DIRTY) \ - --file $(DISDAT_DOCKER_CONTEXT)/Dockerfiles/02-user.dockerfile \ - --tag $(PIPELINE_IMAGE_NAME) \ - $(DISDAT_DOCKER_CONTEXT) - -build-03: $(DISDAT_DOCKER_CONTEXT) - @echo "---------- Using SageMaker entrypoint" - docker build \ - --build-arg IMAGE_LAYER=$(PIPELINE_IMAGE_NAME) \ - --file $(DISDAT_DOCKER_CONTEXT)/Dockerfiles/03-sagemaker.dockerfile \ - --tag $(SAGEMAKER_TRAIN_IMAGE_NAME) \ - $(DISDAT_DOCKER_CONTEXT) - -# -# Set up the Docker context in which we build the image -# - -context: $(DISDAT_DOCKER_CONTEXT) - -# Copy the dockerizer support files into the Docker context. For this copy, -# we DON'T want the directory name to be included in the copy. -$(DISDAT_DOCKER_CONTEXT): $(shell find $(DISDAT_DOCKER_CONTEXT_TEMPLATE) \! -name '*.pyc' -type f) - @echo "----- Creating temporary Docker context in $@" - @if [ ! -d $@ ]; then mkdir $@; fi - rsync -av --delete --force $(DISDAT_DOCKER_CONTEXT_TEMPLATE)/ $@ - @touch $@ - -# Copy the user configuration files into the Docker context -$(DISDAT_DOCKER_CONTEXT)/config: $(DISDAT_DOCKER_CONTEXT) $(shell find $(CONFIG_ROOT) -type f) - @echo "----- Copying configuration files from $(CONFIG_ROOT)" - rsync -av --delete --force --quiet $(CONFIG_ROOT)/ $@ - @touch $@ - -# Copy Disdat into the Docker context -$(DISDAT_DOCKER_CONTEXT)/disdat: $(DISDAT_DOCKER_CONTEXT) $(shell find $(DISDAT_ROOT)/disdat -type f) - @echo "----- Copying Disdat files from $(DISDAT_ROOT)" - rsync -av --delete --force --quiet \ - $(DISDAT_ROOT)/disdat/infrastructure/dockerizer $(DISDAT_ROOT)/disdat/infrastructure $@ - @touch $@ - -# Copy the user pipeline package into the Docker context. -# We assume the user has a setup.py that can create an sdist. -$(DISDAT_DOCKER_CONTEXT)/pipeline: $(DISDAT_DOCKER_CONTEXT) $(shell find $(PIPELINE_ROOT) -not -path '*/\.*' -type f) - @echo "----- Copying pipeline $(PIPELINE_ROOT)" - cd $(PIPELINE_ROOT); \ - mkdir $@; \ - mkdir disdat-temp.egg-base; \ - python setup.py egg_info --egg-base disdat-temp.egg-base; \ - cp `ls disdat-temp.egg-base/*.egg-info/requires.txt` $@/user_package_egg_requires.txt; \ - rm -rf disdat-temp.egg-base; \ - python setup.py sdist --dist-dir $@; \ - cd - - @touch $@ - - -# -# Push the pipeline image to a registry -# - -push: -ifeq ($(DOCKER_REGISTRY), ) - @echo "ERROR: DOCKER_REGISTRY variable not set" - @exit 1 -endif -ifneq ($(DOCKER_REGISTRY_IS_ECR), ) - @$(shell aws ecr get-login --no-include-email) -endif - docker tag $(PIPELINE_IMAGE_NAME) $(DOCKER_REGISTRY)/$(PIPELINE_IMAGE_NAME) - docker push $(DOCKER_REGISTRY)/$(PIPELINE_IMAGE_NAME) diff --git a/disdat/infrastructure/dockerizer/__init__.py b/disdat/infrastructure/dockerizer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/disdat/infrastructure/dockerizer/config.mk.template b/disdat/infrastructure/dockerizer/config.mk.template deleted file mode 100644 index b21faea..0000000 --- a/disdat/infrastructure/dockerizer/config.mk.template +++ /dev/null @@ -1,32 +0,0 @@ -# REQUIRED: The root for the Docker context in which we will build the image -DOCKER_CONTEXT = # For example: $(HOME)/context - -# REQUIRED: The Disdat source root -# DISDAT_ROOT = # For example: $(HOME)/disdat - -# REQUIRED: A pipeline name to identify the built Docker image. May contain -# any combination of characters valid for a Docker image name. -PIPELINE_NAME = # For example: flagstat - -# REQUIRED: The default pipeline class to run within the image. Must be a -# fully-qualified Python module name. -PIPELINE_CLASS = # For example: examples.flagstat.Flagstat - -# REQUIRED: The user pipeline package source tree root. The tree must -# include a setuptools setup file at $PIPELINE_ROOT/setup.py file. -PIPELINE_ROOT = # For example: ../../examples - -# REQUIRED: The base operating system type and version -OS_TYPE = # For example: ubuntu -OS_VERSION = # For example: 16.04 - -# A remote AWS S3 URL to which to push output bundles. This must be an -# S3 folder. -# IMAGE_DISDAT_REMOTE = s3://bucket/key - -# A Docker registry. Only necessary if pushing images. -# DOCKER_REGISTRY = registry-host:5000/path - -# A flag to indicate if the registry is an AWS Elastic Container Repository. -# Set to 1 (really, any non-empty string) if using ECR. -# DOCKER_REGISTRY_IS_ECR = diff --git a/disdat/infrastructure/dockerizer/config/.gitignore b/disdat/infrastructure/dockerizer/config/.gitignore deleted file mode 100644 index ff82780..0000000 --- a/disdat/infrastructure/dockerizer/config/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore most everything except .gitignore in the default config -# directory -* -!.gitignore diff --git a/disdat/infrastructure/dockerizer/context.template/.dockerignore b/disdat/infrastructure/dockerizer/context.template/.dockerignore deleted file mode 100644 index 4d8ee30..0000000 --- a/disdat/infrastructure/dockerizer/context.template/.dockerignore +++ /dev/null @@ -1,3 +0,0 @@ -# Modify the context to exclude files and directories that match patterns in -# this file. -/Dockerfiles/ diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.14-slim.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.14-slim.dockerfile deleted file mode 100644 index e23470a..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.14-slim.dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# -# Kickstart the Python 2 system environment -# - -FROM python:2.7.14-slim - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -RUN apt-get update -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -# disdat uses pyodbc which requires gcc ,hence 'build-essential' -# sometimes people need to install .deb files, hence gdebi -RUN apt-get install -y git build-essential unixodbc-dev -RUN easy_install virtualenv - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.15-slim.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.15-slim.dockerfile deleted file mode 100644 index dc0da43..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-2.7.15-slim.dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# -# Kickstart the Python 2 system environment -# - -FROM python:2.7.15-slim - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -RUN apt-get update -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -# disdat uses pyodbc which requires gcc ,hence 'build-essential' -# sometimes people need to install .deb files, hence gdebi -RUN apt-get install -y git build-essential unixodbc-dev -RUN easy_install virtualenv - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.6.8-slim.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.6.8-slim.dockerfile deleted file mode 100644 index d4ef7e3..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.6.8-slim.dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# -# Kickstart the Python 2 system environment -# - -FROM python:3.6.8-slim - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -RUN apt-get update -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -# disdat uses pyodbc which requires gcc ,hence 'build-essential' -# sometimes people need to install .deb files, hence gdebi -RUN apt-get install -y git build-essential unixodbc-dev -RUN pip install virtualenv==16.7.9 - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.7.8-slim.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.7.8-slim.dockerfile deleted file mode 100644 index bef9562..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-python-3.7.8-slim.dockerfile +++ /dev/null @@ -1,39 +0,0 @@ -# -# Kickstart the Python 2 system environment -# - -FROM python:3.7.8-slim - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -RUN apt-get update -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -# disdat uses pyodbc which requires gcc ,hence 'build-essential' -# sometimes people need to install .deb files, hence gdebi -RUN apt-get install -y git build-essential unixodbc-dev -RUN pip install virtualenv==16.7.9 - -# Since Ubuntu 20.x the default security level -# has been raised to 2. This causes this problem: -# https://askubuntu.com/questions/1231844/ssl-sslerror-ssl-dh-key-too-small-dh-key-too-small-ssl-c1108 -# NOTE: This should be a change at the offending server, not the client. - -RUN mv /etc/ssl/openssl.cnf /etc/ssl/openssl.back.cnf; \ - sed -e 's/DEFAULT@SECLEVEL=2/DEFAULT@SECLEVEL=1/g' /etc/ssl/openssl.back.cnf > /etc/ssl/openssl.cnf - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-14.04.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-14.04.dockerfile deleted file mode 100644 index 2bf4b0b..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-14.04.dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# -# Kickstart the Ubuntu 14.04 operating system environment -# - -FROM ubuntu:14.04.5 - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -# Installation in Docker images is noninteractive -ENV DEBIAN_FRONTEND noninteractive - -# Install apt-utils to stop subsequent errors of the form: -# debconf: delaying package configuration, since apt-utils is not installed -# Also install gdebi to make installing arbitrary .deb files with -# dependencies less painful -RUN apt-get update -y && apt-get install -y --no-install-recommends apt-utils gdebi software-properties-common -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -RUN apt-get install -y git python python-pip python-virtualenv - -# Install conda O/S requirements (python-dev). Also install -# requests[security] O/S requirements -RUN apt-get install -y libffi-dev libssl-dev python-dev - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG CONDA_VERSION -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - -# Install Miniconda -RUN $KICKSTART_ROOT/bin/kickstart-conda.sh -c $CONDA_VERSION $VIRTUAL_ENV diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-16.04.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-16.04.dockerfile deleted file mode 100644 index 6241a3b..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/00-disdat-ubuntu-16.04.dockerfile +++ /dev/null @@ -1,40 +0,0 @@ -# -# Kickstart the Ubuntu 16.04 operating system environment -# - -FROM ubuntu:16.04 - -LABEL \ - author="Theodore Wong" - -# Kickstart shell scripts root -ARG KICKSTART_ROOT -ENV KICKSTART_ROOT $KICKSTART_ROOT - -# Installation in Docker images is noninteractive -ENV DEBIAN_FRONTEND noninteractive - -# Install apt-utils to stop subsequent errors of the form: -# debconf: delaying package configuration, since apt-utils is not installed -# Also install gdebi to make installing arbitrary .deb files with -# dependencies less painful -RUN apt-get update -y && apt-get install -y --no-install-recommends apt-utils gdebi software-properties-common -RUN apt-get upgrade -y - -# Install git and a minimal Python 2.x toolchain. Disdat uses git to detect -# changed sources when deciding whether or not to rerun a pipeline. -RUN apt-get install -y git python python-pip python-virtualenv - -# Install the kickstart scripts used by later layers -COPY kickstart $KICKSTART_ROOT - -# Declare Miniconda configurable arguments. We only need to save the Python -# virtual environment path for later stages. -ARG CONDA_VERSION -ARG VIRTUAL_ENV -ENV VIRTUAL_ENV $VIRTUAL_ENV - -# Install Miniconda if selected. -RUN if [ x$CONDA_VERSION != xNO_CONDA ]; then \ - $KICKSTART_ROOT/bin/kickstart-conda.sh -c $CONDA_VERSION $VIRTUAL_ENV; \ -fi diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/01-disdat-python.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/01-disdat-python.dockerfile deleted file mode 100644 index 5dd7bcc..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/01-disdat-python.dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# -# Install Disdat in the Python virtual environment. -# - -ARG IMAGE_LAYER -FROM $IMAGE_LAYER - -LABEL \ - author="Theodore Wong" - -# Temporary build directory -ARG BUILD_ROOT -ENV BUILD_ROOT $BUILD_ROOT - -# Name of curent Disdat Sdist -ARG DISDAT_SDIST - -# Copy the Disdat source to the temporary build root -COPY disdat $BUILD_ROOT/disdat - -# Create our virtual env -RUN virtualenv $VIRTUAL_ENV - -# ...and install Disdat -RUN ["/bin/bash", "-c", "source $VIRTUAL_ENV/bin/activate; pip install $BUILD_ROOT/disdat/dockerizer/context.template/$DISDAT_SDIST; deactivate"] - -# Add the virtual environment Python to the head of the PATH; running -# `python` will then get you the installed virtual environment and the -# `dsdt` command-line executable. -ENV PATH $VIRTUAL_ENV/bin:$PATH - -# Initialize the Disdat environment -RUN dsdt init - -# Local environment may have its own pip index, support pip.conf files, if not set, copies empty file. -COPY pip.conf /opt/pip.conf -ENV PIP_CONFIG_FILE /opt/pip.conf - -# Local environmment may have its own odbc.ini file, if not set, copies empty file. -COPY odbc.ini /opt/odbc.ini -ENV ODBCINI /opt/odbc.ini \ No newline at end of file diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/02-user.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/02-user.dockerfile deleted file mode 100644 index 60febc8..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/02-user.dockerfile +++ /dev/null @@ -1,92 +0,0 @@ -# -# Kickstart the user-defined execution environment. This includes operating -# system dependencies and Python requirements. -# - -ARG IMAGE_LAYER -FROM $IMAGE_LAYER - -LABEL \ - author="Theodore Wong" - -# Copy the user configuration files -COPY config $BUILD_ROOT/config - -# Install the user operating system dependencies -# TODO: We probably ought to replace this with a script that checks the -# underlying O/S type and then selects the correct O/S package list. -ARG OS_NAME -RUN echo $OS_NAME -RUN if [ -f $BUILD_ROOT/config/$OS_NAME/repository.txt ]; then \ - for repo in $(cat $BUILD_ROOT/config/$OS_NAME/repository.txt); do \ - add-apt-repository -y $repo; \ - done; \ - apt-get update -y; \ -fi -RUN if [ -f $BUILD_ROOT/config/$OS_NAME/deb.txt ]; then \ - apt-get install -y $(cat $BUILD_ROOT/config/$OS_NAME/deb.txt); \ -fi -RUN files=$(echo $BUILD_ROOT/config/$OS_NAME/*.deb); if [ "$files" != $BUILD_ROOT/config/$OS_NAME/'*.deb' ]; then \ - for i in $files; do echo "Installing $i..."; dpkg -i $i; apt-get install -y -f; done; \ -fi - -# Install R and packages -RUN if [ -f $BUILD_ROOT/config/$OS_NAME/r.txt ]; then \ - apt-get update -y \ - && apt-get install -y --no-install-recommends --no-install-suggests apt-transport-https ca-certificates software-properties-common gnupg2 gnupg1 \ - && apt-key add $BUILD_ROOT/config/$OS_NAME/r-debian.pub \ - && add-apt-repository "deb https://cloud.r-project.org/bin/linux/debian buster-cran35/" \ - && apt-get update -y \ - && apt-get upgrade -y \ - && lsb_release -a \ - && apt-get install -y --no-install-recommends --allow-unauthenticated \ - r-base \ - r-base-dev \ - libssl-dev \ - libcurl4-openssl-dev; \ - for pkg in $(cat $BUILD_ROOT/config/$OS_NAME/r.txt); do \ - R -e "install.packages('$pkg', repos='https://cloud.r-project.org/')"; \ - done; \ -fi - -# Install user Python sdist package dependencies -# NOTE: Since PIP 19.0 fails with --no-cache-dir, removed '-n' flag on kickstart-python.py script -RUN files=$(echo $BUILD_ROOT/config/python-sdist/*.tar.gz); if [ "$files" != $BUILD_ROOT/config/python-sdist/'*.tar.gz' ]; then \ - $KICKSTART_ROOT/bin/kickstart-python.sh $VIRTUAL_ENV $i; \ - for i in $files; do \ - $KICKSTART_ROOT/bin/install-python-package-from-source-tree.sh $VIRTUAL_ENV $i; \ - done; \ -fi - -# NOTE: 01-disdat-python.dockerfile has set the PATH using ENV, so $VIRTUAL_ENV is already activated. - -# Install the user's package. Split into two parts: package dependencies and user code. -# First, the Makefile creates egginfo, to create a requires.txt file, and copies into context -# We only issue the COPY for that file. This will invalidate the docker layer cache if the requires.txt changes. -# Second, we install the sdist *.tar.gz. B/c sdists change on each create, that's installed every time. -ARG PIPELINE_ROOT -COPY pipeline/user_package_egg_requires.txt $PIPELINE_ROOT/ -RUN pip install `sed -n '1,/\[.*\]/p' $PIPELINE_ROOT/user_package_egg_requires.txt | grep -v '\[' | awk 'NF'` - -COPY pipeline $PIPELINE_ROOT -RUN pip install --no-cache-dir $PIPELINE_ROOT/*.tar.gz - -# Set the git status env variables for the container. This represents the most recent git hash for the pipeline. -ARG GIT_HASH -ENV PIPELINE_GIT_HASH=${GIT_HASH} -ARG GIT_BRANCH -ENV PIPELINE_GIT_BRANCH=${GIT_BRANCH} -ARG GIT_FETCH_URL -ENV PIPELINE_GIT_FETCH_URL=${GIT_FETCH_URL} -ARG GIT_TIMESTAMP -ENV PIPELINE_GIT_TIMESTAMP=${GIT_TIMESTAMP} -ARG GIT_DIRTY -ENV PIPELINE_GIT_DIRTY=${GIT_DIRTY} - -# Clean up the temporary build directory -RUN rm -rf $BUILD_ROOT - -# Set up the default entrypoint from the disdat package. -# The disdat cli run command will use this command to run the container. -# If the user wishes to run their own entrypoint, they may pass it to `dsdt run` -CMD [ "dsdt_docker", "--help" ] \ No newline at end of file diff --git a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/03-sagemaker.dockerfile b/disdat/infrastructure/dockerizer/context.template/Dockerfiles/03-sagemaker.dockerfile deleted file mode 100644 index 0074ec6..0000000 --- a/disdat/infrastructure/dockerizer/context.template/Dockerfiles/03-sagemaker.dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -# -# Create a SageMaker Training Container -# Inherit from 02-user.dockerfile, only replace the entrypoint. -# - -ARG IMAGE_LAYER -FROM $IMAGE_LAYER - -LABEL \ - author="Ken Yocum" - -COPY bin/sagemaker.py /opt/bin/sagemaker.py -ENTRYPOINT [ "/opt/bin/sagemaker.py" ] diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/find_packages_from_setup.py b/disdat/infrastructure/dockerizer/context.template/kickstart/bin/find_packages_from_setup.py deleted file mode 100755 index ffec2fa..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/find_packages_from_setup.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python - -import argparse -import importlib as imp -import mock -import os -import setuptools - - -def find_packages(setup_py): - packages = [] - # All this horrid hackery to recover the install_requires parameter from - # a setup() call in setup.py. - # - # https://stackoverflow.com/questions/24236266/how-to-extract-dependencies-information-from-a-setup-py - try: - # Patch setuptools to intercept the setup() call - with mock.patch.object(setuptools, 'setup') as setup_mock: - # Get an open file handle and a description of the - # setup file. - setup_file, setup_filename, setup_description = imp.find_module('setup', [os.path.dirname(setup_py)]) - # Load setup.py as the module setup. We have to - # intercept calls to find_packages as well since - # find_packages will run a 'find'-like operation from - # the current working directory - which is Bad if the - # CWD is the root directory... - with mock.patch.object(setuptools, 'find_packages'): - imp.load_module('setup', setup_file, setup_filename, setup_description) - # Grab the call args to setup - _, setup_kwargs = setup_mock.call_args - # ...and recover the install_requires parameter. Fun, eh? - # Don't forget to remove trailing version specifiers that - # lack version numbers. - packages = ['{}'.format(p.rstrip('<>=')) for p in setup_kwargs['install_requires']] - finally: - # As warned in the docs, we have to close the setup file - # ourselves. - if setup_file is not None: - setup_file.close() - return packages - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Select Python packages from setup.py' - ) - parser.add_argument( - 'setup_py', - type=str, - help='The setup.py file', - ) - args = parser.parse_args() - - if not os.path.exists(args.setup_py): - raise RuntimeError('Failed to find file {}'.format(args.setup_py)) - - print ('\n'.join(find_packages(args.setup_py))) \ No newline at end of file diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/install-python-package-from-source-tree.sh b/disdat/infrastructure/dockerizer/context.template/kickstart/bin/install-python-package-from-source-tree.sh deleted file mode 100755 index 7685a54..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/install-python-package-from-source-tree.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -# -# Kickstart Disdat - -SH_FILE=$(basename $0) -SH_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -source $SH_DIR/../etc/common.rc - -if [ x$CONF_DIR == 'x' ]; then - CONF_DIR=$SH_DIR/../etc -fi -source $CONF_DIR/kickstart.conf - -USAGE="Usage: $SH_FILE [-h] [-d] VIRTUAL_ENV PACKAGE_ROOT" - -function usage() { -cat << EOF -Install a Python package from its source tree into a Python virtual -environment. To simplify matters, you must have previously kickstarted the -target environment, otherwise this script would have to provide all manner -of pass-through arguments. - -$USAGE - -Positional arguments: - VIRTUAL_ENV : The destination virtual environment path - PACKAGE_ROOT : The root of the package source - -Options: - -h : Get help - -d : Set debug mode (echoes commands) -EOF -} - -while getopts "hd" opt; do - case "$opt" in - h) usage - exit 0 - ;; - d) set -x - ;; - *) - echo $USAGE - exit 1 - esac -done - -shift $((OPTIND-1)) - -virtual_env=${1} -if [ x$virtual_env == 'x' ]; then - error "Could not find VIRTUAL_ENV argument" - echo $USAGE - exit 1 -fi - -package_root=${2} -if [ x$package_root == 'x' ]; then - error "Could not find PACKAGE_ROOT argument" - echo $USAGE - exit 1 -fi - -echo "Using Python virtual environment $virtual_env" -echo "Using package root $package_root" - -# Whatever happens now, make sure we deactivate the virtual environment -# and remove the temp file. -old_cwd=$PWD -if [ -f $virtual_env/bin/activate ]; then - if [ -x $virtual_env/bin/conda ]; then - source $virtual_env/bin/activate $virtual_env - function atexit_deactivate { - source deactivate - cd $old_cwd - } - trap atexit_deactivate EXIT - use_conda=yes - else - source $virtual_env/bin/activate - function atexit_deactivate { - deactivate - cd $old_cwd - } - trap atexit_deactivate EXIT - fi -else - error "$virtual_env is not a valid Python virtual environment" - exit 1 -fi - -echo "Using Python interpreter $(which python)" -if [ -f $package_root/setup.py ]; then - echo "cd $package_root; python setup.py install" - cd $package_root - run python setup.py install -else - echo "pip install $package_root" - pip install --no-cache-dir $package_root -fi - -success "Successfully installed $(basename $package_root) in $virtual_env" -if [ x$use_conda != 'x' ]; then - success "'source $virtual_env/bin/activate $virtual_env' to start using it" -else - success "'source $virtual_env/bin/activate' to start using it" -fi diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-conda.sh b/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-conda.sh deleted file mode 100755 index 57c1c50..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-conda.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/bin/bash -# -# Kickstart a basic Miniconda virtual environment. You can the use this -# basic environment to kickstart a more built-up Python virtual environment. - -SH_FILE=$(basename $0) -SH_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -source $SH_DIR/../etc/common.rc - -if [ x$CONF_DIR == 'x' ]; then - CONF_DIR=$SH_DIR/../etc -fi -source $CONF_DIR/kickstart.conf - -USAGE="Usage: $SH_FILE [-h] [-d] [-v] [-c CONDA_VERSION] [-m CONDA_ROOT] VIRTUAL_ENV" - -conda_root=$CONDA_ROOT_DEFAULT -conda_version=$CONDA_VERSION_DEFAULT - -function usage() { -cat << EOF -Install Miniconda with Intel MKL and a corresponding Python 2.7 interpreter. - -$USAGE - -Positional arguments: - VIRTUAL_ENV : The destination virtual environment path - -Options: - -h : Get help - -d : Set debug mode (echoes commands) - -c CONDA_VERSION : Specify the Miniconda version (default $CONDA_VERSION_DEFAULT); - set to '' to use the most recent version - -m CONDA_ROOT : Specify the Miniconda installation root - (default $CONDA_ROOT_DEFAULT) - -v : Verbose : Warn if we are reusing an existing virtual environment -EOF -} - -while getopts "hdc:m:v" opt; do - case "$opt" in - h) usage - exit 0 - ;; - d) set -x - ;; - c) conda_version=$OPTARG - ;; - m) conda_root=$OPTARG - ;; - v) verbose=yes - ;; - *) - echo $USAGE - exit 1 - esac -done - -shift $((OPTIND-1)) - -virtual_env=${1} -if [ x$virtual_env == 'x' ]; then - error "Could not find VIRTUAL_ENV argument" - echo $USAGE - exit 1 -fi - -echo "Using Miniconda installation root $conda_root" -echo "Using Python virtual environment $virtual_env" - -if [ ! -d $conda_root ]; then - # Need to create a virtual environment to hold the conda installer. - if [ x$(which virtualenv) == 'x' ]; then - error "Failed to find virtualenv command" - exit 1 - fi - - # Miniconda makes a mess out of the PATH. Thanks Continuum. - old_path=$PATH - virtualenv $conda_root - source $conda_root/bin/activate - - # Whatever happens now, make sure we deactivate the virtual environment - function atexit_deactivate { - deactivate - export PATH=$old_path - } - trap atexit_deactivate EXIT - - # Install security packages to prevent pip warnings about SSLContexts. - pip install --upgrade ndg-httpsclient pyasn1 pyOpenSSL - - # auxlib is required by conda. mock is required to support kickstarting - # from setup.py instead of requirements.txt - pip install auxlib mock - if [ x$conda_version != x ]; then - pip install conda==$conda_version conda-env - else - pip install conda conda-env - fi - - if [ x$(which conda) == 'x' ]; then - error "Failed to find conda command" - exit 1 - fi -fi - -if [ ! -d $virtual_env ]; then - if ! $conda_root/bin/conda create -p $virtual_env --yes python=2; then - error "Failed to create a Miniconda environment in $virtual_env" - exit 1 - fi - - # More conda PATH nonsense - our deactivate script cleans up the PATH - # completely, leaving no trace of conda cruft. - mv $virtual_env/bin/deactivate $virtual_env/bin/deactivate-original - # You'd better believe it - we need a lot of escaping to make this - # work - conda_root_escaped=$(echo $conda_root | sed 's/\//\\\\\\\\\\\\\\\//g') - sed "s/%%CONDA_ROOT%%/$conda_root_escaped/" $CONF_DIR/deactivate.skel > $virtual_env/bin/deactivate - chmod +x $virtual_env/bin/deactivate - # Flag this virtual environment so we remember that we have a custom - # deactivate script - touch $virtual_env/$CONDA_FLAG_FILE - success "Successfully installed the Miniconda environment in $virtual_env" - success "'source $virtual_env/bin/activate $virtual_env' to start using it" -elif [ -f $virtual_env/$CONDA_FLAG_FILE ]; then - if [ x$verbose != 'x' ]; then - warning "Already installed a Miniconda environment in $virtual_env" - else - echo "Reusing an existing Miniconda environment in $virtual_env" - fi - echo "'source $virtual_env/bin/activate $virtual_env' to start using it" -else - error "Failed to create a Miniconda environment: $virtual_env exists and is in use" - exit 1 -fi diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-python.sh b/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-python.sh deleted file mode 100755 index 80bb8ff..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/kickstart-python.sh +++ /dev/null @@ -1,281 +0,0 @@ -#!/bin/bash -# -# Kickstart a Python virtual environment. Tries to deal with the differences -# between installing pip-managed and conda-managed environments. - -SH_FILE=$(basename $0) -SH_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -source $SH_DIR/../etc/common.rc - -if [ x$CONF_DIR == 'x' ]; then - CONF_DIR=$SH_DIR/../etc -fi -source $CONF_DIR/kickstart.conf - -USAGE="Usage: $SH_FILE [-h] [-d] [-a] [-f] [-n] [-v] [-c CONDA_VERSION] [-m CONDA_ROOT] " - -conda_root=$CONDA_ROOT_DEFAULT -conda_version=$CONDA_VERSION_DEFAULT -use_conda='' - -force_pip='' - -no_cache='' - -function usage() { -cat << EOF -Install dependencies in a Python virtual environment. - -$USAGE - -Positional arguments: - VIRTUAL_ENV : The destination virtual environment path - REQUIREMENTS_FILE : An optional pip-style requirements file - -Options: - -h : Get help - -d : Set debug mode (echoes commands) - -a : Use Miniconda with Intel MKL instead of pip install - -b : Build binaries for non-pure-Python dependencies from source (i.e., - do not use manylinux wheels) - -c CONDA_VERSION : Specify the Miniconda version (default $CONDA_VERSION_DEFAULT); - set to '' to use the most recent version - -m CONDA_ROOT : Specify the Miniconda installation root - (default $CONDA_ROOT_DEFAULT) - -f : Force pip install even if we detect Miniconda with Intel MKL - -n : No cache : Disable pip package/wheel caches, and flush Miniconda - cache after kickstart - -v : Verbose : Warn if we are reusing an existing virtual environment - -WARNING: If your requirements file contains a relative pathname (for -example, when supplying an 'editable' package such as '-e .', the installer -will do the wrong thing if your working directory is not the directory that -contains the requirements file. -EOF -} - -while getopts "hdabc:fm:nv" opt; do - case "$opt" in - h) usage - exit 0 - ;; - d) set -x - debug=yes - ;; - a) use_conda=yes - ;; - b) no_manylinux=yes - ;; - c) conda_version=$OPTARG - ;; - f) force_pip=yes - ;; - m) conda_root=$OPTARG - use_conda=yes - ;; - n) no_cache='--no-cache-dir' - ;; - v) verbose=yes - ;; - *) - echo $USAGE - exit 1 - esac -done - -shift $((OPTIND-1)) - -virtual_env=${1} -if [ x$virtual_env == 'x' ]; then - error "Could not find VIRTUAL_ENV argument" - echo $USAGE - exit 1 -fi - -has_requirements=yes -requirements=${2} -if [ x$requirements == 'x' ]; then - has_requirements='' -elif [ ! -e $requirements ]; then - error "Could not find requirements file $requirements" - exit 1 -elif [ ! -f $requirements ]; then - error "Got invalid requirements file $requirements: Not a regular file" - exit 1 -fi - -# If the 'requirements' file is actually a source distribution, extract the -# setup.py file. -if [ x$has_requirements != 'x' ]; then - package_name_maybe=$(basename $requirements .tar.gz) - setup_py_dir= - if [ $(basename $requirements) == ${package_name_maybe}.tar.gz ]; then - # Sorry - no cleanup if the script dies... - setup_py_dir=$(mktemp -d) - if ! tar xvzf $requirements -C $setup_py_dir $package_name_maybe/setup.py; then - error "Got invalid Python sdist $requirements" - exit 1 - fi - requirements=$setup_py_dir/$package_name_maybe/setup.py - fi -fi - -if [ x$use_conda != 'x' -a x$force_pip != 'x' ]; then - error "Got conflicting requests: Use conda or force pip?" - exit 1 -fi - -if [ x$no_cache != 'x' -a x$verbose != 'x' ]; then - warning "pip/conda caches disabled" -fi - -if [ \( x$use_conda != 'x' -o -x $virtual_env/bin/conda \) -a x$force_pip == 'x' ]; then - if [ x$conda_version != x ]; then - conda_version_flag="-c $conda_version" - else - conda_version_flag= - fi - if ! $SH_DIR/kickstart-conda.sh $conda_version_flag -m $conda_root $virtual_env; then - error "Failed to install basic Miniconda virtual envionment" - exit 1 - fi - - source $virtual_env/bin/activate $virtual_env - # Whatever happens now, make sure we deactivate the virtual environment - # and remove the temp file. - non_conda_requirements=$(mktemp) - function atexit_deactivate { - if [ x$debug == 'x' ]; then - rm -f $non_conda_requirements - fi - source deactivate - } - trap atexit_deactivate EXIT - - install_pip - # mock is required by select_conda_packages.py - run pip install --quiet mock - - if [ x$no_manylinux != 'x' ]; then - # Prevent pip from installing generic 'manylinux' wheels. - run cp -p $SH_DIR/../etc/_manylinux.py $virtual_env/lib/python2.7/site-packages - fi - - # Install whatever Continuum provides - conda_packages=$($SH_DIR/select_conda_packages.py $requirements 2> $non_conda_requirements) - if ! conda install --yes $conda_packages mkl mkl-service; then - error "Failed to install Python requirements" - exit 1 - fi - # Install whatever Continuum doesn't provide. If the input was a - # setup.py file, we have to use the recycled output from - # select_conda_packages.py. - if [ x$has_requirements != 'x' ]; then - if [ $(basename $requirements) == "setup.py" ]; then - if [ x$(wc -w $non_conda_requirements | awk '{print $1}') != 'x0' ]; then - if ! pip --disable-pip-version-check $no_cache install -r $non_conda_requirements; then - error "Failed to install pip-only Python requirements" - exit 1 - fi - fi - else - if ! pip --disable-pip-version-check $no_cache install -r $requirements; then - error "Failed to install pip-only Python requirements" - exit 1 - fi - fi - fi - - if [ x$no_cache != 'x' ]; then - if ! conda clean -a -y; then - warning "Failed to flush Miniconda cache" - fi - fi - - use_conda=yes -else - echo "Using Python virtual environment $virtual_env" - - if [ x$(which virtualenv) == 'x' ]; then - error "Failed to find virtualenv command" - exit 1 - fi - - VIRTUALENV_VERSION=16.6.0 - if [ $(virtualenv --version) != $VIRTUALENV_VERSION ]; then - warning "Expected virtualenv $VIRTUALENV_VERSION, got $(virtualenv --version)" - fi - - if [ -d $virtual_env -a x$force_pip == 'x' ]; then - if [ x$verbose != 'x' ]; then - warning "Found an existing directory $(virtual_env); not running virtualenv" - fi - else - virtualenv $virtual_env - fi - source $virtual_env/bin/activate - - # If we succeeded, the activation script will set VIRTUAL_ENV - if [ x$VIRTUAL_ENV == 'x' ]; then - error "Failed to activate Python virtual environment in $virtual_env" - exit 1 - fi - - # Whatever happens now, make sure we deactivate the virtual environment - old_cwd=$PWD - requirements_from_setup=$(mktemp) - function atexit_deactivate { - if [ x$debug == 'x' ]; then - rm -f $requirements_from_setup - fi - cd $old_cwd - deactivate - } - trap atexit_deactivate EXIT - - install_pip - # mock is required by find_packages_from_setup.py - run pip install --quiet mock - - if [ x$no_manylinux != 'x' ]; then - # Prevent pip from installing generic 'manylinux' wheels. - run cp -p $SH_DIR/../etc/_manylinux.py $virtual_env/lib/python2.7/site-packages - fi - - # The authors of SciPy feel that dependency checking is (a) for the - # weak, or (b) for anyone that actually wants to get work done instead - # of dorking around for hours with arcane 'pip install' misfires, so - # it must be installed outside of 'pip install -r requirements.txt'. - if [ x$has_requirements != 'x' ]; then - if [ $(basename $requirements) == "setup.py" ]; then - $SH_DIR/find_packages_from_setup.py $requirements > $requirements_from_setup - requirements=$requirements_from_setup - fi - numpy=$(grep ^numpy $requirements) - if [ -n "$numpy" ] && ! pip --disable-pip-version-check $no_cache install $numpy; then - error "Failed to install $numpy" - exit 1 - fi - scipy=$(grep ^scipy $requirements) - if [ -n "$scipy" ] && ! pip --disable-pip-version-check $no_cache install $scipy; then - error "Failed to install $scipy" - exit 1 - fi - if ! pip --disable-pip-version-check $no_cache install -r $requirements; then - error "Failed to install Python requirements" - exit 1 - fi - fi -fi - -success "Successfully installed the Python virtual environment" -if [ x$use_conda != 'x' ]; then - success "'source $virtual_env/bin/activate $virtual_env' to start using it" -else - success "'source $virtual_env/bin/activate' to start using it" -fi - -if [ x$setup_py_dir != x ]; then - rm -r $setup_py_dir -fi diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/select_conda_packages.py b/disdat/infrastructure/dockerizer/context.template/kickstart/bin/select_conda_packages.py deleted file mode 100755 index d5003e9..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/bin/select_conda_packages.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -# This Python script must be DEAD SIMPLE - it can *only* use standard -# Python modules. We relax this rule if you want to install from a setup.py -# file. - -import argparse -import os -import re -import subprocess -import sys -import tempfile - -from find_packages_from_setup import find_packages - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Select conda-provided Python packages from a pip-style requirements file' - ) - parser.add_argument( - 'requirements', - type=str, - help='The requirements file', - ) - parser.add_argument( - '--no-non-conda', - action='store_false', - help='Do not output non-conda-provided packages (default is to output non-conda-provided packages on stderr)', - dest='non_conda', - ) - args = parser.parse_args() - - if not os.path.exists(args.requirements): - raise RuntimeError('Failed to find file {}'.format(args.requirements)) - - with tempfile.NamedTemporaryFile() as sanitized_file: - # Create a sanitized requirements file that does not contain https - # or -e pseudo-package references. Because Anaconda hates pip. - packages = [] - if os.path.basename(args.requirements) == 'setup.py': - packages = find_packages(args.requirements) - else: - with open(args.requirements, 'r') as requirements_file: - packages = [p.rstrip() for p in requirements_file if not (p.startswith('git+') or p.startswith('http') or p.startswith('-e'))] - sanitized_file.write('\n'.join(packages)) - sanitized_file.seek(0) - # Use the conda installer to figure out which packages Continuum does - # and does not host. - non_conda_packages_p = subprocess.Popen( - ['conda', 'install', '--dry-run', '--file', sanitized_file.name], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True - ) - # The installer lists the non-conda packages in stderr in the following - # format: - # - package_name [version constraint operators (<>=)] version_num - # - # Oh yeah, and Python package names are case-insensitive. - non_conda_packages_raw = non_conda_packages_p.communicate()[1] - non_conda_packages_p.wait() - non_conda_packages = [] - if non_conda_packages_p.returncode != 0: - non_conda_packages = [p.lstrip(' -').split()[0].lower() for p in non_conda_packages_raw.split('\n') if re.match(r'^\s*-', p)] - # Drop the version constraint, and filter out packages not provided by - # Continuum. - sanitized_file.seek(0) - if len(non_conda_packages) > 0: - non_conda_packages_re = r'^({})[<>=]*'.format('|'.join(non_conda_packages)) - conda_packages = [p.rstrip() for p in sanitized_file if not re.match(non_conda_packages_re, p.lower())] - else: - conda_packages = [p.rstrip() for p in sanitized_file] - # Print the filtered requirements in a format usable with the conda - # installer. - print '\n'.join(conda_packages) - if args.non_conda: - print >> sys.stderr, '\n'.join(non_conda_packages) diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/_manylinux.py b/disdat/infrastructure/dockerizer/context.template/kickstart/etc/_manylinux.py deleted file mode 100644 index 642b837..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/_manylinux.py +++ /dev/null @@ -1 +0,0 @@ -manylinux1_compatible = False diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/common.rc b/disdat/infrastructure/dockerizer/context.template/kickstart/etc/common.rc deleted file mode 100644 index 42e7bc2..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/common.rc +++ /dev/null @@ -1,90 +0,0 @@ -# Look at all the pretty colors! - -GREEN='\033[0;32m' -RED='\033[0;31m' -REGULAR='\033[0m' -YELLOW='\033[0;33m' - -function success() { - echo -e $GREEN"$@"$REGULAR -} - -function warning() { - (>&2 echo -e $YELLOW"WARNING: $@"$REGULAR) -} - -function error() { - (>&2 echo -e $RED"ERROR: $@"$REGULAR) -} - -# Trap execution errors without cluttering up the script -# http://stackoverflow.com/questions/5195607/checking-bash-exit-status-of-several-commands-efficiently -function run() { - "$@" - local status=$? - if [ $status -ne 0 ]; then - error "Failed to run command $@" - exit $status - fi - return $status -} - -function install_pip() { - # Upgrade pip past 7.1.2 to work with our private package cache - if ! pip --disable-pip-version-check $no_cache --quiet install --upgrade pip; then - error "Failed to upgrade pip" - exit 1 - fi -} - -function install_twine() { - # Upgrade pip past 7.1.2 to work with our private package cache - if ! pip --disable-pip-version-check $no_cache --quiet install twine; then - error "Failed to install twine" - exit 1 - fi -} - -function check_python_virtual_env() -{ - echo "Using Python virtual environment $python_virtual_env" - - if [ ! -d $python_virtual_env ]; then - error "Python virtual environment directory '$python_virtual_env' does not exist" - exit 1 - fi -} - -function activate_python_virtual_env() -{ - # Activate the virtual environment. Anaconda and pip use different - # activation scripts with slightly different syntax. - - anaconda_flag='' - source $python_virtual_env/bin/activate &> /dev/null - # If we succeeded, the activation script will set VIRTUAL_ENV - if [ x$VIRTUAL_ENV == 'x' ]; then - # OK, maybe this is an Anaconda environment - source $python_virtual_env/bin/activate $python_virtual_env - # If we succeeded, the activation script will set CONDA_ENV_PATH - if [ x$CONDA_ENV_PATH != 'x' ]; then - anaconda_flag='yes' - else - error "Failed to activate Python virtual environment in $python_virtual_env" - exit 1 - fi - fi - - # Whatever happens now, make sure we deactivate the virtual environment - - old_cwd=$PWD - function atexit_deactivate { - if [ x$anaconda_flag != 'x' ]; then - source $python_virtual_env/bin/deactivate - else - deactivate - fi - cd $old_cwd - } - trap atexit_deactivate EXIT -} diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/deactivate.skel b/disdat/infrastructure/dockerizer/context.template/kickstart/etc/deactivate.skel deleted file mode 100755 index 9ca5b44..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/deactivate.skel +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -CONDA_ROOT=%%CONDA_ROOT%% - -# Ensure that this script is sourced, not executed -# Note that if the script was executed, we're running inside bash! -# Also note that errors are ignored as `activate foo` doesn't generate a bad -# value for $0 which would cause errors. -if [[ -n $BASH_VERSION ]] && [[ "$(basename "$0" 2> /dev/null)" == "deactivate" ]]; then - >&2 echo "Error: deactivate must be sourced. Run 'source deactivate' -instead of 'deactivate'. -" - exit 1 -fi - -# Determine the directory containing this script -if [[ -n $BASH_VERSION ]]; then - _SCRIPT_LOCATION=${BASH_SOURCE[0]} -elif [[ -n $ZSH_VERSION ]]; then - _SCRIPT_LOCATION=${funcstack[1]} -else - echo "Only bash and zsh are supported" - return 1 -fi -_THIS_DIR=$(dirname "$_SCRIPT_LOCATION") - -# Source the original deactivate script -source $_THIS_DIR/deactivate-original - -if [ $? == 0 ]; then - # Strip off the Miniconda bin directory from the front of the PATH - echo cleaning up PATH - _NEW_PATH=$(echo $PATH | sed "s/$CONDA_ROOT\\/bin://") - export PATH=$_NEW_PATH -fi diff --git a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/kickstart.conf b/disdat/infrastructure/dockerizer/context.template/kickstart/etc/kickstart.conf deleted file mode 100644 index eac5371..0000000 --- a/disdat/infrastructure/dockerizer/context.template/kickstart/etc/kickstart.conf +++ /dev/null @@ -1,23 +0,0 @@ -# kickstart configuration file - -# (Use environment variables if available) - -# -# Miniconda configuration -# - -# Flag file to mark kickstarted conda environments -CONDA_FLAG_FILE=installed-by-kickstart-conda - -# Installation root -if [ -n "$CONDA_ROOT" ]; then - CONDA_ROOT_DEFAULT=$CONDA_ROOT -else - CONDA_ROOT_DEFAULT=$HOME/conda -fi - -if [ -n "$CONDA_VERSION" ]; then - CONDA_VERSION_DEFAULT=$CONDA_VERSION -else - CONDA_VERSION_DEFAULT=4.0.9 -fi diff --git a/disdat/infrastructure/dockerizer/setup_tools_commands.py b/disdat/infrastructure/dockerizer/setup_tools_commands.py deleted file mode 100644 index 3470564..0000000 --- a/disdat/infrastructure/dockerizer/setup_tools_commands.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright 2015, 2016, 2017, 2018, 2019 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from distutils.core import Command - - -class DistributionName(Command): - - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - print(self.distribution.metadata.name) diff --git a/disdat/lineage.py b/disdat/lineage.py index 8921b80..0012c20 100644 --- a/disdat/lineage.py +++ b/disdat/lineage.py @@ -14,8 +14,6 @@ # limitations under the License. # -from __future__ import print_function - import disdat.api as api import disdat.fs from disdat import logger as _logger diff --git a/disdat/log.py b/disdat/log.py index fdcd68f..f8b7c94 100644 --- a/disdat/log.py +++ b/disdat/log.py @@ -1,8 +1,25 @@ +# +# Copyright Human Longevity, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + """ Log === Package wide logging and logging utilities """ + import sys import logging import contextlib diff --git a/disdat/pipe.py b/disdat/pipe.py deleted file mode 100644 index 46e7f01..0000000 --- a/disdat/pipe.py +++ /dev/null @@ -1,1127 +0,0 @@ -# -# Copyright 2015, 2016 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Pipe - -A "task" in disdat. - -This inherits from Luigi's Abstract Task Class - -The idea is that parameters are actually the parameters to the run function -requires is the tasks that have to run before this task runs - -output() is basically a function that says, given the parameters, what is the output -of this task. - -inputs() isn't used as much, but it says, here is a list of inputs I expect to be -available before I run. - -author: Kenneth Yocum -""" - -import os -import sys -import time -import hashlib -from types import GeneratorType - -import luigi -from luigi import worker - -from disdat.pipe_base import PipeBase, YIELD_PIPETASK_ARG_NAME, MISSING_EXT_DEP_UUID -from disdat.common import BUNDLE_TAG_TRANSIENT, BUNDLE_TAG_PUSH_META, ExtDepError -from disdat.hyperframe import HyperFrameRecord -from disdat import logger as _logger -from disdat.fs import DisdatFS -import disdat.api as api -from disdat.apply import different_code_versions - - -class PipeTask(luigi.Task, PipeBase): - """ - user_arg_name: - calling_task: - driver_output_bundle: - force: - output_tags: - incremental_push: - incremental_pull: - """ - user_arg_name = luigi.Parameter(default=None, significant=False) # what the outputs are referred to by downstreams - is_root_task = luigi.BoolParameter(default=None, significant=False) - - root_output_bundle_name = luigi.Parameter(default=None, significant=False) - forced_output_bundle_uuid = luigi.Parameter(default=None, significant=False) - - force = luigi.BoolParameter(default=False, significant=False) - output_tags = luigi.DictParameter(default={}, significant=False) - - # Each pipeline executes wrt a data context. - data_context_name = luigi.Parameter(default=None, significant=False) - - # Each pipeline can be configured to commit and push intermediate values to the remote - incremental_push = luigi.BoolParameter(default=False, significant=False) - - # Each pipeline can be configured to pull intermediate values on demand from the remote - incremental_pull = luigi.BoolParameter(default=False, significant=False) - - def __init__(self, *args, **kwargs): - """ - This has the same signature as luigi.Task. - Go through this class and get the set of params we define - - Args: - *args: - **kwargs: - """ - - super(PipeTask, self).__init__(*args, **kwargs) - - # Luigi may copy the task by first serializing an existing task using task.to_str_params() - # This forces string parameters set to None to 'None' - if self.root_output_bundle_name == 'None': - self.root_output_bundle_name = None - if self.forced_output_bundle_uuid == 'None': - self.forced_output_bundle_uuid = None - - self._cached_processing_id = None - self._cached_output_bundle = None # refers to either a new bundle or existing bundle: b.uuid and b.local_dir - self._is_new_output = None - - # Instance variables to track various user wishes - self.user_set_human_name = None # self.set_bundle_name() - self.user_tags = {} # self.add_tags() - self.add_deps = {} # self.add(_external)_dependency() - self.yield_deps = {} # self.yield_dependency() - self._input_tags = {} # self.get_tags() of upstream tasks - self._input_bundle_uuids = {} # self.get_bundle_uuid() of upstream tasks - self._mark_force = False # self.mark_force() - - def input_bundles(self): - """ - Given this pipe, return the set of bundles that this task used as input. - Return a list of tuples that contain (processing_name, uuid, arg_name) - - NOTE: Calls task.deps which calls task._requires which calls task.requires() - - Args: - self (disdat.PipeTask): The pipe task in question - - Returns: - (dict(str:`disdat.api.Bundle`)): {arg_name: bundle, ...} - """ - - input_bundles = {} - for task in self.deps(): - b = task.output_bundle - assert b is not None - input_bundles[task.user_arg_name] = b - return input_bundles - - @property - def output_bundle(self): - """ Resolve this task to an existing bundle or get a new bundle. - - If this is a new bundle (a bundle does not yet exist for this processing name) - then we will create an output directory and cache a pointer to this bundle in this object. - Otherwise cache the closed bundle with the result. - - However, in the case of yielded tasks, Luigi re-creates the task by serializing the parameters and - deserializing them during task creation, often creating a new object. Now we have another task - that represents this same data (and processing name), but when the original asks for its output bundle it - doesn't have the *same* uuid that the new task has. Yikes! - - So we need a remote db to store the processingname -> bundle uuid binding. It's the data context (and - the sqlite db). But we only push to the db on bundle write (it's either closed / on disk / in db, or nothing). - So storing an additional bundle state would require some changes to the db. - - In lieu of that, we adopt the following slightly more expensive policy: - * if cache empty: cache returned bundle from resolve. - * if cached and closed: return bundle - * if cached and open: call resolve. [Did an identical task write this logical bundle (by processing name)?] - * if new_bundle is closed: set cached to closed bundle - * elif new_bundle is open: abondon bundle. - * if closed: return cached closed bundle. There should be no situation where resolve returns a different bundle. - - It is possible that two tasks do the same work and write to two different bundles with the same processing name. - This won't be incorrect, just inefficient. Future fix would be to synchronize on the DB vis-a-vis above. - """ - if self._cached_output_bundle is None: - self._cached_output_bundle = self._resolve_bundle(self.pfs.get_context(self.data_context_name)) - if not self._cached_output_bundle.closed: - self._cached_output_bundle.open(force_uuid=self.forced_output_bundle_uuid) - else: - if not self._cached_output_bundle.closed: - possible_output = self._resolve_bundle(self.pfs.get_context(self.data_context_name)) - # <------- RACE -------> between not closed and check, i.e., not thread safe. - if possible_output.closed: - if possible_output.uuid == self._cached_output_bundle.uuid: - # Note, we cannot "clean up" the bundle if we GC one. See Bundle.abandon() - del self._cached_output_bundle - else: - self._cached_output_bundle.abandon() - self._cached_output_bundle = possible_output - - return self._cached_output_bundle - - @property - def pipe_output(self): - """ Return the data in the bundle presented as the Python type with which it was stored """ - return self.output_bundle.data - - @property - def is_new_output(self): - if self._is_new_output is None: - self._is_new_output = not self.output_bundle.closed - return self._is_new_output - - def processing_id(self): - """ - Given a pipe instance, return the "processing_name" -- a unique string based on the class name and - the parameters. This re-uses Luigi code for getting the unique string. - - processing_id = task_class_name + hash(task params + [ dep0_task_name + hash(dep0_params), - dep1_task_name + hash(dep1_params),...]) - - In typical Luigi workflows, a task's parameters uniquely determine the requires function outputs. - - Thus there is no need to include the data inputs in the uniqueness of the processing name. - - But, some facilities on top of Disdat might build dynamic classes. In these cases, upper layer code - might define a pipeline in python like: - ``` - @punch.task - def adder(a,b): - return a+b - - def pipeline(X): - u = adder(a=1, b=2) - v = adder(a=u, b=X+1) - w = adder(a=v, b=X+2) - ``` - - In this case, there is a new task for each `adder`. Each`adder`'s requirements are defined in a scope outside - of the dynamic classes requires method. - - NOTE: The PipeTask has a 'driver_output_bundle'. This the name of the pipeline output bundle given by the user. - Because this is a Luigi parameter, it is included in the Luigi self.task_id string and hash. So we do not - have to append this separately. - - """ - - if self._cached_processing_id is not None: - return self._cached_processing_id - - deps = self.requires() - - assert(isinstance(deps, dict)) - - input_task_processing_ids = [(deps[k]).processing_id() for k in sorted(deps)] - - as_one_str = '.'.join(input_task_processing_ids) - - param_hash = hashlib.md5(as_one_str.encode('utf-8')).hexdigest() - - processing_id = self.task_id + '_' + param_hash[:luigi.task.TASK_ID_TRUNCATE_HASH] - - self._cached_processing_id = processing_id - - return processing_id - - def human_id(self): - """ - This is the "human readable" name; a "less unique" id than the unique id. - - The pipeline_id is well-defined for the output task -- it is the output bundle name. For intermediate outputs - the pipeline_id defaults to the pipe_id(). Else, it may be set by the task author. - - Note: Should we provide an identify for which version of this pipe is running at which stage in the pipeline? - Short answer, no. Imagine if we name with the pipeline bundle output name, branch index, and level index. In - this case if anyone re-uses this output, the human_name for the bundle won't be meaningful. For the pipeline - owner, it may also not be helpful. The system may also place different outputs at different times under those - indices. Too complicated. - - Returns: - (str) - - """ - - if self.root_output_bundle_name is not None: - return self.root_output_bundle_name - elif self.user_set_human_name is not None: - return self.user_set_human_name - else: - default_human_name = type(self).__name__ - return "{}".format(default_human_name) - - def get_hframe_uuid(self): - """ Return the unique ID for this tasks current output hyperframe - - Returns: - hframe_uuid (str): The unique identifier for this task's hyperframe - """ - return self.output_bundle.uuid - - def upstream_hframes(self): - """ Convert upstream tasks to hyperframes, return list of hyperframes - - Returns: - (:list:`hyperframe.HyperFrameRecord`): list of upstream hyperframes - """ - - tasks = self.deps() - hfrs = [] - for t in tasks: - hfid = t.get_hframe_uuid() - hfrs.append(self.pfs.get_hframe_by_uuid(hfid, data_context=self.pfs.get_context(self.data_context_name))) - - return hfrs - - def requires(self): - """ - Return Tasks on which this task depends. - - Build them intelligently, however. - 1.) The input_df so far stays the same for all upstream pipes. - 2.) However, when we resolve the location of the outputs, we need to do so correctly. - - Args: - - Returns: - (dict): arg_name: task_class - """ - - kwargs = self.prepare_pipe_kwargs() - - self.add_deps.clear() - self.pipe_requires(**kwargs) - rslt = self.add_deps - - if len(self.add_deps) == 0: - return {} - - tasks = {} - - for user_arg_name, cls_and_params in rslt.items(): - pipe_class, params = cls_and_params[0], cls_and_params[1] - assert isinstance(pipe_class, luigi.task_register.Register) - self._update_dependency_pipe_params(params, user_arg_name) - tasks[user_arg_name] = pipe_class(**params) - - return tasks - - def _update_dependency_pipe_params(self, params, user_arg_name): - """ - Given a parameter dictionary, update with PipeTask parameters - Never called on the user's root task. It's important to delete - root_output_bundle_name and forced_output_bundle_uuid b/c if someone - yields a PipeTask, Luigi doesn't take the instance, they make a new one - and the parameter serializer replaces None with 'None' for string params. - - Args: - params (dict): a dictionary of str: arg for a new PipeTask class - user_arg_name (str): The user's name of the resulting task output - - Returns: - (dict) - """ - # we propagate the same inputs and the same output dir for every upstream task! - params.update({ - 'user_arg_name': user_arg_name, - 'is_root_task': False, - 'force': self.force, - 'output_tags': dict({}), # do not pass output_tags up beyond root task - 'data_context_name': self.data_context_name, # all operations wrt this context - 'incremental_push': self.incremental_push, # propagate the choice to push incremental data. - 'incremental_pull': self.incremental_pull # propagate the choice to incrementally pull data. - }) - - return params - - def output(self): - """ - This is the *only* output function for all pipes. It declares the creation of the - one HyperFrameRecord pb and that's it. Remember, has to be idempotent. - - Note: By checking self.output_bundle we are implicitly also looking up our output bundle. - - Return: - (list:str): - - """ - if self.output_bundle is None: - assert(self.uuid == MISSING_EXT_DEP_UUID) # only reason we should be here. - output_path = DisdatFS().get_context(self.data_context_name).get_object_dir() - output_uuid = "this_file_should_not_exist" # look for a file that should not exist - else: - output_path = self.output_bundle.local_dir - output_uuid = self.output_bundle.uuid - - return luigi.LocalTarget(os.path.join(output_path, HyperFrameRecord.make_filename(output_uuid))) - - def run(self): - """ - Call users run function. - 1.) prepare the arguments - 2.) run and gather user result - 3.) interpret and wrap in a HyperFrame - - Returns: - None - """ - kwargs = self.prepare_pipe_kwargs(for_run=True) - bundle = self.output_bundle - assert(bundle is not None) - - """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), - then running requires() post run() will give different tasks. To be safe we record the inputs before run() - """ - add_dep_bundle_inputs = self.input_bundles() - - try: - start = time.time() # P3 datetime.now().timestamp() - user_rtn_val = self.pipe_run(**kwargs) - if isinstance(user_rtn_val, GeneratorType): - try: - while True: # while the user's task is yielding - task_list = next(user_rtn_val) - if not isinstance(task_list, list): task_list = [task_list] - # have we been here before? If so, don't yield. - if all(task.complete() for task in task_list): - pass - else: - yield task_list # yield the task or list of tasks - except StopIteration as si: - user_rtn_val = si.value # we always have a return value, it is in si.value - stop = time.time() # P3 datetime.now().timestamp() - except Exception as error: - """ If user's pipe fails for any reason, remove bundle dir and raise """ - try: - _logger.error("User pipe_run encountered exception: {}".format(error)) - bundle.abandon() - except OSError as ose: - _logger.error("User pipe_run encountered error, and error on remove bundle: {}".format(ose)) - raise - - try: - # Add any output tags to the user tag dict - if self.output_tags: - self.user_tags.update(self.output_tags) - - # If this is the root_task, identify it as so in the tag dict - if self.is_root_task: - self.user_tags.update({'root_task': 'True'}) - - """ if we have a pce, we have a new bundle that we need to add info to and close """ - bundle.add_data(user_rtn_val) - - bundle.add_timing(start, stop) - - bundle.add_dependencies(add_dep_bundle_inputs.values(), add_dep_bundle_inputs.keys()) - - yield_output_bundles = [t.output_bundle for t in self.yield_deps.values()] - yield_output_names = self.yield_deps.keys() - bundle.add_dependencies(yield_output_bundles, yield_output_names) - - bundle.name = self.human_id() - - bundle.processing_name = self.processing_id() - - bundle.add_params(self._get_subcls_params()) - - bundle.add_tags(self.user_tags) - - bundle.add_code_ref('{}.{}'.format(self.__class__.__module__, self.__class__.__name__)) - - pipeline_path = os.path.dirname(sys.modules[self.__class__.__module__].__file__) - cv = DisdatFS.get_pipe_version(pipeline_path) - bundle.add_git_info(cv.url, cv.hash, cv.branch) - - bundle.close() # Write out the bundle - - """ Incrementally push the completed bundle """ - if self.incremental_push and (BUNDLE_TAG_TRANSIENT not in bundle.tags): - self.pfs.commit(None, None, uuid=bundle.uuid, data_context=self.pfs.get_context(self.data_context_name)) - self.pfs.push(uuid=bundle.uuid, data_context=self.pfs.get_context(self.data_context_name)) - - except Exception as error: - """ If we fail for any reason, remove bundle dir and raise """ - bundle.abandon() - raise - - return None - - def _get_subcls_params(self): - """ Given the child class, extract user defined Luigi parameters - - This function uses vars(cls) and filters by Luigi Parameter - types. Luigi get_params() gives us all parameters in the full class hierarchy. - It would give us the parameters in this class as well. And then we'd have to do set difference. - See luigi.Task.get_params() - - Args: - self: The instance of the subclass. To get the normalized values for the Luigi Parameters - Returns: - dict: {:'string value',...} - """ - cls = self.__class__ - params = {} - for param in vars(cls): - attribute = getattr(cls, param) - if isinstance(attribute, luigi.Parameter): - params["{}".format(param)] = attribute.serialize(getattr(self, param)) - return params - - @classmethod - def _put_subcls_params(cls, ser_params): - """ Given the child class, create the Luigi parameter dictionary - - Assume that ser_params dictionary keys are the attribute names in the Disdat task class. - - Args: - self: The instance of the subclass. To get the normalized values for the Luigi Parameters - ser_params (dict): Dictionary : - Returns: - deser_params (dict): {: Luigi.Parameter,...} - """ - deser_params = {} - for param, ser_value in ser_params.items(): - try: - attribute = getattr(cls, param) - assert isinstance(attribute, luigi.Parameter) - deser_params[param] = attribute.parse(ser_value) - except Exception as e: - _logger.warning("Bundle parameter ({}:{}) can't be deserialized by class({}): {}".format(param, - ser_value, - cls.__name__, - e)) - raise e - return deser_params - - def prepare_pipe_kwargs(self, for_run=False): - """ Each upstream task produces a bundle. Prepare that bundle as input - to the user's pipe_run function. - - Args: - for_run (bool): prepare args for run -- at that point all upstream tasks have completed. - - Returns: - (dict): A dictionary with the arguments. - - """ - kwargs = dict() - - # Place upstream task outputs into the kwargs. Thus the user does not call - # self.inputs(). If they did, they would get a list of output targets for the bundle - if for_run: - - # Reset the stored tags, in case this instance is run multiple times. - self._input_tags = {} - self._input_bundle_uuids = {} - - upstream_tasks = [(t.user_arg_name, t.output_bundle) for t in self.deps()] - - for user_arg_name, b in [u for u in upstream_tasks if u[1] is not None]: - assert b.is_presentable - - # Download data that is not local (the linked files not present). Default when running in container. - if self.incremental_pull: - b.pull(localize=True) - - if user_arg_name in kwargs: - _logger.warning('Task human name {} reused when naming task dependencies'.format(user_arg_name)) - - self._input_tags[user_arg_name] = b.tags - self._input_bundle_uuids[user_arg_name] = b.uuid - kwargs[user_arg_name] = b.data - - return kwargs - - """ - Pipes Interface -- A pipe implements these calls - """ - - def pipe_requires(self, **kwargs): - """ - This is the place to put your pipeline dependencies. Place - the upstream pipes in an array and a dict for their params - - Args: - **kwargs: - - Returns: - - """ - return None - - def pipe_run(self, **kwargs): - """ - There is only one default argument "input_df" in kwargs. - The other keys in kwargs will be identical to your Luigi parameters specified in this class. - - The input_df has the data context identifiers, e.g., sampleName, sessionId, subjectId - The input_df has the data in either jsonData or fileData. - A sharded task will receive a subset of all possible inputs. - - Args: - **kwargs: - - Returns: - - """ - raise NotImplementedError() - - def yield_dependency(self, pipe_class, params): - """ - Disdat Pipe API Function - - Use this function to dynamically yield dependencies in the pipe_run function. Access these dependency - results by retaining a reference to the PipeTask and calling PipeTask.pipe_output. - `` - def pipe_run(self, some_arg=None): - pipe = self.yield_dependency(IsFish, params=None) - yield pipe - if pipe.pipe_output is True: - print("It's a fish.") - `` - - Args: - pipe_class (object): Class name of upstream task if looking for external bundle by processing_id. - params (dict): Dictionary of parameters if looking for external bundle by processing_id. - - Returns: - `disdat.PipeTask` - """ - if not isinstance(params, dict): - error = "yield_dependency: params argument must be a dictionary" - raise Exception(error) - - assert isinstance(pipe_class, luigi.task_register.Register) - - # Note: we instantiate a new class each time to get the Luigi Task.task_id. we don't need the - # PipeTask.processing_name because we only need tasks that are unique to this parent task (the one yielding). - # But we can't just use yielded task order to name them if the task yields in different orders. - self._update_dependency_pipe_params(params, YIELD_PIPETASK_ARG_NAME) - to_yield = pipe_class(**params) - id = to_yield.task_id - - if id not in self.yield_deps: - self.yield_deps[id] = to_yield - else: - if self.yield_deps[id] is not to_yield: del to_yield - - return self.yield_deps[id] - - def add_dependency(self, param_name, pipe_class, params): - """ - Disdat Pipe API Function - - Add a task and its parameters to our requirements - - Args: - param_name (str): The parameter name this bundle assumes when passed to Pipe.run - pipe_class (object): Class name of upstream task if looking for external bundle by processing_id. - params (dict): Dictionary of parameters if looking for external bundle by processing_id. - - Returns: - None - - """ - if not isinstance(params, dict): - error = "add_dependency third argument must be a dictionary of parameters" - raise Exception(error) - - assert (param_name not in self.add_deps) - self.add_deps[param_name] = (pipe_class, params) - - return - - def add_external_dependency(self, param_name, pipe_class, params, human_name=None, uuid=None): - """ - Disdat Pipe API Function - - Add an external task and its parameters to our requirements. What this means is that - there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls - flatten(self.requires())). And what that means is that this requirement can only be satisfied - by the bundle actually existing. - - Create ersatz ExternalDepTask parameterized by uuid and processing_name - Note: it is possible to use class/params when searching by class, params, but this makes all external - dependencies look the same in the code. Win. - - NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during - execution and that your requires function is no longer deterministic. You must add caching to your - requires function to handle this scenario. - - Example with class variable bundle_uuid: - `` - if self.bundle_uuid is None: - bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result') - self.bundle_uuid = bundle.uuid - else: - bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid) - `` - - TODO: Consider pushing caching into this layer. - - Args: - param_name (str): The parameter name this bundle assumes when passed to Pipe.run - pipe_class (object): Class name of upstream task if looking for external bundle by processing_id. - params (dict): Dictionary of parameters if looking for external bundle by processing_id. - human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name. Trumps task_class and params. - uuid (str): Resolve dependency by explicit UUID, trumps task_class, params and human_name. - - Returns: - `api.Bundle` or None - - """ - import disdat.api as api - - if pipe_class is not None and not isinstance(params, dict): - error = "add_external_dependency requires parameter dictionary" - raise Exception(error) - - assert (param_name not in self.add_deps) - - try: - if uuid is not None: - hfr = self.pfs.get_hframe_by_uuid(uuid, data_context=self.pfs.get_context(self.data_context_name)) - elif human_name is not None: - hfr = self.pfs.get_latest_hframe(human_name, data_context=self.pfs.get_context(self.data_context_name)) - else: - # we propagate the same inputs and the same output dir for every upstream task! - params.update({ - 'user_arg_name': param_name, - 'data_context_name': self.data_context_name - }) - p = pipe_class(**params) - hfr = self.pfs.get_hframe_by_proc(p.processing_id(), data_context=self.pfs.get_context(self.data_context_name)) - - if hfr is None: - error_str = "Disdat can't resolve external bundle from class[{}] params[{}] name[{}] uuid[{}]".format(pipe_class, - params, - human_name, - uuid) - raise ExtDepError(error_str) - - bundle = api.Bundle(self.data_context_name).fill_from_hfr(hfr) - - except ExtDepError as error: # Swallow and allow Luigi to determine task is not available. - _logger.error(error_str) - bundle = None - - except Exception as error: - _logger.error(error) - bundle = None - - finally: - if bundle is None: - self.add_deps[param_name] = (luigi.task.externalize(ExternalDepTask), {'uuid': MISSING_EXT_DEP_UUID, - 'processing_name': 'None'}) - else: - # When a task requires an external dep, this can be called multiple times. And then - # we return to the pipe.run which creates the class. Note that calling task.deps() will cause - # the requires() to be called. But calling deps() doesn't mean that task.output() will be called - # And that means is that tasks that have been required might not have their task.cached_output_bundle - # set by calling resolve_bundle. Now, in this case, because we are calling luigi.task.externalize, - # we are actually creating *copies* of the class object and so luigi object caching isn't going to work. - # This means that resolve_bundle must be called when using the cached_output_bundle field. - self.add_deps[param_name] = (luigi.task.externalize(ExternalDepTask), {'uuid': bundle.uuid, - 'processing_name': bundle.processing_name}) - - return bundle - - def create_output_file(self, filename): - """ - Disdat Pipe API Function - - Pass in the name of your file, and get back a Luigi target object to which you can write. - - Args: - filename (str, dict, list): A basename, dictionary of basenames, or list of basenames. - - Returns: - (`luigi.LocalTarget`): Singleton, list, or dictionary of Luigi Target objects. - """ - output_dir = self.output_bundle.local_dir - - return self.filename_to_luigi_targets(output_dir, filename) - - def create_remote_output_file(self, filename): - """ - Disdat Pipe API Function - - Pass in the name of your file, and get back an object to which you can write on S3. - - NOTE: Managed S3 paths are created only if a) remote is set (otherwise where would we put them?) - and b) incremental_push flag is True (if we don't push bundle metadata, then the locations may be lost). - - Args: - filename (str, dict, list): A basename, dictionary of basenames, or list of basenames. - - Returns: - (`luigi.contrib.s3.S3Target`): Singleton, list, or dictionary of Luigi Target objects. - - """ - output_dir = self.get_remote_output_dir() - return self.filename_to_luigi_targets(output_dir, filename) - - def create_output_dir(self, dirname): - """ - Disdat Pipe API Function - - Given basename directory name, return a fully qualified path whose prefix is the - local output directory for this bundle in the current context. This call creates the - output directory as well. - - Args: - dirname (str): The name of the output directory, i.e., "models" - - Returns: - output_dir (str): Fully qualified path of a directory whose prefix is the bundle's local output directory. - - """ - prefix_dir = self.get_output_dir() - fqp = os.path.join(prefix_dir, dirname) - try: - os.makedirs(fqp) - except IOError as why: - _logger.error("Creating directory in bundle directory failed:".format(why)) - - return fqp - - def create_remote_output_dir(self, dirname): - """ - Disdat Pipe API Function - - Given basename directory name, return a fully qualified path whose prefix is the - remote output directory for this bundle in the current context. - - NOTE: The current context must have a remote attached. - - Args: - dirname (str): The name of the output directory, i.e., "models" - - Returns: - output_dir (str): Fully qualified path of a directory whose prefix is the bundle's remote output directory. - - """ - prefix_dir = self.get_remote_output_dir() - fqp = os.path.join(prefix_dir, dirname) - return fqp - - def get_output_dir(self): - """ - Disdat Pipe API Function - - Retrieve the output directory for this task's bundle. You may place - files directly into this directory. - - Returns: - output_dir (str): The bundle's output directory - - """ - return self.output_bundle.local_dir - - def get_remote_output_dir(self): - """ - Disdat Pipe API Function - - Retrieve the output directory for this task's bundle. You may place - files directly into this directory. - - Returns: - output_dir (str): The bundle's output directory on S3 - - """ - uuid = self.output_bundle.uuid - - data_context = self.pfs.get_context(self.data_context_name) - if data_context.remote_ctxt_url and self.incremental_push: - output_dir = os.path.join(data_context.get_remote_object_dir(), uuid) - else: - raise Exception('Managed S3 path creation needs a) remote context and b) incremental push to be set') - return output_dir - - def set_bundle_name(self, human_name): - """ - Disdat Pipe API Function - - Set the human name for this bundle. If not called, then intermediate outputs - will have human names identical to their process names. - - Args: - human_name (str): The human name of this pipe's output bundle. - - Returns: - None - - """ - self.user_set_human_name = human_name - - def add_tags(self, tags): - """ - Disdat Pipe API Function - - Adds tags to bundle. - - Args: - tags (dict (str, str)): key value pairs (string, string) - - Returns: - None - """ - assert (isinstance(tags, dict)) - self.user_tags.update(tags) - - def get_tags(self, user_arg_name): - """ - Disdat Pipe API Function - - Retrieve the tag dictionary from an upstream task. - - Args: - user_arg_name (str): keyword arg name of input bundle data for which to return tags - - Returns: - tags (dict (str, str)): key value pairs (string, string) - """ - assert user_arg_name in self._input_tags - return self._input_tags[user_arg_name] - - def get_bundle_uuid(self, user_arg_name): - """ - Disdat Pipe API Function - - Retrieve the UUID from an upstream task. - - Args: - user_arg_name (str): keyword arg name of input bundle data for which to return tags - - Returns: - uuid (str) - """ - assert user_arg_name in self._input_bundle_uuids - return self._input_bundle_uuids[user_arg_name] - - def mark_force(self): - """ - Disdat Pipe API Function - - Mark pipe to force recompution of this particular task. This means that Disdat/Luigi will - always re-run this particular pipe / task. - - We mark the pipe with a particular flag so that apply.resolve_bundle() - - Returns: - None - """ - self._mark_force = True - - def mark_transient(self, push_meta=True): - """ - Disdat Pipe API Function - - Mark output bundle as transient. This means that during execution Disdat will not - write (push) this bundle back to the remote. That only happens in two cases: - 1.) Started the pipeline with incremental_push=True - 2.) Running the pipeline in a container with no_push or no_push_intermediates False - - We mark the bundle with a tag. Incremental push investigates the tag before pushing. - And the entrypoint investigates the tag if we are not pushing incrementally. - Otherwise, normal push commands from the CLI or api will work, i.e., manual pushes continue to work. - - Args: - push_meta (bool): Push the meta-data but not the data. Else, push nothing. - - Returns: - None - """ - if push_meta: - self.add_tags({BUNDLE_TAG_TRANSIENT: 'True', BUNDLE_TAG_PUSH_META: 'True'}) - else: - self.add_tags({BUNDLE_TAG_TRANSIENT: 'True'}) - - def _resolve_bundle(self, data_context): - """ - Instead of resolving before we run, this resolve can be issued - from within the pipe.output() function. - - Note: Only returns None with an unfound external dependency - - Note: If returning a new bundle, we do *not* open it in this function. Bundle.open() - creates the output directory. Only do that if we know we're going to actually use the - bundle in pipe.output(). - - Args: - self: the pipe to investigate - data_context: the data context object from which we should resolve bundles. - - Returns: - Bundle: bundle to use. If open, new bundle, if closed, re-using - """ - verbose = False - - if verbose: - print("resolve_bundle: looking up bundle {}".format(self.processing_id())) - - if (self._mark_force or self.force) and not isinstance(self, ExternalDepTask): - # Forcing recomputation through a manual annotation or --force directive, unless external - _logger.debug("resolve_bundle: pipe.mark_force forcing a new output bundle.") - if verbose: print("resolve_bundle: pipe.mark_force forcing a new output bundle.\n") - return api.Bundle(data_context) - - if isinstance(self, ExternalDepTask): - # NOTE: Even if add_external_dependency() fails to find the bundle we still succeed here. - # Thus it can look like we reuse a bundle, when in fact we don't. We error either - # within the user's requires, add_external_dependency(), or when Luigi can't find the task (current approach) - assert worker._is_external(self) - b = api.get(data_context.get_local_name(), None, uuid=str(self.uuid)) - if b is not None: - if verbose: print("resolve_bundle: found ExternalDepTask re-using bundle with UUID[{}].\n".format(self.uuid)) - else: - if verbose: print("resolve_bundle: No ExternalDepTask found with UUID[{}].\n".format(self.uuid)) - return b - - bndls = api.search(data_context.get_local_name(), processing_name=self.processing_id()) - - if bndls is None or len(bndls) <= 0: - if verbose: print("resolve_bundle: No bundle with proc_name {}, getting new output bundle.\n".format(self.processing_id())) - return api.Bundle(data_context) - - bndl = bndls[0] # our best guess is the most recent bundle with the same processing_id() - - # 2.) Bundle exists - lineage object tells us input bundles. - lng = bndl.get_lineage() - if lng is None: - if verbose: print("resolve_bundle: No lineage present, getting new output bundle.\n") - return api.Bundle(data_context) - - # 3.) Lineage record exists -- if new code, re-run - pipeline_path = os.path.dirname(sys.modules[self.__module__].__file__) - current_version = DisdatFS().get_pipe_version(pipeline_path) - - if different_code_versions(current_version, lng): - if verbose: print("resolve_bundle: New code version, getting new output bundle.\n") - return api.Bundle(data_context) - - # 3.5.) Have we changed the output human bundle name? If so, re-run task. - # Note: we need to go through all the bundle versions with that processing_id. - # because, at the moment, we make new bundles when we change name. When in some sense - # it's just a tag set that should include other names and the data should be the same. - - current_human_name = self.human_id() - found = False - for bndl in bndls: - if current_human_name == bndl.get_human_name(): - found = True - break - if not found: - if verbose: print("resolve_bundle: New human name {} (prior {}), getting new output bundle.\n".format( - current_human_name, bndl.get_human_name())) - return api.Bundle(data_context) - - # 4.) Check the inputs -- assumes we have processed upstream tasks already - for task in self.deps(): - """ Are we re-running an upstream input? - Look through its *current* list of possible upstream tasks, not the ones it had - on its prior run. If the UUID has changed relative to lineage, then we need to re-run. - - In general, the only reason we should re-run an upstream is b/c of a code change. And that change - did not change the tasks parameters. So it looks the same, but it is actually different. OR someone - deletes and re-runs (maybe sql query result changes though parameters are the same). - - But if an output exists and we want to ignore code version and ignore data changes then - while we do this, we should re-use our bundle independent of whether an upstream needs to re-run - or whether one of our inputs is out of date. - - So one option is to ignore upstreams that need to be re-run. Re-use blindly. Like Luigi. - - Another option is that anytime we don't have an input bundle, we attempt to read it not just - locally, but remotely as well. - """ - LUIGI_RERUN = False - - if LUIGI_RERUN: - # Ignore whether upstreams had to be re-run b/c they didn't have bundles. - # Ignore whether this has to be re-run because existing inputs are newer - continue - - if task.output_bundle is None: - # this can happen with bundles created by other pipelines. - # still surface the warning, but no longer raise exception - _logger.info( - "Resolve bundles: this pipe's dep {} has no input bundle. Likely an externally produced bundle".format( - task.processing_id())) - else: - if task.is_new_output: - if verbose: print("Resolve_bundle: upstream task is being re-run, so rerun with new output bundle.\n") - return api.Bundle(data_context) - - # Upstream Task - # Resolve to a bundle, UUID and a processing name - # If it is an ordinary task in a workflow, we resolve via the processing name - if worker._is_external(task) and isinstance(task, ExternalDepTask): - upstream_dep_uuid = task.uuid - upstream_dep_processing_name = task.processing_name - else: - found = api.search(data_context.get_local_name(), processing_name=task.processing_id()) - assert len(found) > 0 - local_bundle = found[0] # the most recent with this processing_name - upstream_dep_uuid = local_bundle.pb.uuid - upstream_dep_processing_name = local_bundle.pb.processing_name - assert(upstream_dep_processing_name == task.processing_id()) - - """ Now we need to check if we should re-run this task because an upstream input exists and has been updated - Go through each of the inputs used for this current task. - POLICY - 1.) if the date is more recent, it is "new" data. - 2.) if it is older, we should require force (but currently do not and re-run). - XXX TODO: Add date to the depends_on pb data structure to enforce 2 XXX - """ - for tup in lng.pb.depends_on: - if tup.hframe_proc_name == upstream_dep_processing_name and tup.hframe_uuid != upstream_dep_uuid: - if verbose: print("Resolve_bundle: prior input bundle {} uuid {} has new uuid {}\n".format( - task.processing_id(), - tup.hframe_uuid, - upstream_dep_uuid)) - return api.Bundle(data_context) - - # 5.) Woot! Reuse the found bundle. - if verbose: print("resolve_bundle: reusing bundle\n") - return bndl - - -class ExternalDepTask(PipeTask): - """ This task is only here as a shell. - If the user specifies an external dependency, we look it up in add_external_dependency. - We look it up, b/c we want to hand them the bundle in requires. - If they look up the bundle via UUID or human name, there is no reason for them to - pass in the class. Especially for human name, where they cannot know it. - And, if it exists, there is no reason to look it up again in apply.resolve_bundle(). - Thus we create an ExternalDepTask() parameterized by the UUID and apply.resolve_bundle() - The default output() function will create this tasks processing_id() which will be a hash - of this task's params, which will include a unique UUID. And so should be unique. - """ - uuid = luigi.Parameter(default=None) - processing_name = luigi.Parameter(default=None) - - def input_bundles(self): - """ External bundles output are in lineage. - Note: this is only called in apply for now. And this task can never - be called by apply directly. - """ - assert False, "An ExternalDepTask should never be run directly." diff --git a/disdat/pipe_base.py b/disdat/pipe_base.py deleted file mode 100644 index 22ac45e..0000000 --- a/disdat/pipe_base.py +++ /dev/null @@ -1,249 +0,0 @@ -""" -pipe_base.py - -Unify DriverTask and PipeTask with one abstract base class. - -""" - -# Using print as a function makes it easier to switch between printing -# during development and using logging.{debug, info, ...} in production. -from __future__ import print_function - -from abc import ABCMeta, abstractmethod -import os -import shutil -import collections - -import luigi -from luigi.contrib.s3 import S3Target -import six -from six.moves import urllib -import numpy as np -import pandas as pd - -import disdat.common as common -from disdat.fs import DisdatFS -from disdat.data_context import DataContext -from disdat.hyperframe import HyperFrameRecord, FrameRecord -import disdat.hyperframe_pb2 as hyperframe_pb2 -from disdat import logger as _logger - - -CodeVersion = collections.namedtuple('CodeVersion', 'semver hash tstamp branch url dirty') - -MISSING_EXT_DEP_UUID = 'UnresolvedExternalDep' -YIELD_PIPETASK_ARG_NAME = "YieldArgName" - -class PipeBase(object): - __metaclass__ = ABCMeta - - @property - def pfs(self): - return DisdatFS() - - @abstractmethod - def output_bundle(self): - """ - Given this pipe, return the set of bundles created by this pipe. - Mirrors Luigi task.outputs() - - Returns: - (processing_name, uuid) - """ - pass - - @abstractmethod - def input_bundles(self): - """ - Given this pipe, return the set of bundles created by the input pipes. - Mirrors Luigi task.inputs() - - :param pipe_task: A PipeTask or a DriverTask (both implement PipeBase) - Returns: - [(processing_name, uuid), ... ] - """ - pass - - @abstractmethod - def processing_id(self): - """ - Given a pipe instance, return a unique string based on the class name and - the parameters. - - Bundle Tag: Used to fill in bundle.processing_name - """ - pass - - @abstractmethod - def human_id(self): - """ - This is a "less unique" id than the unique id. It is supposed to be the "human readable" name of the stage - this pipe occupies in the pipesline. - - Bundle Tag: Used to fill in bundle.bundle_name - """ - pass - - @staticmethod - def _interpret_scheme(full_path): - scheme = urllib.parse.urlparse(full_path).scheme - - if scheme == '' or scheme == 'file': - ''' LOCAL FILE ''' - return luigi.LocalTarget(full_path) - elif scheme == 's3': - ''' S3 FILE ''' - return S3Target(full_path) - - assert False - - @staticmethod - def filename_to_luigi_targets(output_dir, output_value): - """ - Create Luigi file objects from a file name, dictionary of file names, or list of file names. - - Return the same object type as output_value, but with Luigi.Targets instead. - - Args: - output_dir (str): Managed output path. - output_value (str, dict, list): A basename, dictionary of basenames, or list of basenames. - - Return: - (`luigi.LocalTarget`, `luigi.contrib.s3.S3Target`): Singleton, list, or dictionary of Luigi Target objects. - """ - - if isinstance(output_value, list) or isinstance(output_value, tuple): - luigi_outputs = [] - for i in output_value: - full_path = os.path.join(output_dir, i) - luigi_outputs.append(PipeBase._interpret_scheme(full_path)) - if len(luigi_outputs) == 1: - luigi_outputs = luigi_outputs[0] - elif isinstance(output_value, dict): - luigi_outputs = {} - for k, v in output_value.items(): - full_path = os.path.join(output_dir, v) - luigi_outputs[k] = PipeBase._interpret_scheme(full_path) - else: - full_path = os.path.join(output_dir, output_value) - luigi_outputs = PipeBase._interpret_scheme(full_path) - - return luigi_outputs - - @staticmethod - def rm_bundle_dir(output_path, uuid): - """ - We created a directory (managed path) to hold the bundle and any files. The files have been - copied in. Removing the directory removes any created files. If the user has told us about - any DBTargets, also call rm() on those. - - TODO: Integrate with data_context bundle remove. That deals with information already - stored in the local DB. - - ASSUMES: That we haven't actually updated the local DB with information on this bundle. - - Args: - output_path (str): - uuid (str): - db_targets (list(DBTarget)): - - Returns: - None - """ - try: - shutil.rmtree(output_path, ignore_errors=True) - os.rmdir(output_path) - # TODO: if people create s3 files, s3 file targets, inside of an s3 context, - # TODO: then we will have to clean those up as well. - except IOError as why: - _logger.error("Removal of hyperframe directory {} failed with error {}. Continuing removal...".format( - uuid, why)) - - @staticmethod - def parse_return_val(hfid, val, data_context): - """ - Interpret the return values and create an HFrame to wrap them. - This means setting the correct presentation bit in the HFrame so that - we call downstream tasks with parameters as the author intended. - - POLICY / NOTE: An non-HF output is a Presentable. - NOTE: For now, a task output is *always* presentable. - NOTE: No other code should set presentation in a HyperFrame. - - The mirror to this function (that unpacks a presentable is disdat.fs.present_hfr() - - Args: - hfid (str): UUID - val (object): A scalar, dict, tuple, list, dataframe - data_context (DataContext): The data context into which to place this value - - Returns: - (presentation, frames[]) - - """ - - possible_scalar_types = ( - int, - float, - str, - bool, - np.bool_, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.float16, - np.float32, - np.float64, - six.binary_type, - six.text_type, - np.unicode_, - np.string_ - ) - - frames = [] - - if val is None: - """ None's stored as json.dumps([None]) or '[null]' """ - presentation = hyperframe_pb2.JSON - frames.append(data_context.convert_scalar2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) - - elif isinstance(val, HyperFrameRecord): - presentation = hyperframe_pb2.HF - frames.append(FrameRecord.make_hframe_frame(hfid, common.DEFAULT_FRAME_NAME + ':0', [val])) - - elif isinstance(val, np.ndarray) or isinstance(val, list): - presentation = hyperframe_pb2.TENSOR - if isinstance(val, list): - val = np.array(val) - frames.append(data_context.convert_serieslike2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) - - elif isinstance(val, tuple): - presentation = hyperframe_pb2.ROW - val = np.array(val) - frames.append(data_context.convert_serieslike2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) - - elif isinstance(val, dict): - presentation = hyperframe_pb2.ROW - for k, v in val.items(): - if not isinstance(v, (list, tuple, pd.core.series.Series, np.ndarray, collections.Sequence)): - # assuming this is a scalar - assert isinstance(v, possible_scalar_types), 'Disdat requires dictionary values to be one of {} not {}'.format(possible_scalar_types, type(v)) - frames.append(data_context.convert_scalar2frame(hfid, k, v)) - else: - assert isinstance(v, (list, tuple, pd.core.series.Series, np.ndarray, collections.Sequence)) - frames.append(data_context.convert_serieslike2frame(hfid, k, v)) - - elif isinstance(val, pd.DataFrame): - presentation = hyperframe_pb2.DF - frames.extend(data_context.convert_df2frames(hfid, val)) - - else: - presentation = hyperframe_pb2.SCALAR - frames.append(data_context.convert_scalar2frame(hfid, common.DEFAULT_FRAME_NAME + ':0', val)) - - return presentation, frames diff --git a/disdat/resource.py b/disdat/resource.py index f475457..c569485 100644 --- a/disdat/resource.py +++ b/disdat/resource.py @@ -1,3 +1,19 @@ +# +# Copyright Human Longevity, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + """ Resource ======== @@ -9,6 +25,7 @@ This allows for better refactoring. """ + import pkgutil import sys import os diff --git a/disdat/resources/__init__.py b/disdat/resources/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/disdat/resources/example-ecr-policy.json b/disdat/resources/example-ecr-policy.json deleted file mode 100644 index 043e58c..0000000 --- a/disdat/resources/example-ecr-policy.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "Version": "2008-10-17", - "Statement": [ - { - "Sid": "IMAccessRole", - "Effect": "Allow", - "Principal": { - "AWS": [ - "arn:aws:iam:: - "arn:aws:iam:: - ] - }, - "Action": [ - "ecr:GetDownloadUrlForLayer", - "ecr:BatchGetImage", - "ecr:BatchCheckLayerAvailability" - ] - } - ] -} \ No newline at end of file diff --git a/disdat/run.py b/disdat/run.py deleted file mode 100644 index cd2def3..0000000 --- a/disdat/run.py +++ /dev/null @@ -1,675 +0,0 @@ -# -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Run dockerized version of this pipe - -Run differs from apply. Apply will run the transform locally / natively. -Run executes the most recently built container. By default it will -first run it locally. Containers can be built for different backends. -Backends do things in different ways. - -1.) Entrypoint arguments. They might customize endpoints based on the arguments. E.g, -d or - -2.) Entrypoint long vs. short-lived. They might run a web-server in the container, e.g., to serve a model. - - -author: Kenneth Yocum -""" -from __future__ import print_function - -import argparse -import inspect -import os -import tempfile -import time -import json -from sys import platform - -import docker -import six -import boto3 as b3 -from enum import Enum - -import disdat.fs as fs -import disdat.common as common -import disdat.utility.aws_s3 as aws -from disdat.common import DisdatConfig -from disdat import logger as _logger - - -_MODULE_NAME = inspect.getmodulename(__file__) - - -ENTRYPOINT_BIN = 'dsdt_docker' - - -class Backend(Enum): - Local = 0 - AWSBatch = 1 - LocalSageMaker = 2 - SageMaker = 3 - - @staticmethod - def default(): - return Backend.Local.name - - @staticmethod - def options(): - return [i.name for i in list(Backend)] - - -def _run_local(cli, pipeline_setup_file, arglist, backend): - """ - Run container locally or run sagemaker container locally - Args: - cli (bool): Whether we were called from the CLI or API - pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline. - arglist: - backend: - - Returns: - output (str): Returns None if there is a failure - - """ - - on_macos = False - if platform == "darwin": - on_macos = True - - client = docker.from_env() - - environment = {} - if 'AWS_PROFILE' in os.environ: - environment['AWS_PROFILE'] = os.environ['AWS_PROFILE'] - - environment[common.LOCAL_EXECUTION] = 'True' - - # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this - #environment['DISDAT_CPU_COUNT'] = vcpus - - volumes = {} - aws_config_dir = os.getenv('AWS_CONFIG_DIR', os.path.join(os.environ['HOME'], '.aws')) - if aws_config_dir is not None and os.path.exists(aws_config_dir): - volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'} - - local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir() - volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'} - - try: - if backend == Backend.LocalSageMaker: - pipeline_image_name = common.make_sagemaker_project_image_name(pipeline_setup_file) - tempdir = tempfile.mkdtemp() - with open(os.path.join(tempdir, 'hyperparameters.json'), 'w') as of: - json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of) - args = ['train'] # rewrite to just 'train' - # On mac OS, tempdir returns /var, but is actually /private/var - # Add /private since it that dir is shared (and not /var) with Docker. - if on_macos: - localdir = os.path.join('/private', tempdir[1:]) - else: - localdir = tempdir - volumes[localdir] = {'bind': '/opt/ml/input/config/', 'mode': 'rw'} - _logger.info("VOLUMES: {}".format(volumes)) - else: - # Add the actual command to the arglist (for non-sagemaker runs) - arglist = [ENTRYPOINT_BIN] + arglist - pipeline_image_name = common.make_project_image_name(pipeline_setup_file) - - _logger.debug('Running image {} with arguments {}'.format(pipeline_image_name, arglist)) - - stdout = client.containers.run(pipeline_image_name, arglist, detach=False, - environment=environment, init=True, stderr=True, volumes=volumes) - stdout = six.ensure_str(stdout) - if cli: print(stdout) - return stdout - except docker.errors.ContainerError as ce: - _logger.error("Internal error running image {}".format(pipeline_image_name)) - _logger.error("Error: {}".format(six.ensure_str(ce.stderr))) - return six.ensure_str(ce) - except docker.errors.ImageNotFound: - _logger.error("Unable to find the docker image {}".format(pipeline_image_name)) - return None - - -def get_fq_docker_repo_name(is_sagemaker, pipeline_setup_file): - """ - Produce the fully qualified docker repo name. - - Args: - is_sagemaker (bool): for sagemaker image - pipeline_setup_file (str): the path to the setup.py file used to dockerize this pipeline - - Returns: - (str): The fully qualified docker image repository name - """ - disdat_config = DisdatConfig.instance() - - repository_prefix = None - if disdat_config.parser.has_option('docker', 'repository_prefix'): - repository_prefix = disdat_config.parser.get('docker', 'repository_prefix') - if is_sagemaker: - repository_name = common.make_sagemaker_project_repository_name(repository_prefix, pipeline_setup_file) - else: - repository_name = common.make_project_repository_name(repository_prefix, pipeline_setup_file) - - # Figure out the fully-qualified repository name, i.e., the name - # including the registry. - registry_name = disdat_config.parser.get('docker', 'registry').strip('/') - if registry_name == '*ECR*': - fq_repository_name = aws.ecr_get_fq_repository_name(repository_name) - else: - fq_repository_name = '{}/{}'.format(registry_name, repository_name) - - return fq_repository_name - - -def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name, - aws_session_token_duration, vcpus, memory, - no_submit, job_role_arn): - """ - Run job on AWS Batch. Sends to queue configured in disdat.cfg. - This assumes that you have already created a cluster that will run the jobs - that have been assigned to that queue. - - Args: - arglist: - fq_repository_name (str): The fully qualified docker repository name - job_name: - pipeline_image_name: - aws_session_token_duration: - vcpus: - memory: - no_submit (bool): default False - job_role_arn (str): Can be None - - Returns: - - """ - - def check_role_arn(job_dict, jra): - """ Check to see if the job desc dictionary contains the same job_role_arn (jra) - """ - - if jra is None: - if 'jobRoleArn' not in job_dict['containerProperties']: - return True - else: - if 'jobRoleArn' in job_dict['containerProperties']: - if job_dict['containerProperties']['jobRoleArn'] == jra: - return True - return False - - disdat_config = DisdatConfig.instance() - - # Get the parameter values required to kick off an AWS Batch job. - # Every batch job must: - # 1. Have a name - # 2. Have a job definition that declares which ECR-hosted Docker - # image to use. - # 3. Have a queue that feeds jobs into a compute cluster. - # 4. The command to execute inside the Docker image; the command - # args are more-or-less the same as the ones used to execute - # locally using 'dsdt run' - - # Create a Job Definition and upload it. - # We create per-user job definitions so multiple users do not clobber each other. - # In addition, we never re-use a job definition, since the user may update - # the vcpu or memory requirements and those are stuck in the job definition - - job_definition_name = aws.batch_get_job_definition_name(pipeline_image_name) - - if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): - job_definition_name = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_job_definition') - - # TODO: Look through all of history to find one that matches? - # TODO: Delete old jobs here or let user do it? - job_definition_obj = aws.batch_get_latest_job_definition(job_definition_name) - - if (job_definition_obj is not None and - job_definition_obj['containerProperties']['image'] == fq_repository_name and - job_definition_obj['containerProperties']['vcpus'] == vcpus and - job_definition_obj['containerProperties']['memory'] == memory and - check_role_arn(job_definition_obj, job_role_arn)): - - job_definition_fqn = aws.batch_extract_job_definition_fqn(job_definition_obj) - - _logger.info("Re-using prior AWS Batch run job definition : {}".format(job_definition_obj)) - - else: - """ Whether None or doesn't match, make a new one """ - - job_definition_obj = aws.batch_register_job_definition(job_definition_name, fq_repository_name, - vcpus=vcpus, memory=memory, job_role_arn=job_role_arn) - - job_definition_fqn = aws.batch_get_job_definition(job_definition_name) - - _logger.info("New AWS Batch run job definition {}".format(job_definition_fqn)) - - if no_submit: - # Return the job description object - return job_definition_obj - - job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') - - container_overrides = {'command': arglist} - - # Through the magic of boto3_session_cache, the client in our script - # here can get at AWS profiles and temporary AWS tokens created in - # part from MFA tokens generated through the user's shells; we don't - # have to write special code of our own to deal with authenticating - # with AWS. - client = b3.client('batch', region_name=aws.profile_get_region()) - # A bigger problem might be that the IAM role executing the job on - # a batch EC2 instance might not have access to the S3 remote. To - # get around this, allow the user to create some temporary AWS - # credentials. - - if aws_session_token_duration > 0 and job_role_arn is None: - sts_client = b3.client('sts') - try: - token = sts_client.get_session_token(DurationSeconds=aws_session_token_duration) - credentials = token['Credentials'] - container_overrides['environment'] = [ - {'name': 'AWS_ACCESS_KEY_ID', 'value': credentials['AccessKeyId']}, - {'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials['SecretAccessKey']}, - {'name': 'AWS_SESSION_TOKEN', 'value': credentials['SessionToken']} - ] - except Exception as e: - _logger.debug("Unable to generate an STS token, instead trying users default credentials...") - credentials = b3.session.Session().get_credentials() - container_overrides['environment'] = [ - {'name': 'AWS_ACCESS_KEY_ID', 'value': credentials.access_key}, - {'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials.secret_key}, - {'name': 'AWS_SESSION_TOKEN', 'value': credentials.token} - ] - - container_overrides['environment'].append({'name': 'DISDAT_CPU_COUNT', 'value': str(vcpus)}) - - job = client.submit_job(jobName=job_name, jobDefinition=job_definition_fqn, jobQueue=job_queue, - containerOverrides=container_overrides) - - status = job['ResponseMetadata']['HTTPStatusCode'] - if status == 200: - _logger.info('Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'.format(job['jobName'], job['jobId'], - job_definition_fqn, job_queue)) - return job - else: - _logger.error('Job submission failed: HTTP Status {}'.format()) - return None - - -def _sagemaker_hyperparameters_from_arglist(arglist): - """ - Return a dictionary of str:str that emulates - what SageMaker will return when using boto3 interface. - - Args: - arglist (list (str)): List of string arguments to entrypoint.py - - Returns: - (dict (str:str)): Dictionary of string to string - """ - - return {'arglist': json.dumps(arglist)} - - -def _run_aws_sagemaker(arglist, fq_repository_name, job_name): - """ - Runs a training job on AWS SageMaker. This uses the default machine type - in the disdat.cfg file. - - Args: - arglist: - fq_repository_name (str): fully qualified repository name - job_name: instance job name - - Returns: - TrainingJobArn (str) - """ - - disdat_config = DisdatConfig.instance() - - job_name = job_name.replace('_', '-') # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])* - - hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist) - - algorithm_specification = {'TrainingImage': fq_repository_name, - 'TrainingInputMode': 'File'} - - role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn') - - input_channel_config = [ - { - 'ChannelName': 'disdat_sagemaker_input_blackhole', - 'DataSource': { - 'S3DataSource': { - 'S3DataType': 'S3Prefix', - 'S3Uri': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_input_uri'), - 'S3DataDistributionType': 'FullyReplicated' - } - }, - 'ContentType': 'application/javascript', - 'CompressionType': 'None', # | 'Gzip', - 'RecordWrapperType': 'None' # | 'RecordIO' - }, - ] - - output_data_config = {'S3OutputPath': os.path.join( - disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_output_uri'), job_name)} - - resource_config = { - 'InstanceType': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'), - 'InstanceCount': int(disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_count')), - 'VolumeSizeInGB': int(disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_volume_sizeGB')) - # 'VolumeKmsKeyId': 'string' - } - - vpc_config = None #'SecurityGroupIds': [], 'Subnets': []} - - stopping_condition = {'MaxRuntimeInSeconds': int(disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_max_runtime_sec'))} - - tags = [{'Key': 'user', 'Value': 'disdat'}, - {'Key': 'job', 'Value': job_name}] - - if False: - print("Disdat SageMaker configs") - print("job name: {}".format(job_name)) - print("hparams: {}".format(hyperparameter_dict)) - print("algorithm: {}".format(algorithm_specification)) - print("Role ARN: {}".format(role_arn)) - print("Input data conf: {}".format(input_channel_config)) - print("Output data conf: {}".format(output_data_config)) - print("Resource conf: {}".format(resource_config)) - print("VPC conf: {}".format(vpc_config)) - print("Stopping condition seconds: {}".format(stopping_condition)) - print("Tags: {}".format(tags)) - - client = b3.client('sagemaker', region_name=aws.profile_get_region()) - - response = client.create_training_job( - TrainingJobName=job_name, - HyperParameters= hyperparameter_dict, - AlgorithmSpecification=algorithm_specification, - RoleArn=role_arn, - InputDataConfig=input_channel_config, - OutputDataConfig=output_data_config, - ResourceConfig=resource_config, - StoppingCondition=stopping_condition, - Tags=tags - ) - - _logger.info("Disdat SageMaker create_training_job response {}".format(response)) - return response['TrainingJobArn'] - - -def _run( - output_bundle = '-', - pipeline_root = '', - pipeline_args = '', - pipe_cls = None, - backend = None, - input_tags = {}, - output_tags = {}, - force = False, - force_all = False, - context = None, - remote = None, - no_pull = False, - no_push = False, - no_push_int = False, - vcpus = 1, - memory = 2000, - workers = 1, - no_submit = False, - job_role_arn = None, - aws_session_token_duration = 0, - cli=False): - """Run the dockerized version of a pipeline. - - Note these are named parameters so we avoid bugs related to argument order. - - Args: - output_bundle (str): The human name of the output bundle - pipeline_root (str): The path to the setup.py used to create the container - pipeline_args: Optional arguments to pass to the pipeline class - pipe_cls: Name of the pipeline class to run - backend: The batch execution back-end to use (default - `Backend.Local`) - input_tags (list(str)): Find bundle with these tags ['key:value',...] - output_tags (list(str)): Push result bundle with these tags ['key:value',...] - force (bool): If `True` force recomputation of all upstream tasks (default `False`) - force_all (bool): If `True` force recomputation of last task (default `False`) - context (str): / context string - remote (str): The remote S3 URL. - no_pull (bool): Do not pull before executing (start in empty local context) - no_push (bool): Do not push any new bundles to remote (useful for testing locally) - no_push_int (bool): Do not push new intermediate bundles to remote - vcpus (int): Number of AWS vCPUs the container requests - memory (int): Amount of memory container requests in MB - workers (int): Number of Luigi workers to run tasks in DAG - no_submit (bool): Produce the AWS job config (for AWS Batch), but do not submit the job - job_role_arn (str): The AWS role under which the job should execute - aws_session_token_duration (int): the number of seconds our temporary credentials should last. - cli (bool): Whether we called run from the API (buffer output) or the CLI - - Returns: - job_result (json): A json blob that contains information about the run job. Error with empty dict. If backend - is Sagemaker, return TrainingJobArn. If backend is AWSBatch, return Batch Job description. If local, return stdout. - """ - def assert_or_log(cli, msg): - if cli: - _logger.error(msg) - else: - assert False, msg - - pfs = fs.DisdatFS() - pipeline_setup_file = os.path.join(pipeline_root, 'setup.py') - - if not common.setup_exists(pipeline_setup_file): - return assert_or_log(cli, "Disdat run: Unable to find setup.py file [{}].".format(pipeline_setup_file)) - - # When run in a container, we create the uuid externally to look for a specific result - output_bundle_uuid = pfs.disdat_uuid() - - # If the user did not specify a context, use the configuration of the current context - if context is None: - if not pfs.in_context(): - return assert_or_log(cli, "Disdat run: Not running in a local context. Switch or specify.") - remote, context = common.get_run_command_parameters(pfs) - - if remote is None and (not no_push or not no_pull): # if pulling or pushing, need a remote - return assert_or_log(cli, "Pushing or pulling bundles with 'run' requires a remote.") - - arglist = common.make_run_command(output_bundle, output_bundle_uuid, pipe_cls, remote, context, - input_tags, output_tags, force, force_all, no_pull, no_push, - no_push_int, workers, pipeline_args) - - if backend == Backend.AWSBatch or backend == Backend.SageMaker: - - pipeline_image_name = common.make_project_image_name(pipeline_setup_file) - - job_name = '{}-{}'.format(pipeline_image_name, int(time.time())) - - fq_repository_name = get_fq_docker_repo_name(False, pipeline_setup_file) - - if backend == Backend.AWSBatch: - - # Add the actual command to the arglist - arglist = [ENTRYPOINT_BIN] + arglist - - retval = _run_aws_batch(arglist, - fq_repository_name, - job_name, - pipeline_image_name, - aws_session_token_duration, - vcpus, - memory, - no_submit, - job_role_arn) - else: - - fq_repository_name = get_fq_docker_repo_name(True, pipeline_root) - - retval = _run_aws_sagemaker(arglist, - fq_repository_name, - job_name) - - elif backend == Backend.Local or backend == Backend.LocalSageMaker: - retval = _run_local(cli, pipeline_setup_file, arglist, backend) - - else: - raise ValueError('Got unrecognized job backend \'{}\': Expected {}'.format(backend, Backend.options())) - - return retval - - -def add_arg_parser(parsers): - run_p = parsers.add_parser('run', description="Run a containerized version of transform.") - run_p.add_argument( - '--backend', - default=Backend.default(), - type=str, - choices=Backend.options(), - help='An optional batch execution back-end to use', - ) - run_p.add_argument( - "-f", - "--force", - action='store_true', - help="Force recomputation of the last task." - ) - run_p.add_argument( - '--force-all', - action='store_true', - help="Force re-computation of ALL upstream tasks.") - run_p.add_argument( - "--no-submit", - action='store_true', - help="For AWS Batch: Do not submit job -- only create the Batch job description." - ) - run_p.add_argument( - '--no-push-intermediates', - action='store_true', - help='Do not push the intermediate bundles to the remote repository (default is to push)', - dest='no_push_int' - ) - run_p.add_argument( - '--pull', - action='store_true', - default=None, - help="Synchronize local repo and remote before execution. Default False if 'local' backend, else default is True." - ) - run_p.add_argument( - '--push', - action='store_true', - default=None, - help="Push bundles to remote context. Default False if 'local' backend, else default is True." - ) - run_p.add_argument( - '--use-aws-session-token', - default=43200, # 12 hours of default time -- for long pipelines! - type=int, - help='For AWS Batch: Use temporary AWS session token, valid for AWS_SESSION_TOKEN_DURATION seconds. Default 43200. Set to zero to not use a token.', - dest='aws_session_token_duration', - ) - run_p.add_argument('--workers', - type=int, - default=2, - help="The number of Luigi workers to spawn. Default is 2.") - run_p.add_argument('--vcpus', - type=int, - default=2, - help="The vCPU count for an AWS Batch container.") - run_p.add_argument('--memory', - type=int, - default=4000, - help="The memory (MiB) required by this AWS Batch container.") - run_p.add_argument('--job-role-arn', - type=str, - default=None, - help="For AWS Batch: Use this ARN to indicate the role under which batch containers run.") - run_p.add_argument('-c', '--context', - type=str, - default=None, - help="'/', else use current context.") - run_p.add_argument('-r', '--remote', - type=str, - default=None, - help="Remote URL, i.e, 's3:///dsdt/', else use remote on current context") - run_p.add_argument('-it', '--input-tag', nargs=1, type=str, action='append', - help="Input bundle tags: '-it authoritative:True -it version:0.7.1'", - dest='input_tags') - run_p.add_argument('-ot', '--output-tag', nargs=1, type=str, action='append', - help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'", - dest='output_tags') - run_p.add_argument('-o', '--output-bundle', type=str, default='-', - help="Name output bundle: '-o my.output.bundle'. Default name is '_'") - run_p.add_argument("pipeline_root", type=str, help="Root of the Python source tree containing the user-defined transform; must have a setuptools-style setup.py file") - run_p.add_argument("pipe_cls", type=str, help="User-defined transform, e.g., module.PipeClass") - run_p.add_argument("pipeline_args", type=str, nargs=argparse.REMAINDER, - help="Optional set of parameters for this pipe '--parameter value'") - run_p.set_defaults(func=lambda args: run_entry(cli=True, **vars(args))) - return parsers - - -def run_entry(cli=False, **kwargs): - """ - Handles parameter defaults for calling _run() - - From the CLI, the parseargs object has all the arguments - From the API, the arguments are explicitly set - - Note: pipeline_args is an array of args: ['name',json.dumps(value),'name2',json.dumps(value2)] - - Args: - kwargs (dict): - - Returns: - - """ - if kwargs['backend'] is not None: - kwargs['backend'] = Backend[kwargs['backend']] - else: - kwargs['backend'] = Backend[Backend.default()] - - # CLI and API only set push or pull. Translate to no-push no-pull in original run code. - # If backend == 'Local' then don't pull or push by default - # TODO: change run() semantics to push pull, not no_push, no_pull. - if kwargs['push'] is not None: - kwargs['no_push'] = not kwargs['push'] - else: - kwargs['no_push'] = True if kwargs['backend'] == Backend.Local else False - - if kwargs['pull'] is not None: - kwargs['no_pull'] = not kwargs['pull'] - else: - kwargs['no_pull'] = True if kwargs['backend'] == Backend.Local else False - - # Ensure kwargs only contains the arguments we want when calling _run - remove_keys = [] - for k in kwargs.keys(): - if k not in _run.__code__.co_varnames: - remove_keys.append(k) - - for k in remove_keys: - kwargs.pop(k) - - kwargs['input_tags'] = common.parse_args_tags(kwargs['input_tags'], to='list') - kwargs['output_tags'] = common.parse_args_tags(kwargs['output_tags'], to='list') - kwargs['cli'] = cli - - return _run(**kwargs) diff --git a/disdat/utility/__init__.py b/disdat/utility/__init__.py index c5de4e9..e69de29 100644 --- a/disdat/utility/__init__.py +++ b/disdat/utility/__init__.py @@ -1,7 +0,0 @@ -''' -Collection of utility pipes for supporting more complex pipelines. - -@author: twong / kyocum -@copyright: Human Longevity, Inc. 2017 -@license: Apache 2.0 -''' diff --git a/disdat/utility/aws_s3.py b/disdat/utility/aws_s3.py index 939636c..1b4fcf4 100644 --- a/disdat/utility/aws_s3.py +++ b/disdat/utility/aws_s3.py @@ -1,6 +1,4 @@ # -# Copyright 2015, 2016, 2017 Human Longevity, Inc. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,19 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" -Utilities for accessing AWS using boto3 - -@author: twong / kyocum -@copyright: Human Longevity, Inc. 2017 -@license: Apache 2.0 -""" -import base64 import os -import pkg_resources -from getpass import getuser -import six from multiprocessing import get_context from botocore.exceptions import ClientError @@ -87,185 +74,6 @@ def disdat_cpu_count(): return int(env_cpu_count) -def batch_get_job_definition_name(pipeline_image_name): - """Get the most recent active AWS Batch job definition for a dockerized - pipeline. - - Note: The Python getpass docs do not specify the exception thrown when getting the user name fails. - - """ - - try: - return '{}-{}-job-definition'.format(getuser(), pipeline_image_name) - except Exception as e: - return '{}-{}-job-definition'.format('DEFAULT', pipeline_image_name) - - -def batch_get_latest_job_definition(job_definition_name): - """Get the most recent active revision number for a AWS Batch job - definition - - Args: - job_definition_name: The name of the job definition - remote_pipeline_image_name: - vcpus: - memory: - - Return: - The latest job definition dictionary or `None` if the job definition does not exist - """ - region = profile_get_region() - client = b3.client('batch', region_name=region) - response = client.describe_job_definitions(jobDefinitionName=job_definition_name, status='ACTIVE') - if response['ResponseMetadata']['HTTPStatusCode'] != 200: - raise RuntimeError( - 'Failed to get job definition revisions for {}: HTTP Status {}'.format(job_definition_name, response['ResponseMetadata']['HTTPStatusCode']) - ) - job_definitions = response['jobDefinitions'] - revision = 0 - job_def = None - for j in job_definitions: - if j['jobDefinitionName'] != job_definition_name: - continue - if j['revision'] > revision: - revision = j['revision'] - job_def = j - - return job_def - - -def batch_extract_job_definition_fqn(job_def): - revision = job_def['revision'] - name = job_def['jobDefinitionName'] - return '{}:{}'.format(name, revision) - - -def batch_get_job_definition(job_definition_name): - """Get the most recent active revision number for a AWS Batch job - definition - - Args: - job_definition_name: The name of the job definition - - Return: - The fully-qualified job definition name with revision number, or - `None` if the job definition does not exist - """ - job_def = batch_get_latest_job_definition(job_definition_name) - - if job_def is None: - return None - else: - return batch_extract_job_definition_fqn(job_def) - - -def batch_register_job_definition(job_definition_name, remote_pipeline_image_name, - vcpus=1, memory=2000, job_role_arn=None): - """Register a new AWS Batch job definition. - - Args: - job_definition_name: The name of the job definition - remote_pipeline_image_name: The ECR Docker image to load to run jobs - using this definition - vcpus: The number of vCPUs to use to run jobs using this definition - memory: The amount of memory in MiB to use to run jobs using this - definition - job_role_arn (str): Can be None - """ - - container_properties = { - 'image': remote_pipeline_image_name, - 'vcpus': vcpus, - 'memory': memory, - } - - if job_role_arn is not None: - container_properties['jobRoleArn'] = job_role_arn - - region = profile_get_region() - client = b3.client('batch', region_name=region) - response = client.register_job_definition( - jobDefinitionName=job_definition_name, - type='container', - containerProperties=container_properties - ) - if response['ResponseMetadata']['HTTPStatusCode'] != 200: - raise RuntimeError('Failed to create job definition {}: HTTP Status {}'.format(job_definition_name, response['ResponseMetadata']['HTTPStatusCode'])) - - return response - - -def ecr_create_fq_respository_name(repository_name, policy_resource_package=None, policy_resource_name=None): - ecr_client = b3.client('ecr', region_name=profile_get_region()) - # Create or fetch the repository in AWS (to store the image) - try: - response = ecr_client.create_repository( - repositoryName=repository_name - ) - repository_metadata = response['repository'] - # Set the policy on the repository - if policy_resource_package is not None and policy_resource_name is not None: - policy = pkg_resources.resource_string(policy_resource_package.__name__, policy_resource_name) - _ = ecr_client.set_repository_policy( - registryId=repository_metadata['registryId'], - repositoryName=repository_name, - policyText=policy, - force=True - ) - except ClientError as e: - if e.response['Error']['Code'] == 'RepositoryAlreadyExistsException': - response = ecr_client.describe_repositories( - repositoryNames=[repository_name] - ) - repository_metadata = response['repositories'][0] - elif e.response['Error']['Code'] == 'AccessDeniedException': - _logger.warn("Error [AccessDeniedException] when creating repo {}, trying to continue...".format(repository_name)) - else: - raise e - return repository_metadata['repositoryUri'] - - -def ecr_get_fq_repository_name(repository_name): - return ecr_create_fq_respository_name(repository_name) - - -def ecr_get_auth_config(): - ecr_client = b3.client('ecr', region_name=profile_get_region()) - # Authorize docker to push to ECR - response = ecr_client.get_authorization_token() - if response['ResponseMetadata']['HTTPStatusCode'] != 200: - raise RuntimeError('Failed to get AWS ECR authorization token: HTTP Status {}'.format(response['ResponseMetadata']['HTTPStatusCode'])) - token = response['authorizationData'][0]['authorizationToken'] - - token_bytes = six.b(token) - - token_decoded_bytes = base64.b64decode(token_bytes) - - token_decoded_str = token_decoded_bytes.decode('utf8') - - username, password = token_decoded_str.split(':') - - return {'username': username, 'password': password} - - -def profile_get_region(): - """Gets the AWS region for the current AWS profile. If AWS_DEFAULT_REGION is set in env will just default to use - that. - """ - - # ENV variables take precedence over the region in the ~/.aws/ folder - if 'AWS_DEFAULT_REGION' in os.environ: - region = os.environ['AWS_DEFAULT_REGION'] - else: - session = b3.session.Session() - profile = session.profile_name - region = session.region_name - if 'AWS_PROFILE' in os.environ: - assert os.environ['AWS_PROFILE'] == profile, "Boto session profile != env AWS_PROFILE" - - return region - - def s3_path_exists(s3_url): """ Given an entire path, does the key exist? diff --git a/disdat/utility/bundle_helpers.py b/disdat/utility/bundle_helpers.py new file mode 100644 index 0000000..bc5ea6c --- /dev/null +++ b/disdat/utility/bundle_helpers.py @@ -0,0 +1,51 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import disdat.common as common + +def different_code_versions(code_version, lineage_obj): + """ + Given the current version, see if it is different than found_version + Note, if either version is dirty, we are forced to say they are different + + Typically we get the code_version from the pipe and the lineage object from the + bundle. We then see if the current code == the information in lineage object. + + Args: + current_version (CodeVersion) : + lineage_obj (LineageObject): + + Returns: + + """ + + conf = common.DisdatConfig.instance() + + if conf.ignore_code_version: + return False + + # If there were uncommitted changes, then we have to re-run, mark as different + if code_version.dirty: + return True + + if code_version.semver != lineage_obj.pb.code_semver: + return True + + if code_version.hash != lineage_obj.pb.code_hash: + return True + + ## Currently ignoring tstamp, branch, url + ## CodeVersion = collections.namedtuple('CodeVersion', 'semver hash tstamp branch url dirty') + + return False diff --git a/disdat/utility/which.py b/disdat/utility/which.py deleted file mode 100644 index d9021ad..0000000 --- a/disdat/utility/which.py +++ /dev/null @@ -1,23 +0,0 @@ -''' -Created on Sep 12, 2017 - -@author: twong -''' - -import os - - -def which(cmd_name): - '''Get the full path to an external command executable. - - :param cmd_name: The command name - :return: The full path to the command executable, or `None` if the - executable is not on the O/S path - :rtype: str - ''' - paths = os.environ['PATH'].split(os.pathsep) - for p in paths: - cmd_fq_name = os.path.join(p, cmd_name) - if os.path.exists(cmd_fq_name) and os.access(cmd_fq_name, os.X_OK): - return cmd_fq_name - return None diff --git a/disdat/infrastructure/.gitignore b/infrastructure/.gitignore similarity index 100% rename from disdat/infrastructure/.gitignore rename to infrastructure/.gitignore diff --git a/disdat/infrastructure/Dockerfiles/Makefile b/infrastructure/Dockerfiles/Makefile similarity index 100% rename from disdat/infrastructure/Dockerfiles/Makefile rename to infrastructure/Dockerfiles/Makefile diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/Dockerfile b/infrastructure/Dockerfiles/hyperframe_def/Dockerfile old mode 100755 new mode 100644 similarity index 100% rename from disdat/infrastructure/Dockerfiles/hyperframe_def/Dockerfile rename to infrastructure/Dockerfiles/hyperframe_def/Dockerfile diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/Makefile b/infrastructure/Dockerfiles/hyperframe_def/Makefile old mode 100755 new mode 100644 similarity index 100% rename from disdat/infrastructure/Dockerfiles/hyperframe_def/Makefile rename to infrastructure/Dockerfiles/hyperframe_def/Makefile diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/README.md b/infrastructure/Dockerfiles/hyperframe_def/README.md old mode 100755 new mode 100644 similarity index 100% rename from disdat/infrastructure/Dockerfiles/hyperframe_def/README.md rename to infrastructure/Dockerfiles/hyperframe_def/README.md diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/bundle.proto b/infrastructure/Dockerfiles/hyperframe_def/bundle.proto old mode 100755 new mode 100644 similarity index 100% rename from disdat/infrastructure/Dockerfiles/hyperframe_def/bundle.proto rename to infrastructure/Dockerfiles/hyperframe_def/bundle.proto diff --git a/disdat/infrastructure/Dockerfiles/hyperframe_def/hyperframe.proto b/infrastructure/Dockerfiles/hyperframe_def/hyperframe.proto similarity index 100% rename from disdat/infrastructure/Dockerfiles/hyperframe_def/hyperframe.proto rename to infrastructure/Dockerfiles/hyperframe_def/hyperframe.proto diff --git a/setup.py b/setup.py index ff27fba..b8c6fa9 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ setup_requires=['setuptools_scm'], name='disdat', - description='DisDat: versioned data science', + description='Disdat: data versioning', author='Ken Yocum', author_email='kyocum@gmail.com', url='https://github.com/kyocum/disdat', @@ -47,7 +47,6 @@ # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.6', 'Operating System :: OS Independent', 'Natural Language :: English', @@ -55,8 +54,7 @@ # You can just specify the packages manually here if your project is # simple. Or you can use find_packages(). - packages=find_packages(exclude=['tests*', - 'infrastructure.tests*']), + packages=find_packages(exclude=['tests*']), # Include non-python files found in each package in the install, if in your MANIFEST.in include_package_data=True, @@ -70,19 +68,11 @@ 'disdat': [ 'config/disdat/*', 'VERSION', - ], - 'infrastructure': [ - 'Dockerfiles/hyperframe_def/*' - 'dockerizer/Makefile', - 'dockerizer/Dockerfiles/*', - 'dockerizer/kickstart/bin/*', - 'dockerizer/kickstart/etc/*', - ], + ] }, exclude_package_data={ 'disdat': [ - 'dockerizer/kickstart/bin/*.pyc', ] }, @@ -93,10 +83,8 @@ # If <= means higher versions broke something. install_requires=[ - 'luigi>=3.0,<=3.1', 'boto3>=1.14.49,<2.0', 'termcolor>=1.1.0,<2.0', - 'docker>=4.1.0,<4.4.0', 'pandas>=0.25.3,<=1.2.0', 'numpy>=1.18.1,<=1.21.1', 'sqlalchemy>=1.3.13,<1.4', @@ -128,10 +116,6 @@ entry_points={ 'console_scripts': [ 'dsdt = disdat.entrypoints.cli_ep:main', - 'dsdt_docker = disdat.entrypoints.docker_ep:main' - ], - 'distutils.commands': [ - "dsdt_distname = disdat.infrastructure.dockerizer.setup_tools_commands:DistributionName", ] }, ) diff --git a/tests/bundles/test_arg_capture.py b/tests/bundles/test_arg_capture.py index 61fb5a2..4821afe 100644 --- a/tests/bundles/test_arg_capture.py +++ b/tests/bundles/test_arg_capture.py @@ -14,23 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from tests.functional.common import run_test, TEST_CONTEXT - -from disdat.pipe import PipeTask -import disdat.api as api - -import datetime import pytest -import luigi import json +import disdat.api as api + +from tests.functional.common import run_test, TEST_CONTEXT -test_luigi_args_data = {'str_arg': 'some string', - 'int_arg': 10, - 'list_arg': [1,3,5], - 'list_str_arg': ['farg','barg'], - 'dict_float_arg': {'farg': 0.01, 'barg': 3.14}, - 'date_arg': datetime.date(2020,4,1)} test_json_args_data = {'str_arg': 'some string', 'int_arg': 10, @@ -41,34 +31,6 @@ serialized_json_args = {k: json.dumps(v) for k, v in test_json_args_data.items()} -class ArgTask(PipeTask): - str_arg = luigi.Parameter(default=None) - int_arg = luigi.IntParameter(default=None) - list_arg = luigi.ListParameter(default=None) - list_str_arg = luigi.ListParameter(default=None) - dict_float_arg = luigi.DictParameter(default=None) - date_arg = luigi.DateParameter(default=None) - - def pipe_run(self): - return True - - -def test_luigi_args(run_test): - """ Create a task, store args, retrieve from bundle api. - Pass in python objects as the values for Luigi parameters. - Stored as serialized json objects. Bundle presents the parameters - as the serialized objects (Disdat isn't aware they were Luigi serialized). - """ - - api.apply(TEST_CONTEXT, ArgTask, output_bundle='output', params=test_luigi_args_data) - b = api.get(TEST_CONTEXT, 'output') - found_p = {} - for k, p in b.params.items(): - attribute = getattr(ArgTask, k) - found_p[k] = attribute.parse(p) - assert(found_p == test_luigi_args_data) - - def test_args_bundle(): """ Create bundle, store args. """ diff --git a/tests/bundles/test_file_bundle_api.py b/tests/bundles/test_file_bundle_api.py index de2d960..e4dad05 100644 --- a/tests/bundles/test_file_bundle_api.py +++ b/tests/bundles/test_file_bundle_api.py @@ -29,14 +29,6 @@ TEST_BUCKET = 'test-bucket' TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) - -# Setup moto s3 resources - -# Make sure bucket is empty -#objects = s3_client.list_objects(Bucket=TEST_BUCKET) -#assert 'Contents' not in objects, 'Bucket should be empty' - - def md5_file(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: @@ -92,7 +84,7 @@ def test_copy_in_s3_file(run_test): The file should be copied into the local context """ - s3_resource = boto3.resource('s3') + s3_resource = boto3.resource('s3', region_name="us-east-1") s3_resource.create_bucket(Bucket=TEST_BUCKET) # Copy a local file to moto s3 bucket @@ -119,7 +111,7 @@ def test_copy_in_s3_file_with_remote(run_test): The file should be copied into the remote context """ - s3_resource = boto3.resource('s3') + s3_resource = boto3.resource('s3', region_name="us-east-1") s3_resource.create_bucket(Bucket=TEST_BUCKET) api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) @@ -142,7 +134,7 @@ def test_copy_in_s3_file_with_remote(run_test): @moto.mock_s3 def test_zero_copy_s3_file(run_test): """ Test managed path in local file """ - s3_resource = boto3.resource('s3') + s3_resource = boto3.resource('s3', region_name="us-east-1") s3_resource.create_bucket(Bucket=TEST_BUCKET) api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) diff --git a/tests/bundles/test_hframe.py b/tests/bundles/test_hframe.py index 75ecd2b..5c9a5d9 100644 --- a/tests/bundles/test_hframe.py +++ b/tests/bundles/test_hframe.py @@ -13,6 +13,7 @@ import tempfile import uuid import numpy as np +import pytest def _make_linkauth_records(): @@ -365,5 +366,7 @@ def test_link_rw_db(): assert (s3_hash == s3_hash2) +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/functional/common_tasks.py b/tests/functional/common_tasks.py deleted file mode 100644 index 2a437b9..0000000 --- a/tests/functional/common_tasks.py +++ /dev/null @@ -1,28 +0,0 @@ -from disdat.pipe import PipeTask -import luigi - -""" These are simple tasks used for test_api_run """ - -COMMON_DEFAULT_ARGS=[10, 100, 1000] - - -class B(PipeTask): - """ B required by A """ - int_array = luigi.ListParameter(default=None) - - def pipe_run(self): - print ("B saving type [{}]".format(type(self.int_array))) - return self.int_array - - -class A(PipeTask): - """ A is the root task""" - int_array = luigi.ListParameter(default=COMMON_DEFAULT_ARGS) - - def pipe_requires(self): - self.add_dependency('b', B, {'int_array': self.int_array}) - - def pipe_run(self, b=None): - print ("Saving the sum of B {}".format(b)) - print ("A got type [{}]".format(type(b))) - return sum(list(b)) diff --git a/tests/functional/test_add.py b/tests/functional/test_add.py index 77770d7..0fa7e04 100644 --- a/tests/functional/test_add.py +++ b/tests/functional/test_add.py @@ -213,111 +213,6 @@ def test_add_directory(tmpdir): api.delete_context(TEST_CONTEXT) -@moto.mock_s3 -def deprecated_add_with_treat_as_bundle(tmpdir): - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - - local_paths = [] - s3_paths = [] - - # Create and upload test.csv file - key = 'test.csv' - test_csv_path = os.path.join(str(tmpdir), key) - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) - df.to_csv(test_csv_path) - - s3_resource.meta.client.upload_file(test_csv_path, TEST_BUCKET, key) - s3_path = "s3://{}/{}".format(TEST_BUCKET, key) - - local_paths.append(test_csv_path) - s3_paths.append(s3_path) - - # Create and uploadt test.txt file - key = 'text.txt' - test_txt_path = os.path.join(str(tmpdir), key) - with open(test_txt_path, 'w') as f: - f.write('Test') - - s3_resource.meta.client.upload_file(test_txt_path, TEST_BUCKET, key) - s3_path = "s3://{}/{}".format(TEST_BUCKET, key) - - local_paths.append(test_txt_path) - s3_paths.append(s3_path) - - bool_values = [True, False] - string_values = ['a', 'b'] - float_values = [1.3, 3.5] - int_values = [4, 5] - - # Build bundle dataframe - bundle_df = pd.DataFrame({ - 'local_paths': local_paths, - 's3_paths': s3_paths, - 'bools': bool_values, - 'strings': string_values, - 'floats': float_values, - 'ints': int_values - }) - - bundle_df_path = os.path.join(str(tmpdir), 'bundle.csv') - bundle_df.to_csv(bundle_df_path) - - # These are now deprecated - # Add bundle dataframe - - api.add(TEST_CONTEXT, 'test_add_bundle', bundle_df_path, treat_file_as_bundle=True) - - # Assert that data in bundle is a dataframe - b = api.get(TEST_CONTEXT, 'test_add_bundle') - assert(isinstance(b.data, pd.DataFrame)) - - # Add bundle dataframe with tags - tag = {'test': 'tag'} - api.add(TEST_CONTEXT, 'test_add_bundle', bundle_df_path, treat_file_as_bundle=True, tags=tag) - - # Assert that data in bundle is a dataframe - b = api.get(TEST_CONTEXT, 'test_add_bundle') - assert(isinstance(b.data, pd.DataFrame)) - assert b.tags == tag, 'Tags do not match' - - api.delete_context(TEST_CONTEXT) - - -def deprecated_data_as_bundle_not_csv(tmpdir): - - # Create Context - api.context(TEST_CONTEXT) - - # Create test .txt file - test_txt_path = os.path.join(str(tmpdir), 'test.txt') - with open(test_txt_path, 'w') as f: - f.write('this should not create a bundle') - - # Assert the txt file exists - assert os.path.exists(test_txt_path) - - # Try to add file to the bundle - with pytest.raises(AssertionError) as ex: - api.add(TEST_CONTEXT, 'bad_path', test_txt_path, treat_file_as_bundle=True) - - # Assert Exited with error code of 1 - assert ex.type == AssertionError - - # Make sure bundle does not exist - assert api.get(TEST_CONTEXT, 'test_file_as_bundle_txt_file') is None, 'Bundle should not exist' - - api.delete_context(TEST_CONTEXT) - - if __name__ == '__main__': import tempfile test_single_file(tempfile.gettempdir()) \ No newline at end of file diff --git a/tests/functional/test_api_exit.py b/tests/functional/test_api_exit.py deleted file mode 100644 index f919a29..0000000 --- a/tests/functional/test_api_exit.py +++ /dev/null @@ -1,93 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from disdat.pipe import PipeTask -from disdat.common import ApplyError -import disdat.api as api -import luigi - -from tests.functional.common import TEST_CONTEXT - -TEST_NAME = 'test_bundle' - - -def test(): - """ Purpose of this test is to have one task that produces a bundle. - And another task that requires it. - - 1.) Create external dep -- also creates PreMaker_auf_datamaker - dsdt apply - - test_external_bundle.DataMaker --int_array '[1000,2000,3000]' - - 2.) Remove Premaker_auf_datamaker - dsdt rm PreMaker_auf_datamaker - - 3.) Try to run Root -- it should find DataMaker but not re-create it or PreMaker_auf_datamaker - - """ - - api.context(TEST_CONTEXT) - - result = None - try: - result = api.apply(TEST_CONTEXT, Root, output_bundle='test_api_exit', params={}, force=True, workers=2) - except ApplyError as e: - print ("Got ApplyError exception {} result {} ".format(e, e.result)) - assert(e.result['did_work']) - assert(not e.result['success']) - finally: - print("API apply returned {}".format(result)) - - -class FailBate(PipeTask): - """ - Generate a small data set of possible basketball scores - """ - unique = luigi.Parameter() - - def pipe_requires(self): - self.set_bundle_name("GenData") - - def pipe_run(self): - - if self.unique == 1: - print("Task about to fail . . . ") - _ = 100 / 0 - elif self.unique == 0: - pass - - return - - -class Root(PipeTask): - """ - Average scores of an upstream task - """ - - def pipe_requires(self): - """ Depend on GenData """ - self.add_dependency('task_succeeds', FailBate, {'unique': 0}) - self.add_dependency('task_fails', FailBate, {'unique': 1}) - pass - - def pipe_run(self, **kwargs): - """ Compute average and return as a dictionary """ - return True - - -if __name__ == "__main__": - import multiprocessing as mp - mp.set_start_method('fork') - test() diff --git a/tests/functional/test_api_run.py b/tests/functional/test_api_run.py deleted file mode 100644 index d9d223c..0000000 --- a/tests/functional/test_api_run.py +++ /dev/null @@ -1,141 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os - -import boto3 -import moto -import docker -import pytest - - -from tests.functional.common import run_test, TEST_CONTEXT -from tests.functional.common_tasks import COMMON_DEFAULT_ARGS -import disdat.api as api - - -TEST_NAME = 'test_bundle' -TEST_BUCKET = 'test-bucket' -TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) -PWD = os.path.dirname(__file__) -SETUP_DIR = os.path.join(PWD,'..') -PIPELINE_CLS = 'functional.common_tasks.A' - - -@pytest.fixture(scope="module") -def build_container_setup_only(): - """ Create a docker image locally. At the moment we are only - testing whether the basic python setup.py builds correctly. - - TODO: Need to test using the configure directory with - a.) Using a MANIFEST file - b.) Installing custom python packages - c.) Installing rpms - d.) Installing R packages - """ - - retval = api.dockerize(SETUP_DIR) - id = api.dockerize_get_id(SETUP_DIR) - yield id - docker_client = docker.from_env() - docker_client.images.remove(id, force=True) - - -def test_run_local_container(run_test, build_container_setup_only): - """ Run the local container. - Test if it runs, test if it re-runs all steps, test if it re-runs last step. - """ - - retval = api.run(SETUP_DIR, - TEST_CONTEXT, - PIPELINE_CLS - ) - - b_b = api.get(TEST_CONTEXT, 'B') - assert b_b is not None - - b_a = api.get(TEST_CONTEXT, 'A') - assert b_a is not None - assert b_a.data == sum(COMMON_DEFAULT_ARGS) - - # Re-run with force all - - retval = api.run(SETUP_DIR, - TEST_CONTEXT, - PIPELINE_CLS, - {'int_array': [1, 2, 3]}, - force_all=True - ) - - b_b_f = api.get(TEST_CONTEXT, 'B') - assert b_b_f is not None - assert b_b.uuid != b_b_f.uuid - - b_a_f = api.get(TEST_CONTEXT, 'A') - assert b_a_f is not None - assert b_a.uuid != b_a_f.uuid - assert b_a_f.data == sum([1, 2, 3]) - - # Re-run with force last one - - retval = api.run(SETUP_DIR, - TEST_CONTEXT, - PIPELINE_CLS, - {'int_array': [1, 2, 3]}, - force=True - ) - - b_b_f2 = api.get(TEST_CONTEXT, 'B') - assert b_b_f2 is not None - assert b_b_f.uuid == b_b_f2.uuid - - b_a_f2 = api.get(TEST_CONTEXT, 'A') - assert b_a_f2 is not None - assert b_a_f.uuid != b_a_f2.uuid - - -#@moto.mock_s3 -def manual_test_run_aws_batch(run_test, build_container_setup_only): - """ Incomplete test. The container code itself needs to have - its S3 access mocked out. Here we are testing manually - """ - - # Setup moto s3 resources - #s3_resource = boto3.resource('s3', region_name='us-east-1') - #s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Add a remote. Pull and Push! - manual_s3_url = 's3://' - api.remote(TEST_CONTEXT, TEST_CONTEXT, manual_s3_url) - - retval = api.run(SETUP_DIR, - TEST_CONTEXT, - PIPELINE_CLS, - remote_context=TEST_CONTEXT, - remote_s3_url=manual_s3_url, - pull=True, - push=True - ) - - # Blow away everything and pull - api.rm(TEST_CONTEXT, bundle_name='.*', rm_all=True) - api.pull(TEST_CONTEXT) - b = api.get(TEST_CONTEXT, 'A') - assert b.data == sum(COMMON_DEFAULT_ARGS) - - -if __name__ == '__main__': - pytest.main([__file__]) diff --git a/tests/functional/test_context.py b/tests/functional/test_context.py index 73130d8..e162cc7 100644 --- a/tests/functional/test_context.py +++ b/tests/functional/test_context.py @@ -14,18 +14,9 @@ # limitations under the License. # import pytest -from disdat.pipe import PipeTask import disdat.api as api -class ContextTest(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('context_test') - - def pipe_run(self, pipeline_input=None): - return 2 - - def test_create_context(): context_name = '__test__' assert context_name not in api.ls_contexts(), 'Context exists' @@ -43,7 +34,7 @@ def test_independent_context(): api.context(context_1_name) api.context(context_2_name) - api.apply(context_1_name, ContextTest) + _ = api.Bundle(context_1_name, name='context_test', data=2) assert len(api.search(context_1_name)) == 1, 'Only one bundle should be in context one' assert len(api.search(context_2_name)) == 0, 'Context two should be empty' diff --git a/tests/functional/test_external_bundle.py b/tests/functional/test_external_bundle.py deleted file mode 100644 index 61f88fe..0000000 --- a/tests/functional/test_external_bundle.py +++ /dev/null @@ -1,102 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi -import pandas as pd -import numpy as np -from disdat.pipe import PipeTask -import disdat.api as api -import pytest - -from tests.functional.common import run_test, TEST_CONTEXT # autouse fixture to setup / tear down context - - -def test(run_test): - """ Purpose of this test is to have one task that produces a bundle. - And another task that requires it. - - 1.) Run DataMaker which runs PreMaker - 2.) Assert that those ran, and remove PreMaker - 3.) run Root which needs DataMaker (external dep) and PreMaker - 4.) assert that premaker re-ran and Root ran successfully (getting external dependency) - - """ - - api.context(TEST_CONTEXT) - - api.apply(TEST_CONTEXT, DataMaker, params={'int_array': [1000, 2000, 3000]}) - - b = api.get(TEST_CONTEXT, 'PreMaker') - assert(b is not None) - pm_uuid = b.uuid - b.rm() - - api.apply(TEST_CONTEXT, Root) - - b = api.get(TEST_CONTEXT, 'PreMaker') - assert(b is not None) - assert(b.uuid != pm_uuid) - - b = api.get(TEST_CONTEXT, 'Root') - assert(b is not None) - - api.delete_context(TEST_CONTEXT) - - -class DataMaker(PipeTask): - """ Run this by itself. - Then B requires DataMaker as external, and A. """ - - int_array = luigi.ListParameter(default=[1, 2, 3, 5, 8]) - - def pipe_requires(self): - self.set_bundle_name("DataMaker") - self.add_dependency('premaker', PreMaker, params={}) - return - - def pipe_run(self, premaker=None): - - return np.array(self.int_array) - - -class PreMaker(PipeTask): - - printme = luigi.Parameter(default="snarky") - - def pipe_requires(self): - return - - def pipe_run(self): - - print("Task premaker says {}".format(self.printme)) - - return pd.DataFrame({'fark': np.random.randint(100, size=10), 'bark': np.random.randint(10, size=10)}) - - -class Root(PipeTask): - - def pipe_requires(self): - self.add_dependency('premaker', PreMaker, params={}) - self.add_external_dependency('datamaker', DataMaker, {'int_array': [1000, 2000, 3000]}) - - def pipe_run(self, premaker=None, datamaker=None): - print ("Root received a datamaker {}".format(datamaker)) - return - - -if __name__ == '__main__': - pytest.main([__file__]) - #test() diff --git a/tests/functional/test_external_dep.py b/tests/functional/test_external_dep.py deleted file mode 100644 index 0bc3f46..0000000 --- a/tests/functional/test_external_dep.py +++ /dev/null @@ -1,202 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi -import pytest - -from disdat.pipe import PipeTask -import disdat.api as api -from disdat.common import ApplyError -from tests.functional.common import run_test, TEST_CONTEXT # autouse fixture to setup / tear down context - -EXT_BUNDLE_NAME='ext_bundle_human_name' -BUNDLE_CONTENTS=list(range(9)) -EXT_TASK_PARAM_VAL='this is a test value' - - -class ExternalPipeline(PipeTask): - test_param = luigi.Parameter() - - """ External Pipeline """ - def pipe_requires(self): - self.set_bundle_name('external_pipeline') - - def pipe_run(self): - print ("ExternalPipeline called with parameter [{}]".format(self.test_param)) - return BUNDLE_CONTENTS - - -class PipelineA(PipeTask): - test_param = luigi.Parameter(default=EXT_TASK_PARAM_VAL) - throw_assert = luigi.BoolParameter(default=True) - - def pipe_requires(self): - self.set_bundle_name('pipeline_a') - b = self.add_external_dependency('ext_input', ExternalPipeline, {'test_param': self.test_param}) - if self.throw_assert: - assert b is not None - assert list(b.data) == BUNDLE_CONTENTS - - def pipe_run(self, ext_input=None): - assert list(ext_input) == BUNDLE_CONTENTS - return True - - -class PipelineB(PipeTask): - ext_uuid = luigi.Parameter() - - def pipe_requires(self): - self.set_bundle_name('pipeline_b') - b = self.add_external_dependency('ext_input', - ExternalPipeline, - {}, - uuid=self.ext_uuid) - assert b is not None - assert list(b.data) == BUNDLE_CONTENTS - - def pipe_run(self, ext_input=None): - assert list(ext_input) == BUNDLE_CONTENTS - return True - - -class PipelineC(PipeTask): - ext_name = luigi.Parameter() - - def pipe_requires(self): - self.set_bundle_name('pipeline_b') - b = self.add_external_dependency('ext_input', - ExternalPipeline, - {}, - human_name=self.ext_name) - assert b is not None - assert list(b.data) == BUNDLE_CONTENTS - - def pipe_run(self, ext_input=None): - assert list(ext_input) == BUNDLE_CONTENTS - return True - - -def create_bundle_from_pipeline(): - """ Run the internal pipeline, create a bundle, return the uuid - """ - - api.apply(TEST_CONTEXT, - ExternalPipeline, - params={'test_param': EXT_TASK_PARAM_VAL}, - output_bundle=EXT_BUNDLE_NAME) - b = api.get(TEST_CONTEXT, EXT_BUNDLE_NAME) - return b.uuid - - -def test_ord_external_dependency(run_test): - - uuid = create_bundle_from_pipeline() - - api.apply(TEST_CONTEXT, PipelineA) - - result = api.apply(TEST_CONTEXT, PipelineA) - assert result['success'] is True - assert result['did_work'] is False - - -def test_uuid_external_dependency(run_test): - - uuid = create_bundle_from_pipeline() - - api.apply(TEST_CONTEXT, PipelineB, params={'ext_uuid': uuid}) - - result = api.apply(TEST_CONTEXT, PipelineB, params={'ext_uuid': uuid}) - assert result['success'] is True - assert result['did_work'] is False - - -def test_name_external_dependency(run_test): - - uuid = create_bundle_from_pipeline() - - api.apply(TEST_CONTEXT, PipelineC, params={'ext_name': EXT_BUNDLE_NAME}) - - result = api.apply(TEST_CONTEXT, PipelineC, params={'ext_name': EXT_BUNDLE_NAME}) - assert result['success'] is True - assert result['did_work'] is False - - -def test_ord_external_dependency_fail(run_test): - """ Test ability to handle a failed lookup. - Note: Disdat/Luigi swallows exceptions in tasks. Here our tasks - assert that they get back a bundle on their lookup. If we catch it, then the - test succeeds. - - Args: - run_test: - - Returns: - - """ - - uuid = create_bundle_from_pipeline() - - try: - result = api.apply(TEST_CONTEXT, PipelineA, params={'test_param': 'never run before'}) - except ApplyError as ae: - print("ERROR: {}".format(ae)) - return - - -def test_uuid_external_dependency_fail(run_test): - """ Test ability to handle a failed lookup. - Note: Disdat/Luigi swallows exceptions in tasks. Here our tasks - assert that they get back a bundle on their lookup. If we catch it, then the - test succeeds. - - Args: - run_test: - - Returns: - - """ - - uuid = create_bundle_from_pipeline() - try: - result = api.apply(TEST_CONTEXT, PipelineB, params={'ext_uuid': 'not a valid uuid'}) - except ApplyError as ae: - print("ERROR: {}".format(ae)) - return - - -def test_name_external_dependency_fail(run_test): - """ Test ability to handle a failed lookup. - Note: Disdat/Luigi swallows exceptions in tasks. Here our tasks - assert that they get back a bundle on their lookup. If we catch it, then the - test succeeds. - - Args: - run_test: - - Returns: - - """ - - uuid = create_bundle_from_pipeline() - try: - result = api.apply(TEST_CONTEXT, PipelineC, params={'ext_name': 'not a bundle name'}) - except ApplyError as ae: - print("ERROR: {}".format(ae)) - return - - -if __name__ == '__main__': - pytest.main([__file__+"::test_ord_external_dependency_fail"]) diff --git a/tests/functional/test_force_one_and_all.py b/tests/functional/test_force_one_and_all.py deleted file mode 100644 index 3d28b2c..0000000 --- a/tests/functional/test_force_one_and_all.py +++ /dev/null @@ -1,87 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - -TEST_NAME = 'test_bundle' - - -def test(run_test): - """ This tests if apply force=True and force_all=True re-run everything. - We have two tasks. One depends on the other. - force_all should re-run both, force should re-run only the last. - """ - - # first run there should be no bundles - #assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, A, params={}) - first_B_uuid = api.get(TEST_CONTEXT, 'B').uuid - first_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - - # second, force re-run last task - api.apply(TEST_CONTEXT, A, force=True, params={}) - one_B_uuid = api.get(TEST_CONTEXT, 'B').uuid - one_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - assert(first_B_uuid == one_B_uuid) - assert(first_A_uuid != one_A_uuid) - - # second, force all to re-run. - api.apply(TEST_CONTEXT, A, force_all=True, params={}) - all_B_uuid = api.get(TEST_CONTEXT, 'B').uuid - all_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - assert(all_B_uuid != one_B_uuid) - assert(all_A_uuid != one_A_uuid) - - # third, make sure a force_all doesn't crash if there is an external bundle. - api.apply(TEST_CONTEXT, A, force_all=True, params={'set_ext_dep': True}) - final_B_uuid = api.get(TEST_CONTEXT, 'B').uuid - final_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - assert(final_B_uuid == all_B_uuid) - assert(final_A_uuid != all_A_uuid) - - -class B(PipeTask): - - def pipe_requires(self): - self.set_bundle_name("B") - return - - def pipe_run(self): - print ("Task B finished.") - - return True - - -class A(PipeTask): - set_ext_dep = luigi.BoolParameter(default=False) - - def pipe_requires(self): - self.set_bundle_name("A") - if self.set_ext_dep: - self.add_external_dependency('B', B, params={}) - else: - self.add_dependency('B', B, {}) - - def pipe_run(self, B=None): - print ("Task A finished.") - return - - -if __name__ == '__main__': - test() diff --git a/tests/functional/test_inc_pull.py b/tests/functional/test_inc_pull.py deleted file mode 100644 index 41e487b..0000000 --- a/tests/functional/test_inc_pull.py +++ /dev/null @@ -1,151 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Test Incremental Pull - -Use API to create a bundle with some files -push to remote context (test assumes that you have AWS credentials for an account. - -author: Kenneth Yocum -""" -import boto3 -import luigi -import moto -import pytest - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import TEST_CONTEXT - - -TEST_REMOTE = '__test_remote_context__' -TEST_BUCKET = 'test-bucket' -TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) - - -class AIP(PipeTask): - def pipe_requires(self): - self.set_bundle_name('a') - - def pipe_run(self): - - target = self.create_output_file('a.txt') - with target.open('w') as output: - output.write('Hi!') - return {'file': [target]} - - -class BIP(PipeTask): - - n = luigi.IntParameter() - - def pipe_requires(self): - self.set_bundle_name('b') - self.add_dependency('a', AIP, params={}) - - def pipe_run(self, a): - target = self.create_output_file('b.txt') - - a_path = a['file'][0] - with open(a_path) as f: - print(f.read()) - - with target.open('w') as output: - output.write(str(self.n)) - return {'file': [target]} - - -class CIP(PipeTask): - - n = luigi.IntParameter(default=2) - - def pipe_requires(self): - self.set_bundle_name('c') - self.add_dependency('b', BIP, params={'n': self.n}) - - def pipe_run(self, b=None): - target = self.create_output_file('c.txt') - with target.open('w') as output: - output.write(str(self.n + 5)) - return {'file': [target]} - - -@moto.mock_s3 -def test_add_with_treat_as_bundle(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - # Run test pipeline - api.apply(TEST_CONTEXT, CIP) - - # Push bundles to remote - for bundle_name in ['a', 'b', 'c']: - assert api.get(TEST_CONTEXT, bundle_name) is not None, 'Bundle should exist' - - api.commit(TEST_CONTEXT, bundle_name) - api.push(TEST_CONTEXT, bundle_name) - - # Blow away context and recreate - api.delete_context(TEST_CONTEXT) - assert TEST_CONTEXT not in api.ls_contexts() - - api.context(context_name=TEST_CONTEXT) - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - assert api.search(TEST_CONTEXT) == [], 'Context should be empty' - - # Pull bundles from remote - api.pull(TEST_CONTEXT) - - # Make sure all bundle meta data comes down but data remains in S3 - for bundle_name in ['a', 'b', 'c']: - bundle = api.get(TEST_CONTEXT, bundle_name) - assert bundle is not None, 'Bundle should exist' - - data_path = bundle.data['file'][0] - assert data_path.startswith('s3://'), 'Data should be in S3' - - # Rerun pipeline - api.apply(TEST_CONTEXT, BIP, params={'n': 100}, incremental_pull=True) - - # Make sure all bundles exist. Bundles a and b should have local paths - for bundle_name in ['a', 'b', 'c']: - bundle = api.get(TEST_CONTEXT, bundle_name) - assert bundle is not None, 'Bundle should exist' - - data_path = bundle.data['file'][0] - if bundle_name in ['a', 'b']: - assert not data_path.startswith('s3://'), 'Data should be local' - else: - assert data_path.startswith('s3://'), 'Data should be in S3' - - api.delete_context(TEST_CONTEXT) - - -if __name__ == '__main__': - pytest.main([__file__]) diff --git a/tests/functional/test_inc_push.py b/tests/functional/test_inc_push.py deleted file mode 100644 index ab6b06f..0000000 --- a/tests/functional/test_inc_push.py +++ /dev/null @@ -1,115 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Test Incremental Push - -Use API to create a bundle with some files -push to remote context - -author: Kenneth Yocum -""" -import boto3 -import luigi -import moto - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import TEST_CONTEXT - -TEST_REMOTE = '__test_remote_context__' -TEST_BUCKET = 'test-bucket' -TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) - - -class APush(PipeTask): - def pipe_requires(self): - self.set_bundle_name('a') - - def pipe_run(self): - - target = self.create_output_file('a.txt') - with target.open('w') as output: - output.write('Hi!') - return {'file': [target]} - - -class BPush(PipeTask): - - n = luigi.IntParameter() - - def pipe_requires(self): - self.set_bundle_name('b') - self.add_dependency('a', APush, params={}) - - def pipe_run(self, a): - target = self.create_output_file('b.txt') - - a_path = a['file'][0] - with open(a_path) as f: - print(f.read()) - - with target.open('w') as output: - output.write(str(self.n)) - return {'file': [target]} - - -class CPush(PipeTask): - - n = luigi.IntParameter(default=2) - - def pipe_requires(self): - self.set_bundle_name('c') - self.add_dependency('b', BPush, params={'n': self.n}) - - def pipe_run(self, b=None): - # Barf! - raise Exception - - -@moto.mock_s3 -def test_add_with_treat_as_bundle(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - # Try to run the pipeline - should fail - try: - # Run test pipeline - api.apply(TEST_CONTEXT, CPush, incremental_push=True) - except Exception as e: - pass - - # Get objects from remote - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - keys = [o['Key'] for o in objects['Contents']] - keys = [key.split('/')[-1] for key in keys] - - # Make sure files exist in S3 - for output_file in ['a.txt', 'b.txt']: - assert output_file in keys, 'Pipeline should have pushed file' - - api.delete_context(TEST_CONTEXT) diff --git a/tests/functional/test_managed.py b/tests/functional/test_managed.py deleted file mode 100644 index 3bbff83..0000000 --- a/tests/functional/test_managed.py +++ /dev/null @@ -1,342 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Test Incremental Push - -Use API to create a bundle with some files -push to remote context - -author: Sayantan Satpati -""" -import boto3 -import moto -import pandas as pd -import pytest -import os - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - -TEST_REMOTE = '__test_remote_context__' -TEST_BUCKET = 'test-bucket' -TEST_BUCKET_OTHER = 'test-bucket-other' -TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) - -# ================================================== # -# ================ Disdat Tasks ==================== # -# ================================================== # - - -class NonManagedLocal(PipeTask): - def pipe_requires(self): - self.set_bundle_name('b1') - - def pipe_run(self): - # Task output was created on some local path - d = {'col1': [1, 2], 'col2': [3, 4]} - df = pd.DataFrame(data=d) - local_file = '/tmp/test.parquet' - df.to_parquet(local_file) - return {'file': [local_file]} - - -class NonManagedS3(PipeTask): - def pipe_requires(self): - self.set_bundle_name('b2') - - def pipe_run(self): - # Task output was created on some S3 path - d = {'col1': [1, 2], 'col2': [3, 4]} - df = pd.DataFrame(data=d) - s3_file = 's3://{}/test.parquet'.format(TEST_BUCKET_OTHER) - df.to_parquet(s3_file) - - return {'file': [s3_file]} - - -class ManagedLocal(PipeTask): - def pipe_requires(self): - self.set_bundle_name('b3') - - def pipe_run(self): - target = self.create_output_file('test.parquet') - - # Write dataframe to S3 Managed Path - d = {'col1': [1, 2], 'col2': [3, 4]} - df = pd.DataFrame(data=d) - with target.temporary_path() as temp_output_path: - df.to_parquet(temp_output_path) - - return {'file': [target.path]} - - -class ManagedS3(PipeTask): - def pipe_requires(self): - self.set_bundle_name('b4') - - def pipe_run(self): - target = self.create_remote_output_file('test.parquet') - - # Write dataframe to S3 Managed Path - d = {'col1': [1, 2], 'col2': [3, 4]} - df = pd.DataFrame(data=d) - with target.temporary_path() as temp_output_path: - df.to_parquet(temp_output_path) - - return {'file': [target.path]} - - -# ================================================== # -# ================== Tests ========================= # -# ================================================== # - - -def test_managed_local(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, ManagedLocal) - assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' - print(api.cat(TEST_CONTEXT, 'b3')) - - assert os.path.exists(api.search(TEST_CONTEXT, human_name='b3')[0].data['file'][0]), \ - 'Local file should be present in bundle' - - -def test_non_managed_local(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, NonManagedLocal) - assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' - print(api.cat(TEST_CONTEXT, 'b1')) - - assert os.path.exists(api.search(TEST_CONTEXT, human_name='b1')[0].data['file'][0]), \ - 'Local file should be present in bundle' - - -@moto.mock_s3 -def test_remote_push_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - # Apply - api.apply(TEST_CONTEXT, ManagedS3, incremental_push=True) - - assert not os.path.exists(api.search(TEST_CONTEXT, human_name='b4')[0].data['file'][0]), \ - 'Managed S3 file should not be copied to local' - - # Get objects from remote - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - keys = [o['Key'] for o in objects['Contents']] - keys = [key.split('/')[-1] for key in keys] - - # Make sure files exist in S3 - for output_file in ['test.parquet']: - assert output_file in keys, 'Pipeline should have pushed file' - - -@moto.mock_s3 -def test_remote_push_non_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - # Apply - api.apply(TEST_CONTEXT, NonManagedS3, incremental_push=True) - print(api.cat(TEST_CONTEXT, 'b2')) - - # Local context should not contain file if a remote exists. - b = api.search(TEST_CONTEXT, human_name='b2')[0] - assert not os.path.exists(b.data['file'][0]), 'Non Managed S3 file w/ remote should be copied to remote' - b.pull(localize=True) - assert os.path.exists(b.data['file'][0]), 'Non Managed S3 file after pull should be copied to local' - - # Get objects from remote - objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) - keys = [o['Key'] for o in objects['Contents']] - keys = [key.split('/')[-1] for key in keys] - - # Make sure files exist in S3 - for output_file in ['test.parquet']: - assert output_file in keys, 'Pipeline should have pushed file' - - -@moto.mock_s3 -def test_remote_no_push_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - with pytest.raises(Exception) as e: - api.apply(TEST_CONTEXT, ManagedS3) - - -@moto.mock_s3 -def test_remote_no_push_non_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Bind remote context - api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - - # Apply - api.apply(TEST_CONTEXT, NonManagedS3) - print(api.cat(TEST_CONTEXT, 'b2')) - - # Local context should not contain file if a remote exists. - b = api.search(TEST_CONTEXT, human_name='b2')[0] - assert not os.path.exists(b.data['file'][0]), 'Non Managed S3 file w/ remote should be copied to remote' - assert b.data['file'][0].startswith("s3://") - - -def test_no_remote_push_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - with pytest.raises(Exception) as e: - api.apply(TEST_CONTEXT, ManagedS3, incremental_push=True) - - -@moto.mock_s3 -def test_no_remote_push_non_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) - assert 'Contents' not in objects, 'Bucket should be empty' - - api.apply(TEST_CONTEXT, NonManagedS3, incremental_push=True) - print(api.cat(TEST_CONTEXT, 'b2')) - assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' - - assert os.path.exists(api.search(TEST_CONTEXT, human_name='b2')[0].data['file'][0]), \ - 'Non Managed S3 file should be copied to local' - - -def test_no_remote_no_push_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - with pytest.raises(Exception) as e: - api.apply(TEST_CONTEXT, ManagedS3) - - -@moto.mock_s3 -def test_no_remote_no_push_non_managed_s3(): - api.delete_context(TEST_CONTEXT) - api.context(context_name=TEST_CONTEXT) - - # Setup moto s3 resources - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3', region_name='us-east-1') - s3_resource.create_bucket(Bucket=TEST_BUCKET) - s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) - - # Make sure bucket is empty - objects = s3_client.list_objects(Bucket=TEST_BUCKET) - assert 'Contents' not in objects, 'Bucket should be empty' - objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) - assert 'Contents' not in objects, 'Bucket should be empty' - - # Apply - api.apply(TEST_CONTEXT, NonManagedS3) - print(api.cat(TEST_CONTEXT, 'b2')) - assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' - - assert os.path.exists(api.search(TEST_CONTEXT, human_name='b2')[0].data['file'][0]), \ - 'Non Managed S3 file should be copied to local' - - -if __name__ == '__main__': - #test_remote_push_managed_s3() - pytest.main([__file__]) - - - - - - - - - diff --git a/tests/functional/test_mark_force.py b/tests/functional/test_mark_force.py deleted file mode 100644 index 500dc92..0000000 --- a/tests/functional/test_mark_force.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - -TEST_NAME = 'test_bundle' - - -def test(run_test): - """ This tests if mark_force works for tasks. - We have two tasks. One depends on the other. The upstream is marked - mark_force and should always run. - """ - - def run_and_get(name, do_ext=False): - api.apply(TEST_CONTEXT, A_2, params={'set_ext_dep': do_ext}) - b = api.get(TEST_CONTEXT, 'B') - print ("Run {}: b.creation_date {} b.uuid {}".format(name, b.creation_date, b.uuid)) - return b - - b = run_and_get("One") - first_uuid = b.uuid - - b = run_and_get("Two") - assert(first_uuid != b.uuid) - second_uuid = b.uuid - - b = run_and_get("Three", do_ext=True) - assert(second_uuid == b.uuid) - - -class B_2(PipeTask): - - def pipe_requires(self): - self.set_bundle_name("B") - self.mark_force() - return - - def pipe_run(self): - print ("Task B finished.") - - return True - - -class A_2(PipeTask): - set_ext_dep = luigi.BoolParameter(default=False) - - def pipe_requires(self): - if self.set_ext_dep: - self.add_external_dependency('B', B_2, params={}) - else: - self.add_dependency('B', B_2, {}) - - def pipe_run(self, B=None): - print ("Task A finished.") - return - - -if __name__ == '__main__': - test() diff --git a/tests/functional/test_output_types.py b/tests/functional/test_output_types.py index 7553191..faba60b 100644 --- a/tests/functional/test_output_types.py +++ b/tests/functional/test_output_types.py @@ -19,7 +19,6 @@ import pytest import six -from disdat.pipe import PipeTask import disdat.api as api from tests.functional.common import TEST_CONTEXT @@ -38,19 +37,10 @@ def setup(): api.context(context_name=TEST_CONTEXT) -# Test Return Types -class IntTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('int_task') - - def pipe_run(self, pipeline_input=None): - return 1 - - def test_int_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, IntTask) + _ = api.Bundle(TEST_CONTEXT, name='int_task', data=1) data = api.get(TEST_CONTEXT, 'int_task').data assert data == 1, 'Data did not match output' @@ -58,18 +48,10 @@ def test_int_task(): assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class StringTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('string_task') - - def pipe_run(self, pipeline_input=None): - return 'output' - - def test_string_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, StringTask) + _ = api.Bundle(TEST_CONTEXT, name='string_task', data='output') data = api.get(TEST_CONTEXT, 'string_task').data assert data == 'output', 'Data did not match output' @@ -77,18 +59,10 @@ def test_string_task(): assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class FloatTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('float_task') - - def pipe_run(self, pipeline_input=None): - return 2.5 - - def test_float_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, FloatTask) + _ = api.Bundle(TEST_CONTEXT, name='float_task', data=2.5) data = api.get(TEST_CONTEXT, 'float_task').data assert data == 2.5, 'Data did not match output' @@ -96,18 +70,10 @@ def test_float_task(): assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class ListTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('list_task') - - def pipe_run(self, pipeline_input=None): - return [1, 2, 3] - - def test_list_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, ListTask) + _ = api.Bundle(TEST_CONTEXT, name='list_task', data=[1, 2, 3]) data = api.get(TEST_CONTEXT, 'list_task').data assert np.array_equal(data, [1, 2, 3]), 'Data did not match output' @@ -115,46 +81,29 @@ def test_list_task(): assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class DataFrameTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('df_task') - - def pipe_run(self, pipeline_input=None): - df = pd.DataFrame() - df['a'] = [1, 2, 3] - return df - - def test_df_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, DataFrameTask) - data = api.get(TEST_CONTEXT, 'df_task').data - df = pd.DataFrame() df['a'] = [1, 2, 3] + _ = api.Bundle(TEST_CONTEXT, name='df_task', data=df) + data = api.get(TEST_CONTEXT, 'df_task').data + assert df.equals(data), 'Data did not match output' assert type(data) == pd.DataFrame, 'Data is not df' assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class FileTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('file_task') - - def pipe_run(self, pipeline_input=None): - target = self.create_output_file('test.txt') - with target.open('w') as of: - of.write('5') - - return target - - def test_file_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, FileTask) + with api.Bundle(TEST_CONTEXT, name='file_task') as b: + f1 = b.get_file("test.txt") + with open(f1, mode='w') as f: + f.write('5') + b.add_data(f1) + output_path = api.get(TEST_CONTEXT, 'file_task').data with open(output_path) as f: @@ -165,27 +114,18 @@ def test_file_task(): assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' -class DictTask(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('dict_task') - - def pipe_run(self, pipeline_input=None): - return { - 'hello': ['world'] - } - - def test_dict_task(): setup() assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, DictTask) - data = api.get(TEST_CONTEXT, 'dict_task').data + d = {'hello': ['world']} + _ = api.Bundle(TEST_CONTEXT, name='dict_task', data=d) + d = api.get(TEST_CONTEXT, 'dict_task').data - assert data == { + assert d == { 'hello': ['world'] }, 'Data did not match output' - assert type(data) == dict, 'Data is not dict' + assert type(d) == dict, 'Data is not dict' assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' diff --git a/tests/functional/test_pipeline.py b/tests/functional/test_pipeline.py deleted file mode 100644 index b8cea39..0000000 --- a/tests/functional/test_pipeline.py +++ /dev/null @@ -1,118 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - - -class A(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('a') - - def pipe_run(self, pipeline_input=None): - return 2 - - -class B(PipeTask): - - n = luigi.IntParameter() - - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('b') - - def pipe_run(self, pipeline_input=None): - return 2 * self.n - - -class C(PipeTask): - - n = luigi.IntParameter(default=2) - - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('c') - self.add_dependency('a', A, params={}) - self.add_dependency('b', B, params={'n': self.n}) - - def pipe_run(self, pipeline_input=None, a=None, b=None): - return a + b - - -def test_single_task(): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, A) - data = api.get(TEST_CONTEXT, 'a').data - - assert data == 2, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' - - api.apply(TEST_CONTEXT, A) - assert len(api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' - - -def test_dependant_tasks(): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, C) - data = api.get(TEST_CONTEXT, 'c').data - - assert data == 6, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present' - - -def test_task_with_parameter(): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, B, params={'n': 10}) - data = api.get(TEST_CONTEXT, 'b').data - - assert data == 20, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' - - api.apply(TEST_CONTEXT, B, params={'n': 20}) - data = api.get(TEST_CONTEXT, 'b').data - - assert data == 40, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 2, 'Two bundles should be present' - - -def test_child_task_with_parameter(): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - - api.apply(TEST_CONTEXT, C, params={'n': 10}) - data = api.get(TEST_CONTEXT, 'c').data - - assert data == 22, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present' - - api.apply(TEST_CONTEXT, C, params={'n': 20}) - data = api.get(TEST_CONTEXT, 'c').data - - assert data == 42, 'Data did not match output' - assert type(data) == int, 'Data is not path' - assert len(api.search(TEST_CONTEXT)) == 5, 'Five bundles should be present' - - -if __name__ == '__main__': - api.apply(TEST_CONTEXT, A, workers=1) diff --git a/tests/functional/test_remote.py b/tests/functional/test_remote.py index b3ae2eb..b77eb3d 100644 --- a/tests/functional/test_remote.py +++ b/tests/functional/test_remote.py @@ -18,7 +18,6 @@ import moto import pytest -from disdat.pipe import PipeTask import disdat.api as api from tests.functional.common import run_test, TEST_CONTEXT @@ -27,14 +26,6 @@ TEST_BUCKET_URL = "s3://{}".format(TEST_BUCKET) -class RemoteTest(PipeTask): - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('remote_test') - - def pipe_run(self, pipeline_input=None): - return 'Hello' - - @moto.mock_s3 def test_push(run_test): s3_client = boto3.client('s3') @@ -48,7 +39,7 @@ def test_push(run_test): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - api.apply(TEST_CONTEXT, RemoteTest) + _ = api.Bundle(TEST_CONTEXT, name='remote_test', data='Hello') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' @@ -77,7 +68,7 @@ def test_pull(run_test): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) - api.apply(TEST_CONTEXT, RemoteTest) + _ = api.Bundle(TEST_CONTEXT, name='remote_test', data='Hello') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' @@ -95,8 +86,8 @@ def test_pull(run_test): api.pull(TEST_CONTEXT) pulled_bundles = api.search(TEST_CONTEXT) - assert len(pulled_bundles) > 0, 'Pulled bundles down' - assert pulled_bundles[0].data == 'Hello', 'Bundle contains correct data' + assert len(pulled_bundles) > 0, 'No bundles were pulled' + assert pulled_bundles[0].data == 'Hello', 'Bundle contains incorrect data' bucket.objects.all().delete() bucket.delete() diff --git a/tests/functional/test_requires.py b/tests/functional/test_requires.py deleted file mode 100644 index f44d54c..0000000 --- a/tests/functional/test_requires.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - - -import pytest - -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - -from disdat.pipe import PipeTask - -""" Purpose of this test is to show that if you return nothing, you -still need to get the input in the downstream task. See git issue - https://github.com/kyocum/disdat/issues/31 - """ - -WORKERS = 2 - -class a(PipeTask): - def pipe_requires(self): - return - - def pipe_run(self): - return - - -class b(PipeTask): - def pipe_requires(self): - self.add_dependency('something', a, {}) - - def pipe_run(self, something=None): - print("Return type {}, object: {}".format(type(something), something)) - assert something is None - - -def test_requires(run_test): - api.apply(TEST_CONTEXT, b, params={}, workers=WORKERS) - - -if __name__ == '__main__': - api.delete_context(TEST_CONTEXT) - api.context(TEST_CONTEXT) - test_requires(run_test) - #pytest.main([__file__]) diff --git a/tests/functional/test_reuse_logic.py b/tests/functional/test_reuse_logic.py deleted file mode 100644 index 9a42be6..0000000 --- a/tests/functional/test_reuse_logic.py +++ /dev/null @@ -1,330 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import pytest -import luigi -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT, setup - -""" This set of tests verifies the behavior of Disdat's re-execution / data re-use logic. - - Luigi has a simple re-run logic: do not re-run tasks when the targets returned by the output function exist. - - Note that because task parameters should define up-stream dependencies, a given task should only re-use its result - when its upstream required tasks have also re-used their results. If a tasks parameters have not changed, its dependencies - should not change, those cached results may be used, and so on. - - But tasks with the same parameters may be re-executed for a variety of real-world reasons. First, the task code may have - changed. Or the task may be reading from an external data source that has changed (a database table). While this - second case should be handled by the programmer (parameterizing by the time of db access), the programmer may forget or - the time unit may not be of sufficient granularity. E.g., parametrize by day read, but the database changes more frequently. - Or an external up-stream input, produced by another pipeline, has been updated. - - In addition, it is possible for a layer above Disdat (or Luigi), could dynamically create tasks, and parameterize them - outside of the normal pattern of "right-to-left" or "last task passes parameters to first task" pattern. - - Finally, Disdat de-couples absolute file paths (used in Luigi) from the logical name of a task's output. What this - means is that absolute file paths alone do not dictate whether a task should re-use that output. Instead, whether any - *version* of a parameterized task's output exists determines if it should re-run. - - In all of these cases, the downstream task should be re-run when new versions of upstream outputs exist. Disdat builds - upon Luigi's straightforward logic to determine whether to re-execute or re-use existing outputs in these conditions. - - In particular, Disdat uses the processing_name as the notion of versioning "sameness" for re-use. The processing name - is, a summary of the current task's parameters and a summary of the parameters of its upstream tasks. In essence, it is - is analogous to a Merkle Tree, summarizing the state of the world used to run a single task. - - Note that UUIDs are "lineage" names. They tell us exactly the data used to produce an output. - - Tests: * means new params - 1.) Force and Force all are handled by test_force_one_and_all.py - 2.) Run A, Run A, should re-use - 3.) Run A, Run A*, should re-run - 4.) Run A->B, Re-run A*. Run A->B, nothing should run. - 5.) Run A->B, re-run A (force), Run A->B, B should re-run. - 6.) Run A->B, Re-run A*. Run A*->B, B should re-run. - 7.) Run A->B->C, Run A*->B. Run A->B->C, nothing should run - 8.) Run A->B->C, Run A*->B. Run A*->B->C, C should re-run - 9.) Run A,B -> C(a=A,b=B) Run A,B -> C(a=B,b=A), C should run both times. - - """ - -WORKERS = 2 - -def test_A2_A3(run_test): - """ - 2.) Run A, Run A, should re-use - 3.) Run A, Run A*, should re-run - """ - - result = api.apply(TEST_CONTEXT, A, workers=WORKERS) - assert result['did_work'] is True - first_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - result = api.apply(TEST_CONTEXT, A, workers=WORKERS) - assert result['did_work'] is False - second_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - assert first_A_uuid == second_A_uuid - assert len(api.search(TEST_CONTEXT, 'A')) == 1 - - # Mod args, should re-run - result = api.apply(TEST_CONTEXT, A, params={'a': 2,'b': 3}, workers=WORKERS) - assert result['did_work'] is True - next_A_uuid = api.get(TEST_CONTEXT, 'A').uuid - assert next_A_uuid != second_A_uuid - assert len(api.search(TEST_CONTEXT, 'A')) == 2 - - -def test_AB4(run_test): - """ - 4.) Run A->B, Re-run A*. Run A->B, nothing should run. - """ - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - - result = api.apply(TEST_CONTEXT, A, params={'a':2, 'b':3}, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is False - - -def test_AB5(run_test): - """ - 5.) Run A->B, re-run A (force), Run A->B, B should re-run. - """ - api.delete_context(TEST_CONTEXT) - api.context(TEST_CONTEXT) - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - - result = api.apply(TEST_CONTEXT, A, force=True, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - - -def test_AB6(run_test): - """ - 6.) Run A->B, Re-run A*. Run A*->B, B should re-run. - - Args: - run_test: - - Returns: - - """ - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - B_uuid = api.get(TEST_CONTEXT, 'B').uuid - - result = api.apply(TEST_CONTEXT, APrime, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - APrime_uuid = api.get(TEST_CONTEXT, 'APrime').uuid - - def custom_B_requires(self): - self.add_dependency('a', APrime, params={}) - - old_requires = B.pipe_requires - B.pipe_requires = custom_B_requires - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - assert APrime_uuid == api.get(TEST_CONTEXT, 'APrime').uuid - assert B_uuid != api.get(TEST_CONTEXT, 'B').uuid - - B.pipe_requires = old_requires - - -def test_ABC7(run_test): - """ - 7.) Run A->B->C, Run A*->B. Run A->B->C, nothing should run - - Args: - run_test: - - Returns: - - """ - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - B_uuid = api.get(TEST_CONTEXT, 'B').uuid - - def custom_B_requires(self): - self.add_dependency('a', APrime, params={}) - - old_requires = B.pipe_requires - B.pipe_requires = custom_B_requires - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - assert B_uuid != api.get(TEST_CONTEXT, 'B').uuid # should have a new B - - B.pipe_requires = old_requires - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is False - - -def test_ABC8(run_test): - """ - 8.) Run A->B->C, Run A*->B. Run A*->B->C, C should re-run - - Args: - run_test: - - Returns: - - """ - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - B_uuid = api.get(TEST_CONTEXT, 'B').uuid - - def custom_B_requires(self): - self.add_dependency('a', APrime, params={}) - - old_requires = B.pipe_requires - B.pipe_requires = custom_B_requires - - result = api.apply(TEST_CONTEXT, B, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - assert B_uuid != api.get(TEST_CONTEXT, 'B').uuid # should have a new B - B_uuid = api.get(TEST_CONTEXT, 'B').uuid - APrime_uuid = api.get(TEST_CONTEXT, 'APrime').uuid - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - assert B_uuid == api.get(TEST_CONTEXT, 'B').uuid - assert APrime_uuid == api.get(TEST_CONTEXT, 'APrime').uuid - - B.pipe_requires = old_requires - - -def test_bundle_depsABC9(run_test): - """ - 10.) Run A,B -> C(a=A,b=B) Run A,B -> C(a=B,b=A), C should run both times. - - Args: - run_test: - - Returns: - - """ - - def custom_C_requires(self): - self.add_dependency('a', A, params={}) - self.add_dependency('b', B, params={}) - - old_requires = C.pipe_requires - C.pipe_requires = custom_C_requires - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - A_uuid = api.get(TEST_CONTEXT, 'A').uuid - B_uuid = api.get(TEST_CONTEXT, 'B').uuid - C_uuid = api.get(TEST_CONTEXT, 'C').uuid - - def custom_C_requires_swap(self): - self.add_dependency('a', B, params={}) - self.add_dependency('b', A, params={}) - - C.pipe_requires = custom_C_requires_swap - - result = api.apply(TEST_CONTEXT, C, workers=WORKERS) - assert result['success'] is True - assert result['did_work'] is True - assert A_uuid == api.get(TEST_CONTEXT, 'A').uuid - assert B_uuid == api.get(TEST_CONTEXT, 'B').uuid - assert C_uuid != api.get(TEST_CONTEXT, 'C').uuid - - C.pipe_requires = old_requires - - -class A(PipeTask): - - a = luigi.IntParameter(default=1) - b = luigi.IntParameter(default=2) - - def pipe_requires(self): - pass - - def pipe_run(self): - return self.a+self.b - - -class APrime(PipeTask): - - a = luigi.IntParameter(default=2) - b = luigi.IntParameter(default=1) - - def pipe_requires(self): - pass - - def pipe_run(self): - return self.a+self.b - - -class B(PipeTask): - - a = luigi.IntParameter(default=3) - b = luigi.IntParameter(default=4) - - def pipe_requires(self): - self.add_dependency('a', A, params={}) - - def pipe_run(self, a): - return a + self.a + self.b - - -class C(PipeTask): - - a = luigi.IntParameter(default=5) - b = luigi.IntParameter(default=6) - - def pipe_requires(self): - self.add_dependency('b', B, params={}) - - def pipe_run(self, **kwargs): - input_sum = sum([v for v in kwargs.values()]) - return input_sum + self.b + self.a - - -if __name__ == '__main__': - pytest.main([__file__]) - - diff --git a/tests/functional/test_tags.py b/tests/functional/test_tags.py deleted file mode 100644 index c3cc8b9..0000000 --- a/tests/functional/test_tags.py +++ /dev/null @@ -1,60 +0,0 @@ -import uuid -import pytest - -from disdat import api, common -from disdat.pipe import PipeTask - - -TAGS = {'tag1': 'omg', 'tag2': 'it works'} - - -class Source(PipeTask): - - def pipe_requires(self): - self.set_bundle_name('tagged') - - def pipe_run(self): - self.add_tags(TAGS) - return 0 - - -class Destination(PipeTask): - - def pipe_requires(self): - self.set_bundle_name('output') - self.add_dependency('tagged', Source, params={}) - - def pipe_run(self, tagged): - tags = self.get_tags('tagged') - assert tags is not TAGS - assert tags == TAGS - return 1 - - -@pytest.fixture -def context(): - - try: - print('ensuring disdat is initialized') - common.DisdatConfig.init() - except: - print('disdat already initialized, no worries...') - - print('creating temporary local context') - context = uuid.uuid1().hex - api.context(context) - - yield context - - print('deleting temporary local context') - api.delete_context(context) - - -class TestContext: - - def test_tags(self, context): - api.apply(context, Destination) - - -if __name__ == '__main__': - pytest.main([__file__]) diff --git a/tests/functional/test_task_inception.py b/tests/functional/test_task_inception.py deleted file mode 100644 index d0610b9..0000000 --- a/tests/functional/test_task_inception.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - - -""" Test ability to call pipelines in pipelines -Test with workers=1 and workers>1 -""" - - -class A(PipeTask): - n = luigi.IntParameter() - - def pipe_requires(self): - self.set_bundle_name("A[{}]".format(self.n)) - self.add_dependency("shark", B, params={'n': self.n+1}) - - def pipe_run(self, shark=None): - return self.n * 2 - - -class B(A): - def pipe_requires(self): - self.set_bundle_name("B[{}]".format(self.n)) - - -class Root(PipeTask): - """ - Call another pipeline internally. - """ - n = luigi.IntParameter(default=0) - workers = luigi.IntParameter(default=1) - inception = luigi.BoolParameter(default=False) - - def pipe_requires(self): - self.set_bundle_name('root') - self.add_dependency('a', A, params={'n': self.n+1}) - self.add_dependency('b', B, params={'n': self.n+1}) - - def pipe_run(self, a=None, b=None): - if self.inception: - api.apply(TEST_CONTEXT, Root, output_bundle="inception_result", params={'n': 100}, workers=self.workers) - b = api.get(TEST_CONTEXT, "inception_result" ) - return b.data - else: - return a + b - - -def _inception_pipeline(inner_workers, outer_workers): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, Root, output_bundle="test_root", params={'inception': True, - 'workers': inner_workers}, - workers=outer_workers) - - b = api.get(TEST_CONTEXT, "test_root") - - print("FINAL DATA {}".format(b.data)) - -def test_inception_1_1(): - _inception_pipeline(1, 1) - -def test_inception_3_1(): - _inception_pipeline(3, 1) - -def test_inception_1_3(): - _inception_pipeline(1, 3) - -def test_inception_3_3(): - _inception_pipeline(3, 3) - - -if __name__ == '__main__': - api.delete_context(TEST_CONTEXT) - api.context(TEST_CONTEXT) - test_inception_3_3() - #api.apply(TEST_CONTEXT, A, params={'n': 1}) \ No newline at end of file diff --git a/tests/functional/test_task_yield.py b/tests/functional/test_task_yield.py deleted file mode 100644 index 464e5c3..0000000 --- a/tests/functional/test_task_yield.py +++ /dev/null @@ -1,188 +0,0 @@ -# -# Copyright 2017 Human Longevity, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import luigi - -from disdat.pipe import PipeTask -import disdat.api as api -from tests.functional.common import run_test, TEST_CONTEXT - - -""" Tests for dynamic dependencies using Disdat-Luigi - -Run tests with -A.) 1 or more yields -B.) yield of lists -C.) 1 or more workers -D.) Mixed with other dependencies -E.) Mixed with other upstream yield dependencies - -Need to check lineage as well - - -""" - - -class Static(PipeTask): - n = luigi.IntParameter() - dyn_yield = luigi.BoolParameter(default=False) - - def pipe_requires(self): - self.set_bundle_name("Static[{}]".format(self.n)) - - def pipe_run(self): - if self.dyn_yield: - result = self.yield_dependency(Yielded, params={'n': self.n, 'source': "Static{}Source".format(self.n)}) - yield result - return self.n * 2 - - -class Yielded(PipeTask): - n = luigi.IntParameter() - source = luigi.Parameter(default="RootSource") - dyn_yield = luigi.BoolParameter(default=False) - - def pipe_requires(self, pipeline_input=None): - self.set_bundle_name('Yielded{}{}'.format(self.source, self.n)) - - def pipe_run(self, pipeline_input=None): - if self.dyn_yield: - result = self.yield_dependency(Yielded, params={'n': self.n, 'source': "Yielded{}Source".format(self.n)}) - yield result - return self.n - - -class Root(PipeTask): - """ - Dynamically yield N tasks - """ - n = luigi.IntParameter(default=0) - static_deps = luigi.BoolParameter(default=False) - static_dynamic_deps = luigi.BoolParameter(default=False) - - def pipe_requires(self): - self.set_bundle_name('root') - if self.static_deps: - self.add_dependency('a', Static, params={'n': 0, 'dyn_yield': self.static_dynamic_deps}) - self.add_dependency('b', Static, params={'n': 1, 'dyn_yield': self.static_dynamic_deps}) - - def pipe_run(self, a=None, b=None): - results = [] - for i in range(self.n): - results.append(self.yield_dependency(Yielded, params={'n': i})) - yield results[-1] - - print (sum(d.pipe_output for d in results)) - return sum(d.pipe_output for d in results) - - -class RootYieldList(Root): - def pipe_run(self, a=None, b=None): - results = [] - for i in range(self.n): - results.append(self.yield_dependency(Yielded, params={'n': i})) - yield results - print (sum(d.pipe_output for d in results)) - return sum(d.pipe_output for d in results) - - -def _yield_pipeline(count, workers, roottask, static_deps=False, static_dynamic=False): - assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' - api.apply(TEST_CONTEXT, roottask, output_bundle="test_root", params={'n': count, - 'static_deps': static_deps, - 'static_dynamic_deps': static_dynamic}, - workers=workers) - b = api.get(TEST_CONTEXT, "test_root") - assert b.data == sum(i for i in range(count)) - - -""" One-by-one yield, with many yields, with many workers """ - -def test_simple_yield_one(): - _yield_pipeline(1, 1, Root, False) - -def test_simple_yield_one_mp(): - _yield_pipeline(1, 3, Root, False) - -def test_simple_yield_many(): - _yield_pipeline(10, 1, Root, False) - -def test_simple_yield_many_mp(): - _yield_pipeline(10, 3, Root, False) - - -""" One-by-one yield, with many yields, with many workers, with static dependencies """ - -def test_simple_yield_one_statics(): - _yield_pipeline(1, 1, Root, True) - -def test_simple_yield_one_mp_statics(): - _yield_pipeline(1, 3, Root, True) - -def test_simple_yield_many_statics(): - _yield_pipeline(10, 1, Root, True) - -def test_simple_yield_many_mp_statics(): - _yield_pipeline(10, 3, Root, True) - -def test_simple_yield_many_mp_statics_dynamic(): - _yield_pipeline(4, 3, Root, True, True) - - -""" One-by-one yield, with many yields, with many workers, with static dependencies """ - -def test_list_yield_one(): - _yield_pipeline(1, 1, RootYieldList, False) - -def test_list_yield_one_mp(): - _yield_pipeline(1, 3, RootYieldList, False) - -def test_list_yield_many(): - _yield_pipeline(10, 1, RootYieldList, False) - -def test_list_yield_many_mp(): - _yield_pipeline(10, 3, RootYieldList, False) - - -""" One-by-one yield, with many yields, with many workers, with static dependencies """ - -def test_list_yield_one_statics(): - _yield_pipeline(1, 1, RootYieldList, True) - -def test_list_yield_one_mp_statics(): - _yield_pipeline(1, 3, RootYieldList, True) - -def test_list_yield_many_statics(): - _yield_pipeline(10, 1, RootYieldList, True) - -def test_list_yield_many_mp_statics(): - _yield_pipeline(10, 3, RootYieldList, True) - -def test_list_yield_many_mp_statics_dynamic(): - _yield_pipeline(4, 3, RootYieldList, True, True) - - -if __name__ == '__main__': - api.delete_context(TEST_CONTEXT) - api.context(TEST_CONTEXT) - #test_simple_yield_one() - test_simple_yield_many_mp_statics_dynamic() - - - api.delete_context(TEST_CONTEXT) - api.context(TEST_CONTEXT) - #test_list_yield_many() - test_list_yield_many_mp_statics_dynamic()