diff --git a/caliban/builder.py b/caliban/builder.py new file mode 100644 index 0000000..298aa1f --- /dev/null +++ b/caliban/builder.py @@ -0,0 +1,734 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions required to interact with Docker to build and run images, shells +and notebooks in a Docker environment. + +""" + +from __future__ import absolute_import, division, print_function + +import json +import os +import subprocess +import sys +from enum import Enum +from pathlib import Path +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, NewType, + Optional, Union) + +import tqdm +from absl import logging +from blessings import Terminal +from tqdm.utils import _screen_shape_wrapper + +import caliban.config as c +import caliban.util as u +from caliban.history.types import Experiment, Job, JobSpec, JobStatus, Platform +from caliban.history.utils import (create_experiments, generate_container_spec, + get_mem_engine, get_sql_engine, + session_scope) + +t = Terminal() + +DEV_CONTAINER_ROOT = "gcr.io/blueshift-playground/blueshift" +TF_VERSIONS = {"2.2.0", "1.12.3", "1.14.0", "1.15.0"} +DEFAULT_WORKDIR = "/usr/app" +CREDS_DIR = "/.creds" +CONDA_BIN = "/opt/conda/bin/conda" + +ImageId = NewType('ImageId', str) +ArgSeq = NewType('ArgSeq', List[str]) + + +class DockerError(Exception): + """Exception that passes info on a failed Docker command.""" + + def __init__(self, message, cmd, ret_code): + super().__init__(message) + self.message = message + self.cmd = cmd + self.ret_code = ret_code + + @property + def command(self): + return " ".join(self.cmd) + + +class NotebookInstall(Enum): + """Flag to decide what to do .""" + none = 'none' + lab = 'lab' + jupyter = 'jupyter' + + def __str__(self) -> str: + return self.value + + +class Shell(Enum): + """Add new shells here and below, in SHELL_DICT.""" + bash = 'bash' + zsh = 'zsh' + + def __str__(self) -> str: + return self.value + + +# Tuple to track the information required to install and execute some custom +# shell into a container. +ShellData = NamedTuple("ShellData", [("executable", str), + ("packages", List[str])]) + + +def apt_install(*packages: str) -> str: + """Returns a command that will install the supplied list of packages without + requiring confirmation or any user interaction. + """ + package_str = ' '.join(packages) + no_prompt = "DEBIAN_FRONTEND=noninteractive" + return f"{no_prompt} apt-get install --yes --no-install-recommends {package_str}" + + +def apt_command(commands: List[str]) -> List[str]: + """Pre-and-ap-pends the supplied commands with the appropriate in-container and + cleanup command for aptitude. + + """ + update = ["apt-get update"] + cleanup = ["apt-get clean", "rm -rf /var/lib/apt/lists/*"] + return update + commands + cleanup + + +# Dict linking a particular supported shell to the data required to run and +# install the shell inside a container. +# +# : Dict[Shell, ShellData] +SHELL_DICT = { + Shell.bash: ShellData("/bin/bash", []), + Shell.zsh: ShellData("/bin/zsh", ["zsh"]) +} + + +def default_shell() -> Shell: + """Returns the shell to load into the container. Defaults to Shell.bash, but if + the user's SHELL variable refers to a supported sub-shell, returns that + instead. + + """ + ret = Shell.bash + + if "zsh" in os.environ.get("SHELL"): + ret = Shell.zsh + + return ret + + +def adc_location(home_dir: Optional[str] = None) -> str: + """Returns the location for application default credentials, INSIDE the + container (so, hardcoded unix separators), given the supplied home directory. + + """ + if home_dir is None: + home_dir = Path.home() + + return "{}/.config/gcloud/application_default_credentials.json".format( + home_dir) + + +def container_home(): + """Returns the location of the home directory inside the generated + container. + + """ + return "/home/{}".format(u.current_user()) + + +def tf_base_image(job_mode: c.JobMode, tensorflow_version: str) -> str: + """Returns the base image to use, depending on whether or not we're using a + GPU. This is JUST for building our base images for Blueshift; not for + actually using in a job. + + List of available tags: https://hub.docker.com/r/tensorflow/tensorflow/tags + + """ + if tensorflow_version not in TF_VERSIONS: + raise Exception("""{} is not a valid tensorflow version. + Try one of: {}""".format(tensorflow_version, TF_VERSIONS)) + + gpu = "-gpu" if c.gpu(job_mode) else "" + return "tensorflow/tensorflow:{}{}-py3".format(tensorflow_version, gpu) + + +def base_image_suffix(job_mode: c.JobMode) -> str: + return "gpu" if c.gpu(job_mode) else "cpu" + + +def base_image_id(job_mode: c.JobMode) -> str: + """Returns the default base image for all caliban Dockerfiles.""" + base_suffix = base_image_suffix(job_mode) + return "{}:{}".format(DEV_CONTAINER_ROOT, base_suffix) + + +def extras_string(extras: List[str]) -> str: + """Returns the argument passed to `pip install` to install a project from its + setup.py and target a specific set of extras_require dependencies. + + Args: + extras: (potentially empty) list of extra_requires deps. + """ + ret = "." + if len(extras) > 0: + ret += "[{}]".format(','.join(extras)) + return ret + + +def base_extras(job_mode: c.JobMode, path: str, + extras: Optional[List[str]]) -> Optional[List[str]]: + """Returns None if the supplied path doesn't exist (it's assumed it points to a + setup.py file). + + If the path DOES exist, generates a list of extras to install. gpu or cpu are + always added to the beginning of the list, depending on the mode. + + """ + ret = None + + if os.path.exists(path): + base = extras or [] + extra = 'gpu' if c.gpu(job_mode) else 'cpu' + ret = base if extra in base else [extra] + base + + return ret + + +def _dependency_entries(workdir: str, + user_id: int, + user_group: int, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None) -> str: + """Returns the Dockerfile entries required to install dependencies from either: + + - a requirements.txt file, path supplied by requirements_path + - a conda environment.yml file, path supplied by conda_env_path. + - a setup.py file, if some sequence of dependencies is supplied. + + An empty list for setup_extras means, run `pip install -c .` with no extras. + None for this argument means do nothing. If a list of strings is supplied, + they'll be treated as extras dependency sets. + """ + ret = "" + + if setup_extras is not None: + ret += f""" +COPY --chown={user_id}:{user_group} setup.py {workdir} +RUN /bin/bash -c "pip install --no-cache-dir {extras_string(setup_extras)}" +""" + + if conda_env_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {conda_env_path} {workdir} +RUN /bin/bash -c "{CONDA_BIN} env update \ + --quiet --name caliban \ + --file {conda_env_path} && \ + {CONDA_BIN} clean -y -q --all" +""" + + if requirements_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {requirements_path} {workdir} +RUN /bin/bash -c "pip install --no-cache-dir -r {requirements_path}" +""" + + return ret + + +def _package_entries(workdir: str, user_id: int, user_group: int, + package: u.Package) -> str: + """Returns the Dockerfile entries required to: + + - copy a directory of code into a docker container + - inject an entrypoint that executes a python module inside that directory. + + Python code runs as modules vs scripts so that we can enforce import hygiene + between files inside a project. + + """ + owner = "{}:{}".format(user_id, user_group) + + arg = package.main_module or package.script_path + + # This needs to use json so that quotes print as double quotes, not single + # quotes. + entrypoint_s = json.dumps(package.executable + [arg]) + + return """ +# Copy project code into the docker container. +COPY --chown={owner} {package_path} {workdir}/{package_path} + +# Declare an entrypoint that actually runs the container. +ENTRYPOINT {entrypoint_s} + """.format_map({ + "owner": owner, + "package_path": package.package_path, + "workdir": workdir, + "entrypoint_s": entrypoint_s + }) + + +def _service_account_entry(user_id: int, user_group: int, credentials_path: str, + docker_credentials_dir: str, + write_adc_placeholder: bool): + """Generates the Dockerfile entries required to transfer a set of Cloud service + account credentials into the Docker container. + + NOTE the write_adc_placeholder variable is here because the "ctpu" script + that we use to interact with TPUs has a bug in it, as of 1/21/2020, where the + script will fail if the application_default_credentials.json file isn't + present, EVEN THOUGH it properly uses the service account credentials + registered with gcloud instead of ADC creds. + + If a service account is present, we write a placeholder string to get past + this problem. This shouldn't matter for anyone else since adc isn't used if a + service account is present. + + """ + container_creds = "{}/credentials.json".format(docker_credentials_dir) + ret = """ +COPY --chown={user_id}:{user_group} {credentials_path} {container_creds} + +# Use the credentials file to activate gcloud, gsutil inside the container. +RUN gcloud auth activate-service-account --key-file={container_creds} && \ + git config --global credential.'https://source.developers.google.com'.helper gcloud.sh + +ENV GOOGLE_APPLICATION_CREDENTIALS={container_creds} +""".format_map({ + "user_id": user_id, + "user_group": user_group, + "credentials_path": credentials_path, + "container_creds": container_creds + }) + + if write_adc_placeholder: + ret += """ +RUN echo "placeholder" >> {} +""".format(adc_location(container_home())) + + return ret + + +def _adc_entry(user_id: int, user_group: int, adc_path: str): + """Returns the Dockerfile line required to transfer the + application_default_credentials.json file into the container's home + directory. + + """ + return """ +COPY --chown={user_id}:{user_group} {adc_path} {adc_loc} + """.format_map({ + "user_id": user_id, + "user_group": user_group, + "adc_path": adc_path, + "adc_loc": adc_location(container_home()) + }) + + +def _credentials_entries(user_id: int, + user_group: int, + adc_path: Optional[str], + credentials_path: Optional[str], + docker_credentials_dir: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to copy a user's Cloud credentials + into the Docker container. + + - adc_path is the relative path inside the current directory to an + application_default_credentials.json file containing... well, you get it. + - credentials_path is the relative path inside the current directory to a + JSON credentials file. + - docker_credentials_dir is the relative path inside the docker container + where the JSON file will be copied on build. + + """ + if docker_credentials_dir is None: + docker_credentials_dir = CREDS_DIR + + ret = "" + if credentials_path is not None: + ret += _service_account_entry(user_id, + user_group, + credentials_path, + docker_credentials_dir, + write_adc_placeholder=adc_path is None) + + if adc_path is not None: + ret += _adc_entry(user_id, user_group, adc_path) + + return ret + + +def _notebook_entries(lab: bool = False, version: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to install Jupyter{lab}. + + Optionally takes a version string. + + """ + version_suffix = "" + + if version is not None: + version_suffix = "=={}".format(version) + + library = "jupyterlab" if lab else "jupyter" + + return """ +RUN pip install {}{} +""".format(library, version_suffix) + + +def _custom_packages( + user_id: int, + user_group: int, + packages: Optional[List[str]] = None, + shell: Optional[Shell] = None, +) -> str: + """Returns the Dockerfile entries necessary to install custom dependencies for + the supplied shell and sequence of aptitude packages. + + """ + if packages is None: + packages = [] + + if shell is None: + shell = Shell.bash + + ret = "" + + to_install = sorted(packages + SHELL_DICT[shell].packages) + + if len(to_install) != 0: + commands = apt_command([apt_install(*to_install)]) + ret = """ +USER root + +RUN {commands} + +USER {user_id}:{user_group} +""".format_map({ + "commands": " && ".join(commands), + "user_id": user_id, + "user_group": user_group + }) + + return ret + + +def _copy_dir_entry(workdir: str, user_id: int, user_group: int, + dirname: str) -> str: + """Returns the Dockerfile entry necessary to copy a single extra subdirectory + from the current directory into a docker container during build. + + """ + owner = "{}:{}".format(user_id, user_group) + return """# Copy {dirname} into the Docker container. +COPY --chown={owner} {dirname} {workdir}/{dirname} +""".format_map({ + "owner": owner, + "workdir": workdir, + "dirname": dirname + }) + + +def _extra_dir_entries(workdir: str, user_id: int, user_group: int, + extra_dirs: List[str]) -> str: + """Returns the Dockerfile entries necessary to copy all directories in the + extra_dirs list into a docker container during build. + + """ + ret = "" + for d in extra_dirs: + ret += "\n{}".format(_copy_dir_entry(workdir, user_id, user_group, d)) + return ret + + +def _dockerfile_template( + job_mode: c.JobMode, + workdir: Optional[str] = None, + base_image_fn: Optional[Callable[[c.JobMode], str]] = None, + package: Optional[Union[List, u.Package]] = None, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None, + adc_path: Optional[str] = None, + credentials_path: Optional[str] = None, + jupyter_version: Optional[str] = None, + inject_notebook: NotebookInstall = NotebookInstall.none, + shell: Optional[Shell] = None, + extra_dirs: Optional[List[str]] = None, + caliban_config: Optional[Dict[str, Any]] = None) -> str: + """Returns a Dockerfile that builds on a local CPU or GPU base image (depending + on the value of job_mode) to create a container that: + + - installs any dependency specified in a requirements.txt file living at + requirements_path, a conda environment at conda_env_path, or any + dependencies in a setup.py file, including extra dependencies, if + setup_extras isn't None + - injects gcloud credentials into the container, so Cloud interaction works + just like it does locally + - potentially installs a custom shell, or jupyterlab for notebook support + - copies all source needed by the main module specified by package, and + potentially injects an entrypoint that, on run, will run that main module + + Most functions that call _dockerfile_template pass along any kwargs that they + receive. It should be enough to add kwargs here, then rely on that mechanism + to pass them along, vs adding kwargs all the way down the call chain. + + Supply a custom base_image_fn (function from job_mode -> image ID) to inject + more complex Docker commands into the Caliban environments by, for example, + building your own image on top of the TF base images, then using that. + + """ + uid = os.getuid() + gid = os.getgid() + username = u.current_user() + + if isinstance(package, list): + package = u.Package(*package) + + if workdir is None: + workdir = DEFAULT_WORKDIR + + if base_image_fn is None: + base_image_fn = base_image_id + + base_image = base_image_fn(job_mode) + + dockerfile = """ +FROM {base_image} + +# Create the same group we're using on the host machine. +RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} + +# Create the user by name. --no-log-init guards against a crash with large user +# IDs. +RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} + +# The directory is created by root. This sets permissions so that any user can +# access the folder. +RUN mkdir -m 777 {workdir} {creds_dir} {c_home} + +ENV HOME={c_home} + +WORKDIR {workdir} + +USER {uid}:{gid} +""".format_map({ + "base_image": base_image, + "username": username, + "uid": uid, + "gid": gid, + "workdir": workdir, + "c_home": container_home(), + "creds_dir": CREDS_DIR + }) + dockerfile += _credentials_entries(uid, + gid, + adc_path=adc_path, + credentials_path=credentials_path) + + dockerfile += _dependency_entries(workdir, + uid, + gid, + requirements_path=requirements_path, + conda_env_path=conda_env_path, + setup_extras=setup_extras) + + if inject_notebook.value != 'none': + install_lab = inject_notebook == NotebookInstall.lab + dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) + + if extra_dirs is not None: + dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) + + dockerfile += _custom_packages(uid, + gid, + packages=c.apt_packages( + caliban_config, job_mode), + shell=shell) + + if package is not None: + # The actual entrypoint and final copied code. + dockerfile += _package_entries(workdir, uid, gid, package) + + return dockerfile + + +def docker_image_id(output: str) -> ImageId: + """Accepts a string containing the output of a successful `docker build` + command and parses the Docker image ID from the stream. + + NOTE this is probably quite brittle! I can imagine this breaking quite easily + on a Docker upgrade. + + """ + return ImageId(output.splitlines()[-1].split()[-1]) + + +def build_image(job_mode: c.JobMode, + build_path: str, + credentials_path: Optional[str] = None, + adc_path: Optional[str] = None, + no_cache: bool = False, + **kwargs) -> str: + """Builds a Docker image by generating a Dockerfile and passing it to `docker + build` via stdin. All output from the `docker build` process prints to + stdout. + + Returns the image ID of the new docker container; if the command fails, + throws on error with information about the command and any issues that caused + the problem. + + """ + with u.TempCopy(credentials_path, + tmp_name=".caliban_default_creds.json") as creds: + with u.TempCopy(adc_path, tmp_name=".caliban_adc_creds.json") as adc: + cache_args = ["--no-cache"] if no_cache else [] + cmd = ["docker", "build"] + cache_args + ["--rm", "-f-", build_path] + + dockerfile = _dockerfile_template(job_mode, + credentials_path=creds, + adc_path=adc, + **kwargs) + + joined_cmd = " ".join(cmd) + logging.info("Running command: {}".format(joined_cmd)) + + try: + output, ret_code = u.capture_stdout(cmd, input_str=dockerfile) + if ret_code == 0: + return docker_image_id(output) + else: + error_msg = "Docker failed with error code {}.".format(ret_code) + raise DockerError(error_msg, cmd, ret_code) + + except subprocess.CalledProcessError as e: + logging.error(e.output) + logging.error(e.stderr) + + +def _image_tag_for_project(project_id: str, image_id: str) -> str: + """Generate the GCR Docker image tag for the supplied pair of project_id and + image_id. + + This function properly handles "domain scoped projects", where the project ID + contains a domain name and project ID separated by : + https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects. + + """ + project_s = project_id.replace(":", "/") + return "gcr.io/{}/{}:latest".format(project_s, image_id) + + +def push_uuid_tag(project_id: str, image_id: str) -> str: + """Takes a base image and tags it for upload, then pushes it to a remote Google + Container Registry. + + Returns the tag on a successful push. + + TODO should this just check first before attempting to push if the image + exists? Immutable names means that if the tag is up there, we're done. + Potentially use docker-py for this. + + """ + image_tag = _image_tag_for_project(project_id, image_id) + subprocess.run(["docker", "tag", image_id, image_tag], check=True) + subprocess.run(["docker", "push", image_tag], check=True) + return image_tag + + +def _run_cmd(job_mode: c.JobMode, + run_args: Optional[List[str]] = None) -> List[str]: + """Returns the sequence of commands for the subprocess run functions required + to execute `docker run`. in CPU or GPU mode, depending on the value of + job_mode. + + Keyword args: + - run_args: list of args to pass to docker run. + + """ + if run_args is None: + run_args = [] + + runtime = ["--runtime", "nvidia"] if c.gpu(job_mode) else [] + return ["docker", "run"] + runtime + ["--ipc", "host"] + run_args + + +def _home_mount_cmds(enable_home_mount: bool) -> List[str]: + """Returns the argument needed by Docker to mount a user's local home directory + into the home directory location inside their container. + + If enable_home_mount is false returns an empty list. + + """ + ret = [] + if enable_home_mount: + ret = ["-v", "{}:{}".format(Path.home(), container_home())] + return ret + + +def _interactive_opts(workdir: str) -> List[str]: + """Returns the basic arguments we want to run a docker process locally. + + """ + return [ + "-w", workdir, \ + "-u", "{}:{}".format(os.getuid(), os.getgid()), \ + "-v", "{}:{}".format(os.getcwd(), workdir) \ + ] + + +def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec: + """Prints logging as a side effect for the supplied sequence of job specs + generated from an experiment definition; returns the input job spec. + + """ + args = c.experiment_to_args(job_spec.experiment.kwargs, + job_spec.experiment.args) + logging.info("") + logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args)))) + return job_spec + + +def logged_job_specs(job_specs: Iterable[JobSpec]) -> Iterable[JobSpec]: + """Generates an iterable of job specs that should be passed to `docker run` to + execute the experiments defined by the supplied iterable. + + """ + for i, s in enumerate(job_specs, 1): + yield log_job_spec_instance(s, i) + + +def execute_dry_run(job_specs: Iterable[JobSpec]) -> None: + """Expands the supplied sequence of experiments into sequences of args and logs + the jobs that WOULD have been executed, had the dry run flag not been + applied. + + """ + list(logged_job_specs(job_specs)) + + logging.info('') + logging.info( + t.yellow("To build your image and execute these jobs, \ +run your command again without {}.".format(c.DRY_RUN_FLAG))) + logging.info('') + return None diff --git a/caliban/cli.py b/caliban/cli.py index ff26d9c..c6436fe 100644 --- a/caliban/cli.py +++ b/caliban/cli.py @@ -26,6 +26,7 @@ import caliban.cloud.types as ct import caliban.config as conf +import caliban.config.experiment as ce import caliban.docker as docker import caliban.gke as gke import caliban.gke.constants as gke_k @@ -352,7 +353,7 @@ def job_name_arg(parser): def experiment_config_arg(parser): parser.add_argument( "--experiment_config", - type=conf.load_experiment_config, + type=ce.load_experiment_config, help="Path to an experiment config, or 'stdin' to read from stdin.") diff --git a/caliban/config/__init__.py b/caliban/config/__init__.py new file mode 100644 index 0000000..28e00a4 --- /dev/null +++ b/caliban/config/__init__.py @@ -0,0 +1,181 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for our job runner, for working with configs. +""" + +from __future__ import absolute_import, division, print_function + +import argparse +import os +import sys +from enum import Enum +from typing import Any, Dict, List, Optional + +import commentjson +import yaml + +import caliban.cloud.types as ct + + +class JobMode(str, Enum): + CPU = 'CPU' + GPU = 'GPU' + + +# Special config for Caliban. +CalibanConfig = Dict[str, Any] + +DRY_RUN_FLAG = "--dry_run" +CALIBAN_CONFIG = ".calibanconfig.json" + +# Defaults for various input values that we can supply given some partial set +# of info from the CLI. +DEFAULT_REGION = ct.US.central1 + +# : Dict[JobMode, ct.MachineType] +DEFAULT_MACHINE_TYPE = { + JobMode.CPU: ct.MachineType.highcpu_32, + JobMode.GPU: ct.MachineType.standard_8 +} +DEFAULT_GPU = ct.GPU.P100 + +# Config to supply for CPU jobs. +DEFAULT_ACCELERATOR_CONFIG = { + "count": 0, + "type": "ACCELERATOR_TYPE_UNSPECIFIED" +} + + +def gpu(job_mode: JobMode) -> bool: + """Returns True if the supplied JobMode is JobMode.GPU, False otherwise. + + """ + return job_mode == JobMode.GPU + + +def load_yaml_config(path): + """returns the config parsed based on the info in the flags. + + Grabs the config file, written in yaml, slurps it in. + """ + with open(path) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + + return config + + +def load_config(path, mode='yaml'): + """Load a JSON or YAML config. + + """ + if mode == 'json': + with open(path) as f: + return commentjson.load(f) + + return load_yaml_config(path) + + +def valid_json(path: str) -> Dict[str, Any]: + """Loads JSON if the path points to a valid JSON file; otherwise, throws an + exception that's picked up by argparse. + + """ + try: + return load_config(path, mode='json') + except commentjson.JSONLibraryException: + raise argparse.ArgumentTypeError( + """File '{}' doesn't seem to contain valid JSON. Try again!""".format( + path)) + + +def extract_script_args(m: Dict[str, Any]) -> List[str]: + """Strip off the "--" argument if it was passed in as a separator.""" + script_args = m.get("script_args") + if script_args is None or script_args == []: + return script_args + + head, *tail = script_args + + return tail if head == "--" else script_args + + +def extract_project_id(m: Dict[str, Any]) -> str: + """Attempts to extract the project_id from the args; falls back to an + environment variable, or exits if this isn't available. There's no sensible + default available. + + """ + project_id = m.get("project_id") or os.environ.get("PROJECT_ID") + + if project_id is None: + print() + print( + "\nNo project_id found. 'caliban cloud' requires that you either set a \n\ +$PROJECT_ID environment variable with the ID of your Cloud project, or pass one \n\ +explicitly via --project_id. Try again, please!") + print() + + sys.exit(1) + + return project_id + + +def extract_region(m: Dict[str, Any]) -> ct.Region: + """Returns the region specified in the args; defaults to an environment + variable. If that's not supplied defaults to the default cloud provider from + caliban.cloud. + + """ + region = m.get("region") or os.environ.get("REGION") + + if region: + return ct.parse_region(region) + + return DEFAULT_REGION + + +def extract_zone(m: Dict[str, Any]) -> str: + return "{}-a".format(extract_region(m)) + + +def extract_cloud_key(m: Dict[str, Any]) -> Optional[str]: + """Returns the Google service account key filepath specified in the args; + defaults to the $GOOGLE_APPLICATION_CREDENTIALS variable. + + """ + return m.get("cloud_key") or \ + os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + + +def apt_packages(conf: CalibanConfig, mode: JobMode) -> List[str]: + """Returns the list of aptitude packages that should be installed to satisfy + the requests in the config. + + """ + packages = conf.get("apt_packages") or {} + + if isinstance(packages, dict): + k = "gpu" if gpu(mode) else "cpu" + return packages.get(k, []) + + elif isinstance(packages, list): + return packages + + else: + raise argparse.ArgumentTypeError( + """{}'s "apt_packages" entry must be a dictionary or list, not '{}'""". + format(CALIBAN_CONFIG, packages)) diff --git a/caliban/config.py b/caliban/config/experiment.py similarity index 60% rename from caliban/config.py rename to caliban/config/experiment.py index a602c40..0fcc1bc 100644 --- a/caliban/config.py +++ b/caliban/config/experiment.py @@ -13,23 +13,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Utilities for our job runner, for working with configs. +"""Utilities for working with experiment.json files. + """ from __future__ import absolute_import, division, print_function import argparse import itertools -import os -import sys -from enum import Enum -from typing import Any, Dict, List, Union, Optional import re +import sys +from collections import ChainMap +from typing import Any, Dict, List, Optional, Tuple, Union + import commentjson -import yaml -import caliban.cloud.types as ct import caliban.util as u # int, str and bool are allowed in a final experiment; lists are markers for @@ -48,168 +46,97 @@ Experiment = Dict[str, ExpValue] -# Mode -class JobMode(str, Enum): - CPU = 'CPU' - GPU = 'GPU' - - -# Special config for Caliban. -CalibanConfig = Dict[str, Any] - -DRY_RUN_FLAG = "--dry_run" -CALIBAN_CONFIG = ".calibanconfig.json" - -# Defaults for various input values that we can supply given some partial set -# of info from the CLI. -DEFAULT_REGION = ct.US.central1 - -# : Dict[JobMode, ct.MachineType] -DEFAULT_MACHINE_TYPE = { - JobMode.CPU: ct.MachineType.highcpu_32, - JobMode.GPU: ct.MachineType.standard_8 -} -DEFAULT_GPU = ct.GPU.P100 - -# Config to supply for CPU jobs. -DEFAULT_ACCELERATOR_CONFIG = { - "count": 0, - "type": "ACCELERATOR_TYPE_UNSPECIFIED" -} - - -def gpu(job_mode: JobMode) -> bool: - """Returns True if the supplied JobMode is JobMode.GPU, False otherwise. - - """ - return job_mode == JobMode.GPU - - -def load_yaml_config(path): - """returns the config parsed based on the info in the flags. - - Grabs the config file, written in yaml, slurps it in. - """ - with open(path) as f: - config = yaml.load(f, Loader=yaml.FullLoader) - - return config - - -def load_config(path, mode='yaml'): - """Load a JSON or YAML config. - - """ - if mode == 'json': - with open(path) as f: - return commentjson.load(f) - - return load_yaml_config(path) - - -def valid_json(path: str) -> Dict[str, Any]: - """Loads JSON if the path points to a valid JSON file; otherwise, throws an - exception that's picked up by argparse. - - """ - try: - return load_config(path, mode='json') - except commentjson.JSONLibraryException: - raise argparse.ArgumentTypeError( - """File '{}' doesn't seem to contain valid JSON. Try again!""".format( - path)) - - -def extract_script_args(m: Dict[str, Any]) -> List[str]: - """Strip off the "--" argument if it was passed in as a separator.""" - script_args = m.get("script_args") - if script_args is None or script_args == []: - return script_args - - head, *tail = script_args - - return tail if head == "--" else script_args - - -def extract_project_id(m: Dict[str, Any]) -> str: - """Attempts to extract the project_id from the args; falls back to an - environment variable, or exits if this isn't available. There's no sensible - default available. - +def _is_compound_key(s: Any) -> bool: + """ compound key is defined as a string which uses square brackets to enclose + a comma-separated list, e.g. "[batch_size,learning_rate]" or "[a,b,c]" """ - project_id = m.get("project_id") or os.environ.get("PROJECT_ID") - if project_id is None: - print() - print( - "\nNo project_id found. 'caliban cloud' requires that you either set a \n\ -$PROJECT_ID environment variable with the ID of your Cloud project, or pass one \n\ -explicitly via --project_id. Try again, please!") - print() + if type(s) is not str or len(s) <= 2: + return False + else: + return s[0] == '[' and s[-1] == ']' - sys.exit(1) - return project_id +def _tupleize_compound_key(k: str) -> List[str]: + """ converts a JSON-input compound key into a tuple """ + assert _is_compound_key(k), "{} must be a valid compound key".format(k) + return tuple([x.strip() for x in k.strip('][').split(',')]) -def extract_region(m: Dict[str, Any]) -> ct.Region: - """Returns the region specified in the args; defaults to an environment - variable. If that's not supplied defaults to the default cloud provider from - caliban.cloud. +def _tupleize_compound_value( + v: Union[List, bool, str, int, float]) -> Union[List, Tuple]: + """ list of lists -> list of tuples + list of primitives -> tuple of primitives + single primitive -> length-1 tuple of that primitive + E.g., [[0,1],[3,4]] -> [(0,1),(3,4)] + [0,1] -> (0,1) + 0 -> (0, ) """ - region = m.get("region") or os.environ.get("REGION") - - if region: - return ct.parse_region(region) - - return DEFAULT_REGION + if isinstance(v, list): + if isinstance(v[0], list): + # v is list of lists + return [tuple(vi) for vi in v] + else: + # v is list of primitives + return tuple(v) + else: + # v is a single primitive (bool, str, int, float) + return tuple([v]) -def extract_zone(m: Dict[str, Any]) -> str: - return "{}-a".format(extract_region(m)) +def _tupleize_compound_item(k: Union[Tuple, str], v: Any) -> Dict: + """ converts a JSON-input compound key/value pair into a dictionary of tuples """ + if _is_compound_key(k): + return {_tupleize_compound_key(k): _tupleize_compound_value(v)} + else: + return {k: v} -def extract_cloud_key(m: Dict[str, Any]) -> Optional[str]: - """Returns the Google service account key filepath specified in the args; - defaults to the $GOOGLE_APPLICATION_CREDENTIALS variable. +def tupleize_dict(m: Dict) -> Dict: + """ given a dictionary with compound keys, converts those keys to tuples, and + converts the corresponding values to a tuple or list of tuples + Compound key: a string which uses square brackets to enclose + a comma-separated list, e.g. "[batch_size,learning_rate]" or "[a,b,c]" """ - return m.get("cloud_key") or \ - os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + formatted_items = [_tupleize_compound_item(k, v) for k, v in m.items()] + return dict(ChainMap(*formatted_items)) -def apt_packages(conf: CalibanConfig, mode: JobMode) -> List[str]: - """Returns the list of aptitude packages that should be installed to satisfy - the requests in the config. +def _expand_compound_pair(k: Union[Tuple, str], v: Any) -> Dict: + """ given a key-value pair k v, where k is either: + a) a primitive representing a single, e.g. k = 'key', v = 'value', or + b) a tuple of primitives representing multiple keys, e.g. k = ('key1','key2'), v = ('value1', 'value2') + this function returns the corresponding dictionary without compound keys """ - packages = conf.get("apt_packages") or {} - - if isinstance(packages, dict): - k = "gpu" if gpu(mode) else "cpu" - return packages.get(k, []) - - elif isinstance(packages, list): - return packages + if isinstance(k, tuple): + if not isinstance(v, tuple): + raise argparse.ArgumentTypeError( + """function _expand_compound_pair(k, v) requires that if type(k) is tuple, + type(v) must also be tuple.""") + else: + return dict(zip(k, v)) else: - raise argparse.ArgumentTypeError( - """{}'s "apt_packages" entry must be a dictionary or list, not '{}'""". - format(CALIBAN_CONFIG, packages)) + return {k: v} -def caliban_config() -> CalibanConfig: - """Returns a dict that represents a `.calibanconfig.json` file if present, - empty dictionary otherwise. +def expand_compound_dict(m: Union[Dict, List]) -> Union[Dict, List]: + """ given a dictionary with some compound keys, aka tuples, + returns a dictionary which each compound key separated into primitives + given a list of such dictionaries, will apply the transformation + described above to each dictionary and return the list, maintaining + structure """ - if not os.path.isfile(CALIBAN_CONFIG): - return {} - with open(CALIBAN_CONFIG) as f: - conf = commentjson.load(f) - return conf + if isinstance(m, list): + return [expand_compound_dict(mi) for mi in m] + else: + expanded_dicts = [_expand_compound_pair(k, v) for k, v in m.items()] + return dict(ChainMap(*expanded_dicts)) def expand_experiment_config(items: ExpConf) -> List[Experiment]: diff --git a/caliban/cloud/__init__.py b/caliban/docker/__init__.py similarity index 100% rename from caliban/cloud/__init__.py rename to caliban/docker/__init__.py diff --git a/caliban/docker/build.py b/caliban/docker/build.py new file mode 100644 index 0000000..1d16aa5 --- /dev/null +++ b/caliban/docker/build.py @@ -0,0 +1,619 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions required to interact with Docker to build and run images, shells +and notebooks in a Docker environment. + +""" + +from __future__ import absolute_import, division, print_function + +import json +import os +import subprocess +from enum import Enum +from pathlib import Path +from typing import (Any, Callable, Dict, List, NamedTuple, NewType, Optional, + Union) + +from absl import logging +from blessings import Terminal + +import caliban.config as c +import caliban.util as u + +t = Terminal() + +DEV_CONTAINER_ROOT = "gcr.io/blueshift-playground/blueshift" +TF_VERSIONS = {"2.2.0", "1.12.3", "1.14.0", "1.15.0"} +DEFAULT_WORKDIR = "/usr/app" +CREDS_DIR = "/.creds" +CONDA_BIN = "/opt/conda/bin/conda" + +ImageId = NewType('ImageId', str) +ArgSeq = NewType('ArgSeq', List[str]) + + +class DockerError(Exception): + """Exception that passes info on a failed Docker command.""" + + def __init__(self, message, cmd, ret_code): + super().__init__(message) + self.message = message + self.cmd = cmd + self.ret_code = ret_code + + @property + def command(self): + return " ".join(self.cmd) + + +class NotebookInstall(Enum): + """Flag to decide what to do .""" + none = 'none' + lab = 'lab' + jupyter = 'jupyter' + + def __str__(self) -> str: + return self.value + + +class Shell(Enum): + """Add new shells here and below, in SHELL_DICT.""" + bash = 'bash' + zsh = 'zsh' + + def __str__(self) -> str: + return self.value + + +# Tuple to track the information required to install and execute some custom +# shell into a container. +ShellData = NamedTuple("ShellData", [("executable", str), + ("packages", List[str])]) + + +def apt_install(*packages: str) -> str: + """Returns a command that will install the supplied list of packages without + requiring confirmation or any user interaction. + """ + package_str = ' '.join(packages) + no_prompt = "DEBIAN_FRONTEND=noninteractive" + return f"{no_prompt} apt-get install --yes --no-install-recommends {package_str}" + + +def apt_command(commands: List[str]) -> List[str]: + """Pre-and-ap-pends the supplied commands with the appropriate in-container and + cleanup command for aptitude. + + """ + update = ["apt-get update"] + cleanup = ["apt-get clean", "rm -rf /var/lib/apt/lists/*"] + return update + commands + cleanup + + +# Dict linking a particular supported shell to the data required to run and +# install the shell inside a container. +# +# : Dict[Shell, ShellData] +SHELL_DICT = { + Shell.bash: ShellData("/bin/bash", []), + Shell.zsh: ShellData("/bin/zsh", ["zsh"]) +} + + +def default_shell() -> Shell: + """Returns the shell to load into the container. Defaults to Shell.bash, but if + the user's SHELL variable refers to a supported sub-shell, returns that + instead. + + """ + ret = Shell.bash + + if "zsh" in os.environ.get("SHELL"): + ret = Shell.zsh + + return ret + + +def adc_location(home_dir: Optional[str] = None) -> str: + """Returns the location for application default credentials, INSIDE the + container (so, hardcoded unix separators), given the supplied home directory. + + """ + if home_dir is None: + home_dir = Path.home() + + return "{}/.config/gcloud/application_default_credentials.json".format( + home_dir) + + +def container_home(): + """Returns the location of the home directory inside the generated + container. + + """ + return "/home/{}".format(u.current_user()) + + +def tf_base_image(job_mode: c.JobMode, tensorflow_version: str) -> str: + """Returns the base image to use, depending on whether or not we're using a + GPU. This is JUST for building our base images for Blueshift; not for + actually using in a job. + + List of available tags: https://hub.docker.com/r/tensorflow/tensorflow/tags + + """ + if tensorflow_version not in TF_VERSIONS: + raise Exception("""{} is not a valid tensorflow version. + Try one of: {}""".format(tensorflow_version, TF_VERSIONS)) + + gpu = "-gpu" if c.gpu(job_mode) else "" + return "tensorflow/tensorflow:{}{}-py3".format(tensorflow_version, gpu) + + +def base_image_suffix(job_mode: c.JobMode) -> str: + return "gpu" if c.gpu(job_mode) else "cpu" + + +def base_image_id(job_mode: c.JobMode) -> str: + """Returns the default base image for all caliban Dockerfiles.""" + base_suffix = base_image_suffix(job_mode) + return "{}:{}".format(DEV_CONTAINER_ROOT, base_suffix) + + +def extras_string(extras: List[str]) -> str: + """Returns the argument passed to `pip install` to install a project from its + setup.py and target a specific set of extras_require dependencies. + + Args: + extras: (potentially empty) list of extra_requires deps. + """ + ret = "." + if len(extras) > 0: + ret += "[{}]".format(','.join(extras)) + return ret + + +def base_extras(job_mode: c.JobMode, path: str, + extras: Optional[List[str]]) -> Optional[List[str]]: + """Returns None if the supplied path doesn't exist (it's assumed it points to a + setup.py file). + + If the path DOES exist, generates a list of extras to install. gpu or cpu are + always added to the beginning of the list, depending on the mode. + + """ + ret = None + + if os.path.exists(path): + base = extras or [] + extra = 'gpu' if c.gpu(job_mode) else 'cpu' + ret = base if extra in base else [extra] + base + + return ret + + +def _dependency_entries(workdir: str, + user_id: int, + user_group: int, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None) -> str: + """Returns the Dockerfile entries required to install dependencies from either: + + - a requirements.txt file, path supplied by requirements_path + - a conda environment.yml file, path supplied by conda_env_path. + - a setup.py file, if some sequence of dependencies is supplied. + + An empty list for setup_extras means, run `pip install -c .` with no extras. + None for this argument means do nothing. If a list of strings is supplied, + they'll be treated as extras dependency sets. + """ + ret = "" + + if setup_extras is not None: + ret += f""" +COPY --chown={user_id}:{user_group} setup.py {workdir} +RUN /bin/bash -c "pip install --no-cache-dir {extras_string(setup_extras)}" +""" + + if conda_env_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {conda_env_path} {workdir} +RUN /bin/bash -c "{CONDA_BIN} env update \ + --quiet --name caliban \ + --file {conda_env_path} && \ + {CONDA_BIN} clean -y -q --all" +""" + + if requirements_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {requirements_path} {workdir} +RUN /bin/bash -c "pip install --no-cache-dir -r {requirements_path}" +""" + + return ret + + +def _package_entries(workdir: str, user_id: int, user_group: int, + package: u.Package) -> str: + """Returns the Dockerfile entries required to: + + - copy a directory of code into a docker container + - inject an entrypoint that executes a python module inside that directory. + + Python code runs as modules vs scripts so that we can enforce import hygiene + between files inside a project. + + """ + owner = "{}:{}".format(user_id, user_group) + + arg = package.main_module or package.script_path + + # This needs to use json so that quotes print as double quotes, not single + # quotes. + entrypoint_s = json.dumps(package.executable + [arg]) + + return """ +# Copy project code into the docker container. +COPY --chown={owner} {package_path} {workdir}/{package_path} + +# Declare an entrypoint that actually runs the container. +ENTRYPOINT {entrypoint_s} + """.format_map({ + "owner": owner, + "package_path": package.package_path, + "workdir": workdir, + "entrypoint_s": entrypoint_s + }) + + +def _service_account_entry(user_id: int, user_group: int, credentials_path: str, + docker_credentials_dir: str, + write_adc_placeholder: bool): + """Generates the Dockerfile entries required to transfer a set of Cloud service + account credentials into the Docker container. + + NOTE the write_adc_placeholder variable is here because the "ctpu" script + that we use to interact with TPUs has a bug in it, as of 1/21/2020, where the + script will fail if the application_default_credentials.json file isn't + present, EVEN THOUGH it properly uses the service account credentials + registered with gcloud instead of ADC creds. + + If a service account is present, we write a placeholder string to get past + this problem. This shouldn't matter for anyone else since adc isn't used if a + service account is present. + + """ + container_creds = "{}/credentials.json".format(docker_credentials_dir) + ret = """ +COPY --chown={user_id}:{user_group} {credentials_path} {container_creds} + +# Use the credentials file to activate gcloud, gsutil inside the container. +RUN gcloud auth activate-service-account --key-file={container_creds} && \ + git config --global credential.'https://source.developers.google.com'.helper gcloud.sh + +ENV GOOGLE_APPLICATION_CREDENTIALS={container_creds} +""".format_map({ + "user_id": user_id, + "user_group": user_group, + "credentials_path": credentials_path, + "container_creds": container_creds + }) + + if write_adc_placeholder: + ret += """ +RUN echo "placeholder" >> {} +""".format(adc_location(container_home())) + + return ret + + +def _adc_entry(user_id: int, user_group: int, adc_path: str): + """Returns the Dockerfile line required to transfer the + application_default_credentials.json file into the container's home + directory. + + """ + return """ +COPY --chown={user_id}:{user_group} {adc_path} {adc_loc} + """.format_map({ + "user_id": user_id, + "user_group": user_group, + "adc_path": adc_path, + "adc_loc": adc_location(container_home()) + }) + + +def _credentials_entries(user_id: int, + user_group: int, + adc_path: Optional[str], + credentials_path: Optional[str], + docker_credentials_dir: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to copy a user's Cloud credentials + into the Docker container. + + - adc_path is the relative path inside the current directory to an + application_default_credentials.json file containing... well, you get it. + - credentials_path is the relative path inside the current directory to a + JSON credentials file. + - docker_credentials_dir is the relative path inside the docker container + where the JSON file will be copied on build. + + """ + if docker_credentials_dir is None: + docker_credentials_dir = CREDS_DIR + + ret = "" + if credentials_path is not None: + ret += _service_account_entry(user_id, + user_group, + credentials_path, + docker_credentials_dir, + write_adc_placeholder=adc_path is None) + + if adc_path is not None: + ret += _adc_entry(user_id, user_group, adc_path) + + return ret + + +def _notebook_entries(lab: bool = False, version: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to install Jupyter{lab}. + + Optionally takes a version string. + + """ + version_suffix = "" + + if version is not None: + version_suffix = "=={}".format(version) + + library = "jupyterlab" if lab else "jupyter" + + return """ +RUN pip install {}{} +""".format(library, version_suffix) + + +def _custom_packages( + user_id: int, + user_group: int, + packages: Optional[List[str]] = None, + shell: Optional[Shell] = None, +) -> str: + """Returns the Dockerfile entries necessary to install custom dependencies for + the supplied shell and sequence of aptitude packages. + + """ + if packages is None: + packages = [] + + if shell is None: + shell = Shell.bash + + ret = "" + + to_install = sorted(packages + SHELL_DICT[shell].packages) + + if len(to_install) != 0: + commands = apt_command([apt_install(*to_install)]) + ret = """ +USER root + +RUN {commands} + +USER {user_id}:{user_group} +""".format_map({ + "commands": " && ".join(commands), + "user_id": user_id, + "user_group": user_group + }) + + return ret + + +def _copy_dir_entry(workdir: str, user_id: int, user_group: int, + dirname: str) -> str: + """Returns the Dockerfile entry necessary to copy a single extra subdirectory + from the current directory into a docker container during build. + + """ + owner = "{}:{}".format(user_id, user_group) + return """# Copy {dirname} into the Docker container. +COPY --chown={owner} {dirname} {workdir}/{dirname} +""".format_map({ + "owner": owner, + "workdir": workdir, + "dirname": dirname + }) + + +def _extra_dir_entries(workdir: str, user_id: int, user_group: int, + extra_dirs: List[str]) -> str: + """Returns the Dockerfile entries necessary to copy all directories in the + extra_dirs list into a docker container during build. + + """ + ret = "" + for d in extra_dirs: + ret += "\n{}".format(_copy_dir_entry(workdir, user_id, user_group, d)) + return ret + + +def _dockerfile_template( + job_mode: c.JobMode, + workdir: Optional[str] = None, + base_image_fn: Optional[Callable[[c.JobMode], str]] = None, + package: Optional[Union[List, u.Package]] = None, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None, + adc_path: Optional[str] = None, + credentials_path: Optional[str] = None, + jupyter_version: Optional[str] = None, + inject_notebook: NotebookInstall = NotebookInstall.none, + shell: Optional[Shell] = None, + extra_dirs: Optional[List[str]] = None, + caliban_config: Optional[Dict[str, Any]] = None) -> str: + """Returns a Dockerfile that builds on a local CPU or GPU base image (depending + on the value of job_mode) to create a container that: + + - installs any dependency specified in a requirements.txt file living at + requirements_path, a conda environment at conda_env_path, or any + dependencies in a setup.py file, including extra dependencies, if + setup_extras isn't None + - injects gcloud credentials into the container, so Cloud interaction works + just like it does locally + - potentially installs a custom shell, or jupyterlab for notebook support + - copies all source needed by the main module specified by package, and + potentially injects an entrypoint that, on run, will run that main module + + Most functions that call _dockerfile_template pass along any kwargs that they + receive. It should be enough to add kwargs here, then rely on that mechanism + to pass them along, vs adding kwargs all the way down the call chain. + + Supply a custom base_image_fn (function from job_mode -> image ID) to inject + more complex Docker commands into the Caliban environments by, for example, + building your own image on top of the TF base images, then using that. + + """ + uid = os.getuid() + gid = os.getgid() + username = u.current_user() + + if isinstance(package, list): + package = u.Package(*package) + + if workdir is None: + workdir = DEFAULT_WORKDIR + + if base_image_fn is None: + base_image_fn = base_image_id + + base_image = base_image_fn(job_mode) + + dockerfile = """ +FROM {base_image} + +# Create the same group we're using on the host machine. +RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} + +# Create the user by name. --no-log-init guards against a crash with large user +# IDs. +RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} + +# The directory is created by root. This sets permissions so that any user can +# access the folder. +RUN mkdir -m 777 {workdir} {creds_dir} {c_home} + +ENV HOME={c_home} + +WORKDIR {workdir} + +USER {uid}:{gid} +""".format_map({ + "base_image": base_image, + "username": username, + "uid": uid, + "gid": gid, + "workdir": workdir, + "c_home": container_home(), + "creds_dir": CREDS_DIR + }) + dockerfile += _credentials_entries(uid, + gid, + adc_path=adc_path, + credentials_path=credentials_path) + + dockerfile += _dependency_entries(workdir, + uid, + gid, + requirements_path=requirements_path, + conda_env_path=conda_env_path, + setup_extras=setup_extras) + + if inject_notebook.value != 'none': + install_lab = inject_notebook == NotebookInstall.lab + dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) + + if extra_dirs is not None: + dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) + + dockerfile += _custom_packages(uid, + gid, + packages=c.apt_packages( + caliban_config, job_mode), + shell=shell) + + if package is not None: + # The actual entrypoint and final copied code. + dockerfile += _package_entries(workdir, uid, gid, package) + + return dockerfile + + +def docker_image_id(output: str) -> ImageId: + """Accepts a string containing the output of a successful `docker build` + command and parses the Docker image ID from the stream. + + NOTE this is probably quite brittle! I can imagine this breaking quite easily + on a Docker upgrade. + + """ + return ImageId(output.splitlines()[-1].split()[-1]) + + +def build_image(job_mode: c.JobMode, + build_path: str, + credentials_path: Optional[str] = None, + adc_path: Optional[str] = None, + no_cache: bool = False, + **kwargs) -> str: + """Builds a Docker image by generating a Dockerfile and passing it to `docker + build` via stdin. All output from the `docker build` process prints to + stdout. + + Returns the image ID of the new docker container; if the command fails, + throws on error with information about the command and any issues that caused + the problem. + + """ + with u.TempCopy(credentials_path, + tmp_name=".caliban_default_creds.json") as creds: + with u.TempCopy(adc_path, tmp_name=".caliban_adc_creds.json") as adc: + cache_args = ["--no-cache"] if no_cache else [] + cmd = ["docker", "build"] + cache_args + ["--rm", "-f-", build_path] + + dockerfile = _dockerfile_template(job_mode, + credentials_path=creds, + adc_path=adc, + **kwargs) + + joined_cmd = " ".join(cmd) + logging.info("Running command: {}".format(joined_cmd)) + + try: + output, ret_code = u.capture_stdout(cmd, input_str=dockerfile) + if ret_code == 0: + return docker_image_id(output) + else: + error_msg = "Docker failed with error code {}.".format(ret_code) + raise DockerError(error_msg, cmd, ret_code) + + except subprocess.CalledProcessError as e: + logging.error(e.output) + logging.error(e.stderr) diff --git a/caliban/docker/push.py b/caliban/docker/push.py new file mode 100644 index 0000000..298aa1f --- /dev/null +++ b/caliban/docker/push.py @@ -0,0 +1,734 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions required to interact with Docker to build and run images, shells +and notebooks in a Docker environment. + +""" + +from __future__ import absolute_import, division, print_function + +import json +import os +import subprocess +import sys +from enum import Enum +from pathlib import Path +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, NewType, + Optional, Union) + +import tqdm +from absl import logging +from blessings import Terminal +from tqdm.utils import _screen_shape_wrapper + +import caliban.config as c +import caliban.util as u +from caliban.history.types import Experiment, Job, JobSpec, JobStatus, Platform +from caliban.history.utils import (create_experiments, generate_container_spec, + get_mem_engine, get_sql_engine, + session_scope) + +t = Terminal() + +DEV_CONTAINER_ROOT = "gcr.io/blueshift-playground/blueshift" +TF_VERSIONS = {"2.2.0", "1.12.3", "1.14.0", "1.15.0"} +DEFAULT_WORKDIR = "/usr/app" +CREDS_DIR = "/.creds" +CONDA_BIN = "/opt/conda/bin/conda" + +ImageId = NewType('ImageId', str) +ArgSeq = NewType('ArgSeq', List[str]) + + +class DockerError(Exception): + """Exception that passes info on a failed Docker command.""" + + def __init__(self, message, cmd, ret_code): + super().__init__(message) + self.message = message + self.cmd = cmd + self.ret_code = ret_code + + @property + def command(self): + return " ".join(self.cmd) + + +class NotebookInstall(Enum): + """Flag to decide what to do .""" + none = 'none' + lab = 'lab' + jupyter = 'jupyter' + + def __str__(self) -> str: + return self.value + + +class Shell(Enum): + """Add new shells here and below, in SHELL_DICT.""" + bash = 'bash' + zsh = 'zsh' + + def __str__(self) -> str: + return self.value + + +# Tuple to track the information required to install and execute some custom +# shell into a container. +ShellData = NamedTuple("ShellData", [("executable", str), + ("packages", List[str])]) + + +def apt_install(*packages: str) -> str: + """Returns a command that will install the supplied list of packages without + requiring confirmation or any user interaction. + """ + package_str = ' '.join(packages) + no_prompt = "DEBIAN_FRONTEND=noninteractive" + return f"{no_prompt} apt-get install --yes --no-install-recommends {package_str}" + + +def apt_command(commands: List[str]) -> List[str]: + """Pre-and-ap-pends the supplied commands with the appropriate in-container and + cleanup command for aptitude. + + """ + update = ["apt-get update"] + cleanup = ["apt-get clean", "rm -rf /var/lib/apt/lists/*"] + return update + commands + cleanup + + +# Dict linking a particular supported shell to the data required to run and +# install the shell inside a container. +# +# : Dict[Shell, ShellData] +SHELL_DICT = { + Shell.bash: ShellData("/bin/bash", []), + Shell.zsh: ShellData("/bin/zsh", ["zsh"]) +} + + +def default_shell() -> Shell: + """Returns the shell to load into the container. Defaults to Shell.bash, but if + the user's SHELL variable refers to a supported sub-shell, returns that + instead. + + """ + ret = Shell.bash + + if "zsh" in os.environ.get("SHELL"): + ret = Shell.zsh + + return ret + + +def adc_location(home_dir: Optional[str] = None) -> str: + """Returns the location for application default credentials, INSIDE the + container (so, hardcoded unix separators), given the supplied home directory. + + """ + if home_dir is None: + home_dir = Path.home() + + return "{}/.config/gcloud/application_default_credentials.json".format( + home_dir) + + +def container_home(): + """Returns the location of the home directory inside the generated + container. + + """ + return "/home/{}".format(u.current_user()) + + +def tf_base_image(job_mode: c.JobMode, tensorflow_version: str) -> str: + """Returns the base image to use, depending on whether or not we're using a + GPU. This is JUST for building our base images for Blueshift; not for + actually using in a job. + + List of available tags: https://hub.docker.com/r/tensorflow/tensorflow/tags + + """ + if tensorflow_version not in TF_VERSIONS: + raise Exception("""{} is not a valid tensorflow version. + Try one of: {}""".format(tensorflow_version, TF_VERSIONS)) + + gpu = "-gpu" if c.gpu(job_mode) else "" + return "tensorflow/tensorflow:{}{}-py3".format(tensorflow_version, gpu) + + +def base_image_suffix(job_mode: c.JobMode) -> str: + return "gpu" if c.gpu(job_mode) else "cpu" + + +def base_image_id(job_mode: c.JobMode) -> str: + """Returns the default base image for all caliban Dockerfiles.""" + base_suffix = base_image_suffix(job_mode) + return "{}:{}".format(DEV_CONTAINER_ROOT, base_suffix) + + +def extras_string(extras: List[str]) -> str: + """Returns the argument passed to `pip install` to install a project from its + setup.py and target a specific set of extras_require dependencies. + + Args: + extras: (potentially empty) list of extra_requires deps. + """ + ret = "." + if len(extras) > 0: + ret += "[{}]".format(','.join(extras)) + return ret + + +def base_extras(job_mode: c.JobMode, path: str, + extras: Optional[List[str]]) -> Optional[List[str]]: + """Returns None if the supplied path doesn't exist (it's assumed it points to a + setup.py file). + + If the path DOES exist, generates a list of extras to install. gpu or cpu are + always added to the beginning of the list, depending on the mode. + + """ + ret = None + + if os.path.exists(path): + base = extras or [] + extra = 'gpu' if c.gpu(job_mode) else 'cpu' + ret = base if extra in base else [extra] + base + + return ret + + +def _dependency_entries(workdir: str, + user_id: int, + user_group: int, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None) -> str: + """Returns the Dockerfile entries required to install dependencies from either: + + - a requirements.txt file, path supplied by requirements_path + - a conda environment.yml file, path supplied by conda_env_path. + - a setup.py file, if some sequence of dependencies is supplied. + + An empty list for setup_extras means, run `pip install -c .` with no extras. + None for this argument means do nothing. If a list of strings is supplied, + they'll be treated as extras dependency sets. + """ + ret = "" + + if setup_extras is not None: + ret += f""" +COPY --chown={user_id}:{user_group} setup.py {workdir} +RUN /bin/bash -c "pip install --no-cache-dir {extras_string(setup_extras)}" +""" + + if conda_env_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {conda_env_path} {workdir} +RUN /bin/bash -c "{CONDA_BIN} env update \ + --quiet --name caliban \ + --file {conda_env_path} && \ + {CONDA_BIN} clean -y -q --all" +""" + + if requirements_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {requirements_path} {workdir} +RUN /bin/bash -c "pip install --no-cache-dir -r {requirements_path}" +""" + + return ret + + +def _package_entries(workdir: str, user_id: int, user_group: int, + package: u.Package) -> str: + """Returns the Dockerfile entries required to: + + - copy a directory of code into a docker container + - inject an entrypoint that executes a python module inside that directory. + + Python code runs as modules vs scripts so that we can enforce import hygiene + between files inside a project. + + """ + owner = "{}:{}".format(user_id, user_group) + + arg = package.main_module or package.script_path + + # This needs to use json so that quotes print as double quotes, not single + # quotes. + entrypoint_s = json.dumps(package.executable + [arg]) + + return """ +# Copy project code into the docker container. +COPY --chown={owner} {package_path} {workdir}/{package_path} + +# Declare an entrypoint that actually runs the container. +ENTRYPOINT {entrypoint_s} + """.format_map({ + "owner": owner, + "package_path": package.package_path, + "workdir": workdir, + "entrypoint_s": entrypoint_s + }) + + +def _service_account_entry(user_id: int, user_group: int, credentials_path: str, + docker_credentials_dir: str, + write_adc_placeholder: bool): + """Generates the Dockerfile entries required to transfer a set of Cloud service + account credentials into the Docker container. + + NOTE the write_adc_placeholder variable is here because the "ctpu" script + that we use to interact with TPUs has a bug in it, as of 1/21/2020, where the + script will fail if the application_default_credentials.json file isn't + present, EVEN THOUGH it properly uses the service account credentials + registered with gcloud instead of ADC creds. + + If a service account is present, we write a placeholder string to get past + this problem. This shouldn't matter for anyone else since adc isn't used if a + service account is present. + + """ + container_creds = "{}/credentials.json".format(docker_credentials_dir) + ret = """ +COPY --chown={user_id}:{user_group} {credentials_path} {container_creds} + +# Use the credentials file to activate gcloud, gsutil inside the container. +RUN gcloud auth activate-service-account --key-file={container_creds} && \ + git config --global credential.'https://source.developers.google.com'.helper gcloud.sh + +ENV GOOGLE_APPLICATION_CREDENTIALS={container_creds} +""".format_map({ + "user_id": user_id, + "user_group": user_group, + "credentials_path": credentials_path, + "container_creds": container_creds + }) + + if write_adc_placeholder: + ret += """ +RUN echo "placeholder" >> {} +""".format(adc_location(container_home())) + + return ret + + +def _adc_entry(user_id: int, user_group: int, adc_path: str): + """Returns the Dockerfile line required to transfer the + application_default_credentials.json file into the container's home + directory. + + """ + return """ +COPY --chown={user_id}:{user_group} {adc_path} {adc_loc} + """.format_map({ + "user_id": user_id, + "user_group": user_group, + "adc_path": adc_path, + "adc_loc": adc_location(container_home()) + }) + + +def _credentials_entries(user_id: int, + user_group: int, + adc_path: Optional[str], + credentials_path: Optional[str], + docker_credentials_dir: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to copy a user's Cloud credentials + into the Docker container. + + - adc_path is the relative path inside the current directory to an + application_default_credentials.json file containing... well, you get it. + - credentials_path is the relative path inside the current directory to a + JSON credentials file. + - docker_credentials_dir is the relative path inside the docker container + where the JSON file will be copied on build. + + """ + if docker_credentials_dir is None: + docker_credentials_dir = CREDS_DIR + + ret = "" + if credentials_path is not None: + ret += _service_account_entry(user_id, + user_group, + credentials_path, + docker_credentials_dir, + write_adc_placeholder=adc_path is None) + + if adc_path is not None: + ret += _adc_entry(user_id, user_group, adc_path) + + return ret + + +def _notebook_entries(lab: bool = False, version: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to install Jupyter{lab}. + + Optionally takes a version string. + + """ + version_suffix = "" + + if version is not None: + version_suffix = "=={}".format(version) + + library = "jupyterlab" if lab else "jupyter" + + return """ +RUN pip install {}{} +""".format(library, version_suffix) + + +def _custom_packages( + user_id: int, + user_group: int, + packages: Optional[List[str]] = None, + shell: Optional[Shell] = None, +) -> str: + """Returns the Dockerfile entries necessary to install custom dependencies for + the supplied shell and sequence of aptitude packages. + + """ + if packages is None: + packages = [] + + if shell is None: + shell = Shell.bash + + ret = "" + + to_install = sorted(packages + SHELL_DICT[shell].packages) + + if len(to_install) != 0: + commands = apt_command([apt_install(*to_install)]) + ret = """ +USER root + +RUN {commands} + +USER {user_id}:{user_group} +""".format_map({ + "commands": " && ".join(commands), + "user_id": user_id, + "user_group": user_group + }) + + return ret + + +def _copy_dir_entry(workdir: str, user_id: int, user_group: int, + dirname: str) -> str: + """Returns the Dockerfile entry necessary to copy a single extra subdirectory + from the current directory into a docker container during build. + + """ + owner = "{}:{}".format(user_id, user_group) + return """# Copy {dirname} into the Docker container. +COPY --chown={owner} {dirname} {workdir}/{dirname} +""".format_map({ + "owner": owner, + "workdir": workdir, + "dirname": dirname + }) + + +def _extra_dir_entries(workdir: str, user_id: int, user_group: int, + extra_dirs: List[str]) -> str: + """Returns the Dockerfile entries necessary to copy all directories in the + extra_dirs list into a docker container during build. + + """ + ret = "" + for d in extra_dirs: + ret += "\n{}".format(_copy_dir_entry(workdir, user_id, user_group, d)) + return ret + + +def _dockerfile_template( + job_mode: c.JobMode, + workdir: Optional[str] = None, + base_image_fn: Optional[Callable[[c.JobMode], str]] = None, + package: Optional[Union[List, u.Package]] = None, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None, + adc_path: Optional[str] = None, + credentials_path: Optional[str] = None, + jupyter_version: Optional[str] = None, + inject_notebook: NotebookInstall = NotebookInstall.none, + shell: Optional[Shell] = None, + extra_dirs: Optional[List[str]] = None, + caliban_config: Optional[Dict[str, Any]] = None) -> str: + """Returns a Dockerfile that builds on a local CPU or GPU base image (depending + on the value of job_mode) to create a container that: + + - installs any dependency specified in a requirements.txt file living at + requirements_path, a conda environment at conda_env_path, or any + dependencies in a setup.py file, including extra dependencies, if + setup_extras isn't None + - injects gcloud credentials into the container, so Cloud interaction works + just like it does locally + - potentially installs a custom shell, or jupyterlab for notebook support + - copies all source needed by the main module specified by package, and + potentially injects an entrypoint that, on run, will run that main module + + Most functions that call _dockerfile_template pass along any kwargs that they + receive. It should be enough to add kwargs here, then rely on that mechanism + to pass them along, vs adding kwargs all the way down the call chain. + + Supply a custom base_image_fn (function from job_mode -> image ID) to inject + more complex Docker commands into the Caliban environments by, for example, + building your own image on top of the TF base images, then using that. + + """ + uid = os.getuid() + gid = os.getgid() + username = u.current_user() + + if isinstance(package, list): + package = u.Package(*package) + + if workdir is None: + workdir = DEFAULT_WORKDIR + + if base_image_fn is None: + base_image_fn = base_image_id + + base_image = base_image_fn(job_mode) + + dockerfile = """ +FROM {base_image} + +# Create the same group we're using on the host machine. +RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} + +# Create the user by name. --no-log-init guards against a crash with large user +# IDs. +RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} + +# The directory is created by root. This sets permissions so that any user can +# access the folder. +RUN mkdir -m 777 {workdir} {creds_dir} {c_home} + +ENV HOME={c_home} + +WORKDIR {workdir} + +USER {uid}:{gid} +""".format_map({ + "base_image": base_image, + "username": username, + "uid": uid, + "gid": gid, + "workdir": workdir, + "c_home": container_home(), + "creds_dir": CREDS_DIR + }) + dockerfile += _credentials_entries(uid, + gid, + adc_path=adc_path, + credentials_path=credentials_path) + + dockerfile += _dependency_entries(workdir, + uid, + gid, + requirements_path=requirements_path, + conda_env_path=conda_env_path, + setup_extras=setup_extras) + + if inject_notebook.value != 'none': + install_lab = inject_notebook == NotebookInstall.lab + dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) + + if extra_dirs is not None: + dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) + + dockerfile += _custom_packages(uid, + gid, + packages=c.apt_packages( + caliban_config, job_mode), + shell=shell) + + if package is not None: + # The actual entrypoint and final copied code. + dockerfile += _package_entries(workdir, uid, gid, package) + + return dockerfile + + +def docker_image_id(output: str) -> ImageId: + """Accepts a string containing the output of a successful `docker build` + command and parses the Docker image ID from the stream. + + NOTE this is probably quite brittle! I can imagine this breaking quite easily + on a Docker upgrade. + + """ + return ImageId(output.splitlines()[-1].split()[-1]) + + +def build_image(job_mode: c.JobMode, + build_path: str, + credentials_path: Optional[str] = None, + adc_path: Optional[str] = None, + no_cache: bool = False, + **kwargs) -> str: + """Builds a Docker image by generating a Dockerfile and passing it to `docker + build` via stdin. All output from the `docker build` process prints to + stdout. + + Returns the image ID of the new docker container; if the command fails, + throws on error with information about the command and any issues that caused + the problem. + + """ + with u.TempCopy(credentials_path, + tmp_name=".caliban_default_creds.json") as creds: + with u.TempCopy(adc_path, tmp_name=".caliban_adc_creds.json") as adc: + cache_args = ["--no-cache"] if no_cache else [] + cmd = ["docker", "build"] + cache_args + ["--rm", "-f-", build_path] + + dockerfile = _dockerfile_template(job_mode, + credentials_path=creds, + adc_path=adc, + **kwargs) + + joined_cmd = " ".join(cmd) + logging.info("Running command: {}".format(joined_cmd)) + + try: + output, ret_code = u.capture_stdout(cmd, input_str=dockerfile) + if ret_code == 0: + return docker_image_id(output) + else: + error_msg = "Docker failed with error code {}.".format(ret_code) + raise DockerError(error_msg, cmd, ret_code) + + except subprocess.CalledProcessError as e: + logging.error(e.output) + logging.error(e.stderr) + + +def _image_tag_for_project(project_id: str, image_id: str) -> str: + """Generate the GCR Docker image tag for the supplied pair of project_id and + image_id. + + This function properly handles "domain scoped projects", where the project ID + contains a domain name and project ID separated by : + https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects. + + """ + project_s = project_id.replace(":", "/") + return "gcr.io/{}/{}:latest".format(project_s, image_id) + + +def push_uuid_tag(project_id: str, image_id: str) -> str: + """Takes a base image and tags it for upload, then pushes it to a remote Google + Container Registry. + + Returns the tag on a successful push. + + TODO should this just check first before attempting to push if the image + exists? Immutable names means that if the tag is up there, we're done. + Potentially use docker-py for this. + + """ + image_tag = _image_tag_for_project(project_id, image_id) + subprocess.run(["docker", "tag", image_id, image_tag], check=True) + subprocess.run(["docker", "push", image_tag], check=True) + return image_tag + + +def _run_cmd(job_mode: c.JobMode, + run_args: Optional[List[str]] = None) -> List[str]: + """Returns the sequence of commands for the subprocess run functions required + to execute `docker run`. in CPU or GPU mode, depending on the value of + job_mode. + + Keyword args: + - run_args: list of args to pass to docker run. + + """ + if run_args is None: + run_args = [] + + runtime = ["--runtime", "nvidia"] if c.gpu(job_mode) else [] + return ["docker", "run"] + runtime + ["--ipc", "host"] + run_args + + +def _home_mount_cmds(enable_home_mount: bool) -> List[str]: + """Returns the argument needed by Docker to mount a user's local home directory + into the home directory location inside their container. + + If enable_home_mount is false returns an empty list. + + """ + ret = [] + if enable_home_mount: + ret = ["-v", "{}:{}".format(Path.home(), container_home())] + return ret + + +def _interactive_opts(workdir: str) -> List[str]: + """Returns the basic arguments we want to run a docker process locally. + + """ + return [ + "-w", workdir, \ + "-u", "{}:{}".format(os.getuid(), os.getgid()), \ + "-v", "{}:{}".format(os.getcwd(), workdir) \ + ] + + +def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec: + """Prints logging as a side effect for the supplied sequence of job specs + generated from an experiment definition; returns the input job spec. + + """ + args = c.experiment_to_args(job_spec.experiment.kwargs, + job_spec.experiment.args) + logging.info("") + logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args)))) + return job_spec + + +def logged_job_specs(job_specs: Iterable[JobSpec]) -> Iterable[JobSpec]: + """Generates an iterable of job specs that should be passed to `docker run` to + execute the experiments defined by the supplied iterable. + + """ + for i, s in enumerate(job_specs, 1): + yield log_job_spec_instance(s, i) + + +def execute_dry_run(job_specs: Iterable[JobSpec]) -> None: + """Expands the supplied sequence of experiments into sequences of args and logs + the jobs that WOULD have been executed, had the dry run flag not been + applied. + + """ + list(logged_job_specs(job_specs)) + + logging.info('') + logging.info( + t.yellow("To build your image and execute these jobs, \ +run your command again without {}.".format(c.DRY_RUN_FLAG))) + logging.info('') + return None diff --git a/caliban/expansion.py b/caliban/expansion.py index ebdeeb4..7659aba 100644 --- a/caliban/expansion.py +++ b/caliban/expansion.py @@ -24,7 +24,7 @@ from absl import app, logging from absl.flags import argparse_flags -import caliban.config as c +import caliban.config.experiment as c from caliban import __version__ ll.getLogger('caliban.expansion').setLevel(logging.ERROR) diff --git a/caliban/gke/__init__.py b/caliban/platform/cloud/__init__.py similarity index 100% rename from caliban/gke/__init__.py rename to caliban/platform/cloud/__init__.py diff --git a/caliban/cloud/core.py b/caliban/platform/cloud/core.py similarity index 100% rename from caliban/cloud/core.py rename to caliban/platform/cloud/core.py diff --git a/caliban/cloud/types.py b/caliban/platform/cloud/types.py similarity index 100% rename from caliban/cloud/types.py rename to caliban/platform/cloud/types.py diff --git a/caliban/platform/cloud/util.py b/caliban/platform/cloud/util.py new file mode 100644 index 0000000..c9ad813 --- /dev/null +++ b/caliban/platform/cloud/util.py @@ -0,0 +1,119 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities relevant to AI Platform. +""" +import re +from typing import Dict, List, Optional, Tuple, Union + +import caliban.util as u +import caliban.util.argparse as ua + +# key and value for labels can be at most this-many-characters long. +AI_PLATFORM_MAX_LABEL_LENGTH = 63 + + +def _truncate(s: str, max_length: int) -> str: + """Returns the input string s truncated to be at most max_length characters + long. + + """ + return s if len(s) <= max_length else s[0:max_length] + + +def _clean_label(s: Optional[str], is_key: bool) -> str: + """Processes the string into the sanitized format required by AI platform + labels. + + https://cloud.google.com/ml-engine/docs/resource-labels + + """ + if s is None: + return "" + + # periods are not allowed by AI Platform labels, but often occur in, + # e.g., learning rates + DECIMAL_REPLACEMENT = '_' + s = s.replace('.', DECIMAL_REPLACEMENT) + + # lowercase, letters, - and _ are valid, so strip the leading dashes, make + # everything lowercase and then kill any remaining unallowed characters. + cleaned = re.sub(r'[^a-z0-9_-]', '', s.lower()).lstrip("-") + + # Keys must start with a letter. If is_key is set and the cleaned version + # starts with something else, append `k`. + if is_key and cleaned != "" and not cleaned[0].isalpha(): + cleaned = "k" + cleaned + + return _truncate(cleaned, AI_PLATFORM_MAX_LABEL_LENGTH) + + +def key_label(k: Optional[str]) -> str: + """converts the argument into a valid label, suitable for submission as a label + key to Cloud. + + """ + return _clean_label(k, True) + + +def value_label(v: Optional[str]) -> str: + """converts the argument into a valid label, suitable for submission as a label + value to Cloud. + + """ + return _clean_label(v, False) + + +def script_args_to_labels(script_args: Optional[List[str]]) -> Dict[str, str]: + """Converts the arguments supplied to our scripts into a dictionary usable as + labels valid for Cloud submission. + + """ + ret = {} + + def process_pair(k, v): + if ua.is_key(k): + clean_k = key_label(k) + if clean_k != "": + ret[clean_k] = "" if ua.is_key(v) else value_label(v) + + if script_args is None or len(script_args) == 0: + return ret + + elif len(script_args) == 1: + process_pair(script_args[0], None) + + # Handle the case where the final argument in the list is a boolean flag. + # This won't get picked up by partition. + elif len(script_args) > 1: + for k, v in u.partition(script_args, 2): + process_pair(k, v) + + process_pair(script_args[-1], None) + + return ret + + +def sanitize_labels( + pairs: Union[Dict[str, str], List[Tuple[str, str]]]) -> Dict[str, str]: + """Turns a dict, or a list of unsanitized key-value pairs (each represented by + a tuple) into a dictionary suitable to submit to Cloud as a label dict. + + """ + if isinstance(pairs, dict): + return sanitize_labels(pairs.items()) + + return {key_label(k): value_label(v) for (k, v) in pairs if key_label(k)} diff --git a/caliban/platform/gke/__init__.py b/caliban/platform/gke/__init__.py new file mode 100644 index 0000000..79c6a2f --- /dev/null +++ b/caliban/platform/gke/__init__.py @@ -0,0 +1,15 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/caliban/gke/cli.py b/caliban/platform/gke/cli.py similarity index 100% rename from caliban/gke/cli.py rename to caliban/platform/gke/cli.py diff --git a/caliban/gke/cluster.py b/caliban/platform/gke/cluster.py similarity index 100% rename from caliban/gke/cluster.py rename to caliban/platform/gke/cluster.py diff --git a/caliban/gke/constants.py b/caliban/platform/gke/constants.py similarity index 100% rename from caliban/gke/constants.py rename to caliban/platform/gke/constants.py diff --git a/caliban/gke/types.py b/caliban/platform/gke/types.py similarity index 100% rename from caliban/gke/types.py rename to caliban/platform/gke/types.py diff --git a/caliban/gke/utils.py b/caliban/platform/gke/utils.py similarity index 100% rename from caliban/gke/utils.py rename to caliban/platform/gke/utils.py diff --git a/caliban/docker.py b/caliban/platform/notebook.py similarity index 100% rename from caliban/docker.py rename to caliban/platform/notebook.py diff --git a/caliban/platform/run.py b/caliban/platform/run.py new file mode 100644 index 0000000..88c4d02 --- /dev/null +++ b/caliban/platform/run.py @@ -0,0 +1,1056 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions required to interact with Docker to build and run images, shells +and notebooks in a Docker environment. + +""" + +from __future__ import absolute_import, division, print_function + +import json +import os +import subprocess +import sys +from enum import Enum +from pathlib import Path +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, NewType, + Optional, Union) + +import tqdm +from absl import logging +from blessings import Terminal +from tqdm.utils import _screen_shape_wrapper + +import caliban.config as c +import caliban.util as u +from caliban.history.types import Experiment, Job, JobSpec, JobStatus, Platform +from caliban.history.utils import (create_experiments, generate_container_spec, + get_mem_engine, get_sql_engine, + session_scope) + +t = Terminal() + +DEV_CONTAINER_ROOT = "gcr.io/blueshift-playground/blueshift" +TF_VERSIONS = {"2.2.0", "1.12.3", "1.14.0", "1.15.0"} +DEFAULT_WORKDIR = "/usr/app" +CREDS_DIR = "/.creds" +CONDA_BIN = "/opt/conda/bin/conda" + +ImageId = NewType('ImageId', str) +ArgSeq = NewType('ArgSeq', List[str]) + + +class DockerError(Exception): + """Exception that passes info on a failed Docker command.""" + + def __init__(self, message, cmd, ret_code): + super().__init__(message) + self.message = message + self.cmd = cmd + self.ret_code = ret_code + + @property + def command(self): + return " ".join(self.cmd) + + +class NotebookInstall(Enum): + """Flag to decide what to do .""" + none = 'none' + lab = 'lab' + jupyter = 'jupyter' + + def __str__(self) -> str: + return self.value + + +class Shell(Enum): + """Add new shells here and below, in SHELL_DICT.""" + bash = 'bash' + zsh = 'zsh' + + def __str__(self) -> str: + return self.value + + +# Tuple to track the information required to install and execute some custom +# shell into a container. +ShellData = NamedTuple("ShellData", [("executable", str), + ("packages", List[str])]) + + +def apt_install(*packages: str) -> str: + """Returns a command that will install the supplied list of packages without + requiring confirmation or any user interaction. + """ + package_str = ' '.join(packages) + no_prompt = "DEBIAN_FRONTEND=noninteractive" + return f"{no_prompt} apt-get install --yes --no-install-recommends {package_str}" + + +def apt_command(commands: List[str]) -> List[str]: + """Pre-and-ap-pends the supplied commands with the appropriate in-container and + cleanup command for aptitude. + + """ + update = ["apt-get update"] + cleanup = ["apt-get clean", "rm -rf /var/lib/apt/lists/*"] + return update + commands + cleanup + + +# Dict linking a particular supported shell to the data required to run and +# install the shell inside a container. +# +# : Dict[Shell, ShellData] +SHELL_DICT = { + Shell.bash: ShellData("/bin/bash", []), + Shell.zsh: ShellData("/bin/zsh", ["zsh"]) +} + + +def default_shell() -> Shell: + """Returns the shell to load into the container. Defaults to Shell.bash, but if + the user's SHELL variable refers to a supported sub-shell, returns that + instead. + + """ + ret = Shell.bash + + if "zsh" in os.environ.get("SHELL"): + ret = Shell.zsh + + return ret + + +def adc_location(home_dir: Optional[str] = None) -> str: + """Returns the location for application default credentials, INSIDE the + container (so, hardcoded unix separators), given the supplied home directory. + + """ + if home_dir is None: + home_dir = Path.home() + + return "{}/.config/gcloud/application_default_credentials.json".format( + home_dir) + + +def container_home(): + """Returns the location of the home directory inside the generated + container. + + """ + return "/home/{}".format(u.current_user()) + + +def tf_base_image(job_mode: c.JobMode, tensorflow_version: str) -> str: + """Returns the base image to use, depending on whether or not we're using a + GPU. This is JUST for building our base images for Blueshift; not for + actually using in a job. + + List of available tags: https://hub.docker.com/r/tensorflow/tensorflow/tags + + """ + if tensorflow_version not in TF_VERSIONS: + raise Exception("""{} is not a valid tensorflow version. + Try one of: {}""".format(tensorflow_version, TF_VERSIONS)) + + gpu = "-gpu" if c.gpu(job_mode) else "" + return "tensorflow/tensorflow:{}{}-py3".format(tensorflow_version, gpu) + + +def base_image_suffix(job_mode: c.JobMode) -> str: + return "gpu" if c.gpu(job_mode) else "cpu" + + +def base_image_id(job_mode: c.JobMode) -> str: + """Returns the default base image for all caliban Dockerfiles.""" + base_suffix = base_image_suffix(job_mode) + return "{}:{}".format(DEV_CONTAINER_ROOT, base_suffix) + + +def extras_string(extras: List[str]) -> str: + """Returns the argument passed to `pip install` to install a project from its + setup.py and target a specific set of extras_require dependencies. + + Args: + extras: (potentially empty) list of extra_requires deps. + """ + ret = "." + if len(extras) > 0: + ret += "[{}]".format(','.join(extras)) + return ret + + +def base_extras(job_mode: c.JobMode, path: str, + extras: Optional[List[str]]) -> Optional[List[str]]: + """Returns None if the supplied path doesn't exist (it's assumed it points to a + setup.py file). + + If the path DOES exist, generates a list of extras to install. gpu or cpu are + always added to the beginning of the list, depending on the mode. + + """ + ret = None + + if os.path.exists(path): + base = extras or [] + extra = 'gpu' if c.gpu(job_mode) else 'cpu' + ret = base if extra in base else [extra] + base + + return ret + + +def _dependency_entries(workdir: str, + user_id: int, + user_group: int, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None) -> str: + """Returns the Dockerfile entries required to install dependencies from either: + + - a requirements.txt file, path supplied by requirements_path + - a conda environment.yml file, path supplied by conda_env_path. + - a setup.py file, if some sequence of dependencies is supplied. + + An empty list for setup_extras means, run `pip install -c .` with no extras. + None for this argument means do nothing. If a list of strings is supplied, + they'll be treated as extras dependency sets. + """ + ret = "" + + if setup_extras is not None: + ret += f""" +COPY --chown={user_id}:{user_group} setup.py {workdir} +RUN /bin/bash -c "pip install --no-cache-dir {extras_string(setup_extras)}" +""" + + if conda_env_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {conda_env_path} {workdir} +RUN /bin/bash -c "{CONDA_BIN} env update \ + --quiet --name caliban \ + --file {conda_env_path} && \ + {CONDA_BIN} clean -y -q --all" +""" + + if requirements_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {requirements_path} {workdir} +RUN /bin/bash -c "pip install --no-cache-dir -r {requirements_path}" +""" + + return ret + + +def _package_entries(workdir: str, user_id: int, user_group: int, + package: u.Package) -> str: + """Returns the Dockerfile entries required to: + + - copy a directory of code into a docker container + - inject an entrypoint that executes a python module inside that directory. + + Python code runs as modules vs scripts so that we can enforce import hygiene + between files inside a project. + + """ + owner = "{}:{}".format(user_id, user_group) + + arg = package.main_module or package.script_path + + # This needs to use json so that quotes print as double quotes, not single + # quotes. + entrypoint_s = json.dumps(package.executable + [arg]) + + return """ +# Copy project code into the docker container. +COPY --chown={owner} {package_path} {workdir}/{package_path} + +# Declare an entrypoint that actually runs the container. +ENTRYPOINT {entrypoint_s} + """.format_map({ + "owner": owner, + "package_path": package.package_path, + "workdir": workdir, + "entrypoint_s": entrypoint_s + }) + + +def _service_account_entry(user_id: int, user_group: int, credentials_path: str, + docker_credentials_dir: str, + write_adc_placeholder: bool): + """Generates the Dockerfile entries required to transfer a set of Cloud service + account credentials into the Docker container. + + NOTE the write_adc_placeholder variable is here because the "ctpu" script + that we use to interact with TPUs has a bug in it, as of 1/21/2020, where the + script will fail if the application_default_credentials.json file isn't + present, EVEN THOUGH it properly uses the service account credentials + registered with gcloud instead of ADC creds. + + If a service account is present, we write a placeholder string to get past + this problem. This shouldn't matter for anyone else since adc isn't used if a + service account is present. + + """ + container_creds = "{}/credentials.json".format(docker_credentials_dir) + ret = """ +COPY --chown={user_id}:{user_group} {credentials_path} {container_creds} + +# Use the credentials file to activate gcloud, gsutil inside the container. +RUN gcloud auth activate-service-account --key-file={container_creds} && \ + git config --global credential.'https://source.developers.google.com'.helper gcloud.sh + +ENV GOOGLE_APPLICATION_CREDENTIALS={container_creds} +""".format_map({ + "user_id": user_id, + "user_group": user_group, + "credentials_path": credentials_path, + "container_creds": container_creds + }) + + if write_adc_placeholder: + ret += """ +RUN echo "placeholder" >> {} +""".format(adc_location(container_home())) + + return ret + + +def _adc_entry(user_id: int, user_group: int, adc_path: str): + """Returns the Dockerfile line required to transfer the + application_default_credentials.json file into the container's home + directory. + + """ + return """ +COPY --chown={user_id}:{user_group} {adc_path} {adc_loc} + """.format_map({ + "user_id": user_id, + "user_group": user_group, + "adc_path": adc_path, + "adc_loc": adc_location(container_home()) + }) + + +def _credentials_entries(user_id: int, + user_group: int, + adc_path: Optional[str], + credentials_path: Optional[str], + docker_credentials_dir: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to copy a user's Cloud credentials + into the Docker container. + + - adc_path is the relative path inside the current directory to an + application_default_credentials.json file containing... well, you get it. + - credentials_path is the relative path inside the current directory to a + JSON credentials file. + - docker_credentials_dir is the relative path inside the docker container + where the JSON file will be copied on build. + + """ + if docker_credentials_dir is None: + docker_credentials_dir = CREDS_DIR + + ret = "" + if credentials_path is not None: + ret += _service_account_entry(user_id, + user_group, + credentials_path, + docker_credentials_dir, + write_adc_placeholder=adc_path is None) + + if adc_path is not None: + ret += _adc_entry(user_id, user_group, adc_path) + + return ret + + +def _notebook_entries(lab: bool = False, version: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to install Jupyter{lab}. + + Optionally takes a version string. + + """ + version_suffix = "" + + if version is not None: + version_suffix = "=={}".format(version) + + library = "jupyterlab" if lab else "jupyter" + + return """ +RUN pip install {}{} +""".format(library, version_suffix) + + +def _custom_packages( + user_id: int, + user_group: int, + packages: Optional[List[str]] = None, + shell: Optional[Shell] = None, +) -> str: + """Returns the Dockerfile entries necessary to install custom dependencies for + the supplied shell and sequence of aptitude packages. + + """ + if packages is None: + packages = [] + + if shell is None: + shell = Shell.bash + + ret = "" + + to_install = sorted(packages + SHELL_DICT[shell].packages) + + if len(to_install) != 0: + commands = apt_command([apt_install(*to_install)]) + ret = """ +USER root + +RUN {commands} + +USER {user_id}:{user_group} +""".format_map({ + "commands": " && ".join(commands), + "user_id": user_id, + "user_group": user_group + }) + + return ret + + +def _copy_dir_entry(workdir: str, user_id: int, user_group: int, + dirname: str) -> str: + """Returns the Dockerfile entry necessary to copy a single extra subdirectory + from the current directory into a docker container during build. + + """ + owner = "{}:{}".format(user_id, user_group) + return """# Copy {dirname} into the Docker container. +COPY --chown={owner} {dirname} {workdir}/{dirname} +""".format_map({ + "owner": owner, + "workdir": workdir, + "dirname": dirname + }) + + +def _extra_dir_entries(workdir: str, user_id: int, user_group: int, + extra_dirs: List[str]) -> str: + """Returns the Dockerfile entries necessary to copy all directories in the + extra_dirs list into a docker container during build. + + """ + ret = "" + for d in extra_dirs: + ret += "\n{}".format(_copy_dir_entry(workdir, user_id, user_group, d)) + return ret + + +def _dockerfile_template( + job_mode: c.JobMode, + workdir: Optional[str] = None, + base_image_fn: Optional[Callable[[c.JobMode], str]] = None, + package: Optional[Union[List, u.Package]] = None, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None, + adc_path: Optional[str] = None, + credentials_path: Optional[str] = None, + jupyter_version: Optional[str] = None, + inject_notebook: NotebookInstall = NotebookInstall.none, + shell: Optional[Shell] = None, + extra_dirs: Optional[List[str]] = None, + caliban_config: Optional[Dict[str, Any]] = None) -> str: + """Returns a Dockerfile that builds on a local CPU or GPU base image (depending + on the value of job_mode) to create a container that: + + - installs any dependency specified in a requirements.txt file living at + requirements_path, a conda environment at conda_env_path, or any + dependencies in a setup.py file, including extra dependencies, if + setup_extras isn't None + - injects gcloud credentials into the container, so Cloud interaction works + just like it does locally + - potentially installs a custom shell, or jupyterlab for notebook support + - copies all source needed by the main module specified by package, and + potentially injects an entrypoint that, on run, will run that main module + + Most functions that call _dockerfile_template pass along any kwargs that they + receive. It should be enough to add kwargs here, then rely on that mechanism + to pass them along, vs adding kwargs all the way down the call chain. + + Supply a custom base_image_fn (function from job_mode -> image ID) to inject + more complex Docker commands into the Caliban environments by, for example, + building your own image on top of the TF base images, then using that. + + """ + uid = os.getuid() + gid = os.getgid() + username = u.current_user() + + if isinstance(package, list): + package = u.Package(*package) + + if workdir is None: + workdir = DEFAULT_WORKDIR + + if base_image_fn is None: + base_image_fn = base_image_id + + base_image = base_image_fn(job_mode) + + dockerfile = """ +FROM {base_image} + +# Create the same group we're using on the host machine. +RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} + +# Create the user by name. --no-log-init guards against a crash with large user +# IDs. +RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} + +# The directory is created by root. This sets permissions so that any user can +# access the folder. +RUN mkdir -m 777 {workdir} {creds_dir} {c_home} + +ENV HOME={c_home} + +WORKDIR {workdir} + +USER {uid}:{gid} +""".format_map({ + "base_image": base_image, + "username": username, + "uid": uid, + "gid": gid, + "workdir": workdir, + "c_home": container_home(), + "creds_dir": CREDS_DIR + }) + dockerfile += _credentials_entries(uid, + gid, + adc_path=adc_path, + credentials_path=credentials_path) + + dockerfile += _dependency_entries(workdir, + uid, + gid, + requirements_path=requirements_path, + conda_env_path=conda_env_path, + setup_extras=setup_extras) + + if inject_notebook.value != 'none': + install_lab = inject_notebook == NotebookInstall.lab + dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) + + if extra_dirs is not None: + dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) + + dockerfile += _custom_packages(uid, + gid, + packages=c.apt_packages( + caliban_config, job_mode), + shell=shell) + + if package is not None: + # The actual entrypoint and final copied code. + dockerfile += _package_entries(workdir, uid, gid, package) + + return dockerfile + + +def docker_image_id(output: str) -> ImageId: + """Accepts a string containing the output of a successful `docker build` + command and parses the Docker image ID from the stream. + + NOTE this is probably quite brittle! I can imagine this breaking quite easily + on a Docker upgrade. + + """ + return ImageId(output.splitlines()[-1].split()[-1]) + + +def build_image(job_mode: c.JobMode, + build_path: str, + credentials_path: Optional[str] = None, + adc_path: Optional[str] = None, + no_cache: bool = False, + **kwargs) -> str: + """Builds a Docker image by generating a Dockerfile and passing it to `docker + build` via stdin. All output from the `docker build` process prints to + stdout. + + Returns the image ID of the new docker container; if the command fails, + throws on error with information about the command and any issues that caused + the problem. + + """ + with u.TempCopy(credentials_path, + tmp_name=".caliban_default_creds.json") as creds: + with u.TempCopy(adc_path, tmp_name=".caliban_adc_creds.json") as adc: + cache_args = ["--no-cache"] if no_cache else [] + cmd = ["docker", "build"] + cache_args + ["--rm", "-f-", build_path] + + dockerfile = _dockerfile_template(job_mode, + credentials_path=creds, + adc_path=adc, + **kwargs) + + joined_cmd = " ".join(cmd) + logging.info("Running command: {}".format(joined_cmd)) + + try: + output, ret_code = u.capture_stdout(cmd, input_str=dockerfile) + if ret_code == 0: + return docker_image_id(output) + else: + error_msg = "Docker failed with error code {}.".format(ret_code) + raise DockerError(error_msg, cmd, ret_code) + + except subprocess.CalledProcessError as e: + logging.error(e.output) + logging.error(e.stderr) + + +def _image_tag_for_project(project_id: str, image_id: str) -> str: + """Generate the GCR Docker image tag for the supplied pair of project_id and + image_id. + + This function properly handles "domain scoped projects", where the project ID + contains a domain name and project ID separated by : + https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects. + + """ + project_s = project_id.replace(":", "/") + return "gcr.io/{}/{}:latest".format(project_s, image_id) + + +def push_uuid_tag(project_id: str, image_id: str) -> str: + """Takes a base image and tags it for upload, then pushes it to a remote Google + Container Registry. + + Returns the tag on a successful push. + + TODO should this just check first before attempting to push if the image + exists? Immutable names means that if the tag is up there, we're done. + Potentially use docker-py for this. + + """ + image_tag = _image_tag_for_project(project_id, image_id) + subprocess.run(["docker", "tag", image_id, image_tag], check=True) + subprocess.run(["docker", "push", image_tag], check=True) + return image_tag + + +def _run_cmd(job_mode: c.JobMode, + run_args: Optional[List[str]] = None) -> List[str]: + """Returns the sequence of commands for the subprocess run functions required + to execute `docker run`. in CPU or GPU mode, depending on the value of + job_mode. + + Keyword args: + - run_args: list of args to pass to docker run. + + """ + if run_args is None: + run_args = [] + + runtime = ["--runtime", "nvidia"] if c.gpu(job_mode) else [] + return ["docker", "run"] + runtime + ["--ipc", "host"] + run_args + + +def _home_mount_cmds(enable_home_mount: bool) -> List[str]: + """Returns the argument needed by Docker to mount a user's local home directory + into the home directory location inside their container. + + If enable_home_mount is false returns an empty list. + + """ + ret = [] + if enable_home_mount: + ret = ["-v", "{}:{}".format(Path.home(), container_home())] + return ret + + +def _interactive_opts(workdir: str) -> List[str]: + """Returns the basic arguments we want to run a docker process locally. + + """ + return [ + "-w", workdir, \ + "-u", "{}:{}".format(os.getuid(), os.getgid()), \ + "-v", "{}:{}".format(os.getcwd(), workdir) \ + ] + + +def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec: + """Prints logging as a side effect for the supplied sequence of job specs + generated from an experiment definition; returns the input job spec. + + """ + args = c.experiment_to_args(job_spec.experiment.kwargs, + job_spec.experiment.args) + logging.info("") + logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args)))) + return job_spec + + +def logged_job_specs(job_specs: Iterable[JobSpec]) -> Iterable[JobSpec]: + """Generates an iterable of job specs that should be passed to `docker run` to + execute the experiments defined by the supplied iterable. + + """ + for i, s in enumerate(job_specs, 1): + yield log_job_spec_instance(s, i) + + +def execute_dry_run(job_specs: Iterable[JobSpec]) -> None: + """Expands the supplied sequence of experiments into sequences of args and logs + the jobs that WOULD have been executed, had the dry run flag not been + applied. + + """ + list(logged_job_specs(job_specs)) + + logging.info('') + logging.info( + t.yellow("To build your image and execute these jobs, \ +run your command again without {}.".format(c.DRY_RUN_FLAG))) + logging.info('') + return None + + +def local_callback(idx: int, job: Job) -> None: + """Provides logging feedback for jobs run locally. If the return code is 0, + logs success; else, logs the failure as an error and logs the script args + that provided the failure. + + """ + if job.status == JobStatus.SUCCEEDED: + logging.info(t.green(f'Job {idx} succeeded!')) + else: + logging.error( + t.red(f'Job {idx} failed with return code {job.details["ret_code"]}.')) + args = c.experiment_to_args(job.spec.experiment.kwargs, + job.spec.experiment.args) + logging.error(t.red(f'Failing args for job {idx}: {args}')) + + +def window_size_env_cmds(): + """Returns a sequence of `docker run` arguments that will internally configure + the terminal columns and lines, so that progress bars and other terminal + interactions will work properly. + + These aren't required for interactive Docker commands like those triggered by + `caliban shell`. + + """ + ret = [] + cols, lines = _screen_shape_wrapper()(0) + if cols: + ret += ["-e", f"COLUMNS={cols}"] + if lines: + ret += ["-e", f"LINES={lines}"] + return ret + + +# ---------------------------------------------------------------------------- +def _create_job_spec_dict( + experiment: Experiment, + job_mode: c.JobMode, + image_id: str, + run_args: Optional[List[str]] = None, +) -> Dict[str, Any]: + '''creates a job spec dictionary for a local job''' + + # Without the unbuffered environment variable, stderr and stdout won't be + # emitted in the proper order from inside the container. + terminal_cmds = ["-e" "PYTHONUNBUFFERED=1"] + window_size_env_cmds() + + base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id] + command = base_cmd + c.experiment_to_args(experiment.kwargs, experiment.args) + return {'command': command, 'container': image_id} + + +# ---------------------------------------------------------------------------- +def execute_jobs( + job_specs: Iterable[JobSpec], + dry_run: bool = False, +): + '''executes a sequence of jobs based on job specs + + Arg: + job_specs: specifications for jobs to be executed + dry_run: if True, only print what would be done + ''' + + with u.tqdm_logging() as orig_stream: + pbar = tqdm.tqdm(logged_job_specs(job_specs), + file=orig_stream, + total=len(job_specs), + ascii=True, + unit="experiment", + desc="Executing") + for idx, job_spec in enumerate(pbar, 1): + command = job_spec.spec['command'] + logging.info(f'Running command: {" ".join(command)}') + if not dry_run: + _, ret_code = u.capture_stdout(command, "", u.TqdmFile(sys.stderr)) + else: + ret_code = 0 + j = Job(spec=job_spec, + container=job_spec.spec['container'], + details={'ret_code': ret_code}, + status=JobStatus.SUCCEEDED if ret_code == 0 else JobStatus.FAILED) + local_callback(idx=idx, job=j) + + if dry_run: + logging.info( + t.yellow(f'\nTo build your image and execute these jobs, ' + f'run your command again without {c.DRY_RUN_FLAG}\n')) + + return None + + +def run_experiments(job_mode: c.JobMode, + run_args: Optional[List[str]] = None, + script_args: Optional[List[str]] = None, + image_id: Optional[str] = None, + dry_run: bool = False, + experiment_config: Optional[c.ExpConf] = None, + xgroup: Optional[str] = None, + **build_image_kwargs) -> None: + """Builds an image using the supplied **build_image_kwargs and calls `docker + run` on the resulting image using sensible defaults. + + Keyword args: + + - job_mode: c.JobMode. + + - run_args: extra arguments to supply to `docker run` after our defaults. + - script_args: extra arguments to supply to the entrypoint. (You can + - override the default container entrypoint by supplying a new one inside + run_args.) + - image_id: ID of the image to run. Supplying this will skip an image build. + - experiment_config: dict of string to list, boolean, string or int. Any + lists will trigger a cartesian product out with the rest of the config. A + job will be executed for every combination of parameters in the experiment + config. + - dry_run: if True, no actual jobs will be executed and docker won't + actually build; logging side effects will show the user what will happen + without dry_run=True. + + any extra kwargs supplied are passed through to build_image. + """ + if run_args is None: + run_args = [] + + if script_args is None: + script_args = [] + + if experiment_config is None: + experiment_config = {} + + docker_args = {k: v for k, v in build_image_kwargs.items()} + docker_args['job_mode'] = job_mode + + engine = get_mem_engine() if dry_run else get_sql_engine() + + with session_scope(engine) as session: + container_spec = generate_container_spec(session, docker_args, image_id) + + if image_id is None: + if dry_run: + logging.info("Dry run - skipping actual 'docker build'.") + image_id = 'dry_run_tag' + else: + image_id = build_image(**docker_args) + + experiments = create_experiments( + session=session, + container_spec=container_spec, + script_args=script_args, + experiment_config=experiment_config, + xgroup=xgroup, + ) + + job_specs = [ + JobSpec.get_or_create( + experiment=x, + spec=_create_job_spec_dict( + experiment=x, + job_mode=job_mode, + run_args=run_args, + image_id=image_id, + ), + platform=Platform.LOCAL, + ) for x in experiments + ] + + try: + execute_jobs(job_specs=job_specs, dry_run=dry_run) + except Exception as e: + logging.error(f'exception: {e}') + session.commit() # commit here, otherwise will be rolled back + + +def run(job_mode: c.JobMode, + run_args: Optional[List[str]] = None, + script_args: Optional[List[str]] = None, + image_id: Optional[str] = None, + **build_image_kwargs) -> None: + """Builds an image using the supplied **build_image_kwargs and calls `docker + run` on the resulting image using sensible defaults. + Keyword args: + - job_mode: c.JobMode. + - run_args: extra arguments to supply to `docker run` after our defaults. + - script_args: extra arguments to supply to the entrypoint. (You can + - override the default container entrypoint by supplying a new one inside + run_args.) + - image_id: ID of the image to run. Supplying this will skip an image build. + any extra kwargs supplied are passed through to build_image. + """ + if run_args is None: + run_args = [] + + if script_args is None: + script_args = [] + + if image_id is None: + image_id = build_image(job_mode, **build_image_kwargs) + + base_cmd = _run_cmd(job_mode, run_args) + + command = base_cmd + [image_id] + script_args + + logging.info("Running command: {}".format(' '.join(command))) + subprocess.call(command) + return None + + +def run_interactive(job_mode: c.JobMode, + workdir: Optional[str] = None, + image_id: Optional[str] = None, + run_args: Optional[List[str]] = None, + mount_home: Optional[bool] = None, + shell: Optional[Shell] = None, + entrypoint: Optional[str] = None, + entrypoint_args: Optional[List[str]] = None, + **build_image_kwargs) -> None: + """Start a live shell in the terminal, with all dependencies installed and the + current working directory (and optionally the user's home directory) mounted. + + Keyword args: + + - job_mode: c.JobMode. + - image_id: ID of the image to run. Supplying this will skip an image build. + - run_args: extra arguments to supply to `docker run`. + - mount_home: if true, mounts the user's $HOME directory into the container + to `/home/$USERNAME`. If False, nothing. + - shell: name of the shell to install into the container. Also configures the + entrypoint if that's not supplied. + - entrypoint: command to run. Defaults to the executable command for the + supplied shell. + - entrypoint_args: extra arguments to supply to the entrypoint. + + any extra kwargs supplied are passed through to build_image. + + """ + if workdir is None: + workdir = DEFAULT_WORKDIR + + if run_args is None: + run_args = [] + + if entrypoint_args is None: + entrypoint_args = [] + + if mount_home is None: + mount_home = True + + if shell is None: + # Only set a default shell if we're also mounting the home volume. + # Otherwise a custom shell won't have access to the user's profile. + shell = default_shell() if mount_home else Shell.bash + + if entrypoint is None: + entrypoint = SHELL_DICT[shell].executable + + interactive_run_args = _interactive_opts(workdir) + [ + "-it", \ + "--entrypoint", entrypoint + ] + _home_mount_cmds(mount_home) + run_args + + run(job_mode=job_mode, + run_args=interactive_run_args, + script_args=entrypoint_args, + image_id=image_id, + shell=shell, + workdir=workdir, + **build_image_kwargs) + + +def run_notebook(job_mode: c.JobMode, + port: Optional[int] = None, + lab: Optional[bool] = None, + version: Optional[bool] = None, + run_args: Optional[List[str]] = None, + **run_interactive_kwargs) -> None: + """Start a notebook in the current working directory; the process will run + inside of a Docker container that's identical to the environment available to + Cloud jobs that are submitted by `caliban cloud`, or local jobs run with + `caliban run.` + + if you pass mount_home=True your jupyter settings will persist across calls. + + Keyword args: + + - port: the port to pass to Jupyter when it boots, useful if you have + multiple instances running on one machine. + - lab: if True, starts jupyter lab, else jupyter notebook. + - version: explicit Jupyter version to install. + + run_interactive_kwargs are all extra arguments taken by run_interactive. + + """ + + if port is None: + port = u.next_free_port(8888) + + if lab is None: + lab = False + + if run_args is None: + run_args = [] + + inject_arg = NotebookInstall.lab if lab else NotebookInstall.jupyter + jupyter_cmd = "lab" if lab else "notebook" + jupyter_args = [ + "-m", "jupyter", jupyter_cmd, \ + "--ip=0.0.0.0", \ + "--port={}".format(port), \ + "--no-browser" + ] + docker_args = ["-p", "{}:{}".format(port, port)] + run_args + + run_interactive(job_mode, + entrypoint="/opt/conda/envs/caliban/bin/python", + entrypoint_args=jupyter_args, + run_args=docker_args, + inject_notebook=inject_arg, + jupyter_version=version, + **run_interactive_kwargs) diff --git a/caliban/platform/shell.py b/caliban/platform/shell.py new file mode 100644 index 0000000..88c4d02 --- /dev/null +++ b/caliban/platform/shell.py @@ -0,0 +1,1056 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions required to interact with Docker to build and run images, shells +and notebooks in a Docker environment. + +""" + +from __future__ import absolute_import, division, print_function + +import json +import os +import subprocess +import sys +from enum import Enum +from pathlib import Path +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, NewType, + Optional, Union) + +import tqdm +from absl import logging +from blessings import Terminal +from tqdm.utils import _screen_shape_wrapper + +import caliban.config as c +import caliban.util as u +from caliban.history.types import Experiment, Job, JobSpec, JobStatus, Platform +from caliban.history.utils import (create_experiments, generate_container_spec, + get_mem_engine, get_sql_engine, + session_scope) + +t = Terminal() + +DEV_CONTAINER_ROOT = "gcr.io/blueshift-playground/blueshift" +TF_VERSIONS = {"2.2.0", "1.12.3", "1.14.0", "1.15.0"} +DEFAULT_WORKDIR = "/usr/app" +CREDS_DIR = "/.creds" +CONDA_BIN = "/opt/conda/bin/conda" + +ImageId = NewType('ImageId', str) +ArgSeq = NewType('ArgSeq', List[str]) + + +class DockerError(Exception): + """Exception that passes info on a failed Docker command.""" + + def __init__(self, message, cmd, ret_code): + super().__init__(message) + self.message = message + self.cmd = cmd + self.ret_code = ret_code + + @property + def command(self): + return " ".join(self.cmd) + + +class NotebookInstall(Enum): + """Flag to decide what to do .""" + none = 'none' + lab = 'lab' + jupyter = 'jupyter' + + def __str__(self) -> str: + return self.value + + +class Shell(Enum): + """Add new shells here and below, in SHELL_DICT.""" + bash = 'bash' + zsh = 'zsh' + + def __str__(self) -> str: + return self.value + + +# Tuple to track the information required to install and execute some custom +# shell into a container. +ShellData = NamedTuple("ShellData", [("executable", str), + ("packages", List[str])]) + + +def apt_install(*packages: str) -> str: + """Returns a command that will install the supplied list of packages without + requiring confirmation or any user interaction. + """ + package_str = ' '.join(packages) + no_prompt = "DEBIAN_FRONTEND=noninteractive" + return f"{no_prompt} apt-get install --yes --no-install-recommends {package_str}" + + +def apt_command(commands: List[str]) -> List[str]: + """Pre-and-ap-pends the supplied commands with the appropriate in-container and + cleanup command for aptitude. + + """ + update = ["apt-get update"] + cleanup = ["apt-get clean", "rm -rf /var/lib/apt/lists/*"] + return update + commands + cleanup + + +# Dict linking a particular supported shell to the data required to run and +# install the shell inside a container. +# +# : Dict[Shell, ShellData] +SHELL_DICT = { + Shell.bash: ShellData("/bin/bash", []), + Shell.zsh: ShellData("/bin/zsh", ["zsh"]) +} + + +def default_shell() -> Shell: + """Returns the shell to load into the container. Defaults to Shell.bash, but if + the user's SHELL variable refers to a supported sub-shell, returns that + instead. + + """ + ret = Shell.bash + + if "zsh" in os.environ.get("SHELL"): + ret = Shell.zsh + + return ret + + +def adc_location(home_dir: Optional[str] = None) -> str: + """Returns the location for application default credentials, INSIDE the + container (so, hardcoded unix separators), given the supplied home directory. + + """ + if home_dir is None: + home_dir = Path.home() + + return "{}/.config/gcloud/application_default_credentials.json".format( + home_dir) + + +def container_home(): + """Returns the location of the home directory inside the generated + container. + + """ + return "/home/{}".format(u.current_user()) + + +def tf_base_image(job_mode: c.JobMode, tensorflow_version: str) -> str: + """Returns the base image to use, depending on whether or not we're using a + GPU. This is JUST for building our base images for Blueshift; not for + actually using in a job. + + List of available tags: https://hub.docker.com/r/tensorflow/tensorflow/tags + + """ + if tensorflow_version not in TF_VERSIONS: + raise Exception("""{} is not a valid tensorflow version. + Try one of: {}""".format(tensorflow_version, TF_VERSIONS)) + + gpu = "-gpu" if c.gpu(job_mode) else "" + return "tensorflow/tensorflow:{}{}-py3".format(tensorflow_version, gpu) + + +def base_image_suffix(job_mode: c.JobMode) -> str: + return "gpu" if c.gpu(job_mode) else "cpu" + + +def base_image_id(job_mode: c.JobMode) -> str: + """Returns the default base image for all caliban Dockerfiles.""" + base_suffix = base_image_suffix(job_mode) + return "{}:{}".format(DEV_CONTAINER_ROOT, base_suffix) + + +def extras_string(extras: List[str]) -> str: + """Returns the argument passed to `pip install` to install a project from its + setup.py and target a specific set of extras_require dependencies. + + Args: + extras: (potentially empty) list of extra_requires deps. + """ + ret = "." + if len(extras) > 0: + ret += "[{}]".format(','.join(extras)) + return ret + + +def base_extras(job_mode: c.JobMode, path: str, + extras: Optional[List[str]]) -> Optional[List[str]]: + """Returns None if the supplied path doesn't exist (it's assumed it points to a + setup.py file). + + If the path DOES exist, generates a list of extras to install. gpu or cpu are + always added to the beginning of the list, depending on the mode. + + """ + ret = None + + if os.path.exists(path): + base = extras or [] + extra = 'gpu' if c.gpu(job_mode) else 'cpu' + ret = base if extra in base else [extra] + base + + return ret + + +def _dependency_entries(workdir: str, + user_id: int, + user_group: int, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None) -> str: + """Returns the Dockerfile entries required to install dependencies from either: + + - a requirements.txt file, path supplied by requirements_path + - a conda environment.yml file, path supplied by conda_env_path. + - a setup.py file, if some sequence of dependencies is supplied. + + An empty list for setup_extras means, run `pip install -c .` with no extras. + None for this argument means do nothing. If a list of strings is supplied, + they'll be treated as extras dependency sets. + """ + ret = "" + + if setup_extras is not None: + ret += f""" +COPY --chown={user_id}:{user_group} setup.py {workdir} +RUN /bin/bash -c "pip install --no-cache-dir {extras_string(setup_extras)}" +""" + + if conda_env_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {conda_env_path} {workdir} +RUN /bin/bash -c "{CONDA_BIN} env update \ + --quiet --name caliban \ + --file {conda_env_path} && \ + {CONDA_BIN} clean -y -q --all" +""" + + if requirements_path is not None: + ret += f""" +COPY --chown={user_id}:{user_group} {requirements_path} {workdir} +RUN /bin/bash -c "pip install --no-cache-dir -r {requirements_path}" +""" + + return ret + + +def _package_entries(workdir: str, user_id: int, user_group: int, + package: u.Package) -> str: + """Returns the Dockerfile entries required to: + + - copy a directory of code into a docker container + - inject an entrypoint that executes a python module inside that directory. + + Python code runs as modules vs scripts so that we can enforce import hygiene + between files inside a project. + + """ + owner = "{}:{}".format(user_id, user_group) + + arg = package.main_module or package.script_path + + # This needs to use json so that quotes print as double quotes, not single + # quotes. + entrypoint_s = json.dumps(package.executable + [arg]) + + return """ +# Copy project code into the docker container. +COPY --chown={owner} {package_path} {workdir}/{package_path} + +# Declare an entrypoint that actually runs the container. +ENTRYPOINT {entrypoint_s} + """.format_map({ + "owner": owner, + "package_path": package.package_path, + "workdir": workdir, + "entrypoint_s": entrypoint_s + }) + + +def _service_account_entry(user_id: int, user_group: int, credentials_path: str, + docker_credentials_dir: str, + write_adc_placeholder: bool): + """Generates the Dockerfile entries required to transfer a set of Cloud service + account credentials into the Docker container. + + NOTE the write_adc_placeholder variable is here because the "ctpu" script + that we use to interact with TPUs has a bug in it, as of 1/21/2020, where the + script will fail if the application_default_credentials.json file isn't + present, EVEN THOUGH it properly uses the service account credentials + registered with gcloud instead of ADC creds. + + If a service account is present, we write a placeholder string to get past + this problem. This shouldn't matter for anyone else since adc isn't used if a + service account is present. + + """ + container_creds = "{}/credentials.json".format(docker_credentials_dir) + ret = """ +COPY --chown={user_id}:{user_group} {credentials_path} {container_creds} + +# Use the credentials file to activate gcloud, gsutil inside the container. +RUN gcloud auth activate-service-account --key-file={container_creds} && \ + git config --global credential.'https://source.developers.google.com'.helper gcloud.sh + +ENV GOOGLE_APPLICATION_CREDENTIALS={container_creds} +""".format_map({ + "user_id": user_id, + "user_group": user_group, + "credentials_path": credentials_path, + "container_creds": container_creds + }) + + if write_adc_placeholder: + ret += """ +RUN echo "placeholder" >> {} +""".format(adc_location(container_home())) + + return ret + + +def _adc_entry(user_id: int, user_group: int, adc_path: str): + """Returns the Dockerfile line required to transfer the + application_default_credentials.json file into the container's home + directory. + + """ + return """ +COPY --chown={user_id}:{user_group} {adc_path} {adc_loc} + """.format_map({ + "user_id": user_id, + "user_group": user_group, + "adc_path": adc_path, + "adc_loc": adc_location(container_home()) + }) + + +def _credentials_entries(user_id: int, + user_group: int, + adc_path: Optional[str], + credentials_path: Optional[str], + docker_credentials_dir: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to copy a user's Cloud credentials + into the Docker container. + + - adc_path is the relative path inside the current directory to an + application_default_credentials.json file containing... well, you get it. + - credentials_path is the relative path inside the current directory to a + JSON credentials file. + - docker_credentials_dir is the relative path inside the docker container + where the JSON file will be copied on build. + + """ + if docker_credentials_dir is None: + docker_credentials_dir = CREDS_DIR + + ret = "" + if credentials_path is not None: + ret += _service_account_entry(user_id, + user_group, + credentials_path, + docker_credentials_dir, + write_adc_placeholder=adc_path is None) + + if adc_path is not None: + ret += _adc_entry(user_id, user_group, adc_path) + + return ret + + +def _notebook_entries(lab: bool = False, version: Optional[str] = None) -> str: + """Returns the Dockerfile entries necessary to install Jupyter{lab}. + + Optionally takes a version string. + + """ + version_suffix = "" + + if version is not None: + version_suffix = "=={}".format(version) + + library = "jupyterlab" if lab else "jupyter" + + return """ +RUN pip install {}{} +""".format(library, version_suffix) + + +def _custom_packages( + user_id: int, + user_group: int, + packages: Optional[List[str]] = None, + shell: Optional[Shell] = None, +) -> str: + """Returns the Dockerfile entries necessary to install custom dependencies for + the supplied shell and sequence of aptitude packages. + + """ + if packages is None: + packages = [] + + if shell is None: + shell = Shell.bash + + ret = "" + + to_install = sorted(packages + SHELL_DICT[shell].packages) + + if len(to_install) != 0: + commands = apt_command([apt_install(*to_install)]) + ret = """ +USER root + +RUN {commands} + +USER {user_id}:{user_group} +""".format_map({ + "commands": " && ".join(commands), + "user_id": user_id, + "user_group": user_group + }) + + return ret + + +def _copy_dir_entry(workdir: str, user_id: int, user_group: int, + dirname: str) -> str: + """Returns the Dockerfile entry necessary to copy a single extra subdirectory + from the current directory into a docker container during build. + + """ + owner = "{}:{}".format(user_id, user_group) + return """# Copy {dirname} into the Docker container. +COPY --chown={owner} {dirname} {workdir}/{dirname} +""".format_map({ + "owner": owner, + "workdir": workdir, + "dirname": dirname + }) + + +def _extra_dir_entries(workdir: str, user_id: int, user_group: int, + extra_dirs: List[str]) -> str: + """Returns the Dockerfile entries necessary to copy all directories in the + extra_dirs list into a docker container during build. + + """ + ret = "" + for d in extra_dirs: + ret += "\n{}".format(_copy_dir_entry(workdir, user_id, user_group, d)) + return ret + + +def _dockerfile_template( + job_mode: c.JobMode, + workdir: Optional[str] = None, + base_image_fn: Optional[Callable[[c.JobMode], str]] = None, + package: Optional[Union[List, u.Package]] = None, + requirements_path: Optional[str] = None, + conda_env_path: Optional[str] = None, + setup_extras: Optional[List[str]] = None, + adc_path: Optional[str] = None, + credentials_path: Optional[str] = None, + jupyter_version: Optional[str] = None, + inject_notebook: NotebookInstall = NotebookInstall.none, + shell: Optional[Shell] = None, + extra_dirs: Optional[List[str]] = None, + caliban_config: Optional[Dict[str, Any]] = None) -> str: + """Returns a Dockerfile that builds on a local CPU or GPU base image (depending + on the value of job_mode) to create a container that: + + - installs any dependency specified in a requirements.txt file living at + requirements_path, a conda environment at conda_env_path, or any + dependencies in a setup.py file, including extra dependencies, if + setup_extras isn't None + - injects gcloud credentials into the container, so Cloud interaction works + just like it does locally + - potentially installs a custom shell, or jupyterlab for notebook support + - copies all source needed by the main module specified by package, and + potentially injects an entrypoint that, on run, will run that main module + + Most functions that call _dockerfile_template pass along any kwargs that they + receive. It should be enough to add kwargs here, then rely on that mechanism + to pass them along, vs adding kwargs all the way down the call chain. + + Supply a custom base_image_fn (function from job_mode -> image ID) to inject + more complex Docker commands into the Caliban environments by, for example, + building your own image on top of the TF base images, then using that. + + """ + uid = os.getuid() + gid = os.getgid() + username = u.current_user() + + if isinstance(package, list): + package = u.Package(*package) + + if workdir is None: + workdir = DEFAULT_WORKDIR + + if base_image_fn is None: + base_image_fn = base_image_id + + base_image = base_image_fn(job_mode) + + dockerfile = """ +FROM {base_image} + +# Create the same group we're using on the host machine. +RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} + +# Create the user by name. --no-log-init guards against a crash with large user +# IDs. +RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} + +# The directory is created by root. This sets permissions so that any user can +# access the folder. +RUN mkdir -m 777 {workdir} {creds_dir} {c_home} + +ENV HOME={c_home} + +WORKDIR {workdir} + +USER {uid}:{gid} +""".format_map({ + "base_image": base_image, + "username": username, + "uid": uid, + "gid": gid, + "workdir": workdir, + "c_home": container_home(), + "creds_dir": CREDS_DIR + }) + dockerfile += _credentials_entries(uid, + gid, + adc_path=adc_path, + credentials_path=credentials_path) + + dockerfile += _dependency_entries(workdir, + uid, + gid, + requirements_path=requirements_path, + conda_env_path=conda_env_path, + setup_extras=setup_extras) + + if inject_notebook.value != 'none': + install_lab = inject_notebook == NotebookInstall.lab + dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) + + if extra_dirs is not None: + dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) + + dockerfile += _custom_packages(uid, + gid, + packages=c.apt_packages( + caliban_config, job_mode), + shell=shell) + + if package is not None: + # The actual entrypoint and final copied code. + dockerfile += _package_entries(workdir, uid, gid, package) + + return dockerfile + + +def docker_image_id(output: str) -> ImageId: + """Accepts a string containing the output of a successful `docker build` + command and parses the Docker image ID from the stream. + + NOTE this is probably quite brittle! I can imagine this breaking quite easily + on a Docker upgrade. + + """ + return ImageId(output.splitlines()[-1].split()[-1]) + + +def build_image(job_mode: c.JobMode, + build_path: str, + credentials_path: Optional[str] = None, + adc_path: Optional[str] = None, + no_cache: bool = False, + **kwargs) -> str: + """Builds a Docker image by generating a Dockerfile and passing it to `docker + build` via stdin. All output from the `docker build` process prints to + stdout. + + Returns the image ID of the new docker container; if the command fails, + throws on error with information about the command and any issues that caused + the problem. + + """ + with u.TempCopy(credentials_path, + tmp_name=".caliban_default_creds.json") as creds: + with u.TempCopy(adc_path, tmp_name=".caliban_adc_creds.json") as adc: + cache_args = ["--no-cache"] if no_cache else [] + cmd = ["docker", "build"] + cache_args + ["--rm", "-f-", build_path] + + dockerfile = _dockerfile_template(job_mode, + credentials_path=creds, + adc_path=adc, + **kwargs) + + joined_cmd = " ".join(cmd) + logging.info("Running command: {}".format(joined_cmd)) + + try: + output, ret_code = u.capture_stdout(cmd, input_str=dockerfile) + if ret_code == 0: + return docker_image_id(output) + else: + error_msg = "Docker failed with error code {}.".format(ret_code) + raise DockerError(error_msg, cmd, ret_code) + + except subprocess.CalledProcessError as e: + logging.error(e.output) + logging.error(e.stderr) + + +def _image_tag_for_project(project_id: str, image_id: str) -> str: + """Generate the GCR Docker image tag for the supplied pair of project_id and + image_id. + + This function properly handles "domain scoped projects", where the project ID + contains a domain name and project ID separated by : + https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects. + + """ + project_s = project_id.replace(":", "/") + return "gcr.io/{}/{}:latest".format(project_s, image_id) + + +def push_uuid_tag(project_id: str, image_id: str) -> str: + """Takes a base image and tags it for upload, then pushes it to a remote Google + Container Registry. + + Returns the tag on a successful push. + + TODO should this just check first before attempting to push if the image + exists? Immutable names means that if the tag is up there, we're done. + Potentially use docker-py for this. + + """ + image_tag = _image_tag_for_project(project_id, image_id) + subprocess.run(["docker", "tag", image_id, image_tag], check=True) + subprocess.run(["docker", "push", image_tag], check=True) + return image_tag + + +def _run_cmd(job_mode: c.JobMode, + run_args: Optional[List[str]] = None) -> List[str]: + """Returns the sequence of commands for the subprocess run functions required + to execute `docker run`. in CPU or GPU mode, depending on the value of + job_mode. + + Keyword args: + - run_args: list of args to pass to docker run. + + """ + if run_args is None: + run_args = [] + + runtime = ["--runtime", "nvidia"] if c.gpu(job_mode) else [] + return ["docker", "run"] + runtime + ["--ipc", "host"] + run_args + + +def _home_mount_cmds(enable_home_mount: bool) -> List[str]: + """Returns the argument needed by Docker to mount a user's local home directory + into the home directory location inside their container. + + If enable_home_mount is false returns an empty list. + + """ + ret = [] + if enable_home_mount: + ret = ["-v", "{}:{}".format(Path.home(), container_home())] + return ret + + +def _interactive_opts(workdir: str) -> List[str]: + """Returns the basic arguments we want to run a docker process locally. + + """ + return [ + "-w", workdir, \ + "-u", "{}:{}".format(os.getuid(), os.getgid()), \ + "-v", "{}:{}".format(os.getcwd(), workdir) \ + ] + + +def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec: + """Prints logging as a side effect for the supplied sequence of job specs + generated from an experiment definition; returns the input job spec. + + """ + args = c.experiment_to_args(job_spec.experiment.kwargs, + job_spec.experiment.args) + logging.info("") + logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args)))) + return job_spec + + +def logged_job_specs(job_specs: Iterable[JobSpec]) -> Iterable[JobSpec]: + """Generates an iterable of job specs that should be passed to `docker run` to + execute the experiments defined by the supplied iterable. + + """ + for i, s in enumerate(job_specs, 1): + yield log_job_spec_instance(s, i) + + +def execute_dry_run(job_specs: Iterable[JobSpec]) -> None: + """Expands the supplied sequence of experiments into sequences of args and logs + the jobs that WOULD have been executed, had the dry run flag not been + applied. + + """ + list(logged_job_specs(job_specs)) + + logging.info('') + logging.info( + t.yellow("To build your image and execute these jobs, \ +run your command again without {}.".format(c.DRY_RUN_FLAG))) + logging.info('') + return None + + +def local_callback(idx: int, job: Job) -> None: + """Provides logging feedback for jobs run locally. If the return code is 0, + logs success; else, logs the failure as an error and logs the script args + that provided the failure. + + """ + if job.status == JobStatus.SUCCEEDED: + logging.info(t.green(f'Job {idx} succeeded!')) + else: + logging.error( + t.red(f'Job {idx} failed with return code {job.details["ret_code"]}.')) + args = c.experiment_to_args(job.spec.experiment.kwargs, + job.spec.experiment.args) + logging.error(t.red(f'Failing args for job {idx}: {args}')) + + +def window_size_env_cmds(): + """Returns a sequence of `docker run` arguments that will internally configure + the terminal columns and lines, so that progress bars and other terminal + interactions will work properly. + + These aren't required for interactive Docker commands like those triggered by + `caliban shell`. + + """ + ret = [] + cols, lines = _screen_shape_wrapper()(0) + if cols: + ret += ["-e", f"COLUMNS={cols}"] + if lines: + ret += ["-e", f"LINES={lines}"] + return ret + + +# ---------------------------------------------------------------------------- +def _create_job_spec_dict( + experiment: Experiment, + job_mode: c.JobMode, + image_id: str, + run_args: Optional[List[str]] = None, +) -> Dict[str, Any]: + '''creates a job spec dictionary for a local job''' + + # Without the unbuffered environment variable, stderr and stdout won't be + # emitted in the proper order from inside the container. + terminal_cmds = ["-e" "PYTHONUNBUFFERED=1"] + window_size_env_cmds() + + base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id] + command = base_cmd + c.experiment_to_args(experiment.kwargs, experiment.args) + return {'command': command, 'container': image_id} + + +# ---------------------------------------------------------------------------- +def execute_jobs( + job_specs: Iterable[JobSpec], + dry_run: bool = False, +): + '''executes a sequence of jobs based on job specs + + Arg: + job_specs: specifications for jobs to be executed + dry_run: if True, only print what would be done + ''' + + with u.tqdm_logging() as orig_stream: + pbar = tqdm.tqdm(logged_job_specs(job_specs), + file=orig_stream, + total=len(job_specs), + ascii=True, + unit="experiment", + desc="Executing") + for idx, job_spec in enumerate(pbar, 1): + command = job_spec.spec['command'] + logging.info(f'Running command: {" ".join(command)}') + if not dry_run: + _, ret_code = u.capture_stdout(command, "", u.TqdmFile(sys.stderr)) + else: + ret_code = 0 + j = Job(spec=job_spec, + container=job_spec.spec['container'], + details={'ret_code': ret_code}, + status=JobStatus.SUCCEEDED if ret_code == 0 else JobStatus.FAILED) + local_callback(idx=idx, job=j) + + if dry_run: + logging.info( + t.yellow(f'\nTo build your image and execute these jobs, ' + f'run your command again without {c.DRY_RUN_FLAG}\n')) + + return None + + +def run_experiments(job_mode: c.JobMode, + run_args: Optional[List[str]] = None, + script_args: Optional[List[str]] = None, + image_id: Optional[str] = None, + dry_run: bool = False, + experiment_config: Optional[c.ExpConf] = None, + xgroup: Optional[str] = None, + **build_image_kwargs) -> None: + """Builds an image using the supplied **build_image_kwargs and calls `docker + run` on the resulting image using sensible defaults. + + Keyword args: + + - job_mode: c.JobMode. + + - run_args: extra arguments to supply to `docker run` after our defaults. + - script_args: extra arguments to supply to the entrypoint. (You can + - override the default container entrypoint by supplying a new one inside + run_args.) + - image_id: ID of the image to run. Supplying this will skip an image build. + - experiment_config: dict of string to list, boolean, string or int. Any + lists will trigger a cartesian product out with the rest of the config. A + job will be executed for every combination of parameters in the experiment + config. + - dry_run: if True, no actual jobs will be executed and docker won't + actually build; logging side effects will show the user what will happen + without dry_run=True. + + any extra kwargs supplied are passed through to build_image. + """ + if run_args is None: + run_args = [] + + if script_args is None: + script_args = [] + + if experiment_config is None: + experiment_config = {} + + docker_args = {k: v for k, v in build_image_kwargs.items()} + docker_args['job_mode'] = job_mode + + engine = get_mem_engine() if dry_run else get_sql_engine() + + with session_scope(engine) as session: + container_spec = generate_container_spec(session, docker_args, image_id) + + if image_id is None: + if dry_run: + logging.info("Dry run - skipping actual 'docker build'.") + image_id = 'dry_run_tag' + else: + image_id = build_image(**docker_args) + + experiments = create_experiments( + session=session, + container_spec=container_spec, + script_args=script_args, + experiment_config=experiment_config, + xgroup=xgroup, + ) + + job_specs = [ + JobSpec.get_or_create( + experiment=x, + spec=_create_job_spec_dict( + experiment=x, + job_mode=job_mode, + run_args=run_args, + image_id=image_id, + ), + platform=Platform.LOCAL, + ) for x in experiments + ] + + try: + execute_jobs(job_specs=job_specs, dry_run=dry_run) + except Exception as e: + logging.error(f'exception: {e}') + session.commit() # commit here, otherwise will be rolled back + + +def run(job_mode: c.JobMode, + run_args: Optional[List[str]] = None, + script_args: Optional[List[str]] = None, + image_id: Optional[str] = None, + **build_image_kwargs) -> None: + """Builds an image using the supplied **build_image_kwargs and calls `docker + run` on the resulting image using sensible defaults. + Keyword args: + - job_mode: c.JobMode. + - run_args: extra arguments to supply to `docker run` after our defaults. + - script_args: extra arguments to supply to the entrypoint. (You can + - override the default container entrypoint by supplying a new one inside + run_args.) + - image_id: ID of the image to run. Supplying this will skip an image build. + any extra kwargs supplied are passed through to build_image. + """ + if run_args is None: + run_args = [] + + if script_args is None: + script_args = [] + + if image_id is None: + image_id = build_image(job_mode, **build_image_kwargs) + + base_cmd = _run_cmd(job_mode, run_args) + + command = base_cmd + [image_id] + script_args + + logging.info("Running command: {}".format(' '.join(command))) + subprocess.call(command) + return None + + +def run_interactive(job_mode: c.JobMode, + workdir: Optional[str] = None, + image_id: Optional[str] = None, + run_args: Optional[List[str]] = None, + mount_home: Optional[bool] = None, + shell: Optional[Shell] = None, + entrypoint: Optional[str] = None, + entrypoint_args: Optional[List[str]] = None, + **build_image_kwargs) -> None: + """Start a live shell in the terminal, with all dependencies installed and the + current working directory (and optionally the user's home directory) mounted. + + Keyword args: + + - job_mode: c.JobMode. + - image_id: ID of the image to run. Supplying this will skip an image build. + - run_args: extra arguments to supply to `docker run`. + - mount_home: if true, mounts the user's $HOME directory into the container + to `/home/$USERNAME`. If False, nothing. + - shell: name of the shell to install into the container. Also configures the + entrypoint if that's not supplied. + - entrypoint: command to run. Defaults to the executable command for the + supplied shell. + - entrypoint_args: extra arguments to supply to the entrypoint. + + any extra kwargs supplied are passed through to build_image. + + """ + if workdir is None: + workdir = DEFAULT_WORKDIR + + if run_args is None: + run_args = [] + + if entrypoint_args is None: + entrypoint_args = [] + + if mount_home is None: + mount_home = True + + if shell is None: + # Only set a default shell if we're also mounting the home volume. + # Otherwise a custom shell won't have access to the user's profile. + shell = default_shell() if mount_home else Shell.bash + + if entrypoint is None: + entrypoint = SHELL_DICT[shell].executable + + interactive_run_args = _interactive_opts(workdir) + [ + "-it", \ + "--entrypoint", entrypoint + ] + _home_mount_cmds(mount_home) + run_args + + run(job_mode=job_mode, + run_args=interactive_run_args, + script_args=entrypoint_args, + image_id=image_id, + shell=shell, + workdir=workdir, + **build_image_kwargs) + + +def run_notebook(job_mode: c.JobMode, + port: Optional[int] = None, + lab: Optional[bool] = None, + version: Optional[bool] = None, + run_args: Optional[List[str]] = None, + **run_interactive_kwargs) -> None: + """Start a notebook in the current working directory; the process will run + inside of a Docker container that's identical to the environment available to + Cloud jobs that are submitted by `caliban cloud`, or local jobs run with + `caliban run.` + + if you pass mount_home=True your jupyter settings will persist across calls. + + Keyword args: + + - port: the port to pass to Jupyter when it boots, useful if you have + multiple instances running on one machine. + - lab: if True, starts jupyter lab, else jupyter notebook. + - version: explicit Jupyter version to install. + + run_interactive_kwargs are all extra arguments taken by run_interactive. + + """ + + if port is None: + port = u.next_free_port(8888) + + if lab is None: + lab = False + + if run_args is None: + run_args = [] + + inject_arg = NotebookInstall.lab if lab else NotebookInstall.jupyter + jupyter_cmd = "lab" if lab else "notebook" + jupyter_args = [ + "-m", "jupyter", jupyter_cmd, \ + "--ip=0.0.0.0", \ + "--port={}".format(port), \ + "--no-browser" + ] + docker_args = ["-p", "{}:{}".format(port, port)] + run_args + + run_interactive(job_mode, + entrypoint="/opt/conda/envs/caliban/bin/python", + entrypoint_args=jupyter_args, + run_args=docker_args, + inject_notebook=inject_arg, + jupyter_version=version, + **run_interactive_kwargs) diff --git a/caliban/util.py b/caliban/util.py deleted file mode 100644 index 8d5f3a7..0000000 --- a/caliban/util.py +++ /dev/null @@ -1,752 +0,0 @@ -#!/usr/bin/python -# -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utilities for our job runner. -""" -import argparse -import contextlib -import getpass -import io -import itertools as it -import os -import platform -import re -import shutil -import socket -import subprocess -import sys -import time -import uuid -from collections import ChainMap -from enum import Enum -from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, - Set, Tuple, Union) - -import tqdm -from absl import logging -from blessings import Terminal -from tqdm._utils import _term_move_up - -t = Terminal() - -# key and value for labels can be at most this-many-characters long. -AI_PLATFORM_MAX_LABEL_LENGTH = 63 - -Package = NamedTuple("Package", [("executable", List[str]), - ("package_path", str), ("script_path", str), - ("main_module", Optional[str])]) - - -def module_package(main_module: str) -> Package: - """Generates a Package instance for a python module executable that should be - executed with python -m. - - """ - script_path = module_to_path(main_module) - root = extract_root_directory(script_path) - return Package(["python", "-m"], - package_path=root, - script_path=script_path, - main_module=main_module) - - -def script_package(path: str, executable: str = "/bin/bash") -> Package: - """Generates a Package instance for a non-python-module executable.""" - root = extract_root_directory(path) - return Package([executable], - package_path=root, - script_path=path, - main_module=None) - - -def err(s: str) -> None: - """Prints the supplied string to stderr in red text.""" - sys.stderr.write(t.red(s)) - - -def current_user() -> str: - return getpass.getuser() - - -def is_mac() -> bool: - """Returns True if the current code is executing on a Mac, False otherwise. - - """ - return platform.system() == "Darwin" - - -def is_linux() -> bool: - """Returns True if the current code is executing on a Linux system, False - otherwise. - - """ - return platform.system() == "Darwin" - - -def enum_vals(enum: Enum) -> List[str]: - """Returns the list of all values for a specific enum.""" - return [v.value for v in enum] - - -def any_of(value_s: str, union_type: Union) -> Any: - """Attempts to parse the supplied string into one of the components of the - supplied Union. Returns the value if possible, else raises a value error. - - union_type must be a union of enums! - - """ - - def attempt(s: str, enum_type: Enum) -> Optional[Any]: - try: - return enum_type(s) - except ValueError: - return None - - enums = union_type.__args__ - ret = None - - for enum_type in enums: - ret = attempt(value_s, enum_type) - if ret is not None: - break - - if ret is None: - raise ValueError("{} isn't a value of any of {}".format(value_s, enums)) - - return ret - - -def _expand_compound_pair(k: Union[Tuple, str], v: Any) -> Dict: - """ given a key-value pair k v, where k is either: - a) a primitive representing a single, e.g. k = 'key', v = 'value', or - b) a tuple of primitives representing multiple keys, e.g. k = ('key1','key2'), v = ('value1', 'value2') - this function returns the corresponding dictionary without compound keys - """ - - if isinstance(k, tuple): - if not isinstance(v, tuple): - raise argparse.ArgumentTypeError( - """function _expand_compound_pair(k, v) requires that if type(k) is tuple, - type(v) must also be tuple.""") - else: - return dict(zip(k, v)) - else: - return {k: v} - - -def expand_compound_dict(m: Union[Dict, List]) -> Union[Dict, List]: - """ given a dictionary with some compound keys, aka tuples, - returns a dictionary which each compound key separated into primitives - - given a list of such dictionaries, will apply the transformation - described above to each dictionary and return the list, maintaining - structure - """ - - if isinstance(m, list): - return [expand_compound_dict(mi) for mi in m] - else: - expanded_dicts = [_expand_compound_pair(k, v) for k, v in m.items()] - return dict(ChainMap(*expanded_dicts)) - - -def tupleize_dict(m: Dict) -> Dict: - """ given a dictionary with compound keys, converts those keys to tuples, and - converts the corresponding values to a tuple or list of tuples - - Compound key: a string which uses square brackets to enclose - a comma-separated list, e.g. "[batch_size,learning_rate]" or "[a,b,c]" - """ - - formatted_items = [_tupleize_compound_item(k, v) for k, v in m.items()] - return dict(ChainMap(*formatted_items)) - - -def _tupleize_compound_item(k: Union[Tuple, str], v: Any) -> Dict: - """ converts a JSON-input compound key/value pair into a dictionary of tuples """ - if _is_compound_key(k): - return {_tupleize_compound_key(k): _tupleize_compound_value(v)} - else: - return {k: v} - - -def _tupleize_compound_key(k: str) -> List[str]: - """ converts a JSON-input compound key into a tuple """ - assert _is_compound_key(k), "{} must be a valid compound key".format(k) - return tuple([x.strip() for x in k.strip('][').split(',')]) - - -def _tupleize_compound_value( - v: Union[List, bool, str, int, float]) -> Union[List, Tuple]: - """ list of lists -> list of tuples - list of primitives -> tuple of primitives - single primitive -> length-1 tuple of that primitive - - E.g., [[0,1],[3,4]] -> [(0,1),(3,4)] - [0,1] -> (0,1) - 0 -> (0, ) - """ - if isinstance(v, list): - if isinstance(v[0], list): - # v is list of lists - return [tuple(vi) for vi in v] - else: - # v is list of primitives - return tuple(v) - else: - # v is a single primitive (bool, str, int, float) - return tuple([v]) - - -def _is_compound_key(s: Any) -> bool: - """ compound key is defined as a string which uses square brackets to enclose - a comma-separated list, e.g. "[batch_size,learning_rate]" or "[a,b,c]" - """ - - if type(s) is not str or len(s) <= 2: - return False - else: - return s[0] == '[' and s[-1] == ']' - - -def dict_product(m: Dict[Any, Any]) -> Iterable[Dict[Any, Any]]: - """Returns a dictionary generated by taking the cartesian product of each - list-typed value iterable with all others. - - The iterable of dictionaries returned represents every combination of values. - - If any value is NOT a list it will be treated as a singleton list. - - """ - - def wrap_v(v): - return v if isinstance(v, list) else [v] - - ks = m.keys() - vs = (wrap_v(v) for v in m.values()) - return (dict(zip(ks, x)) for x in it.product(*vs)) - - -def compose(l, r): - """Returns a function that's the composition of the two supplied functions. - - """ - - def inner(*args, **kwargs): - return l(r(*args, **kwargs)) - - return inner - - -def flipm(table: Dict[Any, Dict[Any, Any]]) -> Dict[Any, Dict[Any, Any]]: - """Handles shuffles for a particular kind of table.""" - ret = {} - for k, m in table.items(): - for k2, v in m.items(): - ret.setdefault(k2, {})[k] = v - - return ret - - -def invertm(table: Dict[Any, Iterable[Any]]) -> Dict[Any, Set[Any]]: - """Handles shuffles for a particular kind of table.""" - ret = {} - for k, vs in table.items(): - for v in vs: - ret.setdefault(v, set()).add(k) - - return ret - - -def reorderm(table: Dict[Any, Dict[Any, Iterable[Any]]], - order: Tuple[int, int, int]) -> Dict[Any, Dict[Any, Set[Any]]]: - """Handles shuffles for a particular kind of table.""" - ret = {} - for k, m in table.items(): - for k2, vs in m.items(): - for v in vs: - fields = [k, k2, v] - innerm = ret.setdefault(fields[order[0]], {}) - acc = innerm.setdefault(fields[order[1]], set()) - acc.add(fields[order[2]]) - - return ret - - -def merge(l: Dict[Any, Any], r: Dict[Any, Any]) -> Dict[Any, Any]: - """Returns a new dictionary by merging the two supplied dictionaries.""" - ret = l.copy() - ret.update(r) - return ret - - -def dict_by(keys: Set[str], f: Callable[[str], Any]) -> Dict[str, Any]: - """Returns a dictionary with keys equal to the supplied keyset. Each value is - the result of applying f to a key in keys. - - """ - return {k: f(k) for k in keys} - - -def expand_args(items: Dict[str, str]) -> List[str]: - """Converts the input map into a sequence of k, v pair strings. A None value is - interpreted to mean that the key is a solo flag; it's evicted from the - output. - - """ - pairs = [[k, v] if v is not None else [k] for k, v in items.items()] - return list(it.chain.from_iterable(pairs)) - - -def split_by(items: List[str], - separator: Optional[str] = None) -> Tuple[List[str], List[str]]: - """If the separator is present in the list, returns a 2-tuple of - - - the items before the separator, - - all items after the separator. - - If the separator isn't present, returns a tuple of - - - (the original list, []) - - """ - if separator is None: - separator = '--' - - try: - idx = items.index(separator) - return items[0:idx], items[idx + 1:] - except ValueError: - return (items, []) - - -class TempCopy(object): - """Inside its scope, this class: - - - generates a temporary file at tmp_name containing a copy of the file at - original_path, and - - deletes the new file at tmp_name when the scope exits. - - The temporary file will live inside the current directory where python's - being executed; it's a hidden file, but it will be live for the duration of - TempCopy's scope. - - We did NOT use a tmp directory here because the changing UUID name - invalidates the docker image each time a new temp path / directory is - generated. - - """ - - def __init__(self, original_path=None, tmp_name=None): - if tmp_name is None: - self.tmp_path = ".{}.json".format(str(uuid.uuid1())) - else: - self.tmp_path = tmp_name - - self.original_path = None - if original_path: - # handle tilde! - self.original_path = os.path.expanduser(original_path) - - self.path = None - - def __enter__(self): - if self.original_path is None: - return None - - current_dir = os.getcwd() - self.path = os.path.join(current_dir, self.tmp_path) - shutil.copy2(self.original_path, self.path) - return self.tmp_path - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.path is not None: - os.remove(self.path) - self.path = None - - -def capture_stdout(cmd: List[str], - input_str: Optional[str] = None, - file=None) -> str: - """Executes the supplied command with the supplied string of std input, then - streams the output to stdout, and returns it as a string along with the - process's return code. - - Args: - cmd: list of strings to send in as the command - input_str: if supplied, this string will be passed as stdin to the supplied - command. if None, stdin will get closed immediately. - file: optional file-like object (stream): the output from the executed - process's stdout will get sent to this stream. Defaults to sys.stdout. - - Returns: - Pair of - - string of all stdout received during the command's execution - - return code of the process - - """ - if file is None: - file = sys.stdout - - buf = io.StringIO() - ret_code = None - - with subprocess.Popen(cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=False, - bufsize=1) as p: - if input_str: - p.stdin.write(input_str.encode('utf-8')) - p.stdin.close() - - out = io.TextIOWrapper(p.stdout, newline='') - - for line in out: - buf.write(line) - file.write(line) - file.flush() - - # flush to force the contents to display. - file.flush() - - while p.poll() is None: - # Process hasn't exited yet, let's wait some - time.sleep(0.5) - - ret_code = p.returncode - p.stdout.close() - - return buf.getvalue(), ret_code - - -def path_to_module(path_str: str) -> str: - return path_str.replace(".py", "").replace(os.path.sep, ".") - - -def module_to_path(module_name: str) -> str: - """Converts the supplied python module (module names separated by dots) into - the python file represented by the module name. - - """ - return module_name.replace(".", os.path.sep) + ".py" - - -def file_exists_in_cwd(path: str) -> bool: - """Returns True if the current path references a valid file in the current - directory, False otherwise. - - """ - return os.path.isfile(os.path.join(os.getcwd(), path)) - - -def extract_root_directory(path: str) -> str: - """Returns the root directory of the supplied path.""" - items = path.split(os.path.sep) - return "." if len(items) == 1 else items[0] - - -def generate_package(path: str, - executable: Optional[List[str]] = None, - main_module: Optional[str] = None) -> Package: - """Takes in a string and generates a package instance that we can use for - imports. - """ - if executable is None: - _, ext = os.path.splitext(path) - executable = ["python"] if ext == ".py" else ["/bin/bash"] - - if main_module is None and not file_exists_in_cwd(path): - module_path = module_to_path(path) - - if file_exists_in_cwd(module_path): - return generate_package(module_path, - executable=["python", "-m"], - main_module=path_to_module(module_path)) - - root = extract_root_directory(path) - return Package(executable, root, path, main_module) - - -def validated_package(path: str) -> Package: - """similar to generate_package but runs argparse validation on packages that - don't actually exist in the filesystem. - - """ - p = generate_package(path) - - if not os.path.isdir(p.package_path): - raise argparse.ArgumentTypeError( - """Directory '{}' doesn't exist in directory. Code must be -nested in a folder that exists in the current directory.""".format( - p.package_path)) - - filename = p.script_path - if not file_exists_in_cwd(filename): - raise argparse.ArgumentTypeError( - """File '{}' doesn't exist locally as a script or python module; code -must live inside the current directory.""".format(filename)) - - return p - - -def parse_kv_pair(s: str) -> Tuple[str, str]: - """ - Parse a key, value pair, separated by '=' - - On the command line (argparse) a declaration will typically look like: - foo=hello - or - foo="hello world" - """ - items = s.split('=') - k = items[0].strip() # Remove whitespace around keys - - if len(items) <= 1: - raise argparse.ArgumentTypeError( - "Couldn't parse label '{}' into k=v format.".format(s)) - - v = '='.join(items[1:]) - return (k, v) - - -def _is_key(k: Optional[str]) -> bool: - """Returns True if the argument is a valid argparse optional arg input, False - otherwise. - - Strings that start with - or -- are considered valid for now. - - """ - return k is not None and len(k) > 0 and k[0] == "-" - - -def _truncate(s: str, max_length: int) -> str: - """Returns the input string s truncated to be at most max_length characters - long. - - """ - return s if len(s) <= max_length else s[0:max_length] - - -def _clean_label(s: Optional[str], is_key: bool) -> str: - """Processes the string into the sanitized format required by AI platform - labels. - - https://cloud.google.com/ml-engine/docs/resource-labels - - """ - if s is None: - return "" - - # periods are not allowed by AI Platform labels, but often occur in, - # e.g., learning rates - DECIMAL_REPLACEMENT = '_' - s = s.replace('.', DECIMAL_REPLACEMENT) - - # lowercase, letters, - and _ are valid, so strip the leading dashes, make - # everything lowercase and then kill any remaining unallowed characters. - cleaned = re.sub(r'[^a-z0-9_-]', '', s.lower()).lstrip("-") - - # Keys must start with a letter. If is_key is set and the cleaned version - # starts with something else, append `k`. - if is_key and cleaned != "" and not cleaned[0].isalpha(): - cleaned = "k" + cleaned - - return _truncate(cleaned, AI_PLATFORM_MAX_LABEL_LENGTH) - - -def key_label(k: Optional[str]) -> str: - """converts the argument into a valid label, suitable for submission as a label - key to Cloud. - - """ - return _clean_label(k, True) - - -def value_label(v: Optional[str]) -> str: - """converts the argument into a valid label, suitable for submission as a label - value to Cloud. - - """ - return _clean_label(v, False) - - -def n_chunks(items: List[Any], n_groups: int) -> List[List[Any]]: - """Returns a list of `n_groups` slices of the original list, guaranteed to - contain all of the original items. - - """ - return [items[i::n_groups] for i in range(n_groups)] - - -def chunks_below_limit(items: List[Any], limit: int) -> List[List[Any]]: - """Breaks the input list into a series of chunks guaranteed to be less than""" - quot, _ = divmod(len(items), limit) - return n_chunks(items, quot + 1) - - -def partition(seq: List[str], n: int) -> List[List[str]]: - """Generate groups of n items from seq by scanning across the sequence and - taking chunks of n, offset by 1. - - """ - for i in range(0, max(1, len(seq) - n + 1), 1): - yield seq[i:i + n] - - -def script_args_to_labels(script_args: Optional[List[str]]) -> Dict[str, str]: - """Converts the arguments supplied to our scripts into a dictionary usable as - labels valid for Cloud submission. - - """ - ret = {} - - def process_pair(k, v): - if _is_key(k): - clean_k = key_label(k) - if clean_k != "": - ret[clean_k] = "" if _is_key(v) else value_label(v) - - if script_args is None or len(script_args) == 0: - return ret - - elif len(script_args) == 1: - process_pair(script_args[0], None) - - # Handle the case where the final argument in the list is a boolean flag. - # This won't get picked up by partition. - elif len(script_args) > 1: - for k, v in partition(script_args, 2): - process_pair(k, v) - - process_pair(script_args[-1], None) - - return ret - - -def sanitize_labels( - pairs: Union[Dict[str, str], List[Tuple[str, str]]]) -> Dict[str, str]: - """Turns a dict, or a list of unsanitized key-value pairs (each represented by - a tuple) into a dictionary suitable to submit to Cloud as a label dict. - - """ - if isinstance(pairs, dict): - return sanitize_labels(pairs.items()) - - return {key_label(k): value_label(v) for (k, v) in pairs if key_label(k)} - - -def validated_directory(path: str) -> str: - """This validates that the supplied directory exists locally. - - """ - if not os.path.isdir(path): - raise argparse.ArgumentTypeError( - """Directory '{}' doesn't exist in this directory. Check yourself!""". - format(path)) - return path - - -def validated_file(path: str) -> str: - """This validates that the supplied file exists. Tilde expansion is supported. - - """ - expanded = os.path.expanduser(path) - if not os.path.isfile(expanded): - raise argparse.ArgumentTypeError( - """File '{}' isn't a valid file on your system. Try again!""".format( - path)) - return path - - -class TqdmFile(object): - """Dummy file-like that will write to tqdm""" - file = None - prefix = _term_move_up() + '\r' - - def __init__(self, file): - self.file = file - self._carriage_pending = False - - def write(self, line): - if self._carriage_pending: - line = self.prefix + line - self._carriage_pending = False - - if line.endswith('\r'): - self._carriage_pending = True - line = line[:-1] + '\n' - - tqdm.tqdm.write(line, file=self.file, end='') - - def flush(self): - return getattr(self.file, "flush", lambda: None)() - - def isatty(self): - return getattr(self.file, "isatty", lambda: False)() - - def close(self): - return getattr(self.file, "close", lambda: None)() - - -def config_logging(): - """Overrides logging to go through TQDM. - - TODO use this call to kill then restore: - https://github.com/tqdm/tqdm#redirecting-writing - - """ - h = logging.get_absl_handler() - old = h.python_handler - h._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) - logging.use_python_logging() - - -@contextlib.contextmanager -def tqdm_logging(): - """Overrides logging to go through TQDM. - - https://github.com/tqdm/tqdm#redirecting-writing - - """ - handler = logging.get_absl_handler() - orig = handler.python_handler - - try: - handler._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) - - # The changes won't take effect if this hasn't been called. Defensively - # call it again here. - logging.use_python_logging() - yield orig.stream - except Exception as exc: - raise exc - finally: - handler._python_handler = orig - - -def next_free_port(port: int, try_n: int = 1000, max_port=65535): - if try_n == 0 or port <= max_port: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - try: - sock.bind(('', port)) - sock.close() - return port - except OSError: - return next_free_port(port + 1, try_n - 1, max_port=max_port) - else: - raise IOError('no free ports') diff --git a/caliban/util/__init__.py b/caliban/util/__init__.py new file mode 100644 index 0000000..290200e --- /dev/null +++ b/caliban/util/__init__.py @@ -0,0 +1,180 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for our job runner. +""" +import getpass +import itertools as it +import platform +import sys +from enum import Enum +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, + Set, Tuple, Union) + +from blessings import Terminal + +t = Terminal() + +Package = NamedTuple("Package", [("executable", List[str]), + ("package_path", str), ("script_path", str), + ("main_module", Optional[str])]) + + +def err(s: str) -> None: + """Prints the supplied string to stderr in red text.""" + sys.stderr.write(t.red(s)) + + +def current_user() -> str: + return getpass.getuser() + + +def is_mac() -> bool: + """Returns True if the current code is executing on a Mac, False otherwise. + + """ + return platform.system() == "Darwin" + + +def is_linux() -> bool: + """Returns True if the current code is executing on a Linux system, False + otherwise. + + """ + return platform.system() == "Darwin" + + +def enum_vals(enum: Enum) -> List[str]: + """Returns the list of all values for a specific enum.""" + return [v.value for v in enum] + + +def any_of(value_s: str, union_type: Union) -> Any: + """Attempts to parse the supplied string into one of the components of the + supplied Union. Returns the value if possible, else raises a value error. + + union_type must be a union of enums! + + """ + + def attempt(s: str, enum_type: Enum) -> Optional[Any]: + try: + return enum_type(s) + except ValueError: + return None + + enums = union_type.__args__ + ret = None + + for enum_type in enums: + ret = attempt(value_s, enum_type) + if ret is not None: + break + + if ret is None: + raise ValueError("{} isn't a value of any of {}".format(value_s, enums)) + + return ret + + +def dict_product(m: Dict[Any, Any]) -> Iterable[Dict[Any, Any]]: + """Returns a dictionary generated by taking the cartesian product of each + list-typed value iterable with all others. + + The iterable of dictionaries returned represents every combination of values. + + If any value is NOT a list it will be treated as a singleton list. + + """ + + def wrap_v(v): + return v if isinstance(v, list) else [v] + + ks = m.keys() + vs = (wrap_v(v) for v in m.values()) + return (dict(zip(ks, x)) for x in it.product(*vs)) + + +def flipm(table: Dict[Any, Dict[Any, Any]]) -> Dict[Any, Dict[Any, Any]]: + """Handles shuffles for a particular kind of table.""" + ret = {} + for k, m in table.items(): + for k2, v in m.items(): + ret.setdefault(k2, {})[k] = v + + return ret + + +def invertm(table: Dict[Any, Iterable[Any]]) -> Dict[Any, Set[Any]]: + """Handles shuffles for a particular kind of table.""" + ret = {} + for k, vs in table.items(): + for v in vs: + ret.setdefault(v, set()).add(k) + + return ret + + +def reorderm(table: Dict[Any, Dict[Any, Iterable[Any]]], + order: Tuple[int, int, int]) -> Dict[Any, Dict[Any, Set[Any]]]: + """Handles shuffles for a particular kind of table.""" + ret = {} + for k, m in table.items(): + for k2, vs in m.items(): + for v in vs: + fields = [k, k2, v] + innerm = ret.setdefault(fields[order[0]], {}) + acc = innerm.setdefault(fields[order[1]], set()) + acc.add(fields[order[2]]) + + return ret + + +def merge(l: Dict[Any, Any], r: Dict[Any, Any]) -> Dict[Any, Any]: + """Returns a new dictionary by merging the two supplied dictionaries.""" + ret = l.copy() + ret.update(r) + return ret + + +def dict_by(keys: Set[str], f: Callable[[str], Any]) -> Dict[str, Any]: + """Returns a dictionary with keys equal to the supplied keyset. Each value is + the result of applying f to a key in keys. + + """ + return {k: f(k) for k in keys} + + +def split_by(items: List[str], + separator: Optional[str] = None) -> Tuple[List[str], List[str]]: + """If the separator is present in the list, returns a 2-tuple of + + - the items before the separator, + - all items after the separator. + + If the separator isn't present, returns a tuple of + + - (the original list, []) + + """ + if separator is None: + separator = '--' + + try: + idx = items.index(separator) + return items[0:idx], items[idx + 1:] + except ValueError: + return (items, []) diff --git a/caliban/util/argparse.py b/caliban/util/argparse.py new file mode 100644 index 0000000..7d36e3f --- /dev/null +++ b/caliban/util/argparse.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for our job runner. +""" +import caliban.util as u +import argparse +import contextlib +import getpass +import io +import itertools as it +import os +import platform +import re +import shutil +import socket +import subprocess +import sys +import time +import uuid +from collections import ChainMap +from enum import Enum +from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, + Set, Tuple, Union) + +import tqdm +from absl import logging +from blessings import Terminal +from tqdm.utils import _term_move_up + +t = Terminal() + + +def expand_args(items: Dict[str, str]) -> List[str]: + """Converts the input map into a sequence of k, v pair strings. A None value is + interpreted to mean that the key is a solo flag; it's evicted from the + output. + + """ + pairs = [[k, v] if v is not None else [k] for k, v in items.items()] + return list(it.chain.from_iterable(pairs)) + + +def validated_package(path: str) -> u.Package: + """similar to generate_package but runs argparse validation on packages that + don't actually exist in the filesystem. + + """ + p = u.generate_package(path) + + if not os.path.isdir(p.package_path): + raise argparse.ArgumentTypeError( + """Directory '{}' doesn't exist in directory. Code must be +nested in a folder that exists in the current directory.""".format( + p.package_path)) + + filename = p.script_path + if not file_exists_in_cwd(filename): + raise argparse.ArgumentTypeError( + """File '{}' doesn't exist locally as a script or python module; code +must live inside the current directory.""".format(filename)) + + return p + + +def parse_kv_pair(s: str) -> Tuple[str, str]: + """ + Parse a key, value pair, separated by '=' + + On the command line (argparse) a declaration will typically look like: + foo=hello + or + foo="hello world" + """ + items = s.split('=') + k = items[0].strip() # Remove whitespace around keys + + if len(items) <= 1: + raise argparse.ArgumentTypeError( + "Couldn't parse label '{}' into k=v format.".format(s)) + + v = '='.join(items[1:]) + return (k, v) + + +def is_key(k: Optional[str]) -> bool: + """Returns True if the argument is a valid argparse optional arg input, False + otherwise. + + Strings that start with - or -- are considered valid for now. + + """ + return k is not None and len(k) > 0 and k[0] == "-" + + +def validated_directory(path: str) -> str: + """This validates that the supplied directory exists locally. + + """ + if not os.path.isdir(path): + raise argparse.ArgumentTypeError( + """Directory '{}' doesn't exist in this directory. Check yourself!""". + format(path)) + return path + + +def validated_file(path: str) -> str: + """This validates that the supplied file exists. Tilde expansion is supported. + + """ + expanded = os.path.expanduser(path) + if not os.path.isfile(expanded): + raise argparse.ArgumentTypeError( + """File '{}' isn't a valid file on your system. Try again!""".format( + path)) + return path diff --git a/caliban/util/fs.py b/caliban/util/fs.py new file mode 100644 index 0000000..27c80d0 --- /dev/null +++ b/caliban/util/fs.py @@ -0,0 +1,219 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for interacting with the filesystem and packages. + +""" +import io +import os +import shutil +import socket +import subprocess +import sys +import time +import uuid +from typing import List, NamedTuple, Optional + +from blessings import Terminal + +t = Terminal() + +Package = NamedTuple("Package", [("executable", List[str]), + ("package_path", str), ("script_path", str), + ("main_module", Optional[str])]) + + +def file_exists_in_cwd(path: str) -> bool: + """Returns True if the current path references a valid file in the current + directory, False otherwise. + + """ + return os.path.isfile(os.path.join(os.getcwd(), path)) + + +def extract_root_directory(path: str) -> str: + """Returns the root directory of the supplied path.""" + items = path.split(os.path.sep) + return "." if len(items) == 1 else items[0] + + +def module_package(main_module: str) -> Package: + """Generates a Package instance for a python module executable that should be + executed with python -m. + + """ + script_path = module_to_path(main_module) + root = extract_root_directory(script_path) + return Package(["python", "-m"], + package_path=root, + script_path=script_path, + main_module=main_module) + + +def script_package(path: str, executable: str = "/bin/bash") -> Package: + """Generates a Package instance for a non-python-module executable.""" + root = extract_root_directory(path) + return Package([executable], + package_path=root, + script_path=path, + main_module=None) + + +def path_to_module(path_str: str) -> str: + return path_str.replace(".py", "").replace(os.path.sep, ".") + + +def module_to_path(module_name: str) -> str: + """Converts the supplied python module (module names separated by dots) into + the python file represented by the module name. + + """ + return module_name.replace(".", os.path.sep) + ".py" + + +def generate_package(path: str, + executable: Optional[List[str]] = None, + main_module: Optional[str] = None) -> Package: + """Takes in a string and generates a package instance that we can use for + imports. + """ + if executable is None: + _, ext = os.path.splitext(path) + executable = ["python"] if ext == ".py" else ["/bin/bash"] + + if main_module is None and not file_exists_in_cwd(path): + module_path = module_to_path(path) + + if file_exists_in_cwd(module_path): + return generate_package(module_path, + executable=["python", "-m"], + main_module=path_to_module(module_path)) + + root = extract_root_directory(path) + return Package(executable, root, path, main_module) + + +class TempCopy(object): + """Inside its scope, this class: + + - generates a temporary file at tmp_name containing a copy of the file at + original_path, and + - deletes the new file at tmp_name when the scope exits. + + The temporary file will live inside the current directory where python's + being executed; it's a hidden file, but it will be live for the duration of + TempCopy's scope. + + We did NOT use a tmp directory here because the changing UUID name + invalidates the docker image each time a new temp path / directory is + generated. + + """ + + def __init__(self, original_path=None, tmp_name=None): + if tmp_name is None: + self.tmp_path = ".{}.json".format(str(uuid.uuid1())) + else: + self.tmp_path = tmp_name + + self.original_path = None + if original_path: + # handle tilde! + self.original_path = os.path.expanduser(original_path) + + self.path = None + + def __enter__(self): + if self.original_path is None: + return None + + current_dir = os.getcwd() + self.path = os.path.join(current_dir, self.tmp_path) + shutil.copy2(self.original_path, self.path) + return self.tmp_path + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.path is not None: + os.remove(self.path) + self.path = None + + +def capture_stdout(cmd: List[str], + input_str: Optional[str] = None, + file=None) -> str: + """Executes the supplied command with the supplied string of std input, then + streams the output to stdout, and returns it as a string along with the + process's return code. + + Args: + cmd: list of strings to send in as the command + input_str: if supplied, this string will be passed as stdin to the supplied + command. if None, stdin will get closed immediately. + file: optional file-like object (stream): the output from the executed + process's stdout will get sent to this stream. Defaults to sys.stdout. + + Returns: + Pair of + - string of all stdout received during the command's execution + - return code of the process + + """ + if file is None: + file = sys.stdout + + buf = io.StringIO() + ret_code = None + + with subprocess.Popen(cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=False, + bufsize=1) as p: + if input_str: + p.stdin.write(input_str.encode('utf-8')) + p.stdin.close() + + out = io.TextIOWrapper(p.stdout, newline='') + + for line in out: + buf.write(line) + file.write(line) + file.flush() + + # flush to force the contents to display. + file.flush() + + while p.poll() is None: + # Process hasn't exited yet, let's wait some + time.sleep(0.5) + + ret_code = p.returncode + p.stdout.close() + + return buf.getvalue(), ret_code + + +def next_free_port(port: int, try_n: int = 1000, max_port=65535): + if try_n == 0 or port <= max_port: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + sock.bind(('', port)) + sock.close() + return port + except OSError: + return next_free_port(port + 1, try_n - 1, max_port=max_port) + else: + raise IOError('no free ports') diff --git a/caliban/util/tqdm.py b/caliban/util/tqdm.py new file mode 100644 index 0000000..13ba49d --- /dev/null +++ b/caliban/util/tqdm.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Progress bar utilities. +""" + +import contextlib +import sys + +from absl import logging +from blessings import Terminal + +import tqdm +from tqdm.utils import _term_move_up + +t = Terminal() + + +class TqdmFile(object): + """Dummy file-like that will write to tqdm""" + file = None + prefix = _term_move_up() + '\r' + + def __init__(self, file): + self.file = file + self._carriage_pending = False + + def write(self, line): + if self._carriage_pending: + line = self.prefix + line + self._carriage_pending = False + + if line.endswith('\r'): + self._carriage_pending = True + line = line[:-1] + '\n' + + tqdm.tqdm.write(line, file=self.file, end='') + + def flush(self): + return getattr(self.file, "flush", lambda: None)() + + def isatty(self): + return getattr(self.file, "isatty", lambda: False)() + + def close(self): + return getattr(self.file, "close", lambda: None)() + + +def config_logging(): + """Overrides logging to go through TQDM. + + TODO use this call to kill then restore: + https://github.com/tqdm/tqdm#redirecting-writing + + """ + h = logging.get_absl_handler() + old = h.python_handler + h._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) + logging.use_python_logging() + + +@contextlib.contextmanager +def tqdm_logging(): + """Overrides logging to go through TQDM. + + https://github.com/tqdm/tqdm#redirecting-writing + + """ + handler = logging.get_absl_handler() + orig = handler.python_handler + + try: + handler._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) + + # The changes won't take effect if this hasn't been called. Defensively + # call it again here. + logging.use_python_logging() + yield orig.stream + except Exception as exc: + raise exc + finally: + handler._python_handler = orig diff --git a/tests/caliban/config/test_config.py b/tests/caliban/config/test_config.py new file mode 100644 index 0000000..aa49498 --- /dev/null +++ b/tests/caliban/config/test_config.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentTypeError + +import caliban.cloud.types as ct +import caliban.config as c +import pytest + + +def test_extract_region(monkeypatch): + if os.environ.get('REGION'): + monkeypatch.delenv('REGION') + + assert c.extract_region({}) == c.DEFAULT_REGION + + # You have to provide a valid region. + with pytest.raises(ArgumentTypeError): + c.extract_region({"region": "face"}) + + # Same goes for the environment variable setting approach. + monkeypatch.setenv('REGION', "face") + with pytest.raises(ArgumentTypeError): + c.extract_region({}) + + # an empty string is fine, and ignored. + monkeypatch.setenv('REGION', "") + assert c.extract_region({}) == c.DEFAULT_REGION + + assert c.extract_region({"region": "us-west1"}) == ct.US.west1 diff --git a/tests/caliban/config/test_experiment.py b/tests/caliban/config/test_experiment.py new file mode 100644 index 0000000..30f134d --- /dev/null +++ b/tests/caliban/config/test_experiment.py @@ -0,0 +1,229 @@ +#!/usr/bin/python +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentTypeError + +import caliban.config.experiment as c +import pytest + + +def test_validate_experiment_config(): + """basic examples of validate experiment config.""" + invalid = {1: "face", "2": "3"} + with pytest.raises(ArgumentTypeError): + c.validate_experiment_config(invalid) + + # a dict value is invalid, even if it's hidden in a list. + with pytest.raises(ArgumentTypeError): + c.validate_experiment_config({"key": [{1: 2}, "face"]}) + + valid = {"a": [1.0, 2, 3], "b": True, "c": 1, "d": "e", "f": 1.2} + assert valid == c.validate_experiment_config(valid) + + # Lists are okay too... + items = [valid, valid] + assert items == c.validate_experiment_config(items) + + # As are lists of lists. + lol = [valid, [valid]] + assert lol == c.validate_experiment_config(lol) + + # Invalid types are caught even nested inside lists. + lol_invalid = [valid, valid, [invalid]] + + with pytest.raises(ArgumentTypeError): + c.validate_experiment_config(lol_invalid) + + # Compound keys which violate syntax rules are caught + invalid_compound = [{ + "[": 0 + }, { + "eh[": 0 + }, { + "[test,,fail]": 0 + }, { + "[,test,fail]": 0 + }, { + "[I,will,fail,]": 0 + }, { + "[I,,will,fail]": 0 + }, { + "]I,will,fail]": 0 + }] + + valid_compound = [{ + "[batch_size,learning_rate]": [0, 1] + }, { + "[batch_size,learning_rate,dataset_size]": [0.01, 0.02, 100] + }, { + "[batch_size,learning_rate,dataset_size]": [[0.01, 0.02, 100], + [0.03, 0.05, 200]] + }, { + "[batch_size,learning_rate]": [[0., 1.], [2., 3.]] + }, { + "[batch_size,learning_rate]": [[0., 1.], [2., 3.], [4., 5.]] + }, { + "[batch_size, learning_rate, dataset_size]": [0.01, 0.02, 100] + }, { + "[batch_size , learning_rate,dataset_size]": [[0.01, 0.02, 100], + [0.03, 0.05, 200]] + }, { + "[batch_size, learning_rate]": [[0., 1.], [2., 3.]] + }, { + "[batch_size ,learning_rate]": [[0., 1.], [2., 3.], [4., 5.]] + }] + + for i in invalid_compound: + with pytest.raises(Exception): + c.validate_experiment_config(i) + + for i in valid_compound: + assert i == c.validate_experiment_config(i) + + +def test_expand_experiment_config(): + # An empty config expands to a singleton list. This is important so that + # single job submission without a spec works. + assert [{}] == c.expand_experiment_config({}) + + +def test_compound_key_handling(): + tests = [{ + 'input': { + '[a,b]': [['c', 'd'], ['e', 'f']] + }, + 'after_tupleization': { + ('a', 'b'): [('c', 'd'), ('e', 'f')] + }, + 'after_dictproduct': [{ + ('a', 'b'): ('c', 'd') + }, { + ('a', 'b'): ('e', 'f') + }], + 'after_expansion': [{ + 'a': 'c', + 'b': 'd' + }, { + 'a': 'e', + 'b': 'f' + }] + }, { + 'input': { + '[a,b]': ['c', 'd'] + }, + 'after_tupleization': { + ('a', 'b'): ('c', 'd') + }, + 'after_dictproduct': [{ + ('a', 'b'): ('c', 'd') + }], + 'after_expansion': [{ + 'a': 'c', + 'b': 'd' + }] + }, { + 'input': { + 'hi': 'there', + '[k1,k2]': [['v1a', 'v2a'], ['v1b', 'v2b']] + }, + 'after_tupleization': { + 'hi': 'there', + ('k1', 'k2'): [('v1a', 'v2a'), ('v1b', 'v2b')] + }, + 'after_dictproduct': [{ + 'hi': 'there', + ('k1', 'k2'): ('v1a', 'v2a') + }, { + 'hi': 'there', + ('k1', 'k2'): ('v1b', 'v2b') + }], + 'after_expansion': [{ + 'hi': 'there', + 'k1': 'v1a', + 'k2': 'v2a' + }, { + 'hi': 'there', + 'k1': 'v1b', + 'k2': 'v2b' + }] + }, { + 'input': { + 'hi': 'there', + '[a,b]': ['c', 'd'] + }, + 'after_tupleization': { + 'hi': 'there', + ('a', 'b'): ('c', 'd') + }, + 'after_dictproduct': [{ + 'hi': 'there', + ('a', 'b'): ('c', 'd') + }], + 'after_expansion': [{ + 'hi': 'there', + 'a': 'c', + 'b': 'd' + }] + }, { + 'input': { + '[a,b]': [0, 1] + }, + 'after_tupleization': { + ('a', 'b'): (0, 1) + }, + 'after_dictproduct': [{ + ('a', 'b'): (0, 1) + }], + 'after_expansion': [{ + 'a': 0, + 'b': 1 + }] + }, { + 'input': { + '[a,b]': [[0, 1]] + }, + 'after_tupleization': { + ('a', 'b'): [(0, 1)] + }, + 'after_dictproduct': [{ + ('a', 'b'): (0, 1) + }], + 'after_expansion': [{ + 'a': 0, + 'b': 1 + }] + }, { + 'input': { + 'hi': 'blueshift', + '[a,b]': [[0, 1]] + }, + 'after_tupleization': { + 'hi': 'blueshift', + ('a', 'b'): [(0, 1)] + }, + 'after_dictproduct': [{ + 'hi': 'blueshift', + ('a', 'b'): (0, 1) + }], + 'after_expansion': [{ + 'hi': 'blueshift', + 'a': 0, + 'b': 1 + }] + }] + + for test in tests: + assert test['after_expansion'] == c.expand_experiment_config(test['input']) diff --git a/tests/caliban/test_config.py b/tests/caliban/test_config.py deleted file mode 100644 index df42969..0000000 --- a/tests/caliban/test_config.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/python -# -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -from argparse import ArgumentTypeError - -import caliban.cloud.types as ct -import caliban.config as c -import pytest - - -def test_extract_region(monkeypatch): - if os.environ.get('REGION'): - monkeypatch.delenv('REGION') - - assert c.extract_region({}) == c.DEFAULT_REGION - - # You have to provide a valid region. - with pytest.raises(ArgumentTypeError): - c.extract_region({"region": "face"}) - - # Same goes for the environment variable setting approach. - monkeypatch.setenv('REGION', "face") - with pytest.raises(ArgumentTypeError): - c.extract_region({}) - - # an empty string is fine, and ignored. - monkeypatch.setenv('REGION', "") - assert c.extract_region({}) == c.DEFAULT_REGION - - assert c.extract_region({"region": "us-west1"}) == ct.US.west1 - - -class ConfigTestSuite(unittest.TestCase): - """Tests for the config package.""" - - def test_validate_experiment_config(self): - """basic examples of validate experiment config.""" - invalid = {1: "face", "2": "3"} - with self.assertRaises(ArgumentTypeError): - c.validate_experiment_config(invalid) - - # a dict value is invalid, even if it's hidden in a list. - with self.assertRaises(ArgumentTypeError): - c.validate_experiment_config({"key": [{1: 2}, "face"]}) - - valid = {"a": [1.0, 2, 3], "b": True, "c": 1, "d": "e", "f": 1.2} - self.assertDictEqual(valid, c.validate_experiment_config(valid)) - - # Lists are okay too... - items = [valid, valid] - self.assertListEqual(items, c.validate_experiment_config(items)) - - # As are lists of lists. - lol = [valid, [valid]] - self.assertListEqual(lol, c.validate_experiment_config(lol)) - - # Invalid types are caught even nested inside lists. - lol_invalid = [valid, valid, [invalid]] - with self.assertRaises(ArgumentTypeError): - c.validate_experiment_config(lol_invalid) - - # Compound keys which violate syntax rules are caught - invalid_compound = [{ - "[": 0 - }, { - "eh[": 0 - }, { - "[test,,fail]": 0 - }, { - "[,test,fail]": 0 - }, { - "[I,will,fail,]": 0 - }, { - "[I,,will,fail]": 0 - }, { - "]I,will,fail]": 0 - }] - valid_compound = [{ - "[batch_size,learning_rate]": [0, 1] - }, { - "[batch_size,learning_rate,dataset_size]": [0.01, 0.02, 100] - }, { - "[batch_size,learning_rate,dataset_size]": [[0.01, 0.02, 100], - [0.03, 0.05, 200]] - }, { - "[batch_size,learning_rate]": [[0., 1.], [2., 3.]] - }, { - "[batch_size,learning_rate]": [[0., 1.], [2., 3.], [4., 5.]] - }, { - "[batch_size, learning_rate, dataset_size]": [0.01, 0.02, 100] - }, { - "[batch_size , learning_rate,dataset_size]": [[0.01, 0.02, 100], - [0.03, 0.05, 200]] - }, { - "[batch_size, learning_rate]": [[0., 1.], [2., 3.]] - }, { - "[batch_size ,learning_rate]": [[0., 1.], [2., 3.], [4., 5.]] - }] - - for i in invalid_compound: - with self.assertRaises(Exception): - c.validate_experiment_config(i) - for i in valid_compound: - self.assertDictEqual(i, c.validate_experiment_config(i)) - - def test_expand_experiment_config(self): - # An empty config expands to a singleton list. This is important so that - # single job submission without a spec works. - self.assertListEqual([{}], c.expand_experiment_config({})) - - def test_compound_key_handling(self): - tests = [{ - 'input': { - '[a,b]': [['c', 'd'], ['e', 'f']] - }, - 'after_tupleization': { - ('a', 'b'): [('c', 'd'), ('e', 'f')] - }, - 'after_dictproduct': [{ - ('a', 'b'): ('c', 'd') - }, { - ('a', 'b'): ('e', 'f') - }], - 'after_expansion': [{ - 'a': 'c', - 'b': 'd' - }, { - 'a': 'e', - 'b': 'f' - }] - }, { - 'input': { - '[a,b]': ['c', 'd'] - }, - 'after_tupleization': { - ('a', 'b'): ('c', 'd') - }, - 'after_dictproduct': [{ - ('a', 'b'): ('c', 'd') - }], - 'after_expansion': [{ - 'a': 'c', - 'b': 'd' - }] - }, { - 'input': { - 'hi': 'there', - '[k1,k2]': [['v1a', 'v2a'], ['v1b', 'v2b']] - }, - 'after_tupleization': { - 'hi': 'there', - ('k1', 'k2'): [('v1a', 'v2a'), ('v1b', 'v2b')] - }, - 'after_dictproduct': [{ - 'hi': 'there', - ('k1', 'k2'): ('v1a', 'v2a') - }, { - 'hi': 'there', - ('k1', 'k2'): ('v1b', 'v2b') - }], - 'after_expansion': [{ - 'hi': 'there', - 'k1': 'v1a', - 'k2': 'v2a' - }, { - 'hi': 'there', - 'k1': 'v1b', - 'k2': 'v2b' - }] - }, { - 'input': { - 'hi': 'there', - '[a,b]': ['c', 'd'] - }, - 'after_tupleization': { - 'hi': 'there', - ('a', 'b'): ('c', 'd') - }, - 'after_dictproduct': [{ - 'hi': 'there', - ('a', 'b'): ('c', 'd') - }], - 'after_expansion': [{ - 'hi': 'there', - 'a': 'c', - 'b': 'd' - }] - }, { - 'input': { - '[a,b]': [0, 1] - }, - 'after_tupleization': { - ('a', 'b'): (0, 1) - }, - 'after_dictproduct': [{ - ('a', 'b'): (0, 1) - }], - 'after_expansion': [{ - 'a': 0, - 'b': 1 - }] - }, { - 'input': { - '[a,b]': [[0, 1]] - }, - 'after_tupleization': { - ('a', 'b'): [(0, 1)] - }, - 'after_dictproduct': [{ - ('a', 'b'): (0, 1) - }], - 'after_expansion': [{ - 'a': 0, - 'b': 1 - }] - }, { - 'input': { - 'hi': 'blueshift', - '[a,b]': [[0, 1]] - }, - 'after_tupleization': { - 'hi': 'blueshift', - ('a', 'b'): [(0, 1)] - }, - 'after_dictproduct': [{ - 'hi': 'blueshift', - ('a', 'b'): (0, 1) - }], - 'after_expansion': [{ - 'hi': 'blueshift', - 'a': 0, - 'b': 1 - }] - }] - - for test in tests: - self.assertListEqual(test['after_expansion'], - c.expand_experiment_config(test['input'])) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/caliban/test_util.py b/tests/caliban/test_util.py index c02ea03..d46719b 100644 --- a/tests/caliban/test_util.py +++ b/tests/caliban/test_util.py @@ -302,22 +302,6 @@ def check_expansion(test_dict): check_dictproduct(test) check_expansion(test) - @given(st.integers()) - def test_compose(self, x): - """Functions should compose; the composed function accepts any arguments that the rightmost function accepts.""" - - def plus1(x): - return x + 1 - - def square(x): - return x * x - - square_plus_one = u.compose(plus1, square) - times_plus_one = u.compose(plus1, lambda l, r: l * r) - - self.assertEqual(square_plus_one(x), x * x + 1) - self.assertEqual(square_plus_one(x), times_plus_one(l=x, r=x)) - @given(st.dictionaries(st.text(), st.text()), st.dictionaries(st.text(), st.text())) def test_merge(self, m1, m2):