Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds strict_db flag to run/cloud/cluster commands #28

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions caliban/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def local_run_parser(base):
image_id_arg(parser)
docker_run_arg(parser)
xgroup_submit_arg(parser)
strict_db_arg(parser)


def gpu_spec_arg(parser, validate_count: bool = False):
Expand Down Expand Up @@ -386,6 +387,7 @@ def container_parser(parser):
job_name_arg(parser)
label_arg(parser)
xgroup_submit_arg(parser)
strict_db_arg(parser)


def cloud_parser(base):
Expand Down Expand Up @@ -687,6 +689,7 @@ def cluster_job_submit_cmd(base):
dry_run_arg(parser)
job_export_arg(parser)
xgroup_submit_arg(parser)
strict_db_arg(parser)

require_module(parser)
add_script_args(parser)
Expand Down Expand Up @@ -947,3 +950,12 @@ def max_jobs_arg(parser):
f'then this specifies the total number of jobs to return, ordered '
f'by creation date, or all jobs if max_jobs==0.'),
)


# ----------------------------------------------------------------------------
def strict_db_arg(parser):
parser.add_argument(
'--strict_db',
action='store_true',
help=(f'if the caliban database specified by the CALIBAN_DB_URL '
f'environment variable cannot be reached, then fail and exit'))
6 changes: 4 additions & 2 deletions caliban/cloud/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,8 @@ def submit_ml_job(job_mode: conf.JobMode,
experiment_config: Optional[conf.ExpConf] = None,
script_args: Optional[List[str]] = None,
request_retries: Optional[int] = None,
xgroup: Optional[str] = None) -> None:
xgroup: Optional[str] = None,
strict_db: bool = False) -> None:
"""Top level function in the module. This function:

- builds an image using the supplied docker_args, in either CPU or GPU mode
Expand Down Expand Up @@ -606,6 +607,7 @@ def submit_ml_job(job_mode: conf.JobMode,
a timeout or a rate limiting request.
- xgroup: experiment group for this submission, if None a new group will
be created
- strict_db: if database specified by the CALIBAN_DB_URL is not found, exit
"""
if script_args is None:
script_args = []
Expand All @@ -628,7 +630,7 @@ def submit_ml_job(job_mode: conf.JobMode,
if request_retries is None:
request_retries = 10

engine = get_mem_engine() if dry_run else get_sql_engine()
engine = get_mem_engine() if dry_run else get_sql_engine(strict=strict_db)

with session_scope(engine) as session:
container_spec = generate_container_spec(session, docker_args, image_tag)
Expand Down
5 changes: 4 additions & 1 deletion caliban/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,7 @@ def run_experiments(job_mode: c.JobMode,
dry_run: bool = False,
experiment_config: Optional[c.ExpConf] = None,
xgroup: Optional[str] = None,
strict_db: bool = False,
**build_image_kwargs) -> None:
"""Builds an image using the supplied **build_image_kwargs and calls `docker
run` on the resulting image using sensible defaults.
Expand All @@ -830,6 +831,8 @@ def run_experiments(job_mode: c.JobMode,
- dry_run: if True, no actual jobs will be executed and docker won't
actually build; logging side effects will show the user what will happen
without dry_run=True.
- xgroup: experiment group for this command
- strict_db: if database specified by the CALIBAN_DB_URL is not found, exit

any extra kwargs supplied are passed through to build_image.
"""
Expand All @@ -845,7 +848,7 @@ def run_experiments(job_mode: c.JobMode,
docker_args = {k: v for k, v in build_image_kwargs.items()}
docker_args['job_mode'] = job_mode

engine = get_mem_engine() if dry_run else get_sql_engine()
engine = get_mem_engine() if dry_run else get_sql_engine(strict=strict_db)

with session_scope(engine) as session:
container_spec = generate_container_spec(session, docker_args, image_id)
Expand Down
11 changes: 9 additions & 2 deletions caliban/gke/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,13 @@ def wrapper(args: dict,
zone=zone,
creds=creds)

return fn(args, cluster=cluster) if cluster else None
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a minor bug I found while debugging this PR.

if cluster is None:
logging.error(f'unable to resolve cluster')
logging.error(f'you can see your available clusters using the command:')
logging.error(f'caliban cluster ls')
return

return fn(args, cluster=cluster)

return wrapper

Expand Down Expand Up @@ -359,6 +365,7 @@ def _job_submit(args: dict, cluster: Cluster) -> None:
xgroup = args.get('xgroup')
image_tag = args.get('image_tag')
export = args.get('export', None)
strict_db = args.get('strict_db', False)

labels = args.get('label')
if labels is not None:
Expand Down Expand Up @@ -415,7 +422,7 @@ def _job_submit(args: dict, cluster: Cluster) -> None:
accel, accel_count = accel_spec

# --------------------------------------------------------------------------
engine = get_mem_engine() if dry_run else get_sql_engine()
engine = get_mem_engine() if dry_run else get_sql_engine(strict=strict_db)

with session_scope(engine) as session:
container_spec = generate_container_spec(session, docker_m, image_tag)
Expand Down
4 changes: 2 additions & 2 deletions caliban/history/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from sqlalchemy import create_engine
from sqlalchemy.engine.base import Engine
from sqlalchemy.exc import OperationalError
from sqlalchemy.exc import OperationalError, ArgumentError
from sqlalchemy.orm import Session, sessionmaker

import caliban.config as conf
Expand Down Expand Up @@ -108,7 +108,7 @@ def get_sql_engine(
try:
return _create_sqa_engine(url=url, echo=echo)

except (OperationalError, OSError) as e:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another minor issue found while testing this PR.

except (OperationalError, OSError, ArgumentError) as e:
logging.error("")
logging.error(
t.red(
Expand Down
4 changes: 4 additions & 0 deletions caliban/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def run_app(arg_input):
image_id = args.get("image_id")
exp_config = args.get("experiment_config")
xgroup = args.get('xgroup')
strict_db = args.get('strict_db')

docker.run_experiments(job_mode,
run_args=docker_run_args,
Expand All @@ -106,6 +107,7 @@ def run_app(arg_input):
dry_run=dry_run,
package=package,
xgroup=xgroup,
strict_db=strict_db,
**docker_args)

elif command == "cloud":
Expand All @@ -123,6 +125,7 @@ def run_app(arg_input):
exp_config = args.get("experiment_config")
labels = u.sanitize_labels(args.get("label") or [])
xgroup = args.get('xgroup')
strict_db = args.get('strict_db')

# Arguments to internally build the image required to submit to Cloud.
docker_m = {"job_mode": job_mode, "package": package, **docker_args}
Expand All @@ -143,6 +146,7 @@ def run_app(arg_input):
script_args=script_args,
experiment_config=exp_config,
xgroup=xgroup,
strict_db=strict_db,
)
else:
logging.info("Unknown command: {}".format(command))
Expand Down