Skip to content

Commit

Permalink
[v1.x] CI fixes to make more stable and upgradable (apache#19895)
Browse files Browse the repository at this point in the history
* Test moving pipelines from p3 to g4.

* Remove fallback codecov command - the existing (first) command works and the second always fails a few times before finally succeeding (and also doesn't support the -P parameter, which causes an error.)

* Stop using docker python client, since it still doesn't support latest nvidia 'gpus' attribute. Switch to using subprocess calls using list parameter (to avoid shell injections).

See docker/docker-py#2395

* Remove old files.

* Fix comment

* Set default environment variables

* Fix GPU syntax.

* Use subprocess.run and redirect output to stdout, don't run docker in interactive mode.

* Check if codecov works without providing parameters now.

* Send docker stderr to sys.stderr

* Support both nvidia-docker configurations, first try '--gpus all', and if that fails, then try '--runtime nvidia'.

Co-authored-by: Joe Evans <[email protected]>
  • Loading branch information
josephevans and Joe Evans committed Feb 16, 2021
1 parent 7b750a8 commit 0303587
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 740 deletions.
15 changes: 1 addition & 14 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -112,20 +112,7 @@ def get_git_commit_hash() {
}

def publish_test_coverage() {
// CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
git_commit_hash = get_git_commit_hash()

if (env.CHANGE_ID) {
// PR execution
codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
} else {
// Branch execution
codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
}

// To make sure we never fail because test coverage reporting is not available
// Fall back to our own copy of the bash helper if it failed to download the public version
sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
sh "curl -s https://codecov.io/bash | bash"
}

def collect_test_results_unix(original_file_name, new_file_name) {
Expand Down
112 changes: 56 additions & 56 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

import yaml

from safe_docker_run import SafeDockerClient
from util import *


Expand Down Expand Up @@ -104,8 +103,7 @@ def default_ccache_dir() -> str:
return os.path.join(os.path.expanduser("~"), ".ccache")


def container_run(docker_client: SafeDockerClient,
platform: str,
def container_run(platform: str,
nvidia_runtime: bool,
docker_registry: str,
shared_memory_size: str,
Expand All @@ -114,17 +112,12 @@ def container_run(docker_client: SafeDockerClient,
environment: Dict[str, str],
dry_run: bool = False) -> int:
"""Run command in a container"""
container_wait_s = 600
#
# Environment setup
#
# set default environment variables
environment.update({
'CCACHE_MAXSIZE': '500G',
'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is
# mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache
# verification.
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification.
})
environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})

Expand All @@ -136,13 +129,9 @@ def container_run(docker_client: SafeDockerClient,
os.makedirs(local_ccache_dir, exist_ok=True)
logging.info("Using ccache directory: %s", local_ccache_dir)

# Equivalent command
docker_cmd_list = [
"docker",
'run',
"--gpus all" if nvidia_runtime else "",
"--cap-add",
"SYS_PTRACE", # Required by ASAN
# Build docker command
docker_arg_list = [
"--cap-add", "SYS_PTRACE", # Required by ASAN
'--rm',
'--shm-size={}'.format(shared_memory_size),
# mount mxnet root
Expand All @@ -158,40 +147,27 @@ def container_run(docker_client: SafeDockerClient,
'-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
# a container-scoped log, useful for ccache verification.
'-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
'-ti',
tag]
docker_cmd_list.extend(command)
docker_cmd = ' \\\n\t'.join(docker_cmd_list)
logging.info("Running %s in container %s", command, tag)
logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
]
docker_arg_list += [tag]
docker_arg_list.extend(command)

def docker_run_cmd(cmd):
logging.info("Running %s in container %s", command, tag)
logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)

if not dry_run:
#############################
#
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
# noinspection PyShadowingNames
runtime = None
if nvidia_runtime:
# noinspection PyShadowingNames
# runc is default (docker info | grep -i runtime)
runtime = 'nvidia'

return docker_client.run(
tag,
runtime=runtime,
command=command,
shm_size=shared_memory_size,
user='{}:{}'.format(os.getuid(), os.getgid()),
cap_add='SYS_PTRACE',
volumes={
mx_root:
{'bind': '/work/mxnet', 'mode': 'rw'},
local_build_folder:
{'bind': '/work/build', 'mode': 'rw'},
local_ccache_dir:
{'bind': '/work/ccache', 'mode': 'rw'},
},
environment=environment)
if not nvidia_runtime:
docker_run_cmd(['docker', 'run'] + docker_arg_list)
else:
try:
docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
except subprocess.CalledProcessError as e:
if e.returncode == 125:
docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
else:
raise

return 0


Expand Down Expand Up @@ -292,8 +268,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain.from_iterable(args.command))
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
for e in args.environment])

Expand All @@ -318,29 +292,55 @@ def main() -> int:
ret = 0
if command:
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
elif args.print_docker_run:
command = []
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
else:
# With no commands, execute a build function for the target platform
command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(command))
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)

if ret != 0:
logging.critical("Execution of %s failed with status: %d", command, ret)
return ret


elif args.all:
platforms = get_platforms()
platforms = [platform for platform in platforms if 'build.' in platform]
logging.info("Building for all architectures: %s", platforms)
logging.info("Artifacts will be produced in the build/ directory.")
for platform in platforms:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
if args.build_only:
continue
shutil.rmtree(buildir(), ignore_errors=True)
build_platform = "build_{}".format(platform)
plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
"mxnet_{}".format(build_platform)))
if os.path.exists(plat_buildir):
logging.warning("%s already exists, skipping", plat_buildir)
continue
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
container_run(
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
shutil.move(buildir(), plat_buildir)
logging.info("Built files left in: %s", plat_buildir)
else:
parser.print_help()
list_platforms()
Expand Down
Loading

0 comments on commit 0303587

Please sign in to comment.