From 0303587b6bde12bfc86f6af62b5265835bca0c67 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Tue, 16 Feb 2021 08:41:52 -0800 Subject: [PATCH 1/2] [v1.x] CI fixes to make more stable and upgradable (#19895) * Test moving pipelines from p3 to g4. * Remove fallback codecov command - the existing (first) command works and the second always fails a few times before finally succeeding (and also doesn't support the -P parameter, which causes an error.) * Stop using docker python client, since it still doesn't support latest nvidia 'gpus' attribute. Switch to using subprocess calls using list parameter (to avoid shell injections). See /~https://github.com/docker/docker-py/issues/2395 * Remove old files. * Fix comment * Set default environment variables * Fix GPU syntax. * Use subprocess.run and redirect output to stdout, don't run docker in interactive mode. * Check if codecov works without providing parameters now. * Send docker stderr to sys.stderr * Support both nvidia-docker configurations, first try '--gpus all', and if that fails, then try '--runtime nvidia'. Co-authored-by: Joe Evans --- ci/Jenkinsfile_utils.groovy | 15 +- ci/build.py | 112 +++++----- ci/safe_docker_run.py | 248 --------------------- ci/test_safe_docker_run.py | 422 ------------------------------------ 4 files changed, 57 insertions(+), 740 deletions(-) delete mode 100644 ci/safe_docker_run.py delete mode 100644 ci/test_safe_docker_run.py diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy index f679cc3f01cf..a77ab1cde15c 100644 --- a/ci/Jenkinsfile_utils.groovy +++ b/ci/Jenkinsfile_utils.groovy @@ -112,20 +112,7 @@ def get_git_commit_hash() { } def publish_test_coverage() { - // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy - git_commit_hash = get_git_commit_hash() - - if (env.CHANGE_ID) { - // PR execution - codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}" - } else { - // Branch execution - codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}" - } - - // To make sure we never fail because test coverage reporting is not available - // Fall back to our own copy of the bash helper if it failed to download the public version - sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true" + sh "curl -s https://codecov.io/bash | bash" } def collect_test_results_unix(original_file_name, new_file_name) { diff --git a/ci/build.py b/ci/build.py index 1e9e23fadafb..ef78504a5111 100755 --- a/ci/build.py +++ b/ci/build.py @@ -34,7 +34,6 @@ import yaml -from safe_docker_run import SafeDockerClient from util import * @@ -104,8 +103,7 @@ def default_ccache_dir() -> str: return os.path.join(os.path.expanduser("~"), ".ccache") -def container_run(docker_client: SafeDockerClient, - platform: str, +def container_run(platform: str, nvidia_runtime: bool, docker_registry: str, shared_memory_size: str, @@ -114,17 +112,12 @@ def container_run(docker_client: SafeDockerClient, environment: Dict[str, str], dry_run: bool = False) -> int: """Run command in a container""" - container_wait_s = 600 - # - # Environment setup - # + # set default environment variables environment.update({ 'CCACHE_MAXSIZE': '500G', 'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared - 'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is - # mounted - 'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache - # verification. + 'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted + 'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification. }) environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ}) @@ -136,13 +129,9 @@ def container_run(docker_client: SafeDockerClient, os.makedirs(local_ccache_dir, exist_ok=True) logging.info("Using ccache directory: %s", local_ccache_dir) - # Equivalent command - docker_cmd_list = [ - "docker", - 'run', - "--gpus all" if nvidia_runtime else "", - "--cap-add", - "SYS_PTRACE", # Required by ASAN + # Build docker command + docker_arg_list = [ + "--cap-add", "SYS_PTRACE", # Required by ASAN '--rm', '--shm-size={}'.format(shared_memory_size), # mount mxnet root @@ -158,40 +147,27 @@ def container_run(docker_client: SafeDockerClient, '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']), # a container-scoped log, useful for ccache verification. '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']), - '-ti', - tag] - docker_cmd_list.extend(command) - docker_cmd = ' \\\n\t'.join(docker_cmd_list) - logging.info("Running %s in container %s", command, tag) - logging.info("Executing the equivalent of:\n%s\n", docker_cmd) + ] + docker_arg_list += [tag] + docker_arg_list.extend(command) + + def docker_run_cmd(cmd): + logging.info("Running %s in container %s", command, tag) + logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd)) + subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True) if not dry_run: - ############################# - # - signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}) - # noinspection PyShadowingNames - runtime = None - if nvidia_runtime: - # noinspection PyShadowingNames - # runc is default (docker info | grep -i runtime) - runtime = 'nvidia' - - return docker_client.run( - tag, - runtime=runtime, - command=command, - shm_size=shared_memory_size, - user='{}:{}'.format(os.getuid(), os.getgid()), - cap_add='SYS_PTRACE', - volumes={ - mx_root: - {'bind': '/work/mxnet', 'mode': 'rw'}, - local_build_folder: - {'bind': '/work/build', 'mode': 'rw'}, - local_ccache_dir: - {'bind': '/work/ccache', 'mode': 'rw'}, - }, - environment=environment) + if not nvidia_runtime: + docker_run_cmd(['docker', 'run'] + docker_arg_list) + else: + try: + docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list) + except subprocess.CalledProcessError as e: + if e.returncode == 125: + docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list) + else: + raise + return 0 @@ -292,8 +268,6 @@ def main() -> int: args = parser.parse_args() command = list(chain.from_iterable(args.command)) - docker_client = SafeDockerClient() - environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e])) for e in args.environment]) @@ -318,13 +292,13 @@ def main() -> int: ret = 0 if command: ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, environment=environment) elif args.print_docker_run: command = [] ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment) else: @@ -332,7 +306,7 @@ def main() -> int: command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)] logging.info("No command specified, trying default build: %s", ' '.join(command)) ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, environment=environment) @@ -340,7 +314,33 @@ def main() -> int: logging.critical("Execution of %s failed with status: %d", command, ret) return ret - + elif args.all: + platforms = get_platforms() + platforms = [platform for platform in platforms if 'build.' in platform] + logging.info("Building for all architectures: %s", platforms) + logging.info("Artifacts will be produced in the build/ directory.") + for platform in platforms: + tag = get_docker_tag(platform=platform, registry=args.docker_registry) + load_docker_cache(tag=tag, docker_registry=args.docker_registry) + build_docker(platform, registry=args.docker_registry, + num_retries=args.docker_build_retries, no_cache=args.no_cache, + cache_intermediate=args.cache_intermediate) + if args.build_only: + continue + shutil.rmtree(buildir(), ignore_errors=True) + build_platform = "build_{}".format(platform) + plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..', + "mxnet_{}".format(build_platform))) + if os.path.exists(plat_buildir): + logging.warning("%s already exists, skipping", plat_buildir) + continue + command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] + container_run( + platform=platform, nvidia_runtime=args.nvidiadocker, + shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, + local_ccache_dir=args.ccache_dir, environment=environment) + shutil.move(buildir(), plat_buildir) + logging.info("Built files left in: %s", plat_buildir) else: parser.print_help() list_platforms() diff --git a/ci/safe_docker_run.py b/ci/safe_docker_run.py deleted file mode 100644 index e014e8731dd2..000000000000 --- a/ci/safe_docker_run.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Docker command wrapper to guard against Zombie containers -""" - -import argparse -import atexit -import logging -import os -import signal -import sys -from functools import reduce -from itertools import chain -from typing import Dict, Any - -import docker -from docker.errors import NotFound -from docker.models.containers import Container - -from util import config_logging - -DOCKER_STOP_TIMEOUT_SECONDS = 3 -CONTAINER_WAIT_SECONDS = 600 - - -class SafeDockerClient: - """ - A wrapper around the docker client to ensure that no zombie containers are left hanging around - in case the script is not allowed to finish normally - """ - - @staticmethod - def _trim_container_id(cid): - """:return: trimmed container id""" - return cid[:12] - - def __init__(self): - self._docker_client = docker.from_env(timeout=None) - self._containers = set() - self._docker_stop_timeout = DOCKER_STOP_TIMEOUT_SECONDS - self._container_wait_seconds = CONTAINER_WAIT_SECONDS - - def signal_handler(signum, _): - signal.pthread_sigmask(signal.SIG_BLOCK, {signum}) - logging.warning("Signal %d received, cleaning up...", signum) - self._clean_up() - logging.warning("done. Exiting with error.") - sys.exit(1) - - atexit.register(self._clean_up) - signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGINT, signal_handler) - - def _clean_up(self): - if self._containers: - logging.warning("Cleaning up containers") - else: - return - # noinspection PyBroadException - try: - stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self._docker_stop_timeout)) - except Exception: - stop_timeout = 3 - for container in self._containers: - try: - container.stop(timeout=stop_timeout) - logging.info("☠: stopped container %s", self._trim_container_id(container.id)) - container.remove() - logging.info("🚽: removed container %s", self._trim_container_id(container.id)) - except Exception as e: - logging.exception(e) - self._containers.clear() - logging.info("Cleaning up containers finished.") - - def _add_container(self, container: Container) -> Container: - self._containers.add(container) - return container - - def _remove_container(self, container: Container): - self._containers.remove(container) - - def run(self, *args, **kwargs) -> int: - if "detach" in kwargs and kwargs.get("detach") is False: - raise ValueError("Can only safe run with 'detach' set to True") - else: - kwargs["detach"] = True - - # These variables are passed to the container so the process tree killer can find runaway - # process inside the container - # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller - # /~https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 - if "environment" not in kwargs: - kwargs["environment"] = {} - - jenkins_env_vars = ["BUILD_NUMBER", "BUILD_ID", "BUILD_TAG"] - kwargs["environment"].update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ}) - - ret = 0 - try: - # Race condition: - # If the call to docker_client.containers.run is interrupted, it is possible that - # the container won't be cleaned up. We avoid this by temporarily masking the signals. - signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}) - container = self._add_container(self._docker_client.containers.run(*args, **kwargs)) - signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) - logging.info("Started container: %s", self._trim_container_id(container.id)) - stream = container.logs(stream=True, stdout=True, stderr=True) - sys.stdout.flush() - for chunk in stream: - sys.stdout.buffer.write(chunk) - sys.stdout.buffer.flush() - sys.stdout.flush() - stream.close() - - try: - logging.info("Waiting for status of container %s for %d s.", - self._trim_container_id(container.id), - self._container_wait_seconds) - wait_result = container.wait(timeout=self._container_wait_seconds) - logging.info("Container exit status: %s", wait_result) - ret = wait_result.get('StatusCode', 200) - if ret != 0: - logging.error("Container exited with an error 😞") - logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) - else: - logging.info("Container exited with success 👍") - logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) - except Exception as err: - logging.exception(err) - return 150 - - try: - logging.info("Stopping container: %s", self._trim_container_id(container.id)) - container.stop() - except Exception as e: - logging.exception(e) - ret = 151 - - try: - logging.info("Removing container: %s", self._trim_container_id(container.id)) - container.remove() - except Exception as e: - logging.exception(e) - ret = 152 - self._remove_container(container) - containers = self._docker_client.containers.list() - if containers: - logging.info("Other running containers: %s", [self._trim_container_id(x.id) for x in containers]) - except NotFound as e: - logging.info("Container was stopped before cleanup started: %s", e) - - return ret - - -def _volume_mount(volume_dfn: str) -> Dict[str, Any]: - """ - Converts docker volume mount format, e.g. docker run --volume /local/path:/container/path:ro - to an object understood by the python docker library, e.g. {"local/path": {"bind": "/container/path", "mode": "ro"}} - This is used by the argparser for automatic conversion and input validation. - If the mode is not specified, 'rw' is assumed. - :param volume_dfn: A string to convert to a volume mount object in the format :[:ro|rw] - :return: An object in the form {"" : {"bind": "", "mode": "rw|ro"}} - """ - if volume_dfn is None: - raise argparse.ArgumentTypeError("Missing value for volume definition") - - parts = volume_dfn.split(":") - - if len(parts) < 2 or len(parts) > 3: - raise argparse.ArgumentTypeError("Invalid volume definition {}".format(volume_dfn)) - - mode = "rw" - if len(parts) == 3: - mode = parts[2] - - if mode not in ["rw", "ro"]: - raise argparse.ArgumentTypeError("Invalid volume mount mode {} in volume definition {}".format(mode, volume_dfn)) - - return {parts[0]: {"bind": parts[1], "mode": mode}} - - -def main(command_line_arguments): - config_logging() - - parser = argparse.ArgumentParser( - description="""Wrapper around docker run that protects against Zombie containers""", epilog="") - - parser.add_argument("-u", "--user", - help="Username or UID (format: [:])", - default=None) - - parser.add_argument("-v", "--volume", - action='append', - type=_volume_mount, - help="Bind mount a volume", - default=[]) - - parser.add_argument("--cap-add", - help="Add Linux capabilities", - action="append", - type=str, - default=[]) - - parser.add_argument("--runtime", - help="Runtime to use for this container", - default=None) - - parser.add_argument("--name", - help="Assign a name to the container", - default=None) - - parser.add_argument("image", metavar="IMAGE") - parser.add_argument("command", metavar="COMMAND") - parser.add_argument("args", nargs='*', metavar="ARG") - - args = parser.parse_args(args=command_line_arguments) - docker_client = SafeDockerClient() - return docker_client.run(args.image, **{ - "command": " ".join(list(chain([args.command] + args.args))), - "user": args.user, - "runtime": args.runtime, - "name": args.name, - "volumes": reduce(lambda dct, v: {**dct, **v}, args.volume, {}), - "cap_add": args.cap_add - }) - - -if __name__ == "__main__": - exit(main(sys.argv[1:])) diff --git a/ci/test_safe_docker_run.py b/ci/test_safe_docker_run.py deleted file mode 100644 index 3a46d9b3a41d..000000000000 --- a/ci/test_safe_docker_run.py +++ /dev/null @@ -1,422 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Safe docker run tests -""" -import itertools -import os -import signal -import unittest -from typing import Optional -from unittest.mock import create_autospec, patch, call - -from docker import DockerClient -from docker.models.containers import Container, ContainerCollection - -from safe_docker_run import SafeDockerClient, main - - -def create_mock_container(status_code: int = 0): - """ - Creates a mock docker container that exits with the specified status code - """ - mock_container = create_autospec(Container, name="mock_container") - mock_container.wait.return_value = { - "StatusCode": status_code - } - return mock_container - - -def create_mock_container_collection(container: Container): - """ - Creates a mock ContainerCollection that return the supplied container when the 'run' method is called - """ - mock_container_collection = create_autospec(ContainerCollection, name="mock_collection") - mock_container_collection.run.return_value = container - return mock_container_collection - - -class MockDockerClient: - """ - A mock DockerClient when docker.from_env is called - The supplied container will be returned when the client.containers.run method is called - """ - def __init__(self, container: Container): - self._mock_client = create_autospec(DockerClient, name="mock_client") - self._mock_client.containers = create_mock_container_collection(container) - self._patch = patch("docker.from_env", return_value=self._mock_client) - - def __enter__(self): - self._patch.start() - return self._mock_client - - def __exit__(self, _, __, ___): - self._patch.stop() - - -class TestSafeDockerRun(unittest.TestCase): - - @patch("safe_docker_run.signal.pthread_sigmask") - @patch.dict(os.environ, { - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - }) - def test_run_successful(self, mock_pthread_sigmask): - """ - Tests successful run - """ - mock_container = create_mock_container() - - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - - # Check return code is 0 - assert safe_docker.run("image", "command") == 0 - - # Check call to container is correct - assert mock_client.containers.run.call_args_list == [ - call("image", "command", detach=True, environment={ - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - }) - ] - - # Check correct signals are blocked then unblocked - assert mock_pthread_sigmask.call_args_list == [ - call(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}), - call(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) - ] - - # Assert container is stopped and removed - assert mock_container.stop.call_count == 1 - assert mock_container.remove.call_count == 1 - assert len(safe_docker._containers) == 0 - - def test_run_detach(self): - """ - Tests detach=True is passed to the underlying call by default - """ - mock_container = create_mock_container() - - # Test detach=True is passed in even if not specified - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["detach"] is True - - # Test passing in detach=True does not cause any issues - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command", detach=True) == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["detach"] is True - - # Test detach=False fails - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - with self.assertRaises(ValueError): - safe_docker.run("image", "command", detach=False) - assert mock_client.containers.run.call_args_list == [] - - def test_jenkins_vars(self): - """ - Tests jenkins environment variables are appropriately passed to the underlying docker run call - """ - # NOTE: It's important that these variables are passed to the underlying docker container - # These variables are passed to the container so the process tree killer can find runaway - # process inside the container - # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller - # /~https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 - - jenkins_vars = { - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - } - mock_container = create_mock_container() - - # Test environment is empty if the jenkins vars are not present - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == {} - - # Test environment contains jenkins env vars if they are present - with MockDockerClient(mock_container) as mock_client: - with patch.dict(os.environ, jenkins_vars): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == jenkins_vars - - # Test jenkins env vars are added to callers env vars - user_env = {"key1": "value1", "key2": "value2"} - with MockDockerClient(mock_container) as mock_client: - with patch.dict(os.environ, jenkins_vars): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command", environment=user_env) == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == {**jenkins_vars, **user_env} - - def test_run_args_kwargs_passed(self): - """ - Tests args and kwargs are passed to the container run call - """ - mock_container = create_mock_container() - - # Test detach=True is passed in even if not specified - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run( - "image", - "command", - "another_arg", - str_param="value", - bool_param=True, - none_param=None, - int_param=5, - float_param=5.2, - list_param=["this", "is", "a", "list"], - map_param={ - "a": "5", - "b": True, - "c": 2 - }) == 0 - assert mock_client.containers.run.call_args_list == [ - call( - "image", - "command", - "another_arg", - detach=True, - environment={}, - str_param="value", - bool_param=True, - none_param=None, - int_param=5, - float_param=5.2, - list_param=["this", "is", "a", "list"], - map_param={ - "a": "5", - "b": True, - "c": 2 - } - ) - ] - - def test_container_returns_non_zero_status_code(self): - """ - Tests non-zero code from container is returned and the container - is cleaned up - """ - mock_container = create_mock_container(status_code=10) - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - # check return code and that container gets cleaned up - assert safe_docker.run("image", "command") == 10 - assert mock_container.stop.call_count == 1 - assert mock_container.remove.call_count == 1 - assert len(safe_docker._containers) == 0 - - def test_container_wait_raises_returns_150(self): - """ - Tests 150 is returned if an error is raised when calling container.wait - """ - mock_container = create_mock_container() - mock_container.wait.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 150 - - def test_container_stop_raises_returns_151(self): - """ - Tests 151 is returned if an error is raised when calling container.stop - """ - mock_container = create_mock_container() - mock_container.stop.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 151 - - def test_container_remove_raises_returns_152(self): - """ - Tests 152 is returned if an error is raised when calling container.remove - """ - mock_container = create_mock_container() - mock_container.remove.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 152 - - def test_main(self): - """ - Tests main function against different command line arguments - """ - tests = [ - # ( supplied command line arguments, expected call ) - ( - ["image", "command"], - call("image", command="command", runtime=None, user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["image", "command", "arg1", "arg2"], - call("image", command="command arg1 arg2", runtime=None, user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["--runtime", "nvidia", "image", "command"], - call("image", command="command", runtime="nvidia", user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["--user", "1001:1001", "image", "command"], - call("image", command="command", runtime=None, user="1001:1001", name=None, volumes={}, cap_add=[]) - ), - ([ - "--volume", "/local/path1:/container/path1", - "--volume", "/local/path2:/container/path2:ro", - "image", - "command" - ], call("image", command="command", runtime=None, user=None, name=None, volumes={ - "/local/path1": { - "bind": "/container/path1", - "mode": "rw" - }, - "/local/path2": { - "bind": "/container/path2", - "mode": "ro" - } - }, cap_add=[])), - ([ - "--runtime", "nvidia", - "-u", "1001:1001", - "-v", "/local/path1:/container/path1", - "-v", "/local/path2:/container/path2:ro", - "--cap-add", "bob", - "--cap-add", "jimmy", - "--name", - "container_name", - "image", - "command", - "arg1", - "arg2" - ], call( - "image", - command="command arg1 arg2", - runtime="nvidia", - user="1001:1001", - name="container_name", - volumes={ - "/local/path1": { - "bind": "/container/path1", - "mode": "rw" - }, - "/local/path2": { - "bind": "/container/path2", - "mode": "ro" - } - }, cap_add=["bob", "jimmy"]) - ) - ] - - # Tests valid arguments - mock_docker = create_autospec(SafeDockerClient) - mock_docker.run.return_value = 0 - with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): - for test in tests: - arguments, expected_call = test - main(arguments) - assert mock_docker.run.call_args == expected_call - - # Tests invalid arguments - tests = [ - [], - None, - ["image"], - # Test some bad volume mounts - ["-v", "bob", "image", "args"], - ["-v", "/local/path", "image", "args"], - ["-v", "/local/path:/container/path:blah", "image", "args"], - ["-v", "", "image", "args"], - ["-v", "a:b:c:d", "image", "args"] - ] - - mock_docker = create_autospec(SafeDockerClient) - with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): - with self.assertRaises(SystemExit): - for test in tests: - main(test) - - def test_clean_up(self): - """ - Tests container clean up in case of SIGTERM and SIGINT - """ - import subprocess - import time - import docker.errors - - docker_client = docker.from_env() - container_name = "safedockertestcontainer1234" - - def get_container(name: str) -> Optional[Container]: - try: - return docker_client.containers.get(name) - except docker.errors.NotFound: - return None - - def remove_container_if_exists(name: str): - container = get_container(name) - if container: - container.stop() - container.remove() - - def wait_for_container(name: str) -> bool: - for _ in itertools.count(5): - if get_container(name): - return True - time.sleep(1) - return False - - # Clear any containers with container name - remove_container_if_exists(container_name) - - # None => not signal is emitted - we should still finish with no containers at the end due - # to the atexit - for sig in [None, signal.SIGTERM, signal.SIGINT]: - # Execute the safe docker run script in a different process - proc = subprocess.Popen(['python3','safe_docker_run.py', "--name", container_name, "ubuntu:18.04", "sleep 10"]) - # NOTE: we need to wait for the container to come up as not all operating systems support blocking signals - if wait_for_container(container_name) is False: - raise RuntimeError("Test container did not come up") - - # Issue the signal and wait for the process to finish - if sig: - proc.send_signal(sig) - proc.wait() - - # The container should no longer exist - assert get_container(container_name) is None From d6db57fe564a07bf712b0aa34b163ed591d63ddd Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Tue, 16 Feb 2021 17:45:25 +0000 Subject: [PATCH 2/2] Fix merge --- ci/build.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/ci/build.py b/ci/build.py index ef78504a5111..f7e8a9e966b0 100755 --- a/ci/build.py +++ b/ci/build.py @@ -314,33 +314,6 @@ def main() -> int: logging.critical("Execution of %s failed with status: %d", command, ret) return ret - elif args.all: - platforms = get_platforms() - platforms = [platform for platform in platforms if 'build.' in platform] - logging.info("Building for all architectures: %s", platforms) - logging.info("Artifacts will be produced in the build/ directory.") - for platform in platforms: - tag = get_docker_tag(platform=platform, registry=args.docker_registry) - load_docker_cache(tag=tag, docker_registry=args.docker_registry) - build_docker(platform, registry=args.docker_registry, - num_retries=args.docker_build_retries, no_cache=args.no_cache, - cache_intermediate=args.cache_intermediate) - if args.build_only: - continue - shutil.rmtree(buildir(), ignore_errors=True) - build_platform = "build_{}".format(platform) - plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..', - "mxnet_{}".format(build_platform))) - if os.path.exists(plat_buildir): - logging.warning("%s already exists, skipping", plat_buildir) - continue - command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] - container_run( - platform=platform, nvidia_runtime=args.nvidiadocker, - shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, - local_ccache_dir=args.ccache_dir, environment=environment) - shutil.move(buildir(), plat_buildir) - logging.info("Built files left in: %s", plat_buildir) else: parser.print_help() list_platforms()