From 6426fb3fffe56878e26bf35d990a806cb4b3e97b Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Mon, 12 Oct 2020 14:22:51 -0700 Subject: [PATCH] [CI] Fix-Up Docker Build (Use Python) (#11139) --- .travis.yml | 7 +- build-docker.sh | 2 +- ci/travis/build-docker-images.py | 260 ++++++++++++++++++++++++++++ ci/travis/build-docker-images.sh | 7 +- ci/travis/determine_tests_to_run.py | 4 +- docker/README.md | 17 ++ docker/base-deps/Dockerfile | 2 +- docker/ray-deps/Dockerfile | 2 +- docker/ray-ml/Dockerfile | 2 +- docker/ray/Dockerfile | 2 +- 10 files changed, 291 insertions(+), 14 deletions(-) create mode 100644 ci/travis/build-docker-images.py create mode 100644 docker/README.md diff --git a/.travis.yml b/.travis.yml index ad05c641c..adf254fd9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -186,6 +186,7 @@ matrix: # Build Linux wheels and jars. - os: linux env: + # - PYTHON=3.6 - LINUX_WHEELS=1 LINUX_JARS=1 - PYTHONWARNINGS=ignore - RAY_INSTALL_JAVA=1 @@ -197,7 +198,9 @@ matrix: - . ./ci/travis/ci.sh build script: - . ./ci/travis/ci.sh test_wheels - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then . $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.sh; fi + - export PATH="$HOME/miniconda3/bin:$PATH" + - python -m pip install docker + - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py; fi - bash ./java/build-jar-multiplatform.sh linux cache: false @@ -496,7 +499,7 @@ deploy: - provider: script edge: true # This supposedly opts in to deploy v2. - script: ./ci/keep_alive bash $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.sh + script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py skip_cleanup: true on: repo: ray-project/ray diff --git a/build-docker.sh b/build-docker.sh index 0f9a3f19e..9617beb4b 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -16,7 +16,7 @@ key="$1" case $key in --gpu) GPU="-gpu" - BASE_IMAGE="nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + BASE_IMAGE="nvidia/cuda:10.1-cudnn8-runtime-ubuntu18.04" ;; --no-cache-build) NO_CACHE="--no-cache" diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py new file mode 100644 index 000000000..00ec2e8e0 --- /dev/null +++ b/ci/travis/build-docker-images.py @@ -0,0 +1,260 @@ +import datetime +import functools +import glob +import os +import re +import runpy +import shutil +import sys +from contextlib import redirect_stdout +from io import StringIO +from typing import List + +import docker + +print = functools.partial(print, file=sys.stderr, flush=True) +DOCKER_USERNAME = "raytravisbot" +DOCKER_CLIENT = None +PYTHON_WHL_VERSION = "cp37m" + + +def _merge_build(): + return os.environ.get("TRAVIS_PULL_REQUEST") == "false" + + +def _release_build(): + branch = os.environ.get("TRAVIS_BRANCH") + if not branch: + print("Branch not found!") + print(os.environ) + print("Environment is above ^^") + return False + return branch != "master" and "releases" in branch + + +def _get_curr_dir(): + return os.path.dirname(os.path.realpath(__file__)) + + +def _get_root_dir(): + return os.path.join(_get_curr_dir(), "../../") + + +def _get_wheel_name(): + matches = glob.glob( + f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}-manylinux*") + assert len(matches) == 1, ( + f"Found ({len(matches)}) matches " + "'*{PYTHON_WHL_VERSION}-manylinux*' instead of 1") + return os.path.basename(matches[0]) + + +def _docker_affected(): + result = StringIO() + with redirect_stdout(result): + runpy.run_path( + f"{_get_curr_dir()}/determine_tests_to_run.py", + run_name="__main__") + variable_definitions = result.getvalue().split() + env_var_dict = { + x.split("=")[0]: x.split("=")[1] + for x in variable_definitions + } + affected = env_var_dict["RAY_CI_DOCKER_AFFECTED"] == "1" or \ + env_var_dict["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED"] == "1" + print(f"Docker affected: {affected}") + return affected + + +def _build_cpu_gpu_images(image_name) -> List[str]: + built_images = [] + for gpu in ["-cpu", "-gpu"]: + build_args = {} + if image_name == "base-deps": + build_args["BASE_IMAGE"] = ( + "nvidia/cuda:10.1-cudnn8-runtime-ubuntu18.04" + if gpu == "-gpu" else "ubuntu:focal") + else: + build_args["GPU"] = gpu + + if "ray" in image_name: + build_args["WHEEL_PATH"] = f".whl/{_get_wheel_name()}" + + tagged_name = f"rayproject/{image_name}:nightly{gpu}" + for i in range(2): + output = DOCKER_CLIENT.api.build( + path=os.path.join(_get_root_dir(), "docker", image_name), + tag=tagged_name, + nocache=True, + buildargs=build_args) + + full_output = "" + try: + start = datetime.datetime.now() + current_iter = start + for line in output: + # print(line) + if datetime.datetime.now( + ) - current_iter >= datetime.timedelta(minutes=5): + current_iter = datetime.datetime.now() + elapsed = datetime.datetime.now() - start + print(f"Still building {tagged_name} after " + f"{elapsed.seconds} seconds") + full_output += line.decode("utf-8") + except Exception as e: + print(f"FAILURE with error {e}") + + if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: + print(f"ERROR building: {tagged_name} & error below:") + print(full_output) + if (i == 1): + raise Exception("FAILED TO BUILD IMAGE") + print("TRYING AGAIN") + else: + break + + print("BUILT: ", tagged_name) + built_images.append(tagged_name) + return built_images + + +def copy_wheels(): + root_dir = _get_root_dir() + wheel = _get_wheel_name() + source = os.path.join(root_dir, ".whl", wheel) + ray_dst = os.path.join(root_dir, "docker/ray/.whl/") + ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") + os.makedirs(ray_dst, exist_ok=True) + shutil.copy(source, ray_dst) + os.makedirs(ray_dep_dst, exist_ok=True) + shutil.copy(source, ray_dep_dst) + + +def build_or_pull_base_images(is_docker_affected: bool) -> List[str]: + """Returns images to tag and build""" + _ = DOCKER_CLIENT.api.pull( + repository="rayproject/base-deps", tag="nightly") + + age = DOCKER_CLIENT.api.inspect_image("rayproject/base-deps:nightly")[ + "Created"] + short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d") + is_stale = ( + datetime.datetime.now() - short_date) > datetime.timedelta(days=14) + + if is_stale or is_docker_affected or _release_build(): + for image in ["base-deps", "ray-deps"]: + _build_cpu_gpu_images(image) + return True + else: + print("Just pulling images!") + _ = DOCKER_CLIENT.api.pull( + repository="rayproject/base-deps", tag="nightly-cpu") + _ = DOCKER_CLIENT.api.pull( + repository="rayproject/base-deps", tag="nightly-gpu") + + _ = DOCKER_CLIENT.api.pull( + repository="rayproject/ray-deps", tag="nightly-gpu") + _ = DOCKER_CLIENT.api.pull( + repository="rayproject/ray-deps", tag="nightly-cpu") + return False + + +def build_ray(): + return _build_cpu_gpu_images("ray") + + +def build_ray_ml(): + root_dir = _get_root_dir() + requirement_files = glob.glob( + f"{_get_root_dir()}/python/requirements*.txt") + for fl in requirement_files: + shutil.copy(fl, os.path.join(root_dir, "docker/ray-ml/")) + ray_ml_images = _build_cpu_gpu_images("ray-ml") + for img in ray_ml_images: + tag = img.split(":")[-1] + DOCKER_CLIENT.api.tag( + image=img, repository="rayproject/autoscaler", tag=tag) + + +# For non-release builds, push "nightly" & "sha" +# For release builds, push "nightly" & "latest" & "x.x.x" +def push_and_tag_images(push_base_images: bool): + if _merge_build(): + docker_password = os.environ.get("DOCKER_PASSWORD") + assert docker_password is not None, "DOCKER_PASSWORD not set." + DOCKER_CLIENT.api.login( + username=DOCKER_USERNAME, + password=os.environ.get("DOCKER_PASSWORD")) + + def docker_push(image, tag): + if _merge_build(): + result = DOCKER_CLIENT.api.push(image, tag=tag) + print(f"PUSHING: {image}:{tag}, result:") + print(result) + else: + print( + "This is a PR Build! On a merge build, we would normally push " + f"to: {image}:{tag}") + + def get_new_tag(old_tag, new_tag): + return old_tag.replace("nightly", new_tag) + + date_tag = datetime.datetime.now().strftime("%Y-%m-%d") + sha_tag = os.environ.get("TRAVIS_COMMIT")[:6] + if _release_build(): + release_name = re.search("[0-9]\.[0-9]\.[0-9]", + os.environ.get("TRAVIS_BRANCH")) + date_tag = release_name + sha_tag = release_name + + image_list = ["ray", "ray-ml", "autoscaler"] + if push_base_images: + image_list.extend(["base-deps", "ray-deps"]) + + for image in image_list: + full_image = f"rayproject/{image}" + + # Generate :nightly from nightly-cpu + DOCKER_CLIENT.api.tag( + image=f"{full_image}:nightly-cpu", + repository=full_image, + tag="nightly") + + for arch_tag in ["-cpu", "-gpu", ""]: + full_arch_tag = f"nightly{arch_tag}" + # Tag and push rayproject/:nightly + docker_push(full_image, full_arch_tag) + + specific_tag = get_new_tag( + full_arch_tag, date_tag if "-deps" in image else sha_tag) + # Tag and push rayproject/: + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=specific_tag) + docker_push(full_image, specific_tag) + + if _release_build(): + latest_tag = get_new_tag(full_arch_tag, "latest") + # Tag and push rayproject/:latest + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=latest_tag) + docker_push(full_image, latest_tag) + + +# Build base-deps/ray-deps only on file change, 2 weeks, per release +# Build ray, ray-ml, autoscaler every time + +if __name__ == "__main__": + print("RUNNING WITH: ", sys.version) + if os.environ.get("TRAVIS") == "true": + is_docker_affected = _docker_affected() + if _merge_build() or is_docker_affected: + DOCKER_CLIENT = docker.from_env() + copy_wheels() + freshly_built = build_or_pull_base_images(is_docker_affected) + build_ray() + build_ray_ml() + push_and_tag_images(freshly_built) diff --git a/ci/travis/build-docker-images.sh b/ci/travis/build-docker-images.sh index 5a1aa7655..31cc90358 100755 --- a/ci/travis/build-docker-images.sh +++ b/ci/travis/build-docker-images.sh @@ -36,7 +36,7 @@ build_and_push_tags() { build_or_pull_base_images() { docker pull rayproject/base-deps:nightly - TAG=$(date +%F_%H-00) + TAG=$(date +%F) age=$(docker inspect -f '{{ .Created }}' rayproject/base-deps:nightly) # Build if older than 2 weeks, files have been edited in this PR OR branch release @@ -94,15 +94,12 @@ if [[ "$TRAVIS" == "true" ]]; then docker_push "rayproject/autoscaler:nightly$GPU" docker_push "rayproject/autoscaler:$commit_sha$GPU" done - - docker_push rayproject/autoscaler:nightly - docker_push rayproject/autoscaler:"$commit_sha" # We have a branch build, e.g. release/v0.7.0 if [[ "$TRAVIS_BRANCH" != "master" ]]; then # Replace / in branch name to - so it is legal tag name - normalized_branch_name=$(echo "$TRAVIS_BRANCH" | sed -e "s/\//-/") + normalized_branch_name=$(echo "$TRAVIS_BRANCH" | cut -d "/" -f2) # TODO(ilr) Remove autoscaler in the future for IMAGE in "base-deps" "ray-deps" "ray" "ray-ml" "autoscaler" do diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py index 2e95117ca..0c2edf0a9 100644 --- a/ci/travis/determine_tests_to_run.py +++ b/ci/travis/determine_tests_to_run.py @@ -106,9 +106,9 @@ if __name__ == "__main__": RAY_CI_MACOS_WHEELS_AFFECTED = 1 RAY_CI_STREAMING_PYTHON_AFFECTED = 1 RAY_CI_DOC_AFFECTED = 1 - if changed_file.startswith("python/setup.py"): + if changed_file.startswith("python/setup.py") or re.match( + "requirements.*\.txt", changed_file): RAY_CI_PYTHON_DEPENDENCIES_AFFECTED = 1 - RAY_CI_LINUX_WHEELS_AFFECTED = 1 elif changed_file.startswith("java/"): RAY_CI_JAVA_AFFECTED = 1 RAY_CI_STREAMING_JAVA_AFFECTED = 1 diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 000000000..9287a4f0f --- /dev/null +++ b/docker/README.md @@ -0,0 +1,17 @@ +Overview of how the ray images are built: + +Images without a "-cpu" or "-gpu" tag are built on ``ubuntu/focal``. They are just an alias for **-cpu** (e.g. ``ray:latest`` is the same as ``ray:latest-cpu``). + +``` +ubuntu/focal +└── base-deps:cpu + └── ray-deps:cpu + └── ray:cpu + └── ray-ml:cpu + +nvidia/cuda +└── base-deps:gpu + └── ray-deps:gpu + └── ray:gpu + └── ray-ml:gpu +``` diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 82e3e9a09..f3d453d20 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,6 +1,6 @@ # The base-deps Docker image installs main libraries needed to run Ray -# The GPU option is nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 +# The GPU option is nvidia/cuda:10.1-cudnn8-runtime-ubuntu18.04 ARG BASE_IMAGE="ubuntu:focal" FROM ${BASE_IMAGE} # If this arg is not "autoscaler" then no autoscaler requirements will be included diff --git a/docker/ray-deps/Dockerfile b/docker/ray-deps/Dockerfile index eaa9a04a1..9fc205c0f 100644 --- a/docker/ray-deps/Dockerfile +++ b/docker/ray-deps/Dockerfile @@ -1,5 +1,5 @@ ARG GPU="" -FROM rayproject/base-deps:latest"$GPU" +FROM rayproject/base-deps:nightly"$GPU" # If this arg is not "autoscaler" then no autoscaler requirements will be included ARG AUTOSCALER="autoscaler" ARG WHEEL_PATH diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 6e6542cbf..9d091bdbb 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,5 +1,5 @@ ARG GPU -FROM rayproject/ray:latest"$GPU" +FROM rayproject/ray:nightly"$GPU" # We have to uninstall wrapt this way for Tensorflow compatibility COPY requirements.txt ./ diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile index 4d89c8956..0b031d0ec 100644 --- a/docker/ray/Dockerfile +++ b/docker/ray/Dockerfile @@ -1,5 +1,5 @@ ARG GPU -FROM rayproject/ray-deps:latest"$GPU" +FROM rayproject/ray-deps:nightly"$GPU" ARG WHEEL_PATH # For Click ENV LC_ALL=C.UTF-8