diff --git a/.bazelrc b/.bazelrc index 2baaa0fa2..8de20992a 100644 --- a/.bazelrc +++ b/.bazelrc @@ -95,6 +95,7 @@ test:asan --test_env=ASAN_OPTIONS="detect_leaks=0" test:asan --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.2 /usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" # For example, for Ubuntu 18.04 libasan can be found here: # test:asan --test_env=LD_PRELOAD="/usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" +test:asan-buildkite --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.5" # CI configuration: aquery:ci --color=no diff --git a/.buildkite/Dockerfile b/.buildkite/Dockerfile index 2f52fb92d..86bd28148 100644 --- a/.buildkite/Dockerfile +++ b/.buildkite/Dockerfile @@ -5,15 +5,25 @@ ARG BUILDKITE_PULL_REQUEST ENV DEBIAN_FRONTEND=noninteractive ENV TZ=America/Los_Angeles + ENV BUILDKITE=true ENV CI=true ENV PYTHON=3.6 +ENV RAY_USE_RANDOM_PORTS=1 +ENV RAY_DEFAULT_BUILD=1 RUN apt-get update -qq RUN apt-get install -y -qq \ curl python-is-python3 git build-essential \ - sudo unzip apt-utils dialog tzdata wget + sudo unzip apt-utils dialog tzdata wget rsync \ + language-pack-en tmux cmake gdb vim htop \ + libgtk2.0-dev zlib1g-dev libgl1-mesa-dev + +# System conf for tests RUN locale -a +ENV LC_ALL=en_US.utf8 +ENV LANG=en_US.utf8 +RUN echo "ulimit -c 0" >> /root/.bashrc # Setup Bazel caches RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \ diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 91c673d52..0544234af 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,6 +1,141 @@ -- label: "Ray Core Tests (:buildkite: Experimental)" +- label: ":cpp: Tests" commands: - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -- label: "Ray Dashboard Tests" + - bash src/ray/test/run_object_manager_tests.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + -- //:all -rllib/... -core_worker_test + +- label: ":cpp: Tests (ASAN)" commands: - - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/... + - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) + --build_tests_only + --config=asan-buildkite + --jobs=2 + -- //:all -//:core_worker_test + +- label: ":serverless: Dashboard + Serve Tests" + commands: + - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + python/ray/new_dashboard/... + - bazel test --config=ci $(./scripts/bazel_export_options) + python/ray/serve/... + +- label: ":python: (Small & Large)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z + python/ray/tests/... + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,client_tests + --test_env=RAY_CLIENT_MODE=1 + python/ray/tests/... +- label: ":python: (Medium A-J)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j + python/ray/tests/... +- label: ":python: (Medium K-Z)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z + python/ray/tests/... + +- label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_tf + rllib/... + +- label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_tf + rllib/... + +- label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_torch + rllib/... + +- label: ":brain: RLlib: Quick Agent train.py runs" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=quick_train + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + # Test everything that does not have any of the "main" labels: + # "learning_tests|quick_train|examples|tests_dir". + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/examples/" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/tests/ (A-L)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/tests/ (M-Z)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + + +- label: ":octopus: Tune tests and examples" + commands: + - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/... + +- label: ":octopus: SGD tests and examples" + commands: + - SGD_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... + +- label: ":octopus: Tune/SGD tests and examples. Python 3.7" + commands: + - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh + # Bcause Python version changed, we need to re-install Ray here + - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... + +- label: ":book: Doc tests and examples" + commands: + - DOC_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... \ No newline at end of file diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 96f4fa95a..498aaf419 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -23,6 +23,13 @@ pkg_install_helper() { } install_bazel() { + if command -v bazel; then + if [ -n "${BUILDKITE-}" ]; then + echo "Bazel exists, skipping the install" + return + fi + fi + "${ROOT_DIR}"/install-bazel.sh if [ -f /etc/profile.d/bazel.sh ]; then . /etc/profile.d/bazel.sh @@ -30,6 +37,11 @@ install_bazel() { } install_base() { + if [ -n "${BUILDKITE-}" ]; then + echo "Skipping install_base in Buildkite" + return + fi + case "${OSTYPE}" in linux*) # Expired apt key error: https://github.com/bazelbuild/bazel/issues/11470#issuecomment-633205152 @@ -188,9 +200,7 @@ install_nvm() { > "${NVM_HOME}/nvm.sh" fi elif [ -n "${BUILDKITE-}" ]; then - # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions - curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - - sudo apt-get install -y nodejs + echo "Skipping nvm on Buildkite because we will use apt-get." else test -f "${NVM_HOME}/nvm.sh" # double-check NVM is already available on other platforms fi @@ -216,10 +226,19 @@ install_upgrade_pip() { } install_node() { + if command -v node; then + if [ -n "${BUILDKITE-}" ]; then + echo "Node existed, skipping install"; + return + fi + fi + if [ "${OSTYPE}" = msys ] ; then { echo "WARNING: Skipping running Node.js due to incompatibilities with Windows"; } 2> /dev/null elif [ -n "${BUILDKITE-}" ] ; then - { echo "WARNING: Skipping running Node.js on buildkite because it's already there"; } 2> /dev/null + # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions + curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - + sudo apt-get install -y nodejs else # Install the latest version of Node.js in order to build the dashboard. ( @@ -258,7 +277,7 @@ install_dependencies() { if [ -n "${PYTHON-}" ]; then # Remove this entire section once RLlib and Serve dependencies are fixed. - if [ -z "${BUILDKITE-}" ] && [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then + if [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then # PyTorch is installed first since we are using a "-f" directive to find the wheels. # We want to install the CPU version only. local torch_url="https://download.pytorch.org/whl/torch_stable.html" diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 6fecd2dc2..b61c69399 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -739,6 +739,7 @@ def stop(force, verbose, log_style, log_color): total_found = 0 total_stopped = 0 + stopped = [] for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger @@ -777,6 +778,7 @@ def stop(force, verbose, log_style, log_color): cf.dimmed("(via SIGTERM)")) total_stopped += 1 + stopped.append(proc) except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", @@ -799,8 +801,8 @@ def stop(force, verbose, log_style, log_color): cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force")) - # TODO(maximsmol): we should probably block until the processes actually - # all died somehow + # Wait for the processes to actually stop. + psutil.wait_procs(stopped, timeout=2) @cli.command() diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 200788736..99ed18671 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -15,7 +15,7 @@ def ray_start_combination(request): initialize_head=True, head_node_args={ "num_cpus": 10, - "redis_max_memory": 10**7 + "redis_max_memory": 10**8 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py index 01d39afa8..83d9f40f2 100644 --- a/python/ray/tests/test_stress_failure.py +++ b/python/ray/tests/test_stress_failure.py @@ -20,7 +20,7 @@ def ray_start_reconstruction(request): head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, - "redis_max_memory": 10**7, + "redis_max_memory": 10**8, "_system_config": { "object_timeout_milliseconds": 200 } diff --git a/python/ray/tests/test_stress_sharded.py b/python/ray/tests/test_stress_sharded.py index 7f05f27ac..c6e5cd484 100644 --- a/python/ray/tests/test_stress_sharded.py +++ b/python/ray/tests/test_stress_sharded.py @@ -14,7 +14,7 @@ def ray_start_sharded(request): object_store_memory=int(0.5 * 10**9), num_cpus=10, # _num_redis_shards=num_redis_shards, - _redis_max_memory=10**7) + _redis_max_memory=10**8) yield None diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index 501dce905..24be89b94 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -10,7 +10,7 @@ class TestObjectLostErrors(unittest.TestCase): ray.init( num_cpus=1, object_store_memory=150 * 1024 * 1024, - _redis_max_memory=10000000) + _redis_max_memory=10**8) def tearDown(self): ray.shutdown()