[Buildkite] Add all Python tests (#13566)

2026-06-27 15:16:34 +08:00 · 2021-01-25 16:05:59 -08:00
parent 0d75f37c1f
commit 8b8d6b984b
9 changed files with 183 additions and 16 deletions
@@ -95,6 +95,7 @@ test:asan --test_env=ASAN_OPTIONS="detect_leaks=0"
 test:asan --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.2 /usr/lib/gcc/x86_64-linux-gnu/7/libasan.so"
 # For example, for Ubuntu 18.04 libasan can be found here:
 # test:asan --test_env=LD_PRELOAD="/usr/lib/gcc/x86_64-linux-gnu/7/libasan.so"
+test:asan-buildkite --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.5"

 # CI configuration:
 aquery:ci --color=no
@@ -5,15 +5,25 @@ ARG BUILDKITE_PULL_REQUEST

 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=America/Los_Angeles
+
 ENV BUILDKITE=true
 ENV CI=true
 ENV PYTHON=3.6
+ENV RAY_USE_RANDOM_PORTS=1
+ENV RAY_DEFAULT_BUILD=1

 RUN apt-get update -qq
 RUN apt-get install -y -qq \
    curl python-is-python3 git build-essential \
-    sudo unzip apt-utils dialog tzdata wget
+    sudo unzip apt-utils dialog tzdata wget rsync \
+    language-pack-en tmux cmake gdb vim htop \
+    libgtk2.0-dev zlib1g-dev libgl1-mesa-dev
+
+# System conf for tests
 RUN locale -a
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
+RUN echo "ulimit -c 0" >> /root/.bashrc

 # Setup Bazel caches
 RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \
@@ -1,6 +1,141 @@
- label: "Ray Core Tests (:buildkite: Experimental)"
+- label: ":cpp: Tests"
  commands:
-  - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/...
- label: "Ray Dashboard Tests"
+  - bash src/ray/test/run_object_manager_tests.sh
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --build_tests_only
+      -- //:all -rllib/... -core_worker_test
+
+- label: ":cpp: Tests (ASAN)"
  commands:
-  - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/...
+  - bazel test --config=ci --config=asan $(./scripts/bazel_export_options)
+      --build_tests_only
+      --config=asan-buildkite
+      --jobs=2
+      -- //:all -//:core_worker_test
+
+- label: ":serverless: Dashboard + Serve Tests"
+  commands:
+  - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      python/ray/new_dashboard/...
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      python/ray/serve/...
+
+- label: ":python: (Small & Large)"
+  commands:
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z
+      python/ray/tests/...
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --test_tag_filters=-kubernetes,-jenkins_only,client_tests
+      --test_env=RAY_CLIENT_MODE=1
+      python/ray/tests/...
+- label: ":python: (Medium A-J)"
+  commands:
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j
+      python/ray/tests/...
+- label: ":python: (Medium K-Z)"
+  commands:
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z
+      python/ray/tests/...
+
+- label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)"
+  commands:
+  - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+  - bazel test --config=ci $(./scripts/bazel_export_options)
+      --build_tests_only
+      --test_tag_filters=learning_tests_tf
+      rllib/...
+
+- label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options)
+      --build_tests_only
+      --test_tag_filters=learning_tests_tf
+      rllib/...
+
+- label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options)
+      --build_tests_only
+      --test_tag_filters=learning_tests_torch
+      rllib/...
+
+- label: ":brain: RLlib: Quick Agent train.py runs"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options)
+        --build_tests_only
+        --test_tag_filters=quick_train
+        --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+    # Test everything that does not have any of the "main" labels:
+    # "learning_tests|quick_train|examples|tests_dir".
+    - bazel test --config=ci $(./scripts/bazel_export_options)
+        --build_tests_only
+        --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir
+        --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+
+- label: ":brain: RLlib: rllib/examples/"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+
+- label: ":brain: RLlib: rllib/tests/ (A-L)"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+
+- label: ":brain: RLlib: rllib/tests/ (M-Z)"
+  commands:
+    - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
+        --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
+        rllib/...
+
+
+- label: ":octopus: Tune tests and examples"
+  commands:
+    - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky python/ray/tune/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky python/ray/tune/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky python/ray/tune/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/...
+
+- label: ":octopus: SGD tests and examples"
+  commands:
+    - SGD_TESTING=1 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/...
+
+- label: ":octopus: Tune/SGD tests and examples. Python 3.7"
+  commands:
+    - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh
+    # Bcause Python version changed, we need to re-install Ray here
+    - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/...
+
+- label: ":book: Doc tests and examples"
+  commands:
+    - DOC_TESTING=1 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/...
@@ -23,6 +23,13 @@ pkg_install_helper() {
 }

 install_bazel() {
+  if command -v bazel; then
+    if [ -n "${BUILDKITE-}" ]; then
+      echo "Bazel exists, skipping the install"
+      return
+    fi
+  fi
+
  "${ROOT_DIR}"/install-bazel.sh
  if [ -f /etc/profile.d/bazel.sh ]; then
    . /etc/profile.d/bazel.sh
@@ -30,6 +37,11 @@ install_bazel() {
 }

 install_base() {
+  if [ -n "${BUILDKITE-}" ]; then
+    echo "Skipping install_base in Buildkite"
+    return
+  fi
+
  case "${OSTYPE}" in
    linux*)
      # Expired apt key error: https://github.com/bazelbuild/bazel/issues/11470#issuecomment-633205152
@@ -188,9 +200,7 @@ install_nvm() {
        > "${NVM_HOME}/nvm.sh"
    fi
  elif [ -n "${BUILDKITE-}" ]; then
-    # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions
-    curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash -
-    sudo apt-get install -y nodejs
+    echo "Skipping nvm on Buildkite because we will use apt-get."
  else
    test -f "${NVM_HOME}/nvm.sh"  # double-check NVM is already available on other platforms
  fi
@@ -216,10 +226,19 @@ install_upgrade_pip() {
 }

 install_node() {
+  if command -v node; then
+    if [ -n "${BUILDKITE-}" ]; then
+      echo "Node existed, skipping install";
+      return
+    fi
+  fi
+
  if [ "${OSTYPE}" = msys ] ; then
    { echo "WARNING: Skipping running Node.js due to incompatibilities with Windows"; } 2> /dev/null
  elif [ -n "${BUILDKITE-}" ] ; then
-    { echo "WARNING: Skipping running Node.js on buildkite because it's already there"; } 2> /dev/null
+    # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions
+    curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash -
+    sudo apt-get install -y nodejs
  else
    # Install the latest version of Node.js in order to build the dashboard.
    (
@@ -258,7 +277,7 @@ install_dependencies() {

  if [ -n "${PYTHON-}" ]; then
    # Remove this entire section once RLlib and Serve dependencies are fixed.
-    if [ -z "${BUILDKITE-}" ] && [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then
+    if [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then
      # PyTorch is installed first since we are using a "-f" directive to find the wheels.
      # We want to install the CPU version only.
      local torch_url="https://download.pytorch.org/whl/torch_stable.html"
@@ -739,6 +739,7 @@ def stop(force, verbose, log_style, log_color):

    total_found = 0
    total_stopped = 0
+    stopped = []
    for keyword, filter_by_cmd in processes_to_kill:
        if filter_by_cmd and is_linux and len(keyword) > 15:
            # getting here is an internal bug, so we do not use cli_logger
@@ -777,6 +778,7 @@ def stop(force, verbose, log_style, log_color):
                                       cf.dimmed("(via SIGTERM)"))

                total_stopped += 1
+                stopped.append(proc)
            except psutil.NoSuchProcess:
                cli_logger.verbose(
                    "Attempted to stop `{}`, but process was already dead.",
@@ -799,8 +801,8 @@ def stop(force, verbose, log_style, log_color):
            cli_logger.warning("Try running the command again, or use `{}`.",
                               cf.bold("--force"))

-    # TODO(maximsmol): we should probably block until the processes actually
-    # all died somehow
+    # Wait for the processes to actually stop.
+    psutil.wait_procs(stopped, timeout=2)


@cli.command()
@@ -15,7 +15,7 @@ def ray_start_combination(request):
        initialize_head=True,
        head_node_args={
            "num_cpus": 10,
-            "redis_max_memory": 10**7
+            "redis_max_memory": 10**8
        })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
@@ -20,7 +20,7 @@ def ray_start_reconstruction(request):
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
-            "redis_max_memory": 10**7,
+            "redis_max_memory": 10**8,
            "_system_config": {
                "object_timeout_milliseconds": 200
            }
@@ -14,7 +14,7 @@ def ray_start_sharded(request):
        object_store_memory=int(0.5 * 10**9),
        num_cpus=10,
        # _num_redis_shards=num_redis_shards,
-        _redis_max_memory=10**7)
+        _redis_max_memory=10**8)

    yield None

@@ -10,7 +10,7 @@ class TestObjectLostErrors(unittest.TestCase):
        ray.init(
            num_cpus=1,
            object_store_memory=150 * 1024 * 1024,
-            _redis_max_memory=10000000)
+            _redis_max_memory=10**8)

    def tearDown(self):
        ray.shutdown()