From 8804758409a1a416ff8017843a3896b217d6f15c Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Wed, 20 Jan 2021 18:40:23 +0100 Subject: [PATCH] [xgboost] Add XGBoost release tests (#13456) * Add XGBoost release tests * Add more xgboost release tests * Use failure state manager * Add release test documentation * Fix wording * Automate fault tolerance tests --- .../xgboost_tests/cluster_cpu_moderate.yaml | 37 ++++++ release/xgboost_tests/cluster_cpu_small.yaml | 37 ++++++ release/xgboost_tests/cluster_gpu_small.yaml | 37 ++++++ release/xgboost_tests/create_test_data.py | 58 +++++++++ release/xgboost_tests/requirements.txt | 2 + release/xgboost_tests/run.sh | 58 +++++++++ release/xgboost_tests/wait_cluster.py | 49 ++++++++ release/xgboost_tests/workloads/_train.py | 94 +++++++++++++++ .../workloads/distributed_api_test.py | 26 ++++ .../workloads/ft_small_elastic.py | 61 ++++++++++ .../workloads/ft_small_non_elastic.py | 113 ++++++++++++++++++ release/xgboost_tests/workloads/train_gpu.py | 46 +++++++ .../xgboost_tests/workloads/train_moderate.py | 33 +++++ .../xgboost_tests/workloads/train_small.py | 33 +++++ release/xgboost_tests/workloads/tune_32x4.py | 56 +++++++++ release/xgboost_tests/workloads/tune_4x32.py | 56 +++++++++ release/xgboost_tests/workloads/tune_small.py | 56 +++++++++ 17 files changed, 852 insertions(+) create mode 100644 release/xgboost_tests/cluster_cpu_moderate.yaml create mode 100644 release/xgboost_tests/cluster_cpu_small.yaml create mode 100644 release/xgboost_tests/cluster_gpu_small.yaml create mode 100644 release/xgboost_tests/create_test_data.py create mode 100644 release/xgboost_tests/requirements.txt create mode 100755 release/xgboost_tests/run.sh create mode 100644 release/xgboost_tests/wait_cluster.py create mode 100644 release/xgboost_tests/workloads/_train.py create mode 100644 release/xgboost_tests/workloads/distributed_api_test.py create mode 100644 release/xgboost_tests/workloads/ft_small_elastic.py create mode 100644 release/xgboost_tests/workloads/ft_small_non_elastic.py create mode 100644 release/xgboost_tests/workloads/train_gpu.py create mode 100644 release/xgboost_tests/workloads/train_moderate.py create mode 100644 release/xgboost_tests/workloads/train_small.py create mode 100644 release/xgboost_tests/workloads/tune_32x4.py create mode 100644 release/xgboost_tests/workloads/tune_4x32.py create mode 100644 release/xgboost_tests/workloads/tune_small.py diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml new file mode 100644 index 000000000..18a18dceb --- /dev/null +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -0,0 +1,37 @@ +cluster_name: ray-xgboost-release-cpu-moderate + +min_workers: 31 +max_workers: 31 +initial_workers: 31 + +target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 + +docker: + image: anyscale/ray:latest + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +auth: + ssh_user: ubuntu + +head_node: + # 64 CPUs + InstanceType: m5.xlarge + +worker_nodes: + # 64 CPUs + InstanceType: m5.xlarge + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml new file mode 100644 index 000000000..fe9e997f8 --- /dev/null +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -0,0 +1,37 @@ +cluster_name: ray-xgboost-release-cpu-small + +min_workers: 3 +max_workers: 3 +initial_workers: 3 + +target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 + +docker: + image: anyscale/ray:latest + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +auth: + ssh_user: ubuntu + +head_node: + # 64 CPUs + InstanceType: m5.xlarge + +worker_nodes: + # 64 CPUs + InstanceType: m5.xlarge + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml new file mode 100644 index 000000000..5bea4f19a --- /dev/null +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -0,0 +1,37 @@ +cluster_name: ray-xgboost-release-gpu-small + +min_workers: 4 +max_workers: 4 +initial_workers: 4 + +target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 + +docker: + image: anyscale/ray:latest-gpu + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +auth: + ssh_user: ubuntu + +head_node: + # 64 CPUs + InstanceType: m5.xlarge + +worker_nodes: + # 64 CPUs + InstanceType: p2.xlarge + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/create_test_data.py b/release/xgboost_tests/create_test_data.py new file mode 100644 index 000000000..21b1c8fdd --- /dev/null +++ b/release/xgboost_tests/create_test_data.py @@ -0,0 +1,58 @@ +import argparse +import numpy as np +import os + +from xgboost_ray.tests.utils import create_parquet + +if __name__ == "__main__": + if "OMP_NUM_THREADS" in os.environ: + del os.environ["OMP_NUM_THREADS"] + + parser = argparse.ArgumentParser(description="Create fake data.") + parser.add_argument( + "filename", type=str, default="/data/parted.parquet/", help="ray/dask") + parser.add_argument( + "-r", + "--num-rows", + required=False, + type=int, + default=1e8, + help="num rows") + parser.add_argument( + "-p", + "--num-partitions", + required=False, + type=int, + default=100, + help="num partitions") + parser.add_argument( + "-c", + "--num-cols", + required=False, + type=int, + default=4, + help="num columns (features)") + parser.add_argument( + "-C", + "--num-classes", + required=False, + type=int, + default=2, + help="num classes") + parser.add_argument( + "-s", + "--seed", + required=False, + type=int, + default=1234, + help="random seed") + + args = parser.parse_args() + + np.random.seed(args.seed) + create_parquet( + args.filename, + num_rows=int(args.num_rows), + num_partitions=int(args.num_partitions), + num_features=int(args.num_cols), + num_classes=int(args.num_classes)) diff --git a/release/xgboost_tests/requirements.txt b/release/xgboost_tests/requirements.txt new file mode 100644 index 000000000..3f41a6379 --- /dev/null +++ b/release/xgboost_tests/requirements.txt @@ -0,0 +1,2 @@ +ray[tune] +xgboost_ray \ No newline at end of file diff --git a/release/xgboost_tests/run.sh b/release/xgboost_tests/run.sh new file mode 100755 index 000000000..f4ba4a62f --- /dev/null +++ b/release/xgboost_tests/run.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +nodes="" +ray_version="" +commit="" +ray_branch="" + +for i in "$@" +do +echo "$i" +case "$i" in + --nodes=*) + nodes="${i#*=}" + ;; + --ray-version=*) + ray_version="${i#*=}" + ;; + --commit=*) + commit="${i#*=}" + ;; + --ray-branch=*) + ray_branch="${i#*=}" + ;; + --workload=*) + workload="${i#*=}" + ;; + --help) + usage + exit + ;; + *) + echo "unknown arg, $i" + exit 1 + ;; +esac +done + +if [[ $nodes == "" || $ray_version == "" || $commit == "" || $ray_branch == "" ]] +then + echo "Provide --nodes --ray-version, --commit, and --ray-branch" + exit 1 +fi + +echo "nodes: $nodes" +echo "version: $ray_version" +echo "commit: $commit" +echo "branch: $ray_branch" +echo "workload: ignored" + +# wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" +# pip install -U "$wheel" + +if ! python "wait_cluster.py" "$nodes" 600; then + echo "Cluster did not come up in time. Aborting test." + exit 1 +fi + +python "workloads/$workload.py" diff --git a/release/xgboost_tests/wait_cluster.py b/release/xgboost_tests/wait_cluster.py new file mode 100644 index 000000000..a0f8260bd --- /dev/null +++ b/release/xgboost_tests/wait_cluster.py @@ -0,0 +1,49 @@ +import argparse +import time + +import ray + +ray.init(address="auto") + +parser = argparse.ArgumentParser() +parser.add_argument( + "num_nodes", + type=int, + help="Wait for this number of nodes (includes head)") + +parser.add_argument( + "max_time_s", type=int, help="Wait for this number of seconds") + +parser.add_argument( + "--feedback_interval_s", + type=int, + default=10, + help="Wait for this number of seconds") + +args = parser.parse_args() + +curr_nodes = 0 +start = time.time() +next_feedback = start +max_time = start + args.max_time_s +while not curr_nodes >= args.num_nodes: + now = time.time() + + if now >= max_time: + raise RuntimeError( + f"Maximum wait time reached, but only " + f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting.") + + if now >= next_feedback: + passed = now - start + print(f"Waiting for more nodes to come up: " + f"{curr_nodes}/{args.num_nodes} " + f"({passed:.0f} seconds passed)") + next_feedback = now + args.feedback_interval_s + + time.sleep(5) + curr_nodes = len(ray.nodes()) + +passed = time.time() - start +print(f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " + f"{passed:.0f} seconds") diff --git a/release/xgboost_tests/workloads/_train.py b/release/xgboost_tests/workloads/_train.py new file mode 100644 index 000000000..586f65991 --- /dev/null +++ b/release/xgboost_tests/workloads/_train.py @@ -0,0 +1,94 @@ +import glob +import os + +import time + +from xgboost_ray import train, RayDMatrix, RayFileType, \ + RayDeviceQuantileDMatrix, RayParams + +if "OMP_NUM_THREADS" in os.environ: + del os.environ["OMP_NUM_THREADS"] + + +def train_ray(path, + num_workers, + num_boost_rounds, + num_files=0, + regression=False, + use_gpu=False, + ray_params=None, + xgboost_params=None, + **kwargs): + if not os.path.exists(path): + raise ValueError(f"Path does not exist: {path}") + + if num_files: + files = sorted(glob.glob(f"{path}/**/*.parquet")) + while num_files > len(files): + files = files + files + path = files[0:num_files] + + use_device_matrix = False + if use_gpu: + try: + import cupy # noqa: F401 + use_device_matrix = True + except ImportError: + use_device_matrix = False + + if use_device_matrix: + dtrain = RayDeviceQuantileDMatrix( + path, + num_actors=num_workers, + label="labels", + ignore=["partition"], + filetype=RayFileType.PARQUET) + else: + dtrain = RayDMatrix( + path, + num_actors=num_workers, + label="labels", + ignore=["partition"], + filetype=RayFileType.PARQUET) + + config = {"tree_method": "hist" if not use_gpu else "gpu_hist"} + + if not regression: + # Classification + config.update({ + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + }) + else: + # Regression + config.update({ + "objective": "reg:squarederror", + "eval_metric": ["logloss", "rmse"], + }) + + if xgboost_params: + config.update(xgboost_params) + + start = time.time() + evals_result = {} + additional_results = {} + bst = train( + config, + dtrain, + evals_result=evals_result, + additional_results=additional_results, + num_boost_round=num_boost_rounds, + ray_params=ray_params or RayParams( + max_actor_restarts=2, + num_actors=num_workers, + cpus_per_actor=1, + gpus_per_actor=1 if not use_gpu else 1), + evals=[(dtrain, "train")], + **kwargs) + taken = time.time() - start + print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") + + bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) + print("Final training error: {:.4f}".format( + evals_result["train"]["error"][-1])) + return bst, additional_results, taken diff --git a/release/xgboost_tests/workloads/distributed_api_test.py b/release/xgboost_tests/workloads/distributed_api_test.py new file mode 100644 index 000000000..66ae8e928 --- /dev/null +++ b/release/xgboost_tests/workloads/distributed_api_test.py @@ -0,0 +1,26 @@ +"""Distributed XGBoost API test + +This test runs unit tests on a distributed cluster. This will confirm that +XGBoost API features like custom metrics/objectives work with remote +trainables. + +Test owner: krfricke + +Acceptance criteria: Unit tests should pass (requires pytest). +""" + +import ray + +from xgboost_ray.tests.test_xgboost_api import XGBoostAPITest + + +class XGBoostDistributedAPITest(XGBoostAPITest): + def _init_ray(self): + if not ray.is_initialized(): + ray.init(address="auto") + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"])) diff --git a/release/xgboost_tests/workloads/ft_small_elastic.py b/release/xgboost_tests/workloads/ft_small_elastic.py new file mode 100644 index 000000000..7e81df370 --- /dev/null +++ b/release/xgboost_tests/workloads/ft_small_elastic.py @@ -0,0 +1,61 @@ +"""Fault tolerance test (small cluster, elastic training) + +In this run, two training actors will die after some time. It is expected that +in both cases xgboost_ray stops training, but continues right away with the +remaining three actors. Shortly after, the actors will be restarted and +re-integrated into the training loop. Training should finish with all four +actors. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results. Intermediate +output should show that training continues with fewer actors when an +actor died. The test will fail if elastic training didn't work. + +Notes: This test seems to be somewhat flaky. This might be due to +race conditions in handling dead actors. This is likely a problem of +the xgboost_ray implementation and not of this test. +""" +import ray + +from xgboost_ray import RayParams + +from _train import train_ray +from ft_small_non_elastic import FailureState, FailureInjection, \ + TrackingCallback + +if __name__ == "__main__": + ray.init(address="auto") + + failure_state = FailureState.remote() + + ray_params = RayParams( + elastic_training=True, + max_failed_actors=2, + max_actor_restarts=3, + num_actors=4, + cpus_per_actor=4, + gpus_per_actor=0) + + _, additional_results, _ = train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=200, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=None, + callbacks=[ + TrackingCallback(), + FailureInjection( + id="first_fail", state=failure_state, ranks=[2], iteration=14), + FailureInjection( + id="second_fail", state=failure_state, ranks=[0], iteration=34) + ]) + + actor_1_world_size = set(additional_results["callback_returns"][1]) + assert 3 in actor_1_world_size, \ + "No training with only 3 actors observed, but this was elastic " \ + "training. Please check if additional actors died (e.g. via " \ + "node failure), run test again, and report to test owner otherwise." diff --git a/release/xgboost_tests/workloads/ft_small_non_elastic.py b/release/xgboost_tests/workloads/ft_small_non_elastic.py new file mode 100644 index 000000000..246af8a38 --- /dev/null +++ b/release/xgboost_tests/workloads/ft_small_non_elastic.py @@ -0,0 +1,113 @@ +"""Fault tolerance test (small cluster, non-elastic training) + +In this run, two training actors will die after some time. It is expected that +in both cases xgboost_ray stops training, restarts the dead actors, and +continues training with all four actors. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results. Intermediate +output should show that training halts wenn an actor dies and continues only +when all four actors are available again. The test will fail if fault +tolerance did not work correctly. + +Notes: This test seems to be somewhat flaky. This might be due to +race conditions in handling dead actors. This is likely a problem of +the xgboost_ray implementation and not of this test. +""" +import os +import time + +import ray +from ray.services import get_node_ip_address + +from xgboost_ray import RayParams +from xgboost_ray.session import get_actor_rank, put_queue + +from xgboost.callback import TrainingCallback +from xgboost.rabit import get_world_size + +from _train import train_ray + + +@ray.remote +class FailureState: + def __init__(self): + self._failed_ids = set() + + def set_failed(self, id): + if id in self._failed_ids: + return False + self._failed_ids.add(id) + return True + + def has_failed(self, id): + return id in self._failed_ids + + +class FailureInjection(TrainingCallback): + def __init__(self, id, state, ranks, iteration, allow_ips=None): + self._id = id + self._state = state + self._ranks = ranks or [] + self._iteration = iteration + self._allow_ips = allow_ips + super(FailureInjection).__init__() + + def after_iteration(self, model, epoch, evals_log): + if self._allow_ips and get_node_ip_address() not in self._allow_ips: + return + + if epoch == self._iteration: + rank = get_actor_rank() + if rank in self._ranks: + if not ray.get(self._state.has_failed.remote(id)): + success = ray.get(self._state.set_failed.remote(id)) + if not success: + # Another rank is already about to fail + return + + pid = os.getpid() + print(f"Killing process: {pid} for actor rank {rank}") + time.sleep(1) + os.kill(pid, 9) + + +class TrackingCallback(TrainingCallback): + def after_iteration(self, model, epoch, evals_log): + put_queue(get_world_size()) + + +if __name__ == "__main__": + ray.init(address="auto") + + failure_state = FailureState.remote() + + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=4, + cpus_per_actor=4, + gpus_per_actor=0) + + _, additional_results, _ = train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=200, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=None, + callbacks=[ + TrackingCallback(), + FailureInjection( + id="first_fail", state=failure_state, ranks=[2], iteration=14), + FailureInjection( + id="second_fail", state=failure_state, ranks=[0], iteration=34) + ]) + + actor_1_world_size = set(additional_results["callback_returns"][1]) + assert len(actor_1_world_size) == 1 and 4 in actor_1_world_size, \ + "Training with fewer than 4 actors observed, but this was " \ + "non-elastic training. Please report to test owner." diff --git a/release/xgboost_tests/workloads/train_gpu.py b/release/xgboost_tests/workloads/train_gpu.py new file mode 100644 index 000000000..556455ef2 --- /dev/null +++ b/release/xgboost_tests/workloads/train_gpu.py @@ -0,0 +1,46 @@ +"""Training on a GPU cluster. + +This will train a small dataset on a distributed GPU cluster. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results. + +Notes: The test will report output such as this: +``` +[05:14:49] WARNING: ../src/gbm/gbtree.cc:350: Loading from a raw memory buffer +on CPU only machine. Changing tree_method to hist. +[05:14:49] WARNING: ../src/learner.cc:222: No visible GPU is found, setting +`gpu_id` to -1 +``` + +This is _not_ an error. This is due to the checkpoints being loaded on the +XGBoost driver, and since the driver lives on the head node (which has no +GPU), XGBoost warns that it can't use the GPU. Training still happened using +the GPUs. +""" +import ray +from xgboost_ray import RayParams + +from _train import train_ray + +if __name__ == "__main__": + ray.init(address="auto") + + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=4, + cpus_per_actor=4, + gpus_per_actor=1) + + train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=25, + regression=False, + use_gpu=True, + ray_params=ray_params, + xgboost_params=None, + ) diff --git a/release/xgboost_tests/workloads/train_moderate.py b/release/xgboost_tests/workloads/train_moderate.py new file mode 100644 index 000000000..7963d55ab --- /dev/null +++ b/release/xgboost_tests/workloads/train_moderate.py @@ -0,0 +1,33 @@ +"""Moderate cluster training + +This training run will start 32 workers on 32 nodes (including head node). + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results. +""" +import ray +from xgboost_ray import RayParams + +from _train import train_ray + +if __name__ == "__main__": + ray.init(address="auto") + + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=32, + cpus_per_actor=4, + gpus_per_actor=0) + + train_ray( + path="/data/classification.parquet", + num_workers=32, + num_boost_rounds=100, + num_files=128, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=None, + ) diff --git a/release/xgboost_tests/workloads/train_small.py b/release/xgboost_tests/workloads/train_small.py new file mode 100644 index 000000000..edd3dd542 --- /dev/null +++ b/release/xgboost_tests/workloads/train_small.py @@ -0,0 +1,33 @@ +"""Small cluster training + +This training run will start 4 workers on 4 nodes (including head node). + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results. +""" +import ray +from xgboost_ray import RayParams + +from _train import train_ray + +if __name__ == "__main__": + ray.init(address="auto") + + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=4, + cpus_per_actor=4, + gpus_per_actor=0) + + train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=25, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=None, + ) diff --git a/release/xgboost_tests/workloads/tune_32x4.py b/release/xgboost_tests/workloads/tune_32x4.py new file mode 100644 index 000000000..53e2886fe --- /dev/null +++ b/release/xgboost_tests/workloads/tune_32x4.py @@ -0,0 +1,56 @@ +"""Moderate Ray Tune run (32 trials, 4 actors). + +This training run will start 32 Ray Tune trials, each starting 4 actors. +The cluster comprises 32 nodes. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results, as well +as the Ray Tune results table. No trials should error. All trials should +run in parallel. +""" +import ray +from ray import tune + +from xgboost_ray import RayParams + +from _train import train_ray + + +def train_wrapper(config): + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=4, + cpus_per_actor=1, + gpus_per_actor=0) + + train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=64, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=config, + ) + + +if __name__ == "__main__": + search_space = { + "eta": tune.loguniform(1e-4, 1e-1), + "subsample": tune.uniform(0.5, 1.0), + "max_depth": tune.randint(1, 9) + } + + ray.init(address="auto") + + analysis = tune.run( + train_wrapper, + config=search_space, + num_samples=32, + resources_per_trial={ + "cpu": 1, + "extra_cpu": 3 + }) diff --git a/release/xgboost_tests/workloads/tune_4x32.py b/release/xgboost_tests/workloads/tune_4x32.py new file mode 100644 index 000000000..1c8d439fc --- /dev/null +++ b/release/xgboost_tests/workloads/tune_4x32.py @@ -0,0 +1,56 @@ +"""Moderate Ray Tune run (4 trials, 32 actors). + +This training run will start 4 Ray Tune trials, each starting 32 actors. +The cluster comprises 32 nodes. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results, as well +as the Ray Tune results table. No trials should error. All trials should +run in parallel. +""" +import ray +from ray import tune + +from xgboost_ray import RayParams + +from _train import train_ray + + +def train_wrapper(config): + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=32, + cpus_per_actor=1, + gpus_per_actor=0) + + train_ray( + path="/data/classification.parquet", + num_workers=32, + num_boost_rounds=100, + num_files=128, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=config, + ) + + +if __name__ == "__main__": + search_space = { + "eta": tune.loguniform(1e-4, 1e-1), + "subsample": tune.uniform(0.5, 1.0), + "max_depth": tune.randint(1, 9) + } + + ray.init(address="auto") + + analysis = tune.run( + train_wrapper, + config=search_space, + num_samples=4, + resources_per_trial={ + "cpu": 1, + "extra_cpu": 31 + }) diff --git a/release/xgboost_tests/workloads/tune_small.py b/release/xgboost_tests/workloads/tune_small.py new file mode 100644 index 000000000..c1915aaaa --- /dev/null +++ b/release/xgboost_tests/workloads/tune_small.py @@ -0,0 +1,56 @@ +"""Small Ray Tune run (4 trials, 4 actors). + +This training run will start 4 Ray Tune Trials, each starting 4 actors. +The cluster comprises 4 nodes. + +Test owner: krfricke + +Acceptance criteria: Should run through and report final results, as well +as the Ray Tune results table. No trials should error. All trials should +run in parallel. +""" +import ray +from ray import tune + +from xgboost_ray import RayParams + +from _train import train_ray + + +def train_wrapper(config): + ray_params = RayParams( + elastic_training=False, + max_actor_restarts=2, + num_actors=4, + cpus_per_actor=1, + gpus_per_actor=0) + + train_ray( + path="/data/classification.parquet", + num_workers=4, + num_boost_rounds=100, + num_files=25, + regression=False, + use_gpu=False, + ray_params=ray_params, + xgboost_params=config, + ) + + +if __name__ == "__main__": + search_space = { + "eta": tune.loguniform(1e-4, 1e-1), + "subsample": tune.uniform(0.5, 1.0), + "max_depth": tune.randint(1, 9) + } + + ray.init(address="auto") + + analysis = tune.run( + train_wrapper, + config=search_space, + num_samples=4, + resources_per_trial={ + "cpu": 1, + "extra_cpu": 3 + })