From fc630813cd804fb36992cbb60f68aa883a603331 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Fri, 5 Feb 2021 21:38:31 +0100 Subject: [PATCH] Update XGBoost release test configs --- .../xgboost_tests/cluster_cpu_moderate.yaml | 27 ++++++---- release/xgboost_tests/cluster_cpu_small.yaml | 27 ++++++---- release/xgboost_tests/cluster_gpu_small.yaml | 35 +++++++++---- .../oss_cluster_cpu_moderate.yaml | 44 ++++++++++++++++ .../xgboost_tests/oss_cluster_cpu_small.yaml | 44 ++++++++++++++++ .../xgboost_tests/oss_cluster_gpu_small.yaml | 50 +++++++++++++++++++ 6 files changed, 199 insertions(+), 28 deletions(-) create mode 100644 release/xgboost_tests/oss_cluster_cpu_moderate.yaml create mode 100644 release/xgboost_tests/oss_cluster_cpu_small.yaml create mode 100644 release/xgboost_tests/oss_cluster_gpu_small.yaml diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml index a65c49336..5792b21dd 100644 --- a/release/xgboost_tests/cluster_cpu_moderate.yaml +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -1,12 +1,13 @@ cluster_name: ray-xgboost-release-cpu-moderate -min_workers: 31 -max_workers: 31 +max_workers: 32 + +upscaling_speed: 32 idle_timeout_minutes: 15 docker: - image: anyscale/ray:latest + image: anyscale/ray-ml:latest container_name: ray_container pull_before_run: true @@ -16,20 +17,28 @@ provider: availability_zone: us-west-2a cache_stopped_nodes: false +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 31 + max_workers: 31 + auth: ssh_user: ubuntu -head_node: - # 64 CPUs - InstanceType: m5.xlarge +head_node_type: cpu_4_ondemand +worker_default_node_type: cpu_4_ondemand -worker_nodes: - # 64 CPUs - InstanceType: m5.xlarge +file_mounts: { + "~/release-automation-xgboost_tests": "." +} setup_commands: - pip install pytest xgboost_ray - sudo mkdir -p /data || true - sudo chown ray:1000 /data || true - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml index 4b97439b9..7959eabee 100644 --- a/release/xgboost_tests/cluster_cpu_small.yaml +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -1,12 +1,13 @@ cluster_name: ray-xgboost-release-cpu-small -min_workers: 3 -max_workers: 3 +max_workers: 4 + +upscaling_speed: 32 idle_timeout_minutes: 15 docker: - image: anyscale/ray:latest + image: anyscale/ray-ml:latest container_name: ray_container pull_before_run: true @@ -16,20 +17,28 @@ provider: availability_zone: us-west-2a cache_stopped_nodes: false +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 3 + max_workers: 3 + auth: ssh_user: ubuntu -head_node: - # 64 CPUs - InstanceType: m5.xlarge +head_node_type: cpu_4_ondemand +worker_default_node_type: cpu_4_ondemand -worker_nodes: - # 64 CPUs - InstanceType: m5.xlarge +file_mounts: { + "~/release-automation-xgboost_tests": "." +} setup_commands: - pip install pytest xgboost_ray - sudo mkdir -p /data || true - sudo chown ray:1000 /data || true - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml index 535d28490..94c39d5ca 100644 --- a/release/xgboost_tests/cluster_gpu_small.yaml +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -1,12 +1,13 @@ -cluster_name: ray-xgboost-release-gpu-small +cluster_name: ray-xgboost-release-cpu-small -min_workers: 4 -max_workers: 4 +max_workers: 5 + +upscaling_speed: 32 idle_timeout_minutes: 15 docker: - image: anyscale/ray:latest-gpu + image: anyscale/ray-ml:latest container_name: ray_container pull_before_run: true @@ -16,20 +17,34 @@ provider: availability_zone: us-west-2a cache_stopped_nodes: false +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 0 + max_workers: 0 + gpu_1_ondemand: + node_config: + InstanceType: p2.xlarge + resources: {"CPU": 4, "GPU": 1} + min_workers: 4 + max_workers: 4 + auth: ssh_user: ubuntu -head_node: - # 64 CPUs - InstanceType: m5.xlarge +head_node_type: cpu_4_ondemand +worker_default_node_type: gpu_1_ondemand -worker_nodes: - # 64 CPUs - InstanceType: p2.xlarge +file_mounts: { + "~/release-automation-xgboost_tests": "." +} setup_commands: - pip install pytest xgboost_ray - sudo mkdir -p /data || true - sudo chown ray:1000 /data || true - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/oss_cluster_cpu_moderate.yaml b/release/xgboost_tests/oss_cluster_cpu_moderate.yaml new file mode 100644 index 000000000..6d7e80aa0 --- /dev/null +++ b/release/xgboost_tests/oss_cluster_cpu_moderate.yaml @@ -0,0 +1,44 @@ +cluster_name: ray-xgboost-release-cpu-moderate + +max_workers: 32 + +upscaling_speed: 32 + +idle_timeout_minutes: 15 + +docker: + image: rayproject/ray-ml:1.2.0 + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 31 + max_workers: 31 + +auth: + ssh_user: ubuntu + +head_node_type: cpu_4_ondemand +worker_default_node_type: cpu_4_ondemand + +file_mounts: { + "~/release-automation-xgboost_tests": "." +} + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/oss_cluster_cpu_small.yaml b/release/xgboost_tests/oss_cluster_cpu_small.yaml new file mode 100644 index 000000000..b2d82a4df --- /dev/null +++ b/release/xgboost_tests/oss_cluster_cpu_small.yaml @@ -0,0 +1,44 @@ +cluster_name: ray-xgboost-release-cpu-small + +max_workers: 4 + +upscaling_speed: 32 + +idle_timeout_minutes: 15 + +docker: + image: rayproject/ray-ml:1.2.0 + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 3 + max_workers: 3 + +auth: + ssh_user: ubuntu + +head_node_type: cpu_4_ondemand +worker_default_node_type: cpu_4_ondemand + +file_mounts: { + "~/release-automation-xgboost_tests": "." +} + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 diff --git a/release/xgboost_tests/oss_cluster_gpu_small.yaml b/release/xgboost_tests/oss_cluster_gpu_small.yaml new file mode 100644 index 000000000..1bc6098bb --- /dev/null +++ b/release/xgboost_tests/oss_cluster_gpu_small.yaml @@ -0,0 +1,50 @@ +cluster_name: ray-xgboost-release-cpu-small + +max_workers: 5 + +upscaling_speed: 32 + +idle_timeout_minutes: 15 + +docker: + image: rayproject/ray-ml:1.2.0 + container_name: ray_container + pull_before_run: true + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: false + +available_node_types: + cpu_4_ondemand: + node_config: + InstanceType: m5.xlarge + resources: {"CPU": 4} + min_workers: 0 + max_workers: 0 + gpu_1_ondemand: + node_config: + InstanceType: p2.xlarge + resources: {"CPU": 4, "GPU": 1} + min_workers: 4 + max_workers: 4 + +auth: + ssh_user: ubuntu + +head_node_type: cpu_4_ondemand +worker_default_node_type: gpu_1_ondemand + +file_mounts: { + "~/release-automation-xgboost_tests": "." +} + +setup_commands: + - pip install pytest xgboost_ray + - sudo mkdir -p /data || true + - sudo chown ray:1000 /data || true + - rm -rf /data/classification.parquet || true + - cp -R /tmp/ray_tmp_mount/release-automation-xgboost_tests ~/release-automation-xgboost_tests || echo "Copy failed" + - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2