diff --git a/python/ray/tune/utils/mock.py b/python/ray/tune/utils/mock.py index 1ec925519..cc92fae26 100644 --- a/python/ray/tune/utils/mock.py +++ b/python/ray/tune/utils/mock.py @@ -102,11 +102,11 @@ class FailureInjectorCallback(Callback): """Adds random failure injection to the TrialExecutor.""" def __init__(self, - config_path="/home/ubuntu/ray_bootstrap_config.yaml", + config_path="~/ray_bootstrap_config.yaml", probability=0.1, disable=False): self.probability = probability - self.config_path = config_path + self.config_path = os.path.expanduser(config_path) self.disable = disable def on_step_begin(self, **info): diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml index a2ed252a4..152642d55 100644 --- a/release/long_running_distributed_tests/cluster.yaml +++ b/release/long_running_distributed_tests/cluster.yaml @@ -1,64 +1,36 @@ -# This file is generated by `ray project create`. - -# A unique identifier for the head node and workers of this cluster. cluster_name: long-running-distributed-tests -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. min_workers: 3 -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. min_workers defaults to 0. max_workers: 3 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 +docker: + image: anyscale/ray-ml:latest-gpu + container_name: ray_container + pull_before_run: True -# Cloud-provider specific configuration. provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: False -# How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu -# By default Ray creates a new private keypair, but you can also use your own. -# If you do so, make sure to also set "KeyName" in the head and worker node -# configurations below. -# ssh_private_key: /path/to/your/key.pem - -# Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: InstanceType: g3.8xlarge - ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19 - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 150 worker_nodes: InstanceType: g3.8xlarge - ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19 - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 150 InstanceMarketOptions: MarketType: spot -setup_commands: [] +setup_commands: + - apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1 + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: diff --git a/release/long_running_distributed_tests/run.sh b/release/long_running_distributed_tests/run.sh index 386416a08..d0fa4a6c4 100755 --- a/release/long_running_distributed_tests/run.sh +++ b/release/long_running_distributed_tests/run.sh @@ -42,7 +42,7 @@ echo "commit: $commit" echo "branch: $ray_branch" echo "workload: $workload" -wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp36-cp36m-manylinux2014_x86_64.whl" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" conda uninstall -y terminado || true pip install -U pip diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index 0fa94cb44..2451fe4a2 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -13,7 +13,7 @@ from ray.tune import CLIReporter from ray.tune.schedulers import PopulationBasedTraining from ray.tune.utils.util import merge_dicts from ray.tune.utils.mock import FailureInjectorCallback -from ray.util.sgd.torch import TorchTrainer +from ray.util.sgd.torch import TorchTrainer, TrainingOperator from ray.util.sgd.torch.resnet import ResNet18 from ray.util.sgd.utils import BATCH_SIZE @@ -74,13 +74,17 @@ def optimizer_creator(model, config): momentum=config.get("momentum", 0.9)) -ray.init(address="auto" if not args.smoke_test else None, _log_to_driver=True) +ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 -TorchTrainable = TorchTrainer.as_trainable( + +CustomTrainingOperator = TrainingOperator.from_creators( model_creator=ResNet18, - data_creator=cifar_creator, optimizer_creator=optimizer_creator, - loss_creator=nn.CrossEntropyLoss, + data_creator=cifar_creator, + loss_creator=nn.CrossEntropyLoss) + +TorchTrainable = TorchTrainer.as_trainable( + training_operator_cls=CustomTrainingOperator, initialization_hook=initialization_hook, num_workers=num_training_workers, config={ diff --git a/release/rllib_tests/regression_tests/cluster.yaml b/release/rllib_tests/regression_tests/cluster.yaml index d0aa94e8c..6a80c80d8 100644 --- a/release/rllib_tests/regression_tests/cluster.yaml +++ b/release/rllib_tests/regression_tests/cluster.yaml @@ -3,6 +3,11 @@ cluster_name: ray-rllib-regression-tests min_workers: 0 max_workers: 0 +docker: + image: anyscale/ray-ml:latest-gpu + container_name: ray_container + pull_before_run: True + # Cloud-provider specific configuration. provider: type: aws @@ -16,24 +21,18 @@ auth: head_node: InstanceType: p3.16xlarge - ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) - - # Set primary volume to 25 GiB - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - # List of shell commands to run to set up nodes. -setup_commands: [] +setup_commands: + - apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1 + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/release/rllib_tests/regression_tests/run.sh b/release/rllib_tests/regression_tests/run.sh index 4a692c2e5..abbabcb04 100755 --- a/release/rllib_tests/regression_tests/run.sh +++ b/release/rllib_tests/regression_tests/run.sh @@ -41,15 +41,18 @@ echo "commit: $commit" echo "branch: $ray_branch" echo "workload: ignored" -wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp36-cp36m-manylinux2014_x86_64.whl" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" conda uninstall -y terminado -source activate tensorflow_p36 && pip install -U pip -source activate tensorflow_p36 && pip install -U "$wheel" -source activate tensorflow_p36 && pip install "ray[rllib]" "ray[debug]" -source activate tensorflow_p36 && pip install torch==1.6 torchvision -source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0 +pip install -U pip +pip install -U "$wheel" +pip install "ray[rllib]" "ray[debug]" +pip install terminado +pip install torch==1.6 torchvision +pip install boto3==1.4.8 cython==0.29.0 + # Run tf learning tests. -source activate tensorflow_p36 && rllib train -f compact-regression-tests-tf.yaml +rllib train -f compact-regression-tests-tf.yaml + # Run torch learning tests. -source activate tensorflow_p36 && rllib train -f compact-regression-tests-torch.yaml +rllib train -f compact-regression-tests-torch.yaml diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml index e4fd2d26d..e31ecbdd9 100644 --- a/release/rllib_tests/stress_tests/cluster.yaml +++ b/release/rllib_tests/stress_tests/cluster.yaml @@ -1,105 +1,46 @@ -#################################################################### -# All nodes in this cluster will auto-terminate in 1 hour -#################################################################### - -# An unique identifier for the head node and workers of this cluster. cluster_name: ray-rllib-stress-tests -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. min_workers: 9 - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. max_workers: 9 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. target_utilization_fraction: 0.8 +idle_timeout_minutes: 15 -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 +docker: + image: anyscale/ray-ml:latest-gpu + container_name: ray_container + pull_before_run: True -# Cloud-provider specific configuration. provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: False -# How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu -# By default Ray creates a new private keypair, but you can also use your own. -# If you do so, make sure to also set "KeyName" in the head and worker node -# configurations below. -# ssh_private_key: /path/to/your/key.pem -# Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: InstanceType: p3.16xlarge - ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) - # Set primary volume to 25 GiB - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - - # Additional options in the boto docs. - -# Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances worker_nodes: - InstanceType: m4.16xlarge - ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) + InstanceType: m5.16xlarge - - # Set primary volume to 25 GiB - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - - # Run workers on spot by default. Comment this out to use on-demand. - # InstanceMarketOptions: - # MarketType: spot - # Additional options can be found in the boto docs, e.g. - # SpotOptions: - # MaxPrice: MAX_HOURLY_PRICE - - # Additional options in the boto docs. - -# Files or directories to copy to the head and worker nodes. The format is a -# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", } -# List of shell commands to run to set up nodes. -setup_commands: [] +setup_commands: + - apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1 + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] -# Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml -# Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/release/rllib_tests/stress_tests/run.sh b/release/rllib_tests/stress_tests/run.sh index b038de9fb..704d013a7 100755 --- a/release/rllib_tests/stress_tests/run.sh +++ b/release/rllib_tests/stress_tests/run.sh @@ -42,14 +42,14 @@ echo "commit: $commit" echo "branch: $ray_branch" echo "workload: ignored" -wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp36-cp36m-manylinux2014_x86_64.whl" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" conda uninstall -y terminado -source activate tensorflow_p36 && pip install -U pip -source activate tensorflow_p36 && pip install -U "$wheel" -source activate tensorflow_p36 && pip install "ray[rllib]" "ray[debug]" -source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0 -source activate tensorflow_p36 +pip install -U pip +pip install -U "$wheel" +pip install "ray[rllib]" "ray[debug]" +pip install terminado +pip install boto3==1.4.8 cython==0.29.0 python3 wait_cluster.py diff --git a/release/rllib_tests/unit_gpu_tests/cluster.yaml b/release/rllib_tests/unit_gpu_tests/cluster.yaml index 2030bb2ac..23e59b788 100644 --- a/release/rllib_tests/unit_gpu_tests/cluster.yaml +++ b/release/rllib_tests/unit_gpu_tests/cluster.yaml @@ -3,6 +3,11 @@ cluster_name: ray-rllib-regression-tests min_workers: 0 max_workers: 0 +docker: + image: anyscale/ray-ml:latest-gpu + container_name: ray_container + pull_before_run: True + # Cloud-provider specific configuration. provider: type: aws @@ -16,7 +21,6 @@ auth: head_node: InstanceType: p2.xlarge # Cheaper 1GPU K80 instance - ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) # Set primary volume to 25 GiB BlockDeviceMappings: @@ -26,14 +30,15 @@ head_node: # List of shell commands to run to set up nodes. -setup_commands: [] +setup_commands: + - apt-get install -y libglib2.0-0 libcudnn7=7.6.5.32-1+cuda10.1 curl unzip gcc python3-dev # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - - source activate tensorflow_p36 && ray stop - - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + - ray stop + - ulimit -n 65536; OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/release/rllib_tests/unit_gpu_tests/requirements.txt b/release/rllib_tests/unit_gpu_tests/requirements.txt index a86eb47e6..e63556bc1 100644 --- a/release/rllib_tests/unit_gpu_tests/requirements.txt +++ b/release/rllib_tests/unit_gpu_tests/requirements.txt @@ -4,3 +4,4 @@ torch==1.6+cu101 torchvision==0.7.0+cu101 boto3==1.4.8 cython==0.29.0 +pytest diff --git a/release/rllib_tests/unit_gpu_tests/run.sh b/release/rllib_tests/unit_gpu_tests/run.sh index db468e789..ff93e5164 100755 --- a/release/rllib_tests/unit_gpu_tests/run.sh +++ b/release/rllib_tests/unit_gpu_tests/run.sh @@ -42,12 +42,33 @@ echo "commit: $commit" echo "branch: $ray_branch" echo "workload: ignored" -wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp36-cp36m-manylinux2014_x86_64.whl" +wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl" conda uninstall -y terminado -source activate tensorflow_p36 && pip install -U pip -source activate tensorflow_p36 && pip install -U "$wheel" +pip install -U pip +pip install -U "$wheel" +pip install -U pytest +pip install terminado +pip install torch>=1.6 torchvision +pip install -U tensorflow-gpu + +if [ -z "$commit" ]; then + cob="origin/$ray_branch" +else + cob="$commit" +fi + +git clone https://github.com/ray-project/ray.git ray +pushd ray || true +git checkout "$cob" + +bash ./ci/travis/install-bazel.sh +BAZEL_PATH=$HOME/bin/bazel # Run all test cases, but with a forced num_gpus=1. # TODO: (sven) chose correct dir and run over all RLlib tests and example scripts! -source activate tensorflow_p36 && export RAY_FORCE_NUM_GPUS=1 && cd ~ && python -m pytest test_attention_net_learning.py +export RLLIB_NUM_GPUS=1 && $BAZEL_PATH test --config="ci $(./scripts/bazel_export_options)" --build_tests_only --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... +export RLLIB_NUM_GPUS=1 && $BAZEL_PATH test --config="ci $(./scripts/bazel_export_options)" --build_tests_only --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... +export RLLIB_NUM_GPUS=1 && $BAZEL_PATH test --config="ci $(./scripts/bazel_export_options)" --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... +export RLLIB_NUM_GPUS=1 && $BAZEL_PATH test --config="ci $(./scripts/bazel_export_options)" --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... +popd || true