From acbd12eabfad258d307bbdd6e820314f8ce1104c Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Mon, 19 Oct 2020 10:53:30 -0700 Subject: [PATCH] [Docker] Set Docker as the Default (#11416) --- BUILD.bazel | 8 +- python/ray/autoscaler/_private/providers.py | 39 +-- python/ray/autoscaler/aws/defaults.yaml | 139 ++++++++ python/ray/autoscaler/aws/example-full.yaml | 23 +- .../autoscaler/aws/example-gpu-docker.yaml | 12 +- python/ray/autoscaler/aws/example-ml.yaml | 6 +- python/ray/autoscaler/azure/defaults.yaml | 136 ++++++++ python/ray/autoscaler/azure/example-full.yaml | 23 +- .../autoscaler/azure/example-gpu-docker.yaml | 16 +- python/ray/autoscaler/azure/example-gpu.yaml | 14 +- python/ray/autoscaler/gcp/defaults.yaml | 165 +++++++++ python/ray/autoscaler/gcp/example-full.yaml | 37 +-- .../autoscaler/gcp/example-gpu-docker.yaml | 12 +- .../ray/autoscaler/kubernetes/defaults.yaml | 301 +++++++++++++++++ python/ray/autoscaler/local/defaults.yaml | 92 ++++++ python/ray/autoscaler/local/example-full.yaml | 12 +- python/ray/autoscaler/ray-schema.json | 2 +- python/ray/autoscaler/staroid/defaults.yaml | 312 ++++++++++++++++++ .../ray/autoscaler/staroid/example-full.yaml | 2 +- python/setup.py | 11 +- 20 files changed, 1247 insertions(+), 115 deletions(-) create mode 100644 python/ray/autoscaler/aws/defaults.yaml create mode 100644 python/ray/autoscaler/azure/defaults.yaml create mode 100644 python/ray/autoscaler/gcp/defaults.yaml create mode 100644 python/ray/autoscaler/kubernetes/defaults.yaml create mode 100644 python/ray/autoscaler/local/defaults.yaml create mode 100644 python/ray/autoscaler/staroid/defaults.yaml diff --git a/BUILD.bazel b/BUILD.bazel index e2739bff7..883a31c3b 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1777,10 +1777,10 @@ filegroup( "python/ray/*.py", "python/ray/autoscaler/*.py", "python/ray/autoscaler/_private/*.py", - "python/ray/autoscaler/aws/example-full.yaml", - "python/ray/autoscaler/azure/example-full.yaml", - "python/ray/autoscaler/gcp/example-full.yaml", - "python/ray/autoscaler/local/example-full.yaml", + "python/ray/autoscaler/aws/defaults.yaml", + "python/ray/autoscaler/azure/defaults.yaml", + "python/ray/autoscaler/gcp/defaults.yaml", + "python/ray/autoscaler/local/defaults.yaml", "python/ray/cloudpickle/*.py", "python/ray/core/__init__.py", "python/ray/core/generated/__init__.py", diff --git a/python/ray/autoscaler/_private/providers.py b/python/ray/autoscaler/_private/providers.py index 329751916..c9ffccaaa 100644 --- a/python/ray/autoscaler/_private/providers.py +++ b/python/ray/autoscaler/_private/providers.py @@ -50,38 +50,35 @@ def _import_staroid(provider_config): return StaroidNodeProvider -def _load_local_example_config(): +def _load_local_defaults_config(): import ray.autoscaler.local as ray_local - return os.path.join( - os.path.dirname(ray_local.__file__), "example-full.yaml") + return os.path.join(os.path.dirname(ray_local.__file__), "defaults.yaml") -def _load_kubernetes_example_config(): +def _load_kubernetes_defaults_config(): import ray.autoscaler.kubernetes as ray_kubernetes return os.path.join( - os.path.dirname(ray_kubernetes.__file__), "example-full.yaml") + os.path.dirname(ray_kubernetes.__file__), "defaults.yaml") -def _load_aws_example_config(): +def _load_aws_defaults_config(): import ray.autoscaler.aws as ray_aws - return os.path.join(os.path.dirname(ray_aws.__file__), "example-full.yaml") + return os.path.join(os.path.dirname(ray_aws.__file__), "defaults.yaml") -def _load_gcp_example_config(): +def _load_gcp_defaults_config(): import ray.autoscaler.gcp as ray_gcp - return os.path.join(os.path.dirname(ray_gcp.__file__), "example-full.yaml") + return os.path.join(os.path.dirname(ray_gcp.__file__), "defaults.yaml") -def _load_azure_example_config(): +def _load_azure_defaults_config(): import ray.autoscaler.azure as ray_azure - return os.path.join( - os.path.dirname(ray_azure.__file__), "example-full.yaml") + return os.path.join(os.path.dirname(ray_azure.__file__), "defaults.yaml") -def _load_staroid_example_config(): +def _load_staroid_defaults_config(): import ray.autoscaler.staroid as ray_staroid - return os.path.join( - os.path.dirname(ray_staroid.__file__), "example-full.yaml") + return os.path.join(os.path.dirname(ray_staroid.__file__), "defaults.yaml") def _import_external(provider_config): @@ -110,12 +107,12 @@ _PROVIDER_PRETTY_NAMES = { } _DEFAULT_CONFIGS = { - "local": _load_local_example_config, - "aws": _load_aws_example_config, - "gcp": _load_gcp_example_config, - "azure": _load_azure_example_config, - "staroid": _load_staroid_example_config, - "kubernetes": _load_kubernetes_example_config, + "local": _load_local_defaults_config, + "aws": _load_aws_defaults_config, + "gcp": _load_gcp_defaults_config, + "azure": _load_azure_defaults_config, + "staroid": _load_staroid_defaults_config, + "kubernetes": _load_kubernetes_defaults_config, } diff --git a/python/ray/autoscaler/aws/defaults.yaml b/python/ray/autoscaler/aws/defaults.yaml new file mode 100644 index 000000000..2d776b7ba --- /dev/null +++ b/python/ray/autoscaler/aws/defaults.yaml @@ -0,0 +1,139 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: {} + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This max value allowed is 1.0, which is the most conservative setting. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + # Whether to allow node reuse. If set to False, nodes will be terminated + # instead of stopped. + cache_stopped_nodes: True # If not present, the default is True. + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +head_node: + InstanceType: m5.large + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + + # Additional options in the boto docs. + +# Provider-specific config for worker nodes, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +worker_nodes: + InstanceType: m5.large + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + # Additional options can be found in the boto docs, e.g. + # SpotOptions: + # MaxPrice: MAX_HOURLY_PRICE + + # Additional options in the boto docs. + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index 189cc144f..a6628f6a0 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -23,18 +23,18 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "" # e.g., rayproject/ray:0.8.7 - container_name: "" # e.g. ray_docker + image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup + container_name: "ray_container" # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. pull_before_run: True run_options: [] # Extra options to pass into "docker run" # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" # Allow Ray to automatically detect GPUs - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest-cpu" # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource @@ -123,20 +123,15 @@ file_mounts_sync_continuously: False initialization_commands: [] # List of shell commands to run to set up nodes. -setup_commands: - # Note: if you're developing Ray, you probably want to create an AMI that +setup_commands: [] + # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl - # Consider uncommenting these if you also want to run apt-get commands during setup - # - sudo pkill -9 apt-get || true - # - sudo pkill -9 dpkg || true - # - sudo dpkg --configure -a + # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest) + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. -head_setup_commands: - - pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions +head_setup_commands: [] # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/aws/example-gpu-docker.yaml b/python/ray/autoscaler/aws/example-gpu-docker.yaml index 07a48d234..67b5e5b22 100644 --- a/python/ray/autoscaler/aws/example-gpu-docker.yaml +++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml @@ -23,13 +23,13 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "rayproject/ray:0.8.7-gpu" - container_name: "ray-nvidia-docker-test" # e.g. ray_docker + image: "rayproject/ray:latest-gpu" + container_name: "ray_nvidia_docker" # e.g. ray_docker # # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest" # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -99,8 +99,8 @@ file_mounts: { } # List of shell commands to run to set up nodes. -# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled -# setup_commands: +# NOTE: rayproject/ray:latest has ray latest bundled +setup_commands: [] # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl diff --git a/python/ray/autoscaler/aws/example-ml.yaml b/python/ray/autoscaler/aws/example-ml.yaml index 6353f3c84..7802b9808 100644 --- a/python/ray/autoscaler/aws/example-ml.yaml +++ b/python/ray/autoscaler/aws/example-ml.yaml @@ -28,7 +28,7 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "" # e.g., rayproject/ray:0.8.7 + image: "" # e.g., rayproject/ray:latest container_name: "" # e.g. ray_docker # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. @@ -36,9 +36,9 @@ docker: run_options: [] # Extra options to pass into "docker run" # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest" # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/azure/defaults.yaml b/python/ray/autoscaler/azure/defaults.yaml new file mode 100644 index 000000000..3aa9174cc --- /dev/null +++ b/python/ray/autoscaler/azure/defaults.yaml @@ -0,0 +1,136 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: {} + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: azure + # https://azure.microsoft.com/en-us/global-infrastructure/locations + location: westus2 + resource_group: ray-cluster + # set subscription id otherwise the default from az cli will be used + # subscription_id: 00000000-0000-0000-0000-000000000000 + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + # you must specify paths to matching private and public key pair files + # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair + ssh_private_key: ~/.ssh/id_rsa + ssh_public_key: ~/.ssh/id_rsa.pub + +# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file +# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines +# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs +# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below + +# Provider-specific config for the head node, e.g. instance type. +head_node: + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: 20.02.01 + +# Provider-specific config for worker nodes, e.g. instance type. +worker_nodes: + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: 20.02.01 + # optionally set priority to use Spot instances + priority: Spot + # set a maximum price for spot instances if desired + # billingProfile: + # maxPrice: -1 + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: + # get rid of annoying Ubuntu message + - touch ~/.sudo_as_admin_successful + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + # - echo 'conda activate py37_pytorch' >> ~/.bashrc + - echo 'conda activate py37_tensorflow' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index 5f8130178..ce07f13da 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -23,17 +23,19 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "" # e.g., rayproject/ray:0.8.7 - container_name: "" # e.g. ray_docker + image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup + container_name: "ray_container" # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. pull_before_run: True run_options: [] # Extra options to pass into "docker run" # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" + # Allow Ray to automatically detect GPUs - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest-cpu" + # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -117,17 +119,12 @@ initialization_commands: - touch ~/.sudo_as_admin_successful # List of shell commands to run to set up nodes. -setup_commands: - # Note: if you're developing Ray, you probably want to create an AMI that +setup_commands: [] + # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - # - echo 'conda activate py37_pytorch' >> ~/.bashrc - - echo 'conda activate py37_tensorflow' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl - # Consider uncommenting these if you also want to run apt-get commands during setup - # - sudo pkill -9 apt-get || true - # - sudo pkill -9 dpkg || true - # - sudo dpkg --configure -a + # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest) + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 8825f8525..d70b457c2 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -23,13 +23,13 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "rayproject/ray:0.8.7-gpu" - container_name: "ray-nvidia-docker-test" # e.g. ray_docker + image: "rayproject/ray:latest-gpu" + container_name: "ray_nvidia_docker" # e.g. ray_docker # # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest" # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -55,13 +55,13 @@ auth: ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields using example-full.yaml +# Ray will auto-configure unspecified fields using defaults.yaml head_node: azure_arm_parameters: vmSize: Standard_NC6s_v3 # Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields using example-full.yaml +# Ray will auto-configure unspecified fields using defaults.yaml worker_nodes: azure_arm_parameters: vmSize: Standard_NC6s_v3 @@ -74,8 +74,8 @@ file_mounts: { } # List of shell commands to run to set up nodes. -# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled -# setup_commands: +# NOTE: rayproject/ray:latest has ray latest bundled +setup_commands: [] # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml index 0f204e3a6..77cccdda7 100644 --- a/python/ray/autoscaler/azure/example-gpu.yaml +++ b/python/ray/autoscaler/azure/example-gpu.yaml @@ -23,17 +23,17 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "" # e.g., rayproject/ray:0.8.7-gpu - container_name: "" # e.g. ray_docker + image: "rayproject/ray:latest-gpu" + container_name: "ray_docker" # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. - pull_before_run: True + pull_before_run: False run_options: [] # Extra options to pass into "docker run" # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest" # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -63,7 +63,7 @@ auth: ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields using example-full.yaml +# Ray will auto-configure unspecified fields using defaults.yaml head_node: azure_arm_parameters: vmSize: Standard_NC6 @@ -74,7 +74,7 @@ head_node: imageVersion: 20.02.01 # Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields using example-full.yaml +# Ray will auto-configure unspecified fields using defaults.yaml worker_nodes: azure_arm_parameters: vmSize: Standard_NC6 diff --git a/python/ray/autoscaler/gcp/defaults.yaml b/python/ray/autoscaler/gcp/defaults.yaml new file mode 100644 index 000000000..0007939c8 --- /dev/null +++ b/python/ray/autoscaler/gcp/defaults.yaml @@ -0,0 +1,165 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: {} + + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: gcp + region: us-west1 + availability_zone: us-west1-a + project_id: null # Globally unique project id + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. This requires that you have added the key into the +# project wide meta-data. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as subnets and ssh-keys. +# For more documentation on available fields, see: +# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert +head_node: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/tf-1-13-cpu + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + + # If the network interface is specified as below in both head and worker + # nodes, the manual network config is used. Otherwise an existing subnet is + # used. To use a shared subnet, ask the subnet owner to grant permission + # for 'compute.subnetworks.use' to the ray autoscaler account... + # networkInterfaces: + # - kind: compute#networkInterface + # subnetwork: path/to/subnet + # aliasIpRanges: [] + +worker_nodes: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/tf-1-13-cpu + # Run workers on preemtible instance by default. + # Comment this out to use on-demand. + scheduling: + - preemptible: true + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + + # Install MiniConda. + - >- + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/anaconda3.sh + || true + && bash ~/anaconda3.sh -b -p ~/anaconda3 || true + && rm ~/anaconda3.sh + && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile + + # Install ray + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install google-api-python-client==1.7.8 + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --head + --port=6379 + --object-manager-port=8076 + --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --address=$RAY_HEAD_IP:6379 + --object-manager-port=8076 diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml index 9e07111b1..61314f318 100644 --- a/python/ray/autoscaler/gcp/example-full.yaml +++ b/python/ray/autoscaler/gcp/example-full.yaml @@ -23,12 +23,19 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "" # e.g., rayproject/ray:0.8.7 - container_name: "" # e.g. ray_docker - # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image - # if no cached version is present. - pull_before_run: True - run_options: [] # Extra options to pass into "docker run" + image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup + container_name: "ray_container" + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: [] # Extra options to pass into "docker run" + + # Example of running a GPU head with CPU workers + # head_image: "rayproject/ray:latest-gpu" + # Allow Ray to automatically detect GPUs + + # worker_image: "rayproject/ray:latest-cpu" + # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource @@ -125,22 +132,12 @@ file_mounts_sync_continuously: False initialization_commands: [] # List of shell commands to run to set up nodes. -setup_commands: - # Note: if you're developing Ray, you probably want to create an AMI that +setup_commands: [] + # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - # Install MiniConda. - - >- - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/anaconda3.sh - || true - && bash ~/anaconda3.sh -b -p ~/anaconda3 || true - && rm ~/anaconda3.sh - && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile - - # Install ray - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest) + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. diff --git a/python/ray/autoscaler/gcp/example-gpu-docker.yaml b/python/ray/autoscaler/gcp/example-gpu-docker.yaml index 6ad3d916a..fbac3f11e 100644 --- a/python/ray/autoscaler/gcp/example-gpu-docker.yaml +++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml @@ -23,14 +23,14 @@ autoscaling_mode: default # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: - image: "rayproject/ray:0.8.7-gpu" - container_name: "ray-nvidia-docker-test" # e.g. ray_docker + image: "rayproject/ray:latest-gpu" + container_name: "ray_nvidia_docker" # e.g. ray_docker # # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray:0.8.7-gpu" + # head_image: "rayproject/ray:latest-gpu" - # worker_image: "rayproject/ray:0.8.7" + # worker_image: "rayproject/ray:latest" # The autoscaler will scale up the cluster to this target fraction of resource @@ -129,8 +129,8 @@ initialization_commands: done" # List of shell commands to run to set up nodes. -# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled -# setup_commands: +# NOTE: rayproject/ray:latest has ray latest bundled +setup_commands: [] # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl diff --git a/python/ray/autoscaler/kubernetes/defaults.yaml b/python/ray/autoscaler/kubernetes/defaults.yaml new file mode 100644 index 000000000..ba91ee9ae --- /dev/null +++ b/python/ray/autoscaler/kubernetes/defaults.yaml @@ -0,0 +1,301 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Kubernetes resources that need to be configured for the autoscaler to be +# able to manage the Ray cluster. If any of the provided resources don't +# exist, the autoscaler will attempt to create them. If this fails, you may +# not have the required permissions and will have to request them to be +# created by your cluster administrator. +provider: + type: kubernetes + + # Exposing external IP addresses for ray pods isn't currently supported. + use_internal_ips: true + + # Namespace to use for all resources created. + namespace: ray + + # ServiceAccount created by the autoscaler for the head node pod that it + # runs in. If this field isn't provided, the head pod config below must + # contain a user-created service account with the proper permissions. + autoscaler_service_account: + apiVersion: v1 + kind: ServiceAccount + metadata: + name: autoscaler + + # Role created by the autoscaler for the head node pod that it runs in. + # If this field isn't provided, the role referenced in + # autoscaler_role_binding must exist and have at least these permissions. + autoscaler_role: + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: autoscaler + rules: + - apiGroups: [""] + resources: ["pods", "pods/status", "pods/exec"] + verbs: ["get", "watch", "list", "create", "delete", "patch"] + + # RoleBinding created by the autoscaler for the head node pod that it runs + # in. If this field isn't provided, the head pod config below must contain + # a user-created service account with the proper permissions. + autoscaler_role_binding: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: autoscaler + subjects: + - kind: ServiceAccount + name: autoscaler + roleRef: + kind: Role + name: autoscaler + apiGroup: rbac.authorization.k8s.io + + services: + # Service that maps to the head node of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-head + spec: + # This selector must match the head node pod's selector below. + selector: + component: ray-head + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + + # Service that maps to the worker nodes of the Ray cluster. + - apiVersion: v1 + kind: Service + metadata: + # NOTE: If you're running multiple Ray clusters with services + # on one Kubernetes cluster, they must have unique service + # names. + name: ray-workers + spec: + # This selector must match the worker node pods' selector below. + selector: + component: ray-worker + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + spec: + # Change this if you altered the autoscaler_service_account above + # or want to provide your own. + serviceAccountName: autoscaler + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) + image: rayproject/ray + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 1000m + memory: 512Mi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node + # service is required. + labels: + component: ray-worker + spec: + serviceAccountName: default + + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + image: rayproject/ray + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 1000m + memory: 512Mi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/local/defaults.yaml b/python/ray/autoscaler/local/defaults.yaml new file mode 100644 index 000000000..d569d21a0 --- /dev/null +++ b/python/ray/autoscaler/local/defaults.yaml @@ -0,0 +1,92 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +## NOTE: Typically for local clusters, min_workers == initial_workers == max_workers == len(worker_ips). + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +# Typically, min_workers == initial_workers == max_workers == len(worker_ips). +min_workers: 0 +# The initial number of worker nodes to launch in addition to the head node. +# Typically, min_workers == initial_workers == max_workers == len(worker_ips). +initial_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head node. +# This takes precedence over min_workers. +# Typically, min_workers == initial_workers == max_workers == len(worker_ips). +max_workers: 0 + +# Autoscaling parameters. +# Ignore this if min_workers == initial_workers == max_workers. +autoscaling_mode: default +target_utilization_fraction: 0.8 +idle_timeout_minutes: 5 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. Assumes Docker is installed. +docker: {} + +# Local specific configuration. +provider: + type: local + head_ip: YOUR_HEAD_NODE_HOSTNAME + worker_ips: [WORKER_NODE_1_HOSTNAME, WORKER_NODE_2_HOSTNAME, ... ] + # Optional when running automatic cluster management on prem. If you use a coordinator server, + # then you can launch multiple autoscaling clusters on the same set of machines, and the coordinator + # will assign individual nodes to clusters as needed. + # coordinator_address: ":" + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: YOUR_USERNAME + # Optional if an ssh private key is necessary to ssh to the cluster. + # ssh_private_key: ~/.ssh/id_rsa + +# Leave this empty. +head_node: {} + +# Leave this empty. +worker_nodes: {} + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up each nodes. +setup_commands: + - pip install -U ray + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ray start --address=$RAY_HEAD_IP:6379 diff --git a/python/ray/autoscaler/local/example-full.yaml b/python/ray/autoscaler/local/example-full.yaml index 037426e08..b09a1aa36 100644 --- a/python/ray/autoscaler/local/example-full.yaml +++ b/python/ray/autoscaler/local/example-full.yaml @@ -26,8 +26,8 @@ idle_timeout_minutes: 5 # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. Assumes Docker is installed. docker: - image: "" # e.g., rayproject/ray:0.8.7 - container_name: "" # e.g. ray_docker + image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup + container_name: "ray_container" # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. pull_before_run: True @@ -78,8 +78,12 @@ file_mounts_sync_continuously: False initialization_commands: [] # List of shell commands to run to set up each nodes. -setup_commands: - - pip install -U ray +setup_commands: [] + # Note: if you're developing Ray, you probably want to create a Docker image that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest) + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: [] diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 9c602332d..8f22c180e 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -188,7 +188,7 @@ "image": { "type": "string", "description": "the docker image name", - "default": "rayproject/ray:0.8.7" + "default": "rayproject/ray:latest" }, "container_name": { "type": "string", diff --git a/python/ray/autoscaler/staroid/defaults.yaml b/python/ray/autoscaler/staroid/defaults.yaml new file mode 100644 index 000000000..310ebd612 --- /dev/null +++ b/python/ray/autoscaler/staroid/defaults.yaml @@ -0,0 +1,312 @@ +# An unique identifier for the head node and workers of this cluster. +# A namespace will be automatically created for each cluster_name in SKE. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Kubernetes resources that need to be configured for the autoscaler to be +# able to manage the Ray cluster. If any of the provided resources don't +# exist, the autoscaler will attempt to create them. If this fails, you may +# not have the required permissions and will have to request them to be +# created by your cluster administrator. +provider: + type: staroid + + # Access token for Staroid from https://staroid.com/settings/accesstokens. + # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable. + # https://github.com/staroids/staroid-python#configuration + # for more information. + access_token: + + # Staroid account to use. e.g. GITHUB/staroids + # Alternatively, you can set STAROID_ACCOUNT environment variable. + # Leave empty to select default account for given access token. + # https://github.com/staroids/staroid-python#configuration + # for more information. + account: + + # Name of a Staroid Kubernetes Engine (SKE) instance. + # Alternatively, you can set STAROID_SKE environment variable. + # An SKE is a virtualized Kubernetes cluster. + # Will create a new if not exists. + ske: "Ray cluster" + + # Cloud and Region to create an SKE when not exists. + # If SKE already exists, this value will be ignored. + # Supported cloud region can be found + # https://docs.staroid.com/ske/cloudregion.html. + ske_region: "aws us-west2" + + # To create a namespace in SKE, you need to specify a Github project. + # The Github project needs to have a staroid.yaml + # (https://docs.staroid.com/references/staroid_yaml.html). + # staroid.yaml defines various resources for the project, such as + # - Building container images can be accessed from the namespace + # - Kubernetes resources to create (like Persistent volume claim) + # on namespace creation + # You can fork when you need to customize. + # 1. Fork github.com/open-datastudio/ray + # 2. Change .staroid/ directory to cutomize + # 3. Connect forked repository (https://staroid.com/projects/settings) + # 4. Release your customized branch + # 4-1. Select project from 'My projects' menu + # 4-2. Select your branch in 'Release' tab + # 4-3. After build success, switch to 'Production' + # 4-4. Switch Launch permission to 'Public' if required + # 5. Change 'project' field to point your + # repository and branch in this file + project: "GITHUB/open-datastudio/ray:master-staroid" + + # 'spec.containers.image' field for ray-node and ray-worker will be + # overrided by the image built from the 'project' field above. + # Set this value to 'false' to not override the image. + image_from_project: true + + # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'. + # 'project' field above provides docker image for each python version. + # Fork 'project' if you'd like to support other python versions. + python_version: 3.7.7 + + # Exposing external IP addresses for ray pods isn't currently supported. + use_internal_ips: true + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + + # https://docs.staroid.com/ske/pod.html#pod + pod.staroid.com/spot: "false" # use on-demand instance for head. + + # Uncomment to locate ray head to dedicated Kubernetes node + # (GPU instance is only available for 'dedicated' isolation) + #pod.staroid.com/isolation: dedicated + #pod.staroid.com/instance-type: gpu-1 + spec: + automountServiceAccountToken: true + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + # nfs volume provides a shared volume across all ray-nodes. + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) + # Image will be overridden when 'image_from_project' is true. + image: rayproject/ray + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /nfs + name: nfs-volume + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + - name: RAY_ADDRESS + value: "auto" + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node + # service is required. + labels: + component: ray-worker + + # https://docs.staroid.com/ske/pod.html#pod + pod.staroid.com/spot: "true" # use spot instance for workers. + + # Uncomment to locate ray head to dedicated Kubernetes node + # (GPU instance is only available for 'dedicated' isolation) + #pod.staroid.com/isolation: dedicated + #pod.staroid.com/instance-type: gpu-1 + spec: + serviceAccountName: default + + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + image: rayproject/autoscaler + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /nfs + name: nfs-volume + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + # install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use. + - pip install -q staroid kubernetes + # install jupyterlab + - pip install -q jupyterlab + - ln -s /nfs /home/ray/nfs + - bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' & + # show 'notebook' link in staroid management console to access jupyter notebook. + - 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -' + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/staroid/example-full.yaml b/python/ray/autoscaler/staroid/example-full.yaml index 25c8fe37d..cc4117059 100644 --- a/python/ray/autoscaler/staroid/example-full.yaml +++ b/python/ray/autoscaler/staroid/example-full.yaml @@ -150,7 +150,7 @@ head_node: # - screen (used for `ray attach`) # - kubectl (used by the autoscaler to manage worker pods) # Image will be overridden when 'image_from_project' is true. - image: rayproject/autoscaler + image: rayproject/ray # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] diff --git a/python/setup.py b/python/setup.py index c629dc604..02c36bb6c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -71,16 +71,13 @@ generated_python_directories = [ optional_ray_files = ["ray/nightly-wheels.yaml"] ray_autoscaler_files = [ - "ray/autoscaler/aws/example-full.yaml", - "ray/autoscaler/azure/example-full.yaml", + "ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml", "ray/autoscaler/azure/azure-vm-template.json", "ray/autoscaler/azure/azure-config-template.json", - "ray/autoscaler/gcp/example-full.yaml", - "ray/autoscaler/local/example-full.yaml", - "ray/autoscaler/kubernetes/example-full.yaml", + "ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml", + "ray/autoscaler/kubernetes/defaults.yaml", "ray/autoscaler/kubernetes/kubectl-rsync.sh", - "ray/autoscaler/staroid/example-full.yaml", - "ray/autoscaler/ray-schema.json" + "ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json" ] ray_project_files = [