[Docker] Set Docker as the Default (#11416)

2026-06-27 23:39:37 +08:00 · 2020-10-19 10:53:30 -07:00
parent f500292d41
commit acbd12eabf
20 changed files with 1247 additions and 115 deletions
@@ -50,38 +50,35 @@ def _import_staroid(provider_config):
    return StaroidNodeProvider


-def _load_local_example_config():
+def _load_local_defaults_config():
    import ray.autoscaler.local as ray_local
-    return os.path.join(
-        os.path.dirname(ray_local.__file__), "example-full.yaml")
+    return os.path.join(os.path.dirname(ray_local.__file__), "defaults.yaml")


-def _load_kubernetes_example_config():
+def _load_kubernetes_defaults_config():
    import ray.autoscaler.kubernetes as ray_kubernetes
    return os.path.join(
-        os.path.dirname(ray_kubernetes.__file__), "example-full.yaml")
+        os.path.dirname(ray_kubernetes.__file__), "defaults.yaml")


-def _load_aws_example_config():
+def _load_aws_defaults_config():
    import ray.autoscaler.aws as ray_aws
-    return os.path.join(os.path.dirname(ray_aws.__file__), "example-full.yaml")
+    return os.path.join(os.path.dirname(ray_aws.__file__), "defaults.yaml")


-def _load_gcp_example_config():
+def _load_gcp_defaults_config():
    import ray.autoscaler.gcp as ray_gcp
-    return os.path.join(os.path.dirname(ray_gcp.__file__), "example-full.yaml")
+    return os.path.join(os.path.dirname(ray_gcp.__file__), "defaults.yaml")


-def _load_azure_example_config():
+def _load_azure_defaults_config():
    import ray.autoscaler.azure as ray_azure
-    return os.path.join(
-        os.path.dirname(ray_azure.__file__), "example-full.yaml")
+    return os.path.join(os.path.dirname(ray_azure.__file__), "defaults.yaml")


-def _load_staroid_example_config():
+def _load_staroid_defaults_config():
    import ray.autoscaler.staroid as ray_staroid
-    return os.path.join(
-        os.path.dirname(ray_staroid.__file__), "example-full.yaml")
+    return os.path.join(os.path.dirname(ray_staroid.__file__), "defaults.yaml")


 def _import_external(provider_config):
@@ -110,12 +107,12 @@ _PROVIDER_PRETTY_NAMES = {
 }

 _DEFAULT_CONFIGS = {
-    "local": _load_local_example_config,
-    "aws": _load_aws_example_config,
-    "gcp": _load_gcp_example_config,
-    "azure": _load_azure_example_config,
-    "staroid": _load_staroid_example_config,
-    "kubernetes": _load_kubernetes_example_config,
+    "local": _load_local_defaults_config,
+    "aws": _load_aws_defaults_config,
+    "gcp": _load_gcp_defaults_config,
+    "azure": _load_azure_defaults_config,
+    "staroid": _load_staroid_defaults_config,
+    "kubernetes": _load_kubernetes_defaults_config,
 }


@@ -0,0 +1,139 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker: {}
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This max value allowed is 1.0, which is the most conservative setting.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+    # Whether to allow node reuse. If set to False, nodes will be terminated
+    # instead of stopped.
+    cache_stopped_nodes: True # If not present, the default is True.
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: m5.large
+    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+    # You can provision additional disk space with a conf as follows
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 100
+
+    # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    InstanceType: m5.large
+    ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+    # Run workers on spot by default. Comment this out to use on-demand.
+    InstanceMarketOptions:
+        MarketType: spot
+        # Additional options can be found in the boto docs, e.g.
+        #   SpotOptions:
+        #       MaxPrice: MAX_HOURLY_PRICE
+
+    # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # Note: if you're developing Ray, you probably want to create an AMI that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
+    # Consider uncommenting these if you also want to run apt-get commands during setup
+    # - sudo pkill -9 apt-get || true
+    # - sudo pkill -9 dpkg || true
+    # - sudo dpkg --configure -a
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install 'boto3>=1.4.8'  # 1.4.8 adds InstanceMarketOptions
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -23,18 +23,18 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7
-    container_name: "" # e.g. ray_docker
+    image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"
    # Allow Ray to automatically detect GPUs

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest-cpu"
    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
@@ -123,20 +123,15 @@ file_mounts_sync_continuously: False
 initialization_commands: []

 # List of shell commands to run to set up nodes.
-setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
-    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
-    # Consider uncommenting these if you also want to run apt-get commands during setup
-    # - sudo pkill -9 apt-get || true
-    # - sudo pkill -9 dpkg || true
-    # - sudo dpkg --configure -a
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
-head_setup_commands:
-    - pip install 'boto3>=1.4.8'  # 1.4.8 adds InstanceMarketOptions
+head_setup_commands: []

 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
@@ -23,13 +23,13 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:0.8.7-gpu"
-    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    image: "rayproject/ray:latest-gpu"
+    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest"

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -99,8 +99,8 @@ file_mounts: {
 }

 # List of shell commands to run to set up nodes.
-# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled
-# setup_commands:
+# NOTE: rayproject/ray:latest has ray latest bundled
+setup_commands: []
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl

@@ -28,7 +28,7 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7
+    image: "" # e.g., rayproject/ray:latest
    container_name: "" # e.g. ray_docker
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -36,9 +36,9 @@ docker:
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest"

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -0,0 +1,136 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker: {}
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: azure
+    # https://azure.microsoft.com/en-us/global-infrastructure/locations
+    location: westus2
+    resource_group: ray-cluster
+    # set subscription id otherwise the default from az cli will be used
+    # subscription_id: 00000000-0000-0000-0000-000000000000
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+    # you must specify paths to matching private and public key pair files
+    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
+    ssh_private_key: ~/.ssh/id_rsa
+    ssh_public_key: ~/.ssh/id_rsa.pub
+
+# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
+# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
+# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
+# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
+
+# Provider-specific config for the head node, e.g. instance type.
+head_node:
+    azure_arm_parameters:
+        vmSize: Standard_D2s_v3
+        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+        imagePublisher: microsoft-dsvm
+        imageOffer: ubuntu-1804
+        imageSku: 1804-gen2
+        imageVersion: 20.02.01
+
+# Provider-specific config for worker nodes, e.g. instance type.
+worker_nodes:
+    azure_arm_parameters:
+        vmSize: Standard_D2s_v3
+        # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+        imagePublisher: microsoft-dsvm
+        imageOffer: ubuntu-1804
+        imageSku: 1804-gen2
+        imageVersion: 20.02.01
+        # optionally set priority to use Spot instances
+        priority: Spot
+        # set a maximum price for spot instances if desired
+        # billingProfile:
+        #     maxPrice: -1
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands:
+    # get rid of annoying Ubuntu message
+    - touch ~/.sudo_as_admin_successful
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # Note: if you're developing Ray, you probably want to create an AMI that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
+    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+    # Consider uncommenting these if you also want to run apt-get commands during setup
+    # - sudo pkill -9 apt-get || true
+    # - sudo pkill -9 dpkg || true
+    # - sudo dpkg --configure -a
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -23,17 +23,19 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7
-    container_name: "" # e.g. ray_docker
+    image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"
+    # Allow Ray to automatically detect GPUs

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest-cpu"
+    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -117,17 +119,12 @@ initialization_commands:
    - touch ~/.sudo_as_admin_successful

 # List of shell commands to run to set up nodes.
-setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
-    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
-    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
-    # Consider uncommenting these if you also want to run apt-get commands during setup
-    # - sudo pkill -9 apt-get || true
-    # - sudo pkill -9 dpkg || true
-    # - sudo dpkg --configure -a
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -23,13 +23,13 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:0.8.7-gpu"
-    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    image: "rayproject/ray:latest-gpu"
+    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest"

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -55,13 +55,13 @@ auth:
    ssh_public_key: ~/.ssh/id_rsa.pub

 # Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using example-full.yaml
+# Ray will auto-configure unspecified fields using defaults.yaml
 head_node:
    azure_arm_parameters:
        vmSize: Standard_NC6s_v3

 # Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using example-full.yaml
+# Ray will auto-configure unspecified fields using defaults.yaml
 worker_nodes:
    azure_arm_parameters:
        vmSize: Standard_NC6s_v3
@@ -74,8 +74,8 @@ file_mounts: {
 }

 # List of shell commands to run to set up nodes.
-# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled
-# setup_commands:
+# NOTE: rayproject/ray:latest has ray latest bundled
+setup_commands: []
 #     - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
 
 # Custom commands that will be run on the head node after common setup.
@@ -23,17 +23,17 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7-gpu
-    container_name: "" # e.g. ray_docker
+    image: "rayproject/ray:latest-gpu"
+    container_name: "ray_docker"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
-    pull_before_run: True
+    pull_before_run: False
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"

-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest"

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -63,7 +63,7 @@ auth:
    ssh_public_key: ~/.ssh/id_rsa.pub

 # Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using example-full.yaml
+# Ray will auto-configure unspecified fields using defaults.yaml
 head_node:
    azure_arm_parameters:
        vmSize: Standard_NC6
@@ -74,7 +74,7 @@ head_node:
        imageVersion: 20.02.01

 # Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields using example-full.yaml
+# Ray will auto-configure unspecified fields using defaults.yaml
 worker_nodes:
    azure_arm_parameters:
        vmSize: Standard_NC6
@@ -0,0 +1,165 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker: {}
+
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: gcp
+    region: us-west1
+    availability_zone: us-west1-a
+    project_id: null # Globally unique project id
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below. This requires that you have added the key into the
+# project wide meta-data.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# For more documentation on available fields, see:
+# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+head_node:
+    machineType: n1-standard-2
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/tf-1-13-cpu
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+    # If the network interface is specified as below in both head and worker
+    # nodes, the manual network config is used.  Otherwise an existing subnet is
+    # used.  To use a shared subnet, ask the subnet owner to grant permission
+    # for 'compute.subnetworks.use' to the ray autoscaler account...
+    # networkInterfaces:
+    #   - kind: compute#networkInterface
+    #     subnetwork: path/to/subnet
+    #     aliasIpRanges: []
+
+worker_nodes:
+    machineType: n1-standard-2
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/tf-1-13-cpu
+    # Run workers on preemtible instance by default.
+    # Comment this out to use on-demand.
+    scheduling:
+      - preemptible: true
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # Note: if you're developing Ray, you probably want to create an AMI that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
+
+    # Install MiniConda.
+    - >-
+      wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/anaconda3.sh
+      || true
+      && bash ~/anaconda3.sh -b -p ~/anaconda3 || true
+      && rm ~/anaconda3.sh
+      && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile
+
+    # Install ray
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+  - pip install google-api-python-client==1.7.8
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --head
+      --port=6379
+      --object-manager-port=8076
+      --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --address=$RAY_HEAD_IP:6379
+      --object-manager-port=8076
@@ -23,12 +23,19 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7
-    container_name: "" # e.g. ray_docker
-    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
-    # if no cached version is present.
-    pull_before_run: True
-    run_options: []  # Extra options to pass into "docker run"
+  image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+  container_name: "ray_container"
+  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+  # if no cached version is present.
+  pull_before_run: True
+  run_options: []  # Extra options to pass into "docker run"
+
+  # Example of running a GPU head with CPU workers
+  # head_image: "rayproject/ray:latest-gpu"
+  # Allow Ray to automatically detect GPUs
+
+  # worker_image: "rayproject/ray:latest-cpu"
+  # worker_run_options: []


 # The autoscaler will scale up the cluster to this target fraction of resource
@@ -125,22 +132,12 @@ file_mounts_sync_continuously: False
 initialization_commands: []

 # List of shell commands to run to set up nodes.
-setup_commands:
-    # Note: if you're developing Ray, you probably want to create an AMI that
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
-    # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
-
-    # Install MiniConda.
-    - >-
-      wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/anaconda3.sh
-      || true
-      && bash ~/anaconda3.sh -b -p ~/anaconda3 || true
-      && rm ~/anaconda3.sh
-      && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile
-
-    # Install ray
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl


 # Custom commands that will be run on the head node after common setup.
@@ -23,14 +23,14 @@ autoscaling_mode: default
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:0.8.7-gpu"
-    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    image: "rayproject/ray:latest-gpu"
+    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:0.8.7-gpu"
+    # head_image: "rayproject/ray:latest-gpu"


-    # worker_image: "rayproject/ray:0.8.7"
+    # worker_image: "rayproject/ray:latest"


 # The autoscaler will scale up the cluster to this target fraction of resource
@@ -129,8 +129,8 @@ initialization_commands:
          done"

 # List of shell commands to run to set up nodes.
-# NOTE: rayproject/ray:0.8.7 has ray 0.8.7 bundled
-# setup_commands:
+# NOTE: rayproject/ray:latest has ray latest bundled
+setup_commands: []
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl

@@ -0,0 +1,301 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: kubernetes
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true
+
+    # Namespace to use for all resources created.
+    namespace: ray
+
+    # ServiceAccount created by the autoscaler for the head node pod that it
+    # runs in. If this field isn't provided, the head pod config below must
+    # contain a user-created service account with the proper permissions.
+    autoscaler_service_account:
+        apiVersion: v1
+        kind: ServiceAccount
+        metadata:
+            name: autoscaler
+
+    # Role created by the autoscaler for the head node pod that it runs in.
+    # If this field isn't provided, the role referenced in
+    # autoscaler_role_binding must exist and have at least these permissions.
+    autoscaler_role:
+        kind: Role
+        apiVersion: rbac.authorization.k8s.io/v1
+        metadata:
+            name: autoscaler
+        rules:
+        - apiGroups: [""]
+          resources: ["pods", "pods/status", "pods/exec"]
+          verbs: ["get", "watch", "list", "create", "delete", "patch"]
+
+    # RoleBinding created by the autoscaler for the head node pod that it runs
+    # in. If this field isn't provided, the head pod config below must contain
+    # a user-created service account with the proper permissions.
+    autoscaler_role_binding:
+        apiVersion: rbac.authorization.k8s.io/v1
+        kind: RoleBinding
+        metadata:
+            name: autoscaler
+        subjects:
+        - kind: ServiceAccount
+          name: autoscaler
+        roleRef:
+            kind: Role
+            name: autoscaler
+            apiGroup: rbac.authorization.k8s.io
+
+    services:
+      # Service that maps to the head node of the Ray cluster.
+      - apiVersion: v1
+        kind: Service
+        metadata:
+            # NOTE: If you're running multiple Ray clusters with services
+            # on one Kubernetes cluster, they must have unique service
+            # names.
+            name: ray-head
+        spec:
+            # This selector must match the head node pod's selector below.
+            selector:
+                component: ray-head
+            ports:
+                - protocol: TCP
+                  port: 8000
+                  targetPort: 8000
+
+      # Service that maps to the worker nodes of the Ray cluster.
+      - apiVersion: v1
+        kind: Service
+        metadata:
+            # NOTE: If you're running multiple Ray clusters with services
+            # on one Kubernetes cluster, they must have unique service
+            # names.
+            name: ray-workers
+        spec:
+            # This selector must match the worker node pods' selector below.
+            selector:
+                component: ray-worker
+            ports:
+                - protocol: TCP
+                  port: 8000
+                  targetPort: 8000
+
+# Kubernetes pod config for the head node pod.
+head_node:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-head-
+
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: ray-head
+    spec:
+        # Change this if you altered the autoscaler_service_account above
+        # or want to provide your own.
+        serviceAccountName: autoscaler
+
+        # Restarting the head node automatically is not currently supported.
+        # If the head node goes down, `ray up` must be run again.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          #   - screen (used for `ray attach`)
+          #   - kubectl (used by the autoscaler to manage worker pods)
+          image: rayproject/ray
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 6379 # Redis port.
+              - containerPort: 6380 # Redis port.
+              - containerPort: 6381 # Redis port.
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 512Mi
+              limits:
+                  # The maximum memory that this pod is allowed to use. The
+                  # limit will be detected by ray and split to use 10% for
+                  # redis, 30% for the shared memory object store, and the
+                  # rest for application memory. If this limit is not set and
+                  # the object store size is not set manually, ray will
+                  # allocate a very large object store in each pod that may
+                  # cause problems for other pods.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+
+# Kubernetes pod config for worker node pods.
+worker_nodes:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-worker-
+
+        # Must match the worker node service selector above if a worker node
+        # service is required.
+        labels:
+            component: ray-worker
+    spec:
+        serviceAccountName: default
+
+        # Worker nodes will be managed automatically by the head node, so
+        # do not change the restart policy.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          image: rayproject/ray
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 512Mi
+              limits:
+                  # This memory limit will be detected by ray and split into
+                  # 30% for plasma, and 70% for workers.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands: []
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,92 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+## NOTE: Typically for local clusters, min_workers == initial_workers == max_workers == len(worker_ips).
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+# Typically, min_workers == initial_workers == max_workers == len(worker_ips).
+min_workers: 0
+# The initial number of worker nodes to launch in addition to the head node.
+# Typically, min_workers == initial_workers == max_workers == len(worker_ips).
+initial_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head node.
+# This takes precedence over min_workers.
+# Typically, min_workers == initial_workers == max_workers == len(worker_ips).
+max_workers: 0
+
+# Autoscaling parameters.
+# Ignore this if min_workers == initial_workers == max_workers.
+autoscaling_mode: default
+target_utilization_fraction: 0.8
+idle_timeout_minutes: 5
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled. Assumes Docker is installed.
+docker: {}
+
+# Local specific configuration.
+provider:
+    type: local
+    head_ip: YOUR_HEAD_NODE_HOSTNAME
+    worker_ips: [WORKER_NODE_1_HOSTNAME, WORKER_NODE_2_HOSTNAME, ... ]
+    # Optional when running automatic cluster management on prem. If you use a coordinator server,
+    # then you can launch multiple autoscaling clusters on the same set of machines, and the coordinator
+    # will assign individual nodes to clusters as needed.
+    #    coordinator_address: "<host>:<port>"
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: YOUR_USERNAME
+    # Optional if an ssh private key is necessary to ssh to the cluster.
+    # ssh_private_key: ~/.ssh/id_rsa
+
+# Leave this empty.
+head_node: {}
+
+# Leave this empty.
+worker_nodes: {}
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up each nodes.
+setup_commands:
+    - pip install -U ray
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands: []
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ray start --address=$RAY_HEAD_IP:6379
@@ -26,8 +26,8 @@ idle_timeout_minutes: 5
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled. Assumes Docker is installed.
 docker:
-    image: "" # e.g., rayproject/ray:0.8.7
-    container_name: "" # e.g. ray_docker
+    image: "rayproject/ray-gpu:latest" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
@@ -78,8 +78,12 @@ file_mounts_sync_continuously: False
 initialization_commands: []

 # List of shell commands to run to set up each nodes.
-setup_commands:
-    - pip install -U ray
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
@@ -188,7 +188,7 @@
                "image": {
                    "type": "string",
                    "description": "the docker image name",
-                    "default": "rayproject/ray:0.8.7"
+                    "default": "rayproject/ray:latest"
                },
                "container_name": {
                    "type": "string",
@@ -0,0 +1,312 @@
+# An unique identifier for the head node and workers of this cluster.
+# A namespace will be automatically created for each cluster_name in SKE.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: staroid
+
+    # Access token for Staroid from https://staroid.com/settings/accesstokens.
+    # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    access_token:
+
+    # Staroid account to use. e.g. GITHUB/staroids
+    # Alternatively, you can set STAROID_ACCOUNT environment variable.
+    # Leave empty to select default account for given access token.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    account:
+
+    # Name of a Staroid Kubernetes Engine (SKE) instance.
+    # Alternatively, you can set STAROID_SKE environment variable.
+    # An SKE is a virtualized Kubernetes cluster.
+    # Will create a new if not exists.
+    ske: "Ray cluster"
+
+    # Cloud and Region to create an SKE when not exists.
+    # If SKE already exists, this value will be ignored.
+    # Supported cloud region can be found
+    # https://docs.staroid.com/ske/cloudregion.html.
+    ske_region: "aws us-west2"
+
+    # To create a namespace in SKE, you need to specify a Github project.
+    # The Github project needs to have a staroid.yaml
+    # (https://docs.staroid.com/references/staroid_yaml.html).
+    # staroid.yaml defines various resources for the project, such as
+    #   - Building container images can be accessed from the namespace
+    #   - Kubernetes resources to create (like Persistent volume claim)
+    #     on namespace creation
+    # You can fork when you need to customize.
+    #   1. Fork github.com/open-datastudio/ray
+    #   2. Change .staroid/ directory to cutomize
+    #   3. Connect forked repository (https://staroid.com/projects/settings)
+    #   4. Release your customized branch
+    #      4-1. Select project from 'My projects' menu
+    #      4-2. Select your branch in 'Release' tab
+    #      4-3. After build success, switch to 'Production'
+    #      4-4. Switch Launch permission to 'Public' if required
+    #   5. Change 'project' field to point your 
+    #      repository and branch in this file
+    project: "GITHUB/open-datastudio/ray:master-staroid"
+
+    # 'spec.containers.image' field for ray-node and ray-worker will be
+    # overrided by the image built from the 'project' field above.
+    # Set this value to 'false' to not override the image.
+    image_from_project: true
+
+    # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
+    # 'project' field above provides docker image for each python version.
+    # Fork 'project' if you'd like to support other python versions.
+    python_version: 3.7.7
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true
+
+# Kubernetes pod config for the head node pod.
+head_node:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-head-
+
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: ray-head
+
+            # https://docs.staroid.com/ske/pod.html#pod
+            pod.staroid.com/spot: "false" # use on-demand instance for head.
+
+            # Uncomment to locate ray head to dedicated Kubernetes node
+            # (GPU instance is only available for 'dedicated' isolation)
+            #pod.staroid.com/isolation: dedicated
+            #pod.staroid.com/instance-type: gpu-1
+    spec:
+        automountServiceAccountToken: true
+
+        # Restarting the head node automatically is not currently supported.
+        # If the head node goes down, `ray up` must be run again.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        # nfs volume provides a shared volume across all ray-nodes.
+        - name: nfs-volume
+          persistentVolumeClaim:
+            claimName: nfs
+
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          #   - screen (used for `ray attach`)
+          #   - kubectl (used by the autoscaler to manage worker pods)
+          # Image will be overridden when 'image_from_project' is true.
+          image: rayproject/ray
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 6379 # Redis port.
+              - containerPort: 6380 # Redis port.
+              - containerPort: 6381 # Redis port.
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 2Gi
+              limits:
+                  # The maximum memory that this pod is allowed to use. The
+                  # limit will be detected by ray and split to use 10% for
+                  # redis, 30% for the shared memory object store, and the
+                  # rest for application memory. If this limit is not set and
+                  # the object store size is not set manually, ray will
+                  # allocate a very large object store in each pod that may
+                  # cause problems for other pods.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+              - name: RAY_ADDRESS
+                value: "auto"
+
+# Kubernetes pod config for worker node pods.
+worker_nodes:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-worker-
+
+        # Must match the worker node service selector above if a worker node
+        # service is required.
+        labels:
+            component: ray-worker
+
+            # https://docs.staroid.com/ske/pod.html#pod
+            pod.staroid.com/spot: "true" # use spot instance for workers.
+
+            # Uncomment to locate ray head to dedicated Kubernetes node
+            # (GPU instance is only available for 'dedicated' isolation)
+            #pod.staroid.com/isolation: dedicated
+            #pod.staroid.com/instance-type: gpu-1
+    spec:
+        serviceAccountName: default
+
+        # Worker nodes will be managed automatically by the head node, so
+        # do not change the restart policy.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: nfs-volume
+          persistentVolumeClaim:
+            claimName: nfs
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          image: rayproject/autoscaler
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 2Gi
+              limits:
+                  # This memory limit will be detected by ray and split into
+                  # 30% for plasma, and 70% for workers.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    # install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
+    - pip install -q staroid kubernetes
+    # install jupyterlab
+    - pip install -q jupyterlab
+    - ln -s /nfs /home/ray/nfs
+    - bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
+    # show 'notebook' link in staroid management console to access jupyter notebook.
+    - 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n  name: notebook\n  annotations:\n    service.staroid.com/link: show\nspec:\n  ports:\n  - name: http\n    port: 8888\n  selector:\n    component: ray-head" | kubectl apply -f -'
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -150,7 +150,7 @@ head_node:
          #   - screen (used for `ray attach`)
          #   - kubectl (used by the autoscaler to manage worker pods)
          # Image will be overridden when 'image_from_project' is true.
-          image: rayproject/autoscaler
+          image: rayproject/ray
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
@@ -71,16 +71,13 @@ generated_python_directories = [
 optional_ray_files = ["ray/nightly-wheels.yaml"]

 ray_autoscaler_files = [
-    "ray/autoscaler/aws/example-full.yaml",
-    "ray/autoscaler/azure/example-full.yaml",
+    "ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml",
    "ray/autoscaler/azure/azure-vm-template.json",
    "ray/autoscaler/azure/azure-config-template.json",
-    "ray/autoscaler/gcp/example-full.yaml",
-    "ray/autoscaler/local/example-full.yaml",
-    "ray/autoscaler/kubernetes/example-full.yaml",
+    "ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml",
+    "ray/autoscaler/kubernetes/defaults.yaml",
    "ray/autoscaler/kubernetes/kubectl-rsync.sh",
-    "ray/autoscaler/staroid/example-full.yaml",
-    "ray/autoscaler/ray-schema.json"
+    "ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json"
 ]

 ray_project_files = [