ray/python/ray/tune/examples/cluster-p3.yaml

# An unique identifier for the head node and workers of this cluster.
cluster_name: sgd-pytorch

# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
min_workers: 0
initial_workers: 0
max_workers: 0

target_utilization_fraction: 0.9

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 20
# docker:
#     image: tensorflow/tensorflow:1.5.0-py3
#     container_name: ray_docker

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu

head_node:
    InstanceType: p3.8xlarge
    ImageId: latest_dlami
    InstanceMarketOptions:
        MarketType: spot
           # SpotOptions:
           #     MaxPrice: "9.0"


worker_nodes:
    InstanceType: p3.8xlarge
    ImageId: latest_dlami
    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
        MarketType: spot
        # SpotOptions:
        #     MaxPrice: "9.0"

setup_commands:
    - ray || pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
    - pip install -U ipdb ray[rllib] torch torchvision
    # Install apex.
    # - rm -rf apex || true
    # - git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir  ./ || true


file_mounts: {
}

# Custom commands that will be run on the head node after common setup.
head_setup_commands: []

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# # Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --object-store-memory=1000000000