Files
ray/python/ray/autoscaler/staroid/example-full.yaml
T

331 lines
14 KiB
YAML

# An unique identifier for the head node and workers of this cluster.
# A namespace will be automatically created for each cluster_name in SKE.
cluster_name: default # name with 'a-z' and '-'
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 5
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0
# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
autoscaling_mode: default
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: staroid
# Access token for Staroid from https://staroid.com/settings/accesstokens.
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
# https://github.com/staroids/staroid-python#configuration
# for more information.
access_token:
# Staroid account to use. e.g. GITHUB/staroids
# Alternatively, you can set STAROID_ACCOUNT environment variable.
# Leave empty to select default account for given access token.
# https://github.com/staroids/staroid-python#configuration
# for more information.
account:
# Name of a Staroid Kubernetes Engine (SKE) instance.
# Alternatively, you can set STAROID_SKE environment variable.
# An SKE is a virtualized Kubernetes cluster.
# Will create a new if not exists.
ske: "Ray cluster"
# Cloud and Region to create an SKE when not exists.
# If SKE already exists, this value will be ignored.
# Supported cloud region can be found
# https://docs.staroid.com/ske/cloudregion.html.
ske_region: "aws us-west2"
# To create a namespace in SKE, you need to specify a Github project.
# The Github project needs to have a staroid.yaml
# (https://docs.staroid.com/references/staroid_yaml.html).
# staroid.yaml defines various resources for the project, such as
# - Building container images can be accessed from the namespace
# - Kubernetes resources to create (like Persistent volume claim)
# on namespace creation
# You can fork when you need to customize.
# 1. Fork github.com/open-datastudio/ray-cluster
# 2. Change contents
# 3. Connect forked repository (https://staroid.com/projects/settings)
# 4. Release your customized branch
# 4-1. Select project from 'My projects' menu
# 4-2. Select your branch in 'Release' tab
# 4-3. After build success, switch to 'Production'
# 4-4. Switch Launch permission to 'Public' if required
# 5. Change 'project' field to point your
# repository and branch in this file
project: "GITHUB/open-datastudio/ray-cluster:master"
# 'spec.containers.image' field for ray-node and ray-worker will be
# overrided by the image built from the 'project' field above.
# Set this value to 'false' to not override the image.
image_from_project: true
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
# 'project' field above provides docker image for each python version.
# Fork 'project' if you'd like to support other python versions.
python_version: 3.7.7
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
# https://docs.staroid.com/ske/pod.html
pod.staroid.com/spot: "false" # use on-demand instance for head.
# Locate ray head to dedicated Kubernetes node
# In dedicated mode, resource requests and limits in the pod spec will be
# automatically overrided based on 'pod.staroid.com/instance-type' below.
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
# See available instance type from https://docs.staroid.com/ske/pod.html.
pod.staroid.com/instance-type: standard-4
spec:
automountServiceAccountToken: true
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: tmp-volume
emptyDir: {}
# nfs volume provides a shared volume across all ray-nodes.
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
# Image will be overridden when 'image_from_project' is true.
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /tmp
name: tmp-volume
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 4000m
memory: 8Gi
limits:
cpu: 4000m
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 8Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: limits.cpu
- name: RAY_ADDRESS
value: "auto"
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
# https://docs.staroid.com/ske/pod.html
pod.staroid.com/spot: "true"
# Locate ray head to dedicated Kubernetes node
# In dedicated mode, resource requests and limits in the pod spec will be
# automatically overrided based on 'pod.staroid.com/instance-type' below.
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
# See available instance type from https://docs.staroid.com/ske/pod.html.
pod.staroid.com/instance-type: standard-4
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: tmp-volume
emptyDir: {}
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /tmp
name: tmp-volume
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 4000m
memory: 8Gi
limits:
cpu: 4000m
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 8Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: limits.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
# install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
- pip install -q staroid kubernetes
# install jupyterlab
- pip install -q jupyterlab
- ln -s /nfs /home/ray/nfs
- bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
# show 'notebook' link in staroid management console to access jupyter notebook.
- 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -'
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076