mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 18:31:37 +08:00
331 lines
14 KiB
YAML
331 lines
14 KiB
YAML
# An unique identifier for the head node and workers of this cluster.
|
|
# A namespace will be automatically created for each cluster_name in SKE.
|
|
cluster_name: default # name with 'a-z' and '-'
|
|
|
|
# The minimum number of workers nodes to launch in addition to the head
|
|
# node. This number should be >= 0.
|
|
min_workers: 0
|
|
|
|
# The maximum number of workers nodes to launch in addition to the head
|
|
# node. This takes precedence over min_workers.
|
|
max_workers: 5
|
|
|
|
# The initial number of worker nodes to launch in addition to the head
|
|
# node. When the cluster is first brought up (or when it is refreshed with a
|
|
# subsequent `ray up`) this number of nodes will be started.
|
|
initial_workers: 0
|
|
|
|
# Whether or not to autoscale aggressively. If this is enabled, if at any point
|
|
# we would start more workers, we start at least enough to bring us to
|
|
# initial_workers.
|
|
autoscaling_mode: default
|
|
|
|
# The autoscaler will scale up the cluster to this target fraction of resource
|
|
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
|
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
|
# can be decreased to increase the aggressiveness of upscaling.
|
|
# This value must be less than 1.0 for scaling to happen.
|
|
target_utilization_fraction: 0.8
|
|
|
|
# If a node is idle for this many minutes, it will be removed.
|
|
idle_timeout_minutes: 5
|
|
|
|
# Kubernetes resources that need to be configured for the autoscaler to be
|
|
# able to manage the Ray cluster. If any of the provided resources don't
|
|
# exist, the autoscaler will attempt to create them. If this fails, you may
|
|
# not have the required permissions and will have to request them to be
|
|
# created by your cluster administrator.
|
|
provider:
|
|
type: staroid
|
|
|
|
# Access token for Staroid from https://staroid.com/settings/accesstokens.
|
|
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
|
|
# https://github.com/staroids/staroid-python#configuration
|
|
# for more information.
|
|
access_token:
|
|
|
|
# Staroid account to use. e.g. GITHUB/staroids
|
|
# Alternatively, you can set STAROID_ACCOUNT environment variable.
|
|
# Leave empty to select default account for given access token.
|
|
# https://github.com/staroids/staroid-python#configuration
|
|
# for more information.
|
|
account:
|
|
|
|
# Name of a Staroid Kubernetes Engine (SKE) instance.
|
|
# Alternatively, you can set STAROID_SKE environment variable.
|
|
# An SKE is a virtualized Kubernetes cluster.
|
|
# Will create a new if not exists.
|
|
ske: "Ray cluster"
|
|
|
|
# Cloud and Region to create an SKE when not exists.
|
|
# If SKE already exists, this value will be ignored.
|
|
# Supported cloud region can be found
|
|
# https://docs.staroid.com/ske/cloudregion.html.
|
|
ske_region: "aws us-west2"
|
|
|
|
# To create a namespace in SKE, you need to specify a Github project.
|
|
# The Github project needs to have a staroid.yaml
|
|
# (https://docs.staroid.com/references/staroid_yaml.html).
|
|
# staroid.yaml defines various resources for the project, such as
|
|
# - Building container images can be accessed from the namespace
|
|
# - Kubernetes resources to create (like Persistent volume claim)
|
|
# on namespace creation
|
|
# You can fork when you need to customize.
|
|
# 1. Fork github.com/open-datastudio/ray-cluster
|
|
# 2. Change contents
|
|
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
|
# 4. Release your customized branch
|
|
# 4-1. Select project from 'My projects' menu
|
|
# 4-2. Select your branch in 'Release' tab
|
|
# 4-3. After build success, switch to 'Production'
|
|
# 4-4. Switch Launch permission to 'Public' if required
|
|
# 5. Change 'project' field to point your
|
|
# repository and branch in this file
|
|
project: "GITHUB/open-datastudio/ray-cluster:master"
|
|
|
|
# 'spec.containers.image' field for ray-node and ray-worker will be
|
|
# overrided by the image built from the 'project' field above.
|
|
# Set this value to 'false' to not override the image.
|
|
image_from_project: true
|
|
|
|
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
|
|
# 'project' field above provides docker image for each python version.
|
|
# Fork 'project' if you'd like to support other python versions.
|
|
python_version: 3.7.7
|
|
|
|
# Exposing external IP addresses for ray pods isn't currently supported.
|
|
use_internal_ips: true
|
|
|
|
# Kubernetes pod config for the head node pod.
|
|
head_node:
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
# Automatically generates a name for the pod with this prefix.
|
|
generateName: ray-head-
|
|
|
|
# Must match the head node service selector above if a head node
|
|
# service is required.
|
|
labels:
|
|
component: ray-head
|
|
|
|
# https://docs.staroid.com/ske/pod.html
|
|
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
|
|
|
# Locate ray head to dedicated Kubernetes node
|
|
# In dedicated mode, resource requests and limits in the pod spec will be
|
|
# automatically overrided based on 'pod.staroid.com/instance-type' below.
|
|
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
|
|
|
|
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
|
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
|
pod.staroid.com/instance-type: standard-4
|
|
spec:
|
|
automountServiceAccountToken: true
|
|
|
|
# Restarting the head node automatically is not currently supported.
|
|
# If the head node goes down, `ray up` must be run again.
|
|
restartPolicy: Never
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
- name: tmp-volume
|
|
emptyDir: {}
|
|
# nfs volume provides a shared volume across all ray-nodes.
|
|
- name: nfs-volume
|
|
persistentVolumeClaim:
|
|
claimName: nfs
|
|
|
|
containers:
|
|
- name: ray-node
|
|
imagePullPolicy: Always
|
|
# You are free (and encouraged) to use your own container image,
|
|
# but it should have the following installed:
|
|
# - rsync (used for `ray rsync` commands and file mounts)
|
|
# - screen (used for `ray attach`)
|
|
# - kubectl (used by the autoscaler to manage worker pods)
|
|
# Image will be overridden when 'image_from_project' is true.
|
|
image: rayproject/autoscaler
|
|
# Do not change this command - it keeps the pod alive until it is
|
|
# explicitly killed.
|
|
command: ["/bin/bash", "-c", "--"]
|
|
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
|
ports:
|
|
- containerPort: 6379 # Redis port.
|
|
- containerPort: 6380 # Redis port.
|
|
- containerPort: 6381 # Redis port.
|
|
- containerPort: 12345 # Ray internal communication.
|
|
- containerPort: 12346 # Ray internal communication.
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
- mountPath: /tmp
|
|
name: tmp-volume
|
|
- mountPath: /nfs
|
|
name: nfs-volume
|
|
resources:
|
|
requests:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
limits:
|
|
cpu: 4000m
|
|
# The maximum memory that this pod is allowed to use. The
|
|
# limit will be detected by ray and split to use 10% for
|
|
# redis, 30% for the shared memory object store, and the
|
|
# rest for application memory. If this limit is not set and
|
|
# the object store size is not set manually, ray will
|
|
# allocate a very large object store in each pod that may
|
|
# cause problems for other pods.
|
|
memory: 8Gi
|
|
env:
|
|
# This is used in the head_start_ray_commands below so that
|
|
# Ray can spawn the correct number of processes. Omitting this
|
|
# may lead to degraded performance.
|
|
- name: MY_CPU_REQUEST
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
resource: limits.cpu
|
|
- name: RAY_ADDRESS
|
|
value: "auto"
|
|
|
|
# Kubernetes pod config for worker node pods.
|
|
worker_nodes:
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
# Automatically generates a name for the pod with this prefix.
|
|
generateName: ray-worker-
|
|
|
|
# Must match the worker node service selector above if a worker node
|
|
# service is required.
|
|
labels:
|
|
component: ray-worker
|
|
|
|
# https://docs.staroid.com/ske/pod.html
|
|
pod.staroid.com/spot: "true"
|
|
|
|
# Locate ray head to dedicated Kubernetes node
|
|
# In dedicated mode, resource requests and limits in the pod spec will be
|
|
# automatically overrided based on 'pod.staroid.com/instance-type' below.
|
|
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
|
|
|
|
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
|
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
|
pod.staroid.com/instance-type: standard-4
|
|
spec:
|
|
serviceAccountName: default
|
|
|
|
# Worker nodes will be managed automatically by the head node, so
|
|
# do not change the restart policy.
|
|
restartPolicy: Never
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
- name: tmp-volume
|
|
emptyDir: {}
|
|
- name: nfs-volume
|
|
persistentVolumeClaim:
|
|
claimName: nfs
|
|
containers:
|
|
- name: ray-node
|
|
imagePullPolicy: Always
|
|
# You are free (and encouraged) to use your own container image,
|
|
# but it should have the following installed:
|
|
# - rsync (used for `ray rsync` commands and file mounts)
|
|
image: rayproject/autoscaler
|
|
# Do not change this command - it keeps the pod alive until it is
|
|
# explicitly killed.
|
|
command: ["/bin/bash", "-c", "--"]
|
|
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
|
ports:
|
|
- containerPort: 12345 # Ray internal communication.
|
|
- containerPort: 12346 # Ray internal communication.
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
- mountPath: /tmp
|
|
name: tmp-volume
|
|
- mountPath: /nfs
|
|
name: nfs-volume
|
|
resources:
|
|
requests:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
limits:
|
|
cpu: 4000m
|
|
# This memory limit will be detected by ray and split into
|
|
# 30% for plasma, and 70% for workers.
|
|
memory: 8Gi
|
|
env:
|
|
# This is used in the head_start_ray_commands below so that
|
|
# Ray can spawn the correct number of processes. Omitting this
|
|
# may lead to degraded performance.
|
|
- name: MY_CPU_REQUEST
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
resource: limits.cpu
|
|
|
|
# Files or directories to copy to the head and worker nodes. The format is a
|
|
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
|
file_mounts: {
|
|
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
|
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
|
}
|
|
|
|
# Files or directories to copy from the head node to the worker nodes. The format is a
|
|
# list of paths. The same path on the head node will be copied to the worker node.
|
|
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
|
# you should just use file_mounts. Only use this if you know what you're doing!
|
|
cluster_synced_files: []
|
|
|
|
# List of commands that will be run before `setup_commands`. If docker is
|
|
# enabled, these commands will run outside the container and before docker
|
|
# is setup.
|
|
initialization_commands: []
|
|
|
|
# List of shell commands to run to set up nodes.
|
|
setup_commands: []
|
|
|
|
# Custom commands that will be run on the head node after common setup.
|
|
head_setup_commands:
|
|
# install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
|
|
- pip install -q staroid kubernetes
|
|
# install jupyterlab
|
|
- pip install -q jupyterlab
|
|
- ln -s /nfs /home/ray/nfs
|
|
- bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
|
|
# show 'notebook' link in staroid management console to access jupyter notebook.
|
|
- 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -'
|
|
|
|
# Custom commands that will be run on worker nodes after common setup.
|
|
worker_setup_commands: []
|
|
|
|
# Command to start ray on the head node. You don't need to change this.
|
|
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
|
|
head_start_ray_commands:
|
|
- ray stop
|
|
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
|
|
|
|
# Command to start ray on worker nodes. You don't need to change this.
|
|
worker_start_ray_commands:
|
|
- ray stop
|
|
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|