mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:38:19 +08:00
[Autoscaler] Staroid node provider followup improvements (#11408)
This commit is contained in:
@@ -226,41 +226,53 @@ class StaroidNodeProvider(NodeProvider):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod(node_id, self.namespace)
|
||||
pod.metadata.labels.update(tags)
|
||||
core_api.patch_namespaced_pod(node_id, self.namespace, pod)
|
||||
max_retry = 10
|
||||
for i in range(max_retry):
|
||||
try:
|
||||
pod = core_api.read_namespaced_pod(node_id, self.namespace)
|
||||
pod.metadata.labels.update(tags)
|
||||
core_api.patch_namespaced_pod(node_id, self.namespace, pod)
|
||||
except ApiException as e:
|
||||
if e.status == 409 and max_retry - 1 > i:
|
||||
# conflict. pod modified before apply patch. retry
|
||||
time.sleep(0.2)
|
||||
continue
|
||||
|
||||
raise e
|
||||
|
||||
def create_node(self, node_config, tags, count):
|
||||
instance_name = self.cluster_name
|
||||
|
||||
# get or create ske
|
||||
cluster_api = self.__star.cluster()
|
||||
ske = cluster_api.create(self.__ske, self.__ske_region)
|
||||
if ske is None:
|
||||
raise Exception("Failed to create an SKE '{}' in '{}' region"
|
||||
.format(self.__ske, self.__ske_region))
|
||||
incluster = self._connect_kubeapi(instance_name)
|
||||
if incluster is None:
|
||||
# get or create ske
|
||||
cluster_api = self.__star.cluster()
|
||||
ske = cluster_api.create(self.__ske, self.__ske_region)
|
||||
if ske is None:
|
||||
raise Exception("Failed to create an SKE '{}' in '{}' region"
|
||||
.format(self.__ske, self.__ske_region))
|
||||
|
||||
# create a namespace
|
||||
ns_api = self.__star.namespace(ske)
|
||||
ns = ns_api.create(
|
||||
instance_name,
|
||||
self.provider_config["project"],
|
||||
# create a namespace
|
||||
ns_api = self.__star.namespace(ske)
|
||||
ns = ns_api.create(
|
||||
instance_name,
|
||||
self.provider_config["project"],
|
||||
|
||||
# Configure 'start-head' param to 'false'.
|
||||
# head node will be created using Kubernetes api.
|
||||
params=[{
|
||||
"group": "Misc",
|
||||
"name": "start-head",
|
||||
"value": "false"
|
||||
}])
|
||||
if ns is None:
|
||||
raise Exception("Failed to create a cluster '{}' in SKE '{}'"
|
||||
.format(instance_name, self.__ske))
|
||||
# Configure 'start-head' param to 'false'.
|
||||
# head node will be created using Kubernetes api.
|
||||
params=[{
|
||||
"group": "Misc",
|
||||
"name": "start-head",
|
||||
"value": "false"
|
||||
}])
|
||||
if ns is None:
|
||||
raise Exception("Failed to create a cluster '{}' in SKE '{}'"
|
||||
.format(instance_name, self.__ske))
|
||||
|
||||
# 'ray down' will change staroid namespace status to "PAUSE"
|
||||
# in this case we need to start namespace again.
|
||||
if ns.status() == "PAUSE":
|
||||
ns = ns_api.start(instance_name)
|
||||
# 'ray down' will change staroid namespace status to "PAUSE"
|
||||
# in this case we need to start namespace again.
|
||||
if ns.status() == "PAUSE":
|
||||
ns = ns_api.start(instance_name)
|
||||
|
||||
# kube client
|
||||
kube_client = self._connect_kubeapi(instance_name)
|
||||
@@ -293,6 +305,14 @@ class StaroidNodeProvider(NodeProvider):
|
||||
else:
|
||||
pod_spec["metadata"]["labels"] = tags
|
||||
|
||||
if "generateName" not in pod_spec["metadata"]:
|
||||
pod_spec["metadata"]["generateName"] = \
|
||||
"ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] + "-"
|
||||
|
||||
if "component" not in pod_spec["metadata"]["labels"]:
|
||||
pod_spec["metadata"]["labels"]["component"] = \
|
||||
"ray-" + pod_spec["metadata"]["labels"]["ray-node-type"]
|
||||
|
||||
if image is not None:
|
||||
containers = pod_spec["spec"]["containers"]
|
||||
for c in containers:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
# A namespace will be automatically created for each cluster_name in SKE.
|
||||
cluster_name: default
|
||||
cluster_name: default # name with 'a-z' and '-'
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
@@ -8,7 +8,7 @@ min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
max_workers: 5
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
@@ -71,8 +71,8 @@ provider:
|
||||
# - Kubernetes resources to create (like Persistent volume claim)
|
||||
# on namespace creation
|
||||
# You can fork when you need to customize.
|
||||
# 1. Fork github.com/open-datastudio/ray
|
||||
# 2. Change .staroid/ directory to cutomize
|
||||
# 1. Fork github.com/open-datastudio/ray-cluster
|
||||
# 2. Change contents
|
||||
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
||||
# 4. Release your customized branch
|
||||
# 4-1. Select project from 'My projects' menu
|
||||
@@ -81,7 +81,7 @@ provider:
|
||||
# 4-4. Switch Launch permission to 'Public' if required
|
||||
# 5. Change 'project' field to point your
|
||||
# repository and branch in this file
|
||||
project: "GITHUB/open-datastudio/ray:master-staroid"
|
||||
project: "GITHUB/open-datastudio/ray-cluster:master"
|
||||
|
||||
# 'spec.containers.image' field for ray-node and ray-worker will be
|
||||
# overrided by the image built from the 'project' field above.
|
||||
@@ -109,13 +109,17 @@ head_node:
|
||||
labels:
|
||||
component: ray-head
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
# https://docs.staroid.com/ske/pod.html
|
||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
# Locate ray head to dedicated Kubernetes node
|
||||
# In dedicated mode, resource requests and limits in the pod spec will be
|
||||
# automatically overrided based on 'pod.staroid.com/instance-type' below.
|
||||
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
|
||||
|
||||
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
||||
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
||||
pod.staroid.com/instance-type: standard-4
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
|
||||
@@ -130,10 +134,12 @@ head_node:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: tmp-volume
|
||||
emptyDir: {}
|
||||
# nfs volume provides a shared volume across all ray-nodes.
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
claimName: nfs
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
@@ -162,13 +168,16 @@ head_node:
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /tmp
|
||||
name: tmp-volume
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
cpu: 4000m
|
||||
memory: 8Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
@@ -176,7 +185,7 @@ head_node:
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
memory: 8Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
@@ -184,7 +193,7 @@ head_node:
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
resource: limits.cpu
|
||||
- name: RAY_ADDRESS
|
||||
value: "auto"
|
||||
|
||||
@@ -201,13 +210,17 @@ worker_nodes:
|
||||
labels:
|
||||
component: ray-worker
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "true" # use spot instance for workers.
|
||||
# https://docs.staroid.com/ske/pod.html
|
||||
pod.staroid.com/spot: "true"
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
# Locate ray head to dedicated Kubernetes node
|
||||
# In dedicated mode, resource requests and limits in the pod spec will be
|
||||
# automatically overrided based on 'pod.staroid.com/instance-type' below.
|
||||
pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
|
||||
|
||||
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
||||
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
||||
pod.staroid.com/instance-type: standard-4
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -222,9 +235,11 @@ worker_nodes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: tmp-volume
|
||||
emptyDir: {}
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
claimName: nfs
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
@@ -246,16 +261,19 @@ worker_nodes:
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /tmp
|
||||
name: tmp-volume
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
cpu: 4000m
|
||||
memory: 8Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
memory: 8Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
@@ -263,7 +281,7 @@ worker_nodes:
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
resource: limits.cpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
|
||||
@@ -0,0 +1,292 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
# A namespace will be automatically created for each cluster_name in SKE.
|
||||
cluster_name: default # name with 'a-z' and '-'
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 5
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 0
|
||||
|
||||
# Whether or not to autoscale aggressively. If this is enabled, if at any point
|
||||
# we would start more workers, we start at least enough to bring us to
|
||||
# initial_workers.
|
||||
autoscaling_mode: default
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: staroid
|
||||
|
||||
# Access token for Staroid from https://staroid.com/settings/accesstokens.
|
||||
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
access_token:
|
||||
|
||||
# Staroid account to use. e.g. GITHUB/staroids
|
||||
# Alternatively, you can set STAROID_ACCOUNT environment variable.
|
||||
# Leave empty to select default account for given access token.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
account:
|
||||
|
||||
# Name of a Staroid Kubernetes Engine (SKE) instance.
|
||||
# Alternatively, you can set STAROID_SKE environment variable.
|
||||
# An SKE is a virtualized Kubernetes cluster.
|
||||
# Will create a new if not exists.
|
||||
ske: "Ray cluster"
|
||||
|
||||
# Cloud and Region to create an SKE when not exists.
|
||||
# If SKE already exists, this value will be ignored.
|
||||
# Supported cloud region can be found
|
||||
# https://docs.staroid.com/ske/cloudregion.html.
|
||||
ske_region: "aws us-west2"
|
||||
|
||||
# To create a namespace in SKE, you need to specify a Github project.
|
||||
# The Github project needs to have a staroid.yaml
|
||||
# (https://docs.staroid.com/references/staroid_yaml.html).
|
||||
# staroid.yaml defines various resources for the project, such as
|
||||
# - Building container images can be accessed from the namespace
|
||||
# - Kubernetes resources to create (like Persistent volume claim)
|
||||
# on namespace creation
|
||||
# You can fork when you need to customize.
|
||||
# 1. Fork github.com/open-datastudio/ray-cluster
|
||||
# 2. Change contents
|
||||
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
||||
# 4. Release your customized branch
|
||||
# 4-1. Select project from 'My projects' menu
|
||||
# 4-2. Select your branch in 'Release' tab
|
||||
# 4-3. After build success, switch to 'Production'
|
||||
# 4-4. Switch Launch permission to 'Public' if required
|
||||
# 5. Change 'project' field to point your
|
||||
# repository and branch in this file
|
||||
project: "GITHUB/open-datastudio/ray-cluster:master"
|
||||
|
||||
# 'spec.containers.image' field for ray-node and ray-worker will be
|
||||
# overrided by the image built from the 'project' field above.
|
||||
# Set this value to 'false' to not override the image.
|
||||
image_from_project: true
|
||||
|
||||
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
|
||||
# 'project' field above provides docker image for each python version.
|
||||
# Fork 'project' if you'd like to support other python versions.
|
||||
python_version: 3.7.7
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
head_node:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
|
||||
# Locate this Pod to spot instance or not.
|
||||
# https://docs.staroid.com/ske/pod.html
|
||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||
|
||||
# Locate ray head to dedicated Kubernetes node or not.
|
||||
# 'sandboxed' (default) or 'dedicated'.
|
||||
pod.staroid.com/isolation: dedicated
|
||||
|
||||
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
||||
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
||||
pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: tmp-volume
|
||||
emptyDir: {}
|
||||
# nfs volume provides a shared volume across all ray-nodes.
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
# Image will be overriden when 'image_from_project' is true.
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /tmp
|
||||
name: tmp-volume
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
# in case of 'pod.staroid.com/isolation' is 'dedicated',
|
||||
# cpu and memory requests/limits in resources field will be
|
||||
# automatically configured based on
|
||||
# 'pod.staroid.com/instance-type'
|
||||
requests:
|
||||
cpu: 4000m
|
||||
memory: 8Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 8Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: limits.cpu
|
||||
- name: RAY_ADDRESS
|
||||
value: "auto"
|
||||
|
||||
# Kubernetes pod config for worker node pods.
|
||||
worker_nodes:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
|
||||
# Locate this Pod to spot instance or not.
|
||||
# https://docs.staroid.com/ske/pod.html
|
||||
pod.staroid.com/spot: "true" # use on-demand instance for head.
|
||||
|
||||
# Locate ray head to dedicated Kubernetes node or not.
|
||||
# 'sandboxed' (default) or 'dedicated'.
|
||||
pod.staroid.com/isolation: dedicated
|
||||
|
||||
# Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
|
||||
# See available instance type from https://docs.staroid.com/ske/pod.html.
|
||||
pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: tmp-volume
|
||||
emptyDir: {}
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /tmp
|
||||
name: tmp-volume
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
# in case of 'pod.staroid.com/isolation' is 'dedicated',
|
||||
# cpu and memory requests/limits in resources field will be
|
||||
# automatically configured based on
|
||||
# 'pod.staroid.com/instance-type'
|
||||
requests:
|
||||
cpu: 4000m
|
||||
memory: 8Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 8Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: limits.cpu
|
||||
@@ -1,9 +1,9 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
cluster_name: minimal # name with 'a-z' and '-'
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
max_workers: 1
|
||||
max_workers: 5
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
@@ -46,8 +46,8 @@ provider:
|
||||
# - Kubernetes resources to create (like Persistent volume claim)
|
||||
# on namespace creation
|
||||
# You can fork when you need to customize.
|
||||
# 1. Fork github.com/open-datastudio/ray
|
||||
# 2. Change .staroid/ directory to cutomize
|
||||
# 1. Fork github.com/open-datastudio/ray-cluster
|
||||
# 2. Change contents
|
||||
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
||||
# 4. Release your customized branch
|
||||
# 4-1. Select project from 'My projects' menu
|
||||
@@ -56,7 +56,7 @@ provider:
|
||||
# 4-4. Switch Launch permission to 'Public' if required
|
||||
# 5. Change 'project' field to point your
|
||||
# repository and branch in this file
|
||||
project: "GITHUB/open-datastudio/ray:master-staroid"
|
||||
project: "GITHUB/open-datastudio/ray-cluster:master"
|
||||
|
||||
# 'spec.containers.image' field for ray-node and ray-worker will be
|
||||
# overrided by the image built from the 'project' field above.
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
# an example of configuring a mixed-node-type cluster.
|
||||
cluster_name: multi-node-type # name with 'a-z' and '-'
|
||||
min_workers: 1
|
||||
max_workers: 40
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: staroid
|
||||
access_token:
|
||||
account:
|
||||
ske: "Ray cluster"
|
||||
ske_region: "aws us-west2"
|
||||
project: "GITHUB/open-datastudio/ray-cluster:master"
|
||||
image_from_project: true
|
||||
python_version: 3.7.7
|
||||
use_internal_ips: true
|
||||
|
||||
# Tell the autoscaler the allowed node types and the resources they provide.
|
||||
# The key is the name of the node type, which is just for debugging purposes.
|
||||
# The node config specifies the launch config and physical instance type.
|
||||
available_node_types:
|
||||
cpu_2_ondemand:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "false"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-2
|
||||
resources: {"CPU": 2}
|
||||
max_workers: 10
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "false"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-4
|
||||
resources: {"CPU": 4}
|
||||
max_workers: 10
|
||||
cpu_8_ondemand:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "false"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-8
|
||||
resources: {"CPU": 8}
|
||||
max_workers: 10
|
||||
gpu_1_ondemand:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "false"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: gpu-1
|
||||
resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1}
|
||||
max_workers: 10
|
||||
cpu_2_spot:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "true"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-2
|
||||
resources: {"CPU": 2}
|
||||
max_workers: 10
|
||||
cpu_4_spot:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "true"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-4
|
||||
resources: {"CPU": 4}
|
||||
max_workers: 10
|
||||
cpu_8_spot:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "true"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: standard-8
|
||||
resources: {"CPU": 8}
|
||||
max_workers: 10
|
||||
# worker_setup_commands:
|
||||
# - pip install tensorflow-gpu # Example command.
|
||||
gpu_1_spot:
|
||||
node_config:
|
||||
metadata:
|
||||
labels:
|
||||
pod.staroid.com/spot: "true"
|
||||
pod.staroid.com/isolation: dedicated
|
||||
pod.staroid.com/instance-type: gpu-1
|
||||
resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1}
|
||||
max_workers: 10
|
||||
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: cpu_4_ondemand
|
||||
|
||||
# Specify the default type of the worker node (as configured above).
|
||||
worker_default_node_type: cpu_4_spot
|
||||
|
||||
# The default settings for the head node. This will be merged with the per-node
|
||||
# type configs given above.
|
||||
#head_node:
|
||||
|
||||
# The default settings for worker nodes. This will be merged with the per-node
|
||||
# type configs given above.
|
||||
#worker_nodes:
|
||||
|
||||
# Configure the cluster for very conservative auto-scaling otherwise.
|
||||
target_utilization_fraction: 0.9
|
||||
idle_timeout_minutes: 5
|
||||
Reference in New Issue
Block a user