mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:31:08 +08:00
[autoscaler] Service and Ingress per worker pod (#9359)
This commit is contained in:
@@ -4,6 +4,7 @@ from kubernetes.config.config_exception import ConfigException
|
||||
_configured = False
|
||||
_core_api = None
|
||||
_auth_api = None
|
||||
_extensions_beta_api = None
|
||||
|
||||
|
||||
def _load_config():
|
||||
@@ -35,4 +36,13 @@ def auth_api():
|
||||
return _auth_api
|
||||
|
||||
|
||||
def extensions_beta_api():
|
||||
global _extensions_beta_api
|
||||
if _extensions_beta_api is None:
|
||||
_load_config()
|
||||
_extensions_beta_api = kubernetes.client.ExtensionsV1beta1Api()
|
||||
|
||||
return _extensions_beta_api
|
||||
|
||||
|
||||
log_prefix = "KubernetesNodeProvider: "
|
||||
|
||||
@@ -0,0 +1,327 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 1
|
||||
|
||||
# Whether or not to autoscale aggressively. If this is enabled, if at any point
|
||||
# we would start more workers, we start at least enough to bring us to
|
||||
# initial_workers.
|
||||
autoscaling_mode: default
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 1
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: kubernetes
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Namespace to use for all resources created.
|
||||
namespace: ray
|
||||
|
||||
# ServiceAccount created by the autoscaler for the head node pod that it
|
||||
# runs in. If this field isn't provided, the head pod config below must
|
||||
# contain a user-created service account with the proper permissions.
|
||||
autoscaler_service_account:
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: autoscaler
|
||||
|
||||
# Role created by the autoscaler for the head node pod that it runs in.
|
||||
# If this field isn't provided, the role referenced in
|
||||
# autoscaler_role_binding must exist and have at least these permissions.
|
||||
autoscaler_role:
|
||||
kind: Role
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/status", "pods/exec", "services"]
|
||||
verbs: ["get", "watch", "list", "create", "delete", "patch"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["ingresses"]
|
||||
verbs: ["get", "watch", "list", "create", "delete", "patch"]
|
||||
|
||||
# RoleBinding created by the autoscaler for the head node pod that it runs
|
||||
# in. If this field isn't provided, the head pod config below must contain
|
||||
# a user-created service account with the proper permissions.
|
||||
autoscaler_role_binding:
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: autoscaler
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: autoscaler
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
services:
|
||||
# Service that maps to the head node of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-head
|
||||
spec:
|
||||
# This selector must match the head node pod's selector below.
|
||||
selector:
|
||||
component: ray-head
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Service that maps to the worker nodes of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-workers
|
||||
spec:
|
||||
# This selector must match the worker node pods' selector below.
|
||||
selector:
|
||||
component: ray-worker
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
head_node:
|
||||
pod:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
spec:
|
||||
# Change this if you altered the autoscaler_service_account above
|
||||
# or want to provide your own.
|
||||
serviceAccountName: autoscaler
|
||||
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: IfNotPresent
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
|
||||
# Kubernetes pod config for worker node pods.
|
||||
worker_nodes:
|
||||
pod:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: IfNotPresent
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
service:
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
# The service name gets automatically set by the
|
||||
# autoscaler and gets the same name as the pod.
|
||||
spec:
|
||||
# The right selector is automatically applied by autoscaler
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
ingress:
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
spec:
|
||||
rules:
|
||||
- host: localhost
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
backend:
|
||||
# The value of the serviceName must be set to `${RAY_POD_NAME} and will be
|
||||
# automatically replaced by the name of the pod.
|
||||
serviceName: ${RAY_POD_NAME}
|
||||
servicePort: 8000
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --webui-host 0.0.0.0
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -1,7 +1,9 @@
|
||||
import logging
|
||||
from uuid import uuid4
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
from ray.autoscaler.command_runner import KubernetesCommandRunner
|
||||
from ray.autoscaler.kubernetes import core_api, log_prefix
|
||||
from ray.autoscaler.kubernetes import core_api, log_prefix, extensions_beta_api
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler.kubernetes.config import bootstrap_kubernetes
|
||||
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
||||
@@ -69,8 +71,13 @@ class KubernetesNodeProvider(NodeProvider):
|
||||
core_api().patch_namespaced_pod(node_id, self.namespace, pod)
|
||||
|
||||
def create_node(self, node_config, tags, count):
|
||||
pod_spec = node_config.copy()
|
||||
conf = node_config.copy()
|
||||
pod_spec = conf.get("pod", conf)
|
||||
service_spec = conf.get("service")
|
||||
ingress_spec = conf.get("ingress")
|
||||
node_uuid = str(uuid4())
|
||||
tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
||||
tags["ray-node-uuid"] = node_uuid
|
||||
pod_spec["metadata"]["namespace"] = self.namespace
|
||||
if "labels" in pod_spec["metadata"]:
|
||||
pod_spec["metadata"]["labels"].update(tags)
|
||||
@@ -78,11 +85,52 @@ class KubernetesNodeProvider(NodeProvider):
|
||||
pod_spec["metadata"]["labels"] = tags
|
||||
logger.info(log_prefix + "calling create_namespaced_pod "
|
||||
"(count={}).".format(count))
|
||||
new_nodes = []
|
||||
for _ in range(count):
|
||||
core_api().create_namespaced_pod(self.namespace, pod_spec)
|
||||
pod = core_api().create_namespaced_pod(self.namespace, pod_spec)
|
||||
new_nodes.append(pod)
|
||||
|
||||
new_svcs = []
|
||||
if service_spec is not None:
|
||||
logger.info(log_prefix + "calling create_namespaced_service "
|
||||
"(count={}).".format(count))
|
||||
|
||||
for new_node in new_nodes:
|
||||
|
||||
metadata = service_spec.get("metadata", {})
|
||||
metadata["name"] = new_node.metadata.name
|
||||
service_spec["metadata"] = metadata
|
||||
service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid}
|
||||
svc = core_api().create_namespaced_service(
|
||||
self.namespace, service_spec)
|
||||
new_svcs.append(svc)
|
||||
|
||||
if ingress_spec is not None:
|
||||
logger.info(log_prefix + "calling create_namespaced_ingress "
|
||||
"(count={}).".format(count))
|
||||
for new_svc in new_svcs:
|
||||
metadata = ingress_spec.get("metadata", {})
|
||||
metadata["name"] = new_svc.metadata.name
|
||||
ingress_spec["metadata"] = metadata
|
||||
ingress_spec = _add_service_name_to_service_port(
|
||||
ingress_spec, new_svc.metadata.name)
|
||||
extensions_beta_api().create_namespaced_ingress(
|
||||
self.namespace, ingress_spec)
|
||||
|
||||
def terminate_node(self, node_id):
|
||||
logger.info(log_prefix + "calling delete_namespaced_pod")
|
||||
core_api().delete_namespaced_pod(node_id, self.namespace)
|
||||
try:
|
||||
core_api().delete_namespaced_service(node_id, self.namespace)
|
||||
except ApiException:
|
||||
pass
|
||||
try:
|
||||
extensions_beta_api().delete_namespaced_ingress(
|
||||
node_id,
|
||||
self.namespace,
|
||||
)
|
||||
except ApiException:
|
||||
pass
|
||||
|
||||
def terminate_nodes(self, node_ids):
|
||||
for node_id in node_ids:
|
||||
@@ -102,3 +150,31 @@ class KubernetesNodeProvider(NodeProvider):
|
||||
@staticmethod
|
||||
def bootstrap_config(cluster_config):
|
||||
return bootstrap_kubernetes(cluster_config)
|
||||
|
||||
|
||||
def _add_service_name_to_service_port(spec, svc_name):
|
||||
"""Goes recursively through the ingress manifest and adds the
|
||||
right serviceName next to every servicePort definition.
|
||||
"""
|
||||
if isinstance(spec, dict):
|
||||
dict_keys = list(spec.keys())
|
||||
for k in dict_keys:
|
||||
spec[k] = _add_service_name_to_service_port(spec[k], svc_name)
|
||||
|
||||
# The magic string ${RAY_POD_NAME} is replaced with
|
||||
# the true service name, which is equal to the worker pod name.
|
||||
if k == "serviceName":
|
||||
if spec[k] != "${RAY_POD_NAME}":
|
||||
raise ValueError(
|
||||
"The value of serviceName must be set to "
|
||||
"${RAY_POD_NAME}. It is automatically replaced "
|
||||
"when using the autoscaler.")
|
||||
else:
|
||||
spec["serviceName"] = svc_name
|
||||
|
||||
elif isinstance(spec, list):
|
||||
spec = [
|
||||
_add_service_name_to_service_port(item, svc_name) for item in spec
|
||||
]
|
||||
|
||||
return spec
|
||||
|
||||
Reference in New Issue
Block a user