mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
[autoscaler/k8s] [CI] Kubernetes test ray up, exec, down (#12514)
This commit is contained in:
@@ -108,6 +108,16 @@ py_test_module_list(
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test_module_list(
|
||||
files = [
|
||||
"test_k8s_cluster_launcher.py",
|
||||
],
|
||||
size = "small",
|
||||
extra_srcs = SRCS,
|
||||
deps = ["//:ray_lib"],
|
||||
tags = ["kubernetes"]
|
||||
)
|
||||
|
||||
py_test_module_list(
|
||||
files = [
|
||||
"test_failure.py",
|
||||
|
||||
@@ -0,0 +1,300 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: test
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 1
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscaling_speed: 1.0
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: kubernetes
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Namespace to use for all resources created.
|
||||
namespace: ray-cluster-launcher-unit-test
|
||||
|
||||
# ServiceAccount created by the autoscaler for the head node pod that it
|
||||
# runs in. If this field isn't provided, the head pod config below must
|
||||
# contain a user-created service account with the proper permissions.
|
||||
autoscaler_service_account:
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: autoscaler
|
||||
|
||||
# Role created by the autoscaler for the head node pod that it runs in.
|
||||
# If this field isn't provided, the role referenced in
|
||||
# autoscaler_role_binding must exist and have at least these permissions.
|
||||
autoscaler_role:
|
||||
kind: Role
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/status", "pods/exec"]
|
||||
verbs: ["get", "watch", "list", "create", "delete", "patch"]
|
||||
|
||||
# RoleBinding created by the autoscaler for the head node pod that it runs
|
||||
# in. If this field isn't provided, the head pod config below must contain
|
||||
# a user-created service account with the proper permissions.
|
||||
autoscaler_role_binding:
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: autoscaler
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: autoscaler
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
services:
|
||||
# Service that maps to the head node of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-head
|
||||
spec:
|
||||
# This selector must match the head node pod's selector below.
|
||||
selector:
|
||||
component: ray-head
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Service that maps to the worker nodes of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-workers
|
||||
spec:
|
||||
# This selector must match the worker node pods' selector below.
|
||||
selector:
|
||||
component: ray-worker
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
head_node:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
spec:
|
||||
# Change this if you altered the autoscaler_service_account above
|
||||
# or want to provide your own.
|
||||
serviceAccountName: autoscaler
|
||||
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: IfNotPresent
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: PLACEHOLDER
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
# Kubernetes pod config for worker node pods.
|
||||
worker_nodes:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: IfNotPresent
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: PLACEHOLDER
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down.
|
||||
# This is not supported on kubernetes.
|
||||
# rsync_exclude: []
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
# This is not supported on kubernetes.
|
||||
# rsync_filter: []
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -0,0 +1,110 @@
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import kubernetes
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from ray.autoscaler._private.kubernetes.node_provider import \
|
||||
KubernetesNodeProvider
|
||||
from ray.autoscaler import sdk
|
||||
|
||||
IMAGE_ENV = "KUBERNETES_CLUSTER_LAUNCHER_TEST_IMAGE"
|
||||
|
||||
|
||||
def fill_image_field(pod_config):
|
||||
image = os.getenv(IMAGE_ENV, "rayproject/ray:nightly")
|
||||
pod_config["spec"]["containers"][0]["image"] = image
|
||||
|
||||
|
||||
def fill_image_fields(cluster_config):
|
||||
for key in "worker_nodes", "head_node":
|
||||
fill_image_field(cluster_config[key])
|
||||
|
||||
|
||||
def get_config():
|
||||
here = os.path.realpath(__file__)
|
||||
parent = os.path.dirname(here)
|
||||
relative_path = "test_cli_patterns/test_k8s_cluster_launcher.yaml"
|
||||
config_path = os.path.join(parent, relative_path)
|
||||
config = yaml.safe_load(open(config_path).read())
|
||||
fill_image_fields(config)
|
||||
return config
|
||||
|
||||
|
||||
class KubernetesTest(unittest.TestCase):
|
||||
def test_up_and_down(self):
|
||||
"""(1) Runs 'ray up' with a Kubernetes config that specifies
|
||||
min_workers=1.
|
||||
(2) Runs 'ray exec' to read monitor logs and confirm that worker and
|
||||
head are connected.
|
||||
(4) Rsyncs files up and down.
|
||||
(3) Runs 'ray down' and confirms that the cluster is gone."""
|
||||
|
||||
# get path to config
|
||||
config = get_config()
|
||||
|
||||
# get a node provider
|
||||
provider_config = config["provider"]
|
||||
cluster_name = config["cluster_name"]
|
||||
self.provider = KubernetesNodeProvider(provider_config, cluster_name)
|
||||
|
||||
# ray up
|
||||
sdk.create_or_update_cluster(config, no_config_cache=True)
|
||||
|
||||
# Check for two pods (worker and head).
|
||||
while True:
|
||||
nodes = self.provider.non_terminated_nodes({})
|
||||
if len(nodes) == 2:
|
||||
break
|
||||
else:
|
||||
time.sleep(1)
|
||||
|
||||
# Read logs with ray exec and check that worker and head are connected.
|
||||
# (Since the config yaml is legacy-style, we check for
|
||||
# ray-legacy-*-node_type.)
|
||||
log_cmd = "tail -n 100 /tmp/ray/session_latest/logs/monitor*"
|
||||
while True:
|
||||
monitor_output = sdk.run_on_cluster(
|
||||
config, cmd=log_cmd, with_output=True).decode()
|
||||
if ("ray-legacy-head-node-type" in monitor_output
|
||||
and "ray-legacy-worker-node-type" in monitor_output):
|
||||
break
|
||||
else:
|
||||
time.sleep(1)
|
||||
|
||||
# rsync
|
||||
with tempfile.NamedTemporaryFile("w") as test_file:
|
||||
test_file.write("test")
|
||||
test_file.flush()
|
||||
sdk.rsync(
|
||||
config, source=test_file.name, target="~/in_pod", down=False)
|
||||
with tempfile.NamedTemporaryFile() as test_file:
|
||||
sdk.rsync(
|
||||
config, target=test_file.name, source="~/in_pod", down=True)
|
||||
contents = open(test_file.name).read()
|
||||
assert contents == "test"
|
||||
|
||||
# ray down
|
||||
sdk.teardown_cluster(config)
|
||||
|
||||
# Check that there are no pods left in namespace ray to confirm that
|
||||
# the cluster is gone.
|
||||
while True:
|
||||
nodes = self.provider.non_terminated_nodes({})
|
||||
if len(nodes) == 0:
|
||||
break
|
||||
else:
|
||||
time.sleep(1)
|
||||
|
||||
def __del__(self):
|
||||
kubernetes.config.load_kube_config()
|
||||
core_api = kubernetes.client.CoreV1Api()
|
||||
core_api.delete_namespace(self.provider.namespace)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
Reference in New Issue
Block a user