[autoscaler/k8s] [CI] Kubernetes test ray up, exec, down (#12514)

This commit is contained in:
Dmitri Gekhtman
2021-01-13 15:03:56 -08:00
committed by GitHub
parent 44acbdd82a
commit 1968b2f9d8
5 changed files with 427 additions and 6 deletions
+10
View File
@@ -108,6 +108,16 @@ py_test_module_list(
deps = ["//:ray_lib"],
)
py_test_module_list(
files = [
"test_k8s_cluster_launcher.py",
],
size = "small",
extra_srcs = SRCS,
deps = ["//:ray_lib"],
tags = ["kubernetes"]
)
py_test_module_list(
files = [
"test_failure.py",
@@ -0,0 +1,300 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: test
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 1
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: ray-cluster-launcher-unit-test
# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
# contain a user-created service account with the proper permissions.
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: autoscaler
# Role created by the autoscaler for the head node pod that it runs in.
# If this field isn't provided, the role referenced in
# autoscaler_role_binding must exist and have at least these permissions.
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
# RoleBinding created by the autoscaler for the head node pod that it runs
# in. If this field isn't provided, the head pod config below must contain
# a user-created service account with the proper permissions.
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: autoscaler
subjects:
- kind: ServiceAccount
name: autoscaler
roleRef:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Service that maps to the worker nodes of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-workers
spec:
# This selector must match the worker node pods' selector below.
selector:
component: ray-worker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: IfNotPresent
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: PLACEHOLDER
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: IfNotPresent
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: PLACEHOLDER
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down.
# This is not supported on kubernetes.
# rsync_exclude: []
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
# This is not supported on kubernetes.
# rsync_filter: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,110 @@
import os
import tempfile
import time
import unittest
import kubernetes
import pytest
import yaml
from ray.autoscaler._private.kubernetes.node_provider import \
KubernetesNodeProvider
from ray.autoscaler import sdk
IMAGE_ENV = "KUBERNETES_CLUSTER_LAUNCHER_TEST_IMAGE"
def fill_image_field(pod_config):
image = os.getenv(IMAGE_ENV, "rayproject/ray:nightly")
pod_config["spec"]["containers"][0]["image"] = image
def fill_image_fields(cluster_config):
for key in "worker_nodes", "head_node":
fill_image_field(cluster_config[key])
def get_config():
here = os.path.realpath(__file__)
parent = os.path.dirname(here)
relative_path = "test_cli_patterns/test_k8s_cluster_launcher.yaml"
config_path = os.path.join(parent, relative_path)
config = yaml.safe_load(open(config_path).read())
fill_image_fields(config)
return config
class KubernetesTest(unittest.TestCase):
def test_up_and_down(self):
"""(1) Runs 'ray up' with a Kubernetes config that specifies
min_workers=1.
(2) Runs 'ray exec' to read monitor logs and confirm that worker and
head are connected.
(4) Rsyncs files up and down.
(3) Runs 'ray down' and confirms that the cluster is gone."""
# get path to config
config = get_config()
# get a node provider
provider_config = config["provider"]
cluster_name = config["cluster_name"]
self.provider = KubernetesNodeProvider(provider_config, cluster_name)
# ray up
sdk.create_or_update_cluster(config, no_config_cache=True)
# Check for two pods (worker and head).
while True:
nodes = self.provider.non_terminated_nodes({})
if len(nodes) == 2:
break
else:
time.sleep(1)
# Read logs with ray exec and check that worker and head are connected.
# (Since the config yaml is legacy-style, we check for
# ray-legacy-*-node_type.)
log_cmd = "tail -n 100 /tmp/ray/session_latest/logs/monitor*"
while True:
monitor_output = sdk.run_on_cluster(
config, cmd=log_cmd, with_output=True).decode()
if ("ray-legacy-head-node-type" in monitor_output
and "ray-legacy-worker-node-type" in monitor_output):
break
else:
time.sleep(1)
# rsync
with tempfile.NamedTemporaryFile("w") as test_file:
test_file.write("test")
test_file.flush()
sdk.rsync(
config, source=test_file.name, target="~/in_pod", down=False)
with tempfile.NamedTemporaryFile() as test_file:
sdk.rsync(
config, target=test_file.name, source="~/in_pod", down=True)
contents = open(test_file.name).read()
assert contents == "test"
# ray down
sdk.teardown_cluster(config)
# Check that there are no pods left in namespace ray to confirm that
# the cluster is gone.
while True:
nodes = self.provider.non_terminated_nodes({})
if len(nodes) == 0:
break
else:
time.sleep(1)
def __del__(self):
kubernetes.config.load_kube_config()
core_api = kubernetes.client.CoreV1Api()
core_api.delete_namespace(self.provider.namespace)
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", __file__]))