[autoscaler][kubernetes] Ray client setup, example config simplification, example scripts. (#13920)

This commit is contained in:
Dmitri Gekhtman
2021-02-08 18:00:34 -08:00
committed by GitHub
parent 1643bc5c4f
commit 081f3e5f07
18 changed files with 807 additions and 411 deletions
+1 -1
View File
@@ -149,7 +149,7 @@ def create_or_update_cluster(
redirect_command_output: Optional[bool] = False,
use_login_shells: bool = True,
no_monitor_on_head: bool = False) -> Dict[str, Any]:
"""Create or updates an autoscaling Ray cluster from a config json."""
"""Creates or updates an autoscaling Ray cluster from a config json."""
# no_monitor_on_head is an internal flag used by the Ray K8s operator.
# If True, prevents autoscaling config sync to the Ray head during cluster
# creation. See https://github.com/ray-project/ray/pull/13720.
+97 -156
View File
@@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# A unique identifier for the head node and workers of this cluster.
cluster_name: defaults
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@@ -78,52 +74,83 @@ provider:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-head
name: example-cluster-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: ray-head
component: example-cluster-ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
# Service that maps to the worker nodes of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-workers
spec:
# This selector must match the worker node pods' selector below.
selector:
component: ray-worker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
worker_node:
# Minimum number of Ray workers of this Pod type.
min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
generateName: example-cluster-ray-worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
head_node:
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
component: example-cluster-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
@@ -132,120 +159,51 @@ head_node:
volumes:
- name: dshm
emptyDir:
medium: Memory
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -266,16 +224,6 @@ cluster_synced_files: []
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down.
# This is not supported on kubernetes.
# rsync_exclude: []
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
# This is not supported on kubernetes.
# rsync_filter: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
@@ -291,13 +239,6 @@ head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
head_node: {}
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
worker_nodes: {}
@@ -0,0 +1,261 @@
# A unique identifier for the head node and workers of this cluster.
cluster_name: example-cluster
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: ray
# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
# contain a user-created service account with the proper permissions.
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: autoscaler
# Role created by the autoscaler for the head node pod that it runs in.
# If this field isn't provided, the role referenced in
# autoscaler_role_binding must exist and have at least these permissions.
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
# RoleBinding created by the autoscaler for the head node pod that it runs
# in. If this field isn't provided, the head pod config below must contain
# a user-created service account with the proper permissions.
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: autoscaler
subjects:
- kind: ServiceAccount
name: autoscaler
roleRef:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: example-cluster-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: example-cluster-ray-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: example-cluster-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "~/path1/on/remote/machine": "/path1/on/local/machine",
# "~/path2/on/remote/machine": "/path2/on/local/machine",
}
# Note that the container images in this example have a non-root user.
# To avoid permissions issues, we recommend mounting into a subdirectory of home (~).
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
@@ -1,12 +1,8 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# A unique identifier for the head node and workers of this cluster.
cluster_name: example-cluster
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
@@ -78,52 +74,86 @@ provider:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-head
name: example-cluster-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: ray-head
component: example-cluster-ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
# Service that maps to the worker nodes of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-workers
spec:
# This selector must match the worker node pods' selector below.
selector:
component: ray-worker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
worker_node:
# Minimum number of Ray workers of this Pod type.
min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
# User-specified custom resources for use by Ray. Object with string keys and integer values.
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
resources: {"foo": 1, "bar": 2}
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
generateName: example-cluster-ray-worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
head_node:
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
component: example-cluster-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
@@ -132,171 +162,48 @@ head_node:
volumes:
- name: dshm
emptyDir:
medium: Memory
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "~/path1/on/remote/machine": "/path1/on/local/machine",
# "~/path2/on/remote/machine": "/path2/on/local/machine",
}
# Note that the container images in this example have a non-root user.
# To avoid permissions issues, we recommend mounting into a subdirectory of home (~).
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down.
# This is not supported on kubernetes.
# rsync_exclude: []
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
# This is not supported on kubernetes.
# rsync_filter: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
@@ -1,9 +1,9 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal
cluster_name: example-cluster
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
max_workers: 1
# node.
max_workers: 2
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
@@ -56,3 +56,26 @@ provider:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: example-cluster-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: example-cluster-ray-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
@@ -0,0 +1,71 @@
from collections import Counter
import os
import sys
import time
import ray
""" This script is meant to be run from a pod in the same Kubernetes namespace
as your Ray cluster.
Just below are the environment variables used to access Ray client via a
service targetting the Ray cluster's head node pod.
These environment variables are set by Kubernetes.
See https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables
In the documentation examples, the head service has
"example-cluster-ray-head" and the relevant port is named "client".
Modify the environment variables as needed to match the name of the service
and port.
Note: The default head service set up by the Ray Kubernetes operator is named
<cluster-name>-ray-head,
where <cluster-name> is the metadata.name field you set in the RayCluster
custom resource.
""" # noqa
HEAD_SERVICE_IP_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST"
HEAD_SERVICE_CLIENT_PORT_ENV = "EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_CLIENT"
@ray.remote
def gethostname(x):
import platform
import time
time.sleep(0.01)
return x + (platform.node(), )
def wait_for_nodes(expected):
# Wait for all nodes to join the cluster.
while True:
resources = ray.cluster_resources()
node_keys = [key for key in resources if "node" in key]
num_nodes = sum(resources[node_key] for node_key in node_keys)
if num_nodes < expected:
print("{} nodes have joined so far, waiting for {} more.".format(
num_nodes, expected - num_nodes))
sys.stdout.flush()
time.sleep(1)
else:
break
def main():
wait_for_nodes(3)
# Check that objects can be transferred from each node to each other node.
for i in range(10):
print("Iteration {}".format(i))
results = [
gethostname.remote(gethostname.remote(())) for _ in range(100)
]
print(Counter(ray.get(results)))
sys.stdout.flush()
print("Success!")
sys.stdout.flush()
if __name__ == "__main__":
head_service_ip = os.environ[HEAD_SERVICE_IP_ENV]
client_port = os.environ[HEAD_SERVICE_CLIENT_PORT_ENV]
ray.util.connect(f"{head_service_ip}:{client_port}")
main()
@@ -0,0 +1,58 @@
from collections import Counter
import sys
import time
import ray
""" Run this script locally to execute a Ray program on your Ray cluster on
Kubernetes.
Before running this script, you must port-forward from the local host to
the relevant Kubernetes head service e.g.
kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001.
Set the constant LOCAL_PORT below to the local port being forwarded.
"""
LOCAL_PORT = 10001
@ray.remote
def gethostname(x):
import platform
import time
time.sleep(0.01)
return x + (platform.node(), )
def wait_for_nodes(expected):
# Wait for all nodes to join the cluster.
while True:
resources = ray.cluster_resources()
node_keys = [key for key in resources if "node" in key]
num_nodes = sum(resources[node_key] for node_key in node_keys)
if num_nodes < expected:
print("{} nodes have joined so far, waiting for {} more.".format(
num_nodes, expected - num_nodes))
sys.stdout.flush()
time.sleep(1)
else:
break
def main():
wait_for_nodes(3)
# Check that objects can be transferred from each node to each other node.
for i in range(10):
print("Iteration {}".format(i))
results = [
gethostname.remote(gethostname.remote(())) for _ in range(100)
]
print(Counter(ray.get(results)))
sys.stdout.flush()
print("Success!")
sys.stdout.flush()
if __name__ == "__main__":
ray.util.connect(f"127.0.0.1:{LOCAL_PORT}")
main()
@@ -0,0 +1,50 @@
from collections import Counter
import sys
import time
import ray
# Run this script on the Ray head node using kubectl exec.
@ray.remote
def gethostname(x):
import platform
import time
time.sleep(0.01)
return x + (platform.node(), )
def wait_for_nodes(expected):
# Wait for all nodes to join the cluster.
while True:
resources = ray.cluster_resources()
node_keys = [key for key in resources if "node" in key]
num_nodes = sum(resources[node_key] for node_key in node_keys)
if num_nodes < expected:
print("{} nodes have joined so far, waiting for {} more.".format(
num_nodes, expected - num_nodes))
sys.stdout.flush()
time.sleep(1)
else:
break
def main():
wait_for_nodes(3)
# Check that objects can be transferred from each node to each other node.
for i in range(10):
print("Iteration {}".format(i))
results = [
gethostname.remote(gethostname.remote(())) for _ in range(100)
]
print(Counter(ray.get(results)))
sys.stdout.flush()
print("Success!")
sys.stdout.flush()
if __name__ == "__main__":
ray.init(address="auto")
main()
@@ -0,0 +1,24 @@
# Job to run a Ray program in its own pod. Assumes that a Ray cluster is already
# running.
apiVersion: batch/v1
kind: Job
metadata:
generateName: ray-test-job-
spec:
template:
spec:
restartPolicy: Never
containers:
- name: ray
image: rayproject/ray:nightly
imagePullPolicy: Always
command: ["python"]
args:
- "$(EXAMPLE_PROGRAM_PATH)"
env:
- name: EXAMPLE_PROGRAM_PATH
value: "/home/ray/anaconda3/lib/python3.7/site-packages/ray/autoscaler/kubernetes/example_scripts/job_example.py"
resources:
requests:
cpu: 100m
memory: 512Mi
@@ -78,9 +78,9 @@ spec:
description: Maximum number of Ray workers of this Pod type.
rayResources:
type: object
description: User-specified custom resources for use by Ray.
# TODO (dmitri): Validate that values are numeric [patternProperties not supported by OpenAPI v3.0]
x-kubernetes-preserve-unknown-fields: true
description: User-specified custom resources for use by Ray. Keys strings, values integers.
# TODO (dmitri): Validate that values are integers [patternProperties not supported by OpenAPI v3.0]
x-kubernetes-preserve-unknown-fields: true
setupCommands:
description: Commands to run before starting the Ray runtime.
type: array
@@ -42,9 +42,9 @@ spec:
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
@@ -65,16 +65,14 @@ spec:
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
- name: worker-nodes
- name: worker-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 2
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 3
# User-specified custom resources for use by Ray
rayResources: {"Custom1": 1, "is_spot": 1}
# Optional commands to run before starting the Ray runtime.
setupCommands:
- pip install numpy # Example
# User-specified custom resources for use by Ray.
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
rayResources: {"foo": 1, "bar": 1}
podConfig:
apiVersion: v1
kind: Pod
@@ -93,9 +91,6 @@ spec:
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
@@ -118,9 +113,9 @@ spec:
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
@@ -42,9 +42,9 @@ spec:
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
@@ -65,16 +65,14 @@ spec:
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
- name: worker-nodes
- name: worker-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 1
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 3
# User-specified custom resources for use by Ray
rayResources: {"Custom1": 1, "is_spot": 1}
# Optional commands to run before starting the Ray runtime.
setupCommands:
- pip install numpy # Example
# User-specified custom resources for use by Ray. Object with string keys and integer values.
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
rayResources: {"baz": 5, "quux": 17}
podConfig:
apiVersion: v1
kind: Pod
@@ -93,9 +91,6 @@ spec:
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
@@ -118,9 +113,9 @@ spec:
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
@@ -10,7 +10,7 @@ metadata:
name: ray-operator-role
rules:
- apiGroups: ["", "cluster.ray.io"]
resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec"]
resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"]
verbs: ["get", "watch", "list", "create", "delete", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
+6 -2
View File
@@ -337,8 +337,12 @@
"min_workers": {"type": "integer"},
"max_workers": {"type": "integer"},
"resources": {
"type": "object",
".*": {"type": "number"}
"patternProperties": {
".*":{
"type": "integer",
"minimum": 0
}
}
},
"initialization_commands": {
"$ref": "#/definitions/commands",
+43 -14
View File
@@ -6,6 +6,7 @@ from typing import Any, Dict, Iterator
from kubernetes.watch import Watch
from ray.autoscaler._private.kubernetes import custom_objects_api
from ray.autoscaler._private.providers import _get_default_config
RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
@@ -59,36 +60,64 @@ def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
"""Convert RayCluster custom resource to a ray cluster config for use by the
autoscaler."""
config = translate(cluster_resource["spec"], dictionary=CONFIG_FIELDS)
config["available_node_types"] = get_node_types(cluster_resource)
config["cluster_name"] = cluster_resource["metadata"]["name"]
config["provider"] = PROVIDER_CONFIG
cluster_name = cluster_resource["metadata"]["name"]
config["available_node_types"] = get_node_types(cluster_resource,
cluster_name)
config["cluster_name"] = cluster_name
config["provider"] = get_provider_config(cluster_name)
return config
def get_node_types(cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
cluster_owner_reference = get_cluster_owner_reference(cluster_resource)
def get_node_types(cluster_resource: Dict[str, Any], cluster_name) ->\
Dict[str, Any]:
cluster_owner_reference = get_cluster_owner_reference(
cluster_resource, cluster_name)
node_types = {}
for pod_type in cluster_resource["spec"]["podTypes"]:
name = pod_type["name"]
pod_type_copy = copy.deepcopy(pod_type)
pod_type_copy.pop("name")
node_types[name] = translate(
pod_type_copy, dictionary=NODE_TYPE_FIELDS)
# Deleting a RayCluster CR will also delete the associated pods.
node_types[name]["node_config"]["metadata"].update({
"ownerReferences": [cluster_owner_reference]
})
node_type = translate(pod_type_copy, dictionary=NODE_TYPE_FIELDS)
metadata = node_type["node_config"]["metadata"]
metadata.update({"ownerReferences": [cluster_owner_reference]})
if name == cluster_resource["spec"]["headPodType"]:
if "labels" not in metadata:
metadata["labels"] = {}
metadata["labels"].update(head_service_selector(cluster_name))
node_types[name] = node_type
return node_types
def get_cluster_owner_reference(
cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
def get_provider_config(cluster_name):
default_kubernetes_config = _get_default_config({"type": "kubernetes"})
default_provider_conf = default_kubernetes_config["provider"]
# Configure head service for dashboard and client
head_service = copy.deepcopy(default_provider_conf["services"][0])
service_name = f"{cluster_name}-ray-head"
head_service["metadata"]["name"] = service_name
head_service["spec"]["selector"] = head_service_selector(cluster_name)
provider_conf = {}
provider_conf["type"] = "kubernetes"
provider_conf["use_internal_ips"] = True
provider_conf["namespace"] = RAY_NAMESPACE
provider_conf["services"] = [head_service]
return provider_conf
def head_service_selector(cluster_name):
return {"component": f"{cluster_name}-ray-head"}
def get_cluster_owner_reference(cluster_resource: Dict[str, Any],
cluster_name: str) -> Dict[str, Any]:
return {
"apiVersion": cluster_resource["apiVersion"],
"kind": cluster_resource["kind"],
"blockOwnerDeletion": True,
"controller": True,
"name": cluster_resource["metadata"]["name"],
"name": cluster_name,
"uid": cluster_resource["metadata"]["uid"]
}
+8 -1
View File
@@ -11,6 +11,8 @@ import pytest
from ray.autoscaler._private.util import prepare_config, validate_config
from ray.autoscaler._private.providers import _NODE_PROVIDERS
from ray.autoscaler._private.kubernetes.node_provider import\
KubernetesNodeProvider
from ray.test_utils import recursive_fnmatch
@@ -25,6 +27,7 @@ CONFIG_PATHS += recursive_fnmatch(
def ignore_k8s_operator_configs(paths):
return [
path for path in paths if "kubernetes/operator_configs" not in path
and "kubernetes/job-example.yaml" not in path
]
@@ -40,10 +43,14 @@ class AutoscalingConfigTest(unittest.TestCase):
with open(config_path) as f:
config = yaml.safe_load(f)
config = prepare_config(config)
if config["provider"]["type"] == "kubernetes":
KubernetesNodeProvider.fillout_available_node_types_resources(
config)
try:
validate_config(config)
except Exception:
self.fail("Config did not pass validation test!")
self.fail(
f"Config {config_path} did not pass validation test!")
@pytest.mark.skipif(
sys.platform.startswith("win"), reason="Fails on Windows.")
@@ -69,8 +69,8 @@ class KubernetesTest(unittest.TestCase):
while True:
monitor_output = sdk.run_on_cluster(
config, cmd=log_cmd, with_output=True).decode()
if ("ray-legacy-head-node-type" in monitor_output
and "ray-legacy-worker-node-type" in monitor_output):
if ("head-node" in monitor_output
and "worker-node" in monitor_output):
break
else:
time.sleep(1)
+42 -11
View File
@@ -20,7 +20,7 @@ NAMESPACE = "test-k8s-operator-examples"
def retry_until_true(f):
# Retry 60 times with 1 second delay between attempts.
def f_with_retries(*args, **kwargs):
for _ in range(60):
for _ in range(120):
if f(*args, **kwargs):
return
else:
@@ -47,25 +47,38 @@ def wait_for_logs():
cmd = f"kubectl -n {NAMESPACE} logs ray-operator-pod"\
"| grep ^example-cluster: | tail -n 100"
log_tail = subprocess.check_output(cmd, shell=True).decode()
return ("head-node" in log_tail) and ("worker-nodes" in log_tail)
return ("head-node" in log_tail) and ("worker-node" in log_tail)
def operator_configs_directory():
@retry_until_true
def wait_for_job(job_pod):
cmd = f"kubectl -n {NAMESPACE} logs {job_pod}"
out = subprocess.check_output(cmd, shell=True).decode()
return ("success" in out.lower())
def kubernetes_configs_directory():
here = os.path.realpath(__file__)
ray_python_root = os.path.dirname(os.path.dirname(here))
relative_path = "autoscaler/kubernetes/operator_configs"
relative_path = "autoscaler/kubernetes"
return os.path.join(ray_python_root, relative_path)
def get_kubernetes_config_path(name):
return os.path.join(kubernetes_configs_directory(), name)
def get_operator_config_path(file_name):
return os.path.join(operator_configs_directory(), file_name)
operator_configs = get_kubernetes_config_path("operator_configs")
return os.path.join(operator_configs, file_name)
class KubernetesOperatorTest(unittest.TestCase):
def test_examples(self):
with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \
tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\
tempfile.NamedTemporaryFile("w+") as operator_file:
tempfile.NamedTemporaryFile("w+") as operator_file,\
tempfile.NamedTemporaryFile("w+") as job_file:
# Get paths to operator configs
example_cluster_config_path = get_operator_config_path(
@@ -73,6 +86,7 @@ class KubernetesOperatorTest(unittest.TestCase):
example_cluster2_config_path = get_operator_config_path(
"example_cluster2.yaml")
operator_config_path = get_operator_config_path("operator.yaml")
job_path = get_kubernetes_config_path("job-example.yaml")
self.crd_path = get_operator_config_path("cluster_crd.yaml")
# Load operator configs
@@ -82,19 +96,23 @@ class KubernetesOperatorTest(unittest.TestCase):
open(example_cluster2_config_path).read())
operator_config = list(
yaml.safe_load_all(open(operator_config_path).read()))
job_config = yaml.safe_load(open(job_path).read())
# Fill image fields
podTypes = example_cluster_config["spec"]["podTypes"]
podTypes2 = example_cluster2_config["spec"]["podTypes"]
pod_configs = ([operator_config[-1]] + [
podType["podConfig"] for podType in podTypes
] + [podType["podConfig"] for podType in podTypes2])
for pod_config in pod_configs:
pod_config["spec"]["containers"][0]["image"] = IMAGE
pod_specs = ([operator_config[-1]["spec"]] + [
job_config["spec"]["template"]["spec"]
] + [podType["podConfig"]["spec"] for podType in podTypes
] + [podType["podConfig"]["spec"] for podType in podTypes2])
for pod_spec in pod_specs:
pod_spec["containers"][0]["image"] = IMAGE
pod_spec["containers"][0]["imagePullPolicy"] = "IfNotPresent"
# Dump to temporary files
yaml.dump(example_cluster_config, example_cluster_file)
yaml.dump(example_cluster2_config, example_cluster2_file)
yaml.dump(job_config, job_file)
yaml.dump_all(operator_config, operator_file)
files = [
example_cluster_file, example_cluster2_file, operator_file
@@ -131,6 +149,19 @@ class KubernetesOperatorTest(unittest.TestCase):
# Four pods remain
wait_for_pods(4)
# Check job submission
cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
subprocess.check_call(cmd, shell=True)
cmd = f"kubectl -n {NAMESPACE} get pods --no-headers -o"\
" custom-columns=\":metadata.name\""
pods = subprocess.check_output(cmd, shell=True).decode().split()
job_pod = [pod for pod in pods if "job" in pod].pop()
time.sleep(10)
wait_for_job(job_pod)
cmd = f"kubectl -n {NAMESPACE} delete jobs --all"
subprocess.check_call(cmd, shell=True)
# Check that cluster updates work: increase minWorkers to 3
# and check that one worker is created.
example_cluster_edit = copy.deepcopy(example_cluster_config)