From 8581dd2fb1ff95a2d25aa52feb8a24b43f6597cc Mon Sep 17 00:00:00 2001 From: Lee moon soo Date: Sun, 18 Oct 2020 00:26:11 -0700 Subject: [PATCH] [Autoscaler] Staroid node provider followup improvements (#11408) --- .../_private/staroid/node_provider.py | 76 +++-- .../ray/autoscaler/staroid/example-full.yaml | 70 +++-- .../ray/autoscaler/staroid/example-gpu.yaml | 292 ++++++++++++++++++ .../autoscaler/staroid/example-minimal.yaml | 10 +- .../staroid/example-multi-node-type.yaml | 113 +++++++ 5 files changed, 502 insertions(+), 59 deletions(-) create mode 100644 python/ray/autoscaler/staroid/example-gpu.yaml create mode 100644 python/ray/autoscaler/staroid/example-multi-node-type.yaml diff --git a/python/ray/autoscaler/_private/staroid/node_provider.py b/python/ray/autoscaler/_private/staroid/node_provider.py index 3afc93e0f..1b62fabf5 100644 --- a/python/ray/autoscaler/_private/staroid/node_provider.py +++ b/python/ray/autoscaler/_private/staroid/node_provider.py @@ -226,41 +226,53 @@ class StaroidNodeProvider(NodeProvider): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) - pod = core_api.read_namespaced_pod(node_id, self.namespace) - pod.metadata.labels.update(tags) - core_api.patch_namespaced_pod(node_id, self.namespace, pod) + max_retry = 10 + for i in range(max_retry): + try: + pod = core_api.read_namespaced_pod(node_id, self.namespace) + pod.metadata.labels.update(tags) + core_api.patch_namespaced_pod(node_id, self.namespace, pod) + except ApiException as e: + if e.status == 409 and max_retry - 1 > i: + # conflict. pod modified before apply patch. retry + time.sleep(0.2) + continue + + raise e def create_node(self, node_config, tags, count): instance_name = self.cluster_name - # get or create ske - cluster_api = self.__star.cluster() - ske = cluster_api.create(self.__ske, self.__ske_region) - if ske is None: - raise Exception("Failed to create an SKE '{}' in '{}' region" - .format(self.__ske, self.__ske_region)) + incluster = self._connect_kubeapi(instance_name) + if incluster is None: + # get or create ske + cluster_api = self.__star.cluster() + ske = cluster_api.create(self.__ske, self.__ske_region) + if ske is None: + raise Exception("Failed to create an SKE '{}' in '{}' region" + .format(self.__ske, self.__ske_region)) - # create a namespace - ns_api = self.__star.namespace(ske) - ns = ns_api.create( - instance_name, - self.provider_config["project"], + # create a namespace + ns_api = self.__star.namespace(ske) + ns = ns_api.create( + instance_name, + self.provider_config["project"], - # Configure 'start-head' param to 'false'. - # head node will be created using Kubernetes api. - params=[{ - "group": "Misc", - "name": "start-head", - "value": "false" - }]) - if ns is None: - raise Exception("Failed to create a cluster '{}' in SKE '{}'" - .format(instance_name, self.__ske)) + # Configure 'start-head' param to 'false'. + # head node will be created using Kubernetes api. + params=[{ + "group": "Misc", + "name": "start-head", + "value": "false" + }]) + if ns is None: + raise Exception("Failed to create a cluster '{}' in SKE '{}'" + .format(instance_name, self.__ske)) - # 'ray down' will change staroid namespace status to "PAUSE" - # in this case we need to start namespace again. - if ns.status() == "PAUSE": - ns = ns_api.start(instance_name) + # 'ray down' will change staroid namespace status to "PAUSE" + # in this case we need to start namespace again. + if ns.status() == "PAUSE": + ns = ns_api.start(instance_name) # kube client kube_client = self._connect_kubeapi(instance_name) @@ -293,6 +305,14 @@ class StaroidNodeProvider(NodeProvider): else: pod_spec["metadata"]["labels"] = tags + if "generateName" not in pod_spec["metadata"]: + pod_spec["metadata"]["generateName"] = \ + "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] + "-" + + if "component" not in pod_spec["metadata"]["labels"]: + pod_spec["metadata"]["labels"]["component"] = \ + "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] + if image is not None: containers = pod_spec["spec"]["containers"] for c in containers: diff --git a/python/ray/autoscaler/staroid/example-full.yaml b/python/ray/autoscaler/staroid/example-full.yaml index ad3032c44..25c8fe37d 100644 --- a/python/ray/autoscaler/staroid/example-full.yaml +++ b/python/ray/autoscaler/staroid/example-full.yaml @@ -1,6 +1,6 @@ # An unique identifier for the head node and workers of this cluster. # A namespace will be automatically created for each cluster_name in SKE. -cluster_name: default +cluster_name: default # name with 'a-z' and '-' # The minimum number of workers nodes to launch in addition to the head # node. This number should be >= 0. @@ -8,7 +8,7 @@ min_workers: 0 # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. -max_workers: 2 +max_workers: 5 # The initial number of worker nodes to launch in addition to the head # node. When the cluster is first brought up (or when it is refreshed with a @@ -71,8 +71,8 @@ provider: # - Kubernetes resources to create (like Persistent volume claim) # on namespace creation # You can fork when you need to customize. - # 1. Fork github.com/open-datastudio/ray - # 2. Change .staroid/ directory to cutomize + # 1. Fork github.com/open-datastudio/ray-cluster + # 2. Change contents # 3. Connect forked repository (https://staroid.com/projects/settings) # 4. Release your customized branch # 4-1. Select project from 'My projects' menu @@ -81,7 +81,7 @@ provider: # 4-4. Switch Launch permission to 'Public' if required # 5. Change 'project' field to point your # repository and branch in this file - project: "GITHUB/open-datastudio/ray:master-staroid" + project: "GITHUB/open-datastudio/ray-cluster:master" # 'spec.containers.image' field for ray-node and ray-worker will be # overrided by the image built from the 'project' field above. @@ -109,13 +109,17 @@ head_node: labels: component: ray-head - # https://docs.staroid.com/ske/pod.html#pod + # https://docs.staroid.com/ske/pod.html pod.staroid.com/spot: "false" # use on-demand instance for head. - # Uncomment to locate ray head to dedicated Kubernetes node - # (GPU instance is only available for 'dedicated' isolation) - #pod.staroid.com/isolation: dedicated - #pod.staroid.com/instance-type: gpu-1 + # Locate ray head to dedicated Kubernetes node + # In dedicated mode, resource requests and limits in the pod spec will be + # automatically overrided based on 'pod.staroid.com/instance-type' below. + pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated' + + # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'. + # See available instance type from https://docs.staroid.com/ske/pod.html. + pod.staroid.com/instance-type: standard-4 spec: automountServiceAccountToken: true @@ -130,10 +134,12 @@ head_node: - name: dshm emptyDir: medium: Memory + - name: tmp-volume + emptyDir: {} # nfs volume provides a shared volume across all ray-nodes. - name: nfs-volume persistentVolumeClaim: - claimName: nfs + claimName: nfs containers: - name: ray-node @@ -162,13 +168,16 @@ head_node: volumeMounts: - mountPath: /dev/shm name: dshm + - mountPath: /tmp + name: tmp-volume - mountPath: /nfs name: nfs-volume resources: requests: - cpu: 1000m - memory: 2Gi + cpu: 4000m + memory: 8Gi limits: + cpu: 4000m # The maximum memory that this pod is allowed to use. The # limit will be detected by ray and split to use 10% for # redis, 30% for the shared memory object store, and the @@ -176,7 +185,7 @@ head_node: # the object store size is not set manually, ray will # allocate a very large object store in each pod that may # cause problems for other pods. - memory: 2Gi + memory: 8Gi env: # This is used in the head_start_ray_commands below so that # Ray can spawn the correct number of processes. Omitting this @@ -184,7 +193,7 @@ head_node: - name: MY_CPU_REQUEST valueFrom: resourceFieldRef: - resource: requests.cpu + resource: limits.cpu - name: RAY_ADDRESS value: "auto" @@ -201,13 +210,17 @@ worker_nodes: labels: component: ray-worker - # https://docs.staroid.com/ske/pod.html#pod - pod.staroid.com/spot: "true" # use spot instance for workers. + # https://docs.staroid.com/ske/pod.html + pod.staroid.com/spot: "true" - # Uncomment to locate ray head to dedicated Kubernetes node - # (GPU instance is only available for 'dedicated' isolation) - #pod.staroid.com/isolation: dedicated - #pod.staroid.com/instance-type: gpu-1 + # Locate ray head to dedicated Kubernetes node + # In dedicated mode, resource requests and limits in the pod spec will be + # automatically overrided based on 'pod.staroid.com/instance-type' below. + pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated' + + # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'. + # See available instance type from https://docs.staroid.com/ske/pod.html. + pod.staroid.com/instance-type: standard-4 spec: serviceAccountName: default @@ -222,9 +235,11 @@ worker_nodes: - name: dshm emptyDir: medium: Memory + - name: tmp-volume + emptyDir: {} - name: nfs-volume persistentVolumeClaim: - claimName: nfs + claimName: nfs containers: - name: ray-node imagePullPolicy: Always @@ -246,16 +261,19 @@ worker_nodes: volumeMounts: - mountPath: /dev/shm name: dshm + - mountPath: /tmp + name: tmp-volume - mountPath: /nfs name: nfs-volume resources: requests: - cpu: 1000m - memory: 2Gi + cpu: 4000m + memory: 8Gi limits: + cpu: 4000m # This memory limit will be detected by ray and split into # 30% for plasma, and 70% for workers. - memory: 2Gi + memory: 8Gi env: # This is used in the head_start_ray_commands below so that # Ray can spawn the correct number of processes. Omitting this @@ -263,7 +281,7 @@ worker_nodes: - name: MY_CPU_REQUEST valueFrom: resourceFieldRef: - resource: requests.cpu + resource: limits.cpu # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. diff --git a/python/ray/autoscaler/staroid/example-gpu.yaml b/python/ray/autoscaler/staroid/example-gpu.yaml new file mode 100644 index 000000000..b61aaa664 --- /dev/null +++ b/python/ray/autoscaler/staroid/example-gpu.yaml @@ -0,0 +1,292 @@ +# An unique identifier for the head node and workers of this cluster. +# A namespace will be automatically created for each cluster_name in SKE. +cluster_name: default # name with 'a-z' and '-' + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 5 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Kubernetes resources that need to be configured for the autoscaler to be +# able to manage the Ray cluster. If any of the provided resources don't +# exist, the autoscaler will attempt to create them. If this fails, you may +# not have the required permissions and will have to request them to be +# created by your cluster administrator. +provider: + type: staroid + + # Access token for Staroid from https://staroid.com/settings/accesstokens. + # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable. + # https://github.com/staroids/staroid-python#configuration + # for more information. + access_token: + + # Staroid account to use. e.g. GITHUB/staroids + # Alternatively, you can set STAROID_ACCOUNT environment variable. + # Leave empty to select default account for given access token. + # https://github.com/staroids/staroid-python#configuration + # for more information. + account: + + # Name of a Staroid Kubernetes Engine (SKE) instance. + # Alternatively, you can set STAROID_SKE environment variable. + # An SKE is a virtualized Kubernetes cluster. + # Will create a new if not exists. + ske: "Ray cluster" + + # Cloud and Region to create an SKE when not exists. + # If SKE already exists, this value will be ignored. + # Supported cloud region can be found + # https://docs.staroid.com/ske/cloudregion.html. + ske_region: "aws us-west2" + + # To create a namespace in SKE, you need to specify a Github project. + # The Github project needs to have a staroid.yaml + # (https://docs.staroid.com/references/staroid_yaml.html). + # staroid.yaml defines various resources for the project, such as + # - Building container images can be accessed from the namespace + # - Kubernetes resources to create (like Persistent volume claim) + # on namespace creation + # You can fork when you need to customize. + # 1. Fork github.com/open-datastudio/ray-cluster + # 2. Change contents + # 3. Connect forked repository (https://staroid.com/projects/settings) + # 4. Release your customized branch + # 4-1. Select project from 'My projects' menu + # 4-2. Select your branch in 'Release' tab + # 4-3. After build success, switch to 'Production' + # 4-4. Switch Launch permission to 'Public' if required + # 5. Change 'project' field to point your + # repository and branch in this file + project: "GITHUB/open-datastudio/ray-cluster:master" + + # 'spec.containers.image' field for ray-node and ray-worker will be + # overrided by the image built from the 'project' field above. + # Set this value to 'false' to not override the image. + image_from_project: true + + # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'. + # 'project' field above provides docker image for each python version. + # Fork 'project' if you'd like to support other python versions. + python_version: 3.7.7 + + # Exposing external IP addresses for ray pods isn't currently supported. + use_internal_ips: true + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + + # Locate this Pod to spot instance or not. + # https://docs.staroid.com/ske/pod.html + pod.staroid.com/spot: "false" # use on-demand instance for head. + + # Locate ray head to dedicated Kubernetes node or not. + # 'sandboxed' (default) or 'dedicated'. + pod.staroid.com/isolation: dedicated + + # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'. + # See available instance type from https://docs.staroid.com/ske/pod.html. + pod.staroid.com/instance-type: gpu-1 + spec: + automountServiceAccountToken: true + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: tmp-volume + emptyDir: {} + # nfs volume provides a shared volume across all ray-nodes. + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) + # Image will be overriden when 'image_from_project' is true. + image: rayproject/autoscaler + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /tmp + name: tmp-volume + - mountPath: /nfs + name: nfs-volume + resources: + # in case of 'pod.staroid.com/isolation' is 'dedicated', + # cpu and memory requests/limits in resources field will be + # automatically configured based on + # 'pod.staroid.com/instance-type' + requests: + cpu: 4000m + memory: 8Gi + limits: + cpu: 4000m + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 8Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: limits.cpu + - name: RAY_ADDRESS + value: "auto" + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node + # service is required. + labels: + component: ray-worker + + # Locate this Pod to spot instance or not. + # https://docs.staroid.com/ske/pod.html + pod.staroid.com/spot: "true" # use on-demand instance for head. + + # Locate ray head to dedicated Kubernetes node or not. + # 'sandboxed' (default) or 'dedicated'. + pod.staroid.com/isolation: dedicated + + # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'. + # See available instance type from https://docs.staroid.com/ske/pod.html. + pod.staroid.com/instance-type: gpu-1 + spec: + serviceAccountName: default + + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: tmp-volume + emptyDir: {} + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + image: rayproject/autoscaler + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /tmp + name: tmp-volume + - mountPath: /nfs + name: nfs-volume + resources: + # in case of 'pod.staroid.com/isolation' is 'dedicated', + # cpu and memory requests/limits in resources field will be + # automatically configured based on + # 'pod.staroid.com/instance-type' + requests: + cpu: 4000m + memory: 8Gi + limits: + cpu: 4000m + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 8Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: limits.cpu diff --git a/python/ray/autoscaler/staroid/example-minimal.yaml b/python/ray/autoscaler/staroid/example-minimal.yaml index bc32883a2..e25fc7a38 100644 --- a/python/ray/autoscaler/staroid/example-minimal.yaml +++ b/python/ray/autoscaler/staroid/example-minimal.yaml @@ -1,9 +1,9 @@ # An unique identifier for the head node and workers of this cluster. -cluster_name: minimal +cluster_name: minimal # name with 'a-z' and '-' # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. -max_workers: 1 +max_workers: 5 # Kubernetes resources that need to be configured for the autoscaler to be # able to manage the Ray cluster. If any of the provided resources don't @@ -46,8 +46,8 @@ provider: # - Kubernetes resources to create (like Persistent volume claim) # on namespace creation # You can fork when you need to customize. - # 1. Fork github.com/open-datastudio/ray - # 2. Change .staroid/ directory to cutomize + # 1. Fork github.com/open-datastudio/ray-cluster + # 2. Change contents # 3. Connect forked repository (https://staroid.com/projects/settings) # 4. Release your customized branch # 4-1. Select project from 'My projects' menu @@ -56,7 +56,7 @@ provider: # 4-4. Switch Launch permission to 'Public' if required # 5. Change 'project' field to point your # repository and branch in this file - project: "GITHUB/open-datastudio/ray:master-staroid" + project: "GITHUB/open-datastudio/ray-cluster:master" # 'spec.containers.image' field for ray-node and ray-worker will be # overrided by the image built from the 'project' field above. diff --git a/python/ray/autoscaler/staroid/example-multi-node-type.yaml b/python/ray/autoscaler/staroid/example-multi-node-type.yaml new file mode 100644 index 000000000..958f32ded --- /dev/null +++ b/python/ray/autoscaler/staroid/example-multi-node-type.yaml @@ -0,0 +1,113 @@ +# an example of configuring a mixed-node-type cluster. +cluster_name: multi-node-type # name with 'a-z' and '-' +min_workers: 1 +max_workers: 40 + +# Cloud-provider specific configuration. +provider: + type: staroid + access_token: + account: + ske: "Ray cluster" + ske_region: "aws us-west2" + project: "GITHUB/open-datastudio/ray-cluster:master" + image_from_project: true + python_version: 3.7.7 + use_internal_ips: true + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + cpu_2_ondemand: + node_config: + metadata: + labels: + pod.staroid.com/spot: "false" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-2 + resources: {"CPU": 2} + max_workers: 10 + cpu_4_ondemand: + node_config: + metadata: + labels: + pod.staroid.com/spot: "false" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-4 + resources: {"CPU": 4} + max_workers: 10 + cpu_8_ondemand: + node_config: + metadata: + labels: + pod.staroid.com/spot: "false" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-8 + resources: {"CPU": 8} + max_workers: 10 + gpu_1_ondemand: + node_config: + metadata: + labels: + pod.staroid.com/spot: "false" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: gpu-1 + resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1} + max_workers: 10 + cpu_2_spot: + node_config: + metadata: + labels: + pod.staroid.com/spot: "true" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-2 + resources: {"CPU": 2} + max_workers: 10 + cpu_4_spot: + node_config: + metadata: + labels: + pod.staroid.com/spot: "true" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-4 + resources: {"CPU": 4} + max_workers: 10 + cpu_8_spot: + node_config: + metadata: + labels: + pod.staroid.com/spot: "true" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: standard-8 + resources: {"CPU": 8} + max_workers: 10 + # worker_setup_commands: + # - pip install tensorflow-gpu # Example command. + gpu_1_spot: + node_config: + metadata: + labels: + pod.staroid.com/spot: "true" + pod.staroid.com/isolation: dedicated + pod.staroid.com/instance-type: gpu-1 + resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1} + max_workers: 10 + +# Specify the node type of the head node (as configured above). +head_node_type: cpu_4_ondemand + +# Specify the default type of the worker node (as configured above). +worker_default_node_type: cpu_4_spot + +# The default settings for the head node. This will be merged with the per-node +# type configs given above. +#head_node: + +# The default settings for worker nodes. This will be merged with the per-node +# type configs given above. +#worker_nodes: + +# Configure the cluster for very conservative auto-scaling otherwise. +target_utilization_fraction: 0.9 +idle_timeout_minutes: 5