[Autoscaler] Staroid node provider followup improvements (#11408)

2026-06-27 20:38:19 +08:00 · 2020-10-18 00:26:11 -07:00
parent 48b75a6922
commit 8581dd2fb1
5 changed files with 502 additions and 59 deletions
@@ -226,41 +226,53 @@ class StaroidNodeProvider(NodeProvider):
        kube_client = self.__cached[self.cluster_name]["kube_client"]
        core_api = client.CoreV1Api(kube_client)

-        pod = core_api.read_namespaced_pod(node_id, self.namespace)
-        pod.metadata.labels.update(tags)
-        core_api.patch_namespaced_pod(node_id, self.namespace, pod)
+        max_retry = 10
+        for i in range(max_retry):
+            try:
+                pod = core_api.read_namespaced_pod(node_id, self.namespace)
+                pod.metadata.labels.update(tags)
+                core_api.patch_namespaced_pod(node_id, self.namespace, pod)
+            except ApiException as e:
+                if e.status == 409 and max_retry - 1 > i:
+                    # conflict. pod modified before apply patch. retry
+                    time.sleep(0.2)
+                    continue
+
+                raise e

    def create_node(self, node_config, tags, count):
        instance_name = self.cluster_name

-        # get or create ske
-        cluster_api = self.__star.cluster()
-        ske = cluster_api.create(self.__ske, self.__ske_region)
-        if ske is None:
-            raise Exception("Failed to create an SKE '{}' in '{}' region"
-                            .format(self.__ske, self.__ske_region))
+        incluster = self._connect_kubeapi(instance_name)
+        if incluster is None:
+            # get or create ske
+            cluster_api = self.__star.cluster()
+            ske = cluster_api.create(self.__ske, self.__ske_region)
+            if ske is None:
+                raise Exception("Failed to create an SKE '{}' in '{}' region"
+                                .format(self.__ske, self.__ske_region))

-        # create a namespace
-        ns_api = self.__star.namespace(ske)
-        ns = ns_api.create(
-            instance_name,
-            self.provider_config["project"],
+            # create a namespace
+            ns_api = self.__star.namespace(ske)
+            ns = ns_api.create(
+                instance_name,
+                self.provider_config["project"],

-            # Configure 'start-head' param to 'false'.
-            # head node will be created using Kubernetes api.
-            params=[{
-                "group": "Misc",
-                "name": "start-head",
-                "value": "false"
-            }])
-        if ns is None:
-            raise Exception("Failed to create a cluster '{}' in SKE '{}'"
-                            .format(instance_name, self.__ske))
+                # Configure 'start-head' param to 'false'.
+                # head node will be created using Kubernetes api.
+                params=[{
+                    "group": "Misc",
+                    "name": "start-head",
+                    "value": "false"
+                }])
+            if ns is None:
+                raise Exception("Failed to create a cluster '{}' in SKE '{}'"
+                                .format(instance_name, self.__ske))

-        # 'ray down' will change staroid namespace status to "PAUSE"
-        # in this case we need to start namespace again.
-        if ns.status() == "PAUSE":
-            ns = ns_api.start(instance_name)
+            # 'ray down' will change staroid namespace status to "PAUSE"
+            # in this case we need to start namespace again.
+            if ns.status() == "PAUSE":
+                ns = ns_api.start(instance_name)

        # kube client
        kube_client = self._connect_kubeapi(instance_name)
@@ -293,6 +305,14 @@ class StaroidNodeProvider(NodeProvider):
        else:
            pod_spec["metadata"]["labels"] = tags

+        if "generateName" not in pod_spec["metadata"]:
+            pod_spec["metadata"]["generateName"] = \
+                "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] + "-"
+
+        if "component" not in pod_spec["metadata"]["labels"]:
+            pod_spec["metadata"]["labels"]["component"] = \
+                "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"]
+
        if image is not None:
            containers = pod_spec["spec"]["containers"]
            for c in containers:
@@ -1,6 +1,6 @@
 # An unique identifier for the head node and workers of this cluster.
 # A namespace will be automatically created for each cluster_name in SKE.
-cluster_name: default
+cluster_name: default # name with 'a-z' and '-'

 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
@@ -8,7 +8,7 @@ min_workers: 0

 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
-max_workers: 2
+max_workers: 5

 # The initial number of worker nodes to launch in addition to the head
 # node. When the cluster is first brought up (or when it is refreshed with a
@@ -71,8 +71,8 @@ provider:
    #   - Kubernetes resources to create (like Persistent volume claim)
    #     on namespace creation
    # You can fork when you need to customize.
-    #   1. Fork github.com/open-datastudio/ray
-    #   2. Change .staroid/ directory to cutomize
+    #   1. Fork github.com/open-datastudio/ray-cluster
+    #   2. Change contents
    #   3. Connect forked repository (https://staroid.com/projects/settings)
    #   4. Release your customized branch
    #      4-1. Select project from 'My projects' menu
@@ -81,7 +81,7 @@ provider:
    #      4-4. Switch Launch permission to 'Public' if required
    #   5. Change 'project' field to point your 
    #      repository and branch in this file
-    project: "GITHUB/open-datastudio/ray:master-staroid"
+    project: "GITHUB/open-datastudio/ray-cluster:master"

    # 'spec.containers.image' field for ray-node and ray-worker will be
    # overrided by the image built from the 'project' field above.
@@ -109,13 +109,17 @@ head_node:
        labels:
            component: ray-head

-            # https://docs.staroid.com/ske/pod.html#pod
+            # https://docs.staroid.com/ske/pod.html
            pod.staroid.com/spot: "false" # use on-demand instance for head.

-            # Uncomment to locate ray head to dedicated Kubernetes node
-            # (GPU instance is only available for 'dedicated' isolation)
-            #pod.staroid.com/isolation: dedicated
-            #pod.staroid.com/instance-type: gpu-1
+            # Locate ray head to dedicated Kubernetes node
+            # In dedicated mode, resource requests and limits in the pod spec will be
+            # automatically overrided based on 'pod.staroid.com/instance-type' below.
+            pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
+
+            # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
+            # See available instance type from https://docs.staroid.com/ske/pod.html.
+            pod.staroid.com/instance-type: standard-4
    spec:
        automountServiceAccountToken: true

@@ -130,10 +134,12 @@ head_node:
        - name: dshm
          emptyDir:
              medium: Memory
+        - name: tmp-volume
+          emptyDir: {}
        # nfs volume provides a shared volume across all ray-nodes.
        - name: nfs-volume
          persistentVolumeClaim:
-            claimName: nfs
+              claimName: nfs

        containers:
        - name: ray-node
@@ -162,13 +168,16 @@ head_node:
          volumeMounts:
              - mountPath: /dev/shm
                name: dshm
+              - mountPath: /tmp
+                name: tmp-volume
              - mountPath: /nfs
                name: nfs-volume
          resources:
              requests:
-                  cpu: 1000m
-                  memory: 2Gi
+                  cpu: 4000m
+                  memory: 8Gi
              limits:
+                  cpu: 4000m
                  # The maximum memory that this pod is allowed to use. The
                  # limit will be detected by ray and split to use 10% for
                  # redis, 30% for the shared memory object store, and the
@@ -176,7 +185,7 @@ head_node:
                  # the object store size is not set manually, ray will
                  # allocate a very large object store in each pod that may
                  # cause problems for other pods.
-                  memory: 2Gi
+                  memory: 8Gi
          env:
              # This is used in the head_start_ray_commands below so that
              # Ray can spawn the correct number of processes. Omitting this
@@ -184,7 +193,7 @@ head_node:
              - name: MY_CPU_REQUEST
                valueFrom:
                    resourceFieldRef:
-                        resource: requests.cpu
+                        resource: limits.cpu
              - name: RAY_ADDRESS
                value: "auto"

@@ -201,13 +210,17 @@ worker_nodes:
        labels:
            component: ray-worker

-            # https://docs.staroid.com/ske/pod.html#pod
-            pod.staroid.com/spot: "true" # use spot instance for workers.
+            # https://docs.staroid.com/ske/pod.html
+            pod.staroid.com/spot: "true"

-            # Uncomment to locate ray head to dedicated Kubernetes node
-            # (GPU instance is only available for 'dedicated' isolation)
-            #pod.staroid.com/isolation: dedicated
-            #pod.staroid.com/instance-type: gpu-1
+            # Locate ray head to dedicated Kubernetes node
+            # In dedicated mode, resource requests and limits in the pod spec will be
+            # automatically overrided based on 'pod.staroid.com/instance-type' below.
+            pod.staroid.com/isolation: dedicated # 'sandboxed' or 'dedicated'
+
+            # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
+            # See available instance type from https://docs.staroid.com/ske/pod.html.
+            pod.staroid.com/instance-type: standard-4
    spec:
        serviceAccountName: default

@@ -222,9 +235,11 @@ worker_nodes:
        - name: dshm
          emptyDir:
              medium: Memory
+        - name: tmp-volume
+          emptyDir: {}
        - name: nfs-volume
          persistentVolumeClaim:
-            claimName: nfs
+              claimName: nfs
        containers:
        - name: ray-node
          imagePullPolicy: Always
@@ -246,16 +261,19 @@ worker_nodes:
          volumeMounts:
              - mountPath: /dev/shm
                name: dshm
+              - mountPath: /tmp
+                name: tmp-volume
              - mountPath: /nfs
                name: nfs-volume
          resources:
              requests:
-                  cpu: 1000m
-                  memory: 2Gi
+                  cpu: 4000m
+                  memory: 8Gi
              limits:
+                  cpu: 4000m
                  # This memory limit will be detected by ray and split into
                  # 30% for plasma, and 70% for workers.
-                  memory: 2Gi
+                  memory: 8Gi
          env:
              # This is used in the head_start_ray_commands below so that
              # Ray can spawn the correct number of processes. Omitting this
@@ -263,7 +281,7 @@ worker_nodes:
              - name: MY_CPU_REQUEST
                valueFrom:
                    resourceFieldRef:
-                        resource: requests.cpu
+                        resource: limits.cpu

 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -0,0 +1,292 @@
+# An unique identifier for the head node and workers of this cluster.
+# A namespace will be automatically created for each cluster_name in SKE.
+cluster_name: default # name with 'a-z' and '-'
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 5
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: staroid
+
+    # Access token for Staroid from https://staroid.com/settings/accesstokens.
+    # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    access_token:
+
+    # Staroid account to use. e.g. GITHUB/staroids
+    # Alternatively, you can set STAROID_ACCOUNT environment variable.
+    # Leave empty to select default account for given access token.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    account:
+
+    # Name of a Staroid Kubernetes Engine (SKE) instance.
+    # Alternatively, you can set STAROID_SKE environment variable.
+    # An SKE is a virtualized Kubernetes cluster.
+    # Will create a new if not exists.
+    ske: "Ray cluster"
+
+    # Cloud and Region to create an SKE when not exists.
+    # If SKE already exists, this value will be ignored.
+    # Supported cloud region can be found
+    # https://docs.staroid.com/ske/cloudregion.html.
+    ske_region: "aws us-west2"
+
+    # To create a namespace in SKE, you need to specify a Github project.
+    # The Github project needs to have a staroid.yaml
+    # (https://docs.staroid.com/references/staroid_yaml.html).
+    # staroid.yaml defines various resources for the project, such as
+    #   - Building container images can be accessed from the namespace
+    #   - Kubernetes resources to create (like Persistent volume claim)
+    #     on namespace creation
+    # You can fork when you need to customize.
+    #   1. Fork github.com/open-datastudio/ray-cluster
+    #   2. Change contents
+    #   3. Connect forked repository (https://staroid.com/projects/settings)
+    #   4. Release your customized branch
+    #      4-1. Select project from 'My projects' menu
+    #      4-2. Select your branch in 'Release' tab
+    #      4-3. After build success, switch to 'Production'
+    #      4-4. Switch Launch permission to 'Public' if required
+    #   5. Change 'project' field to point your 
+    #      repository and branch in this file
+    project: "GITHUB/open-datastudio/ray-cluster:master"
+
+    # 'spec.containers.image' field for ray-node and ray-worker will be
+    # overrided by the image built from the 'project' field above.
+    # Set this value to 'false' to not override the image.
+    image_from_project: true
+
+    # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
+    # 'project' field above provides docker image for each python version.
+    # Fork 'project' if you'd like to support other python versions.
+    python_version: 3.7.7
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true
+
+# Kubernetes pod config for the head node pod.
+head_node:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-head-
+
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: ray-head
+
+            # Locate this Pod to spot instance or not.
+            # https://docs.staroid.com/ske/pod.html
+            pod.staroid.com/spot: "false" # use on-demand instance for head.
+
+            # Locate ray head to dedicated Kubernetes node or not.
+            # 'sandboxed' (default) or 'dedicated'.
+            pod.staroid.com/isolation: dedicated
+
+            # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
+            # See available instance type from https://docs.staroid.com/ske/pod.html.
+            pod.staroid.com/instance-type: gpu-1
+    spec:
+        automountServiceAccountToken: true
+
+        # Restarting the head node automatically is not currently supported.
+        # If the head node goes down, `ray up` must be run again.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: tmp-volume
+          emptyDir: {}
+        # nfs volume provides a shared volume across all ray-nodes.
+        - name: nfs-volume
+          persistentVolumeClaim:
+              claimName: nfs
+
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          #   - screen (used for `ray attach`)
+          #   - kubectl (used by the autoscaler to manage worker pods)
+          # Image will be overriden when 'image_from_project' is true.
+          image: rayproject/autoscaler
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 6379 # Redis port.
+              - containerPort: 6380 # Redis port.
+              - containerPort: 6381 # Redis port.
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /tmp
+                name: tmp-volume
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              # in case of 'pod.staroid.com/isolation' is 'dedicated',
+              # cpu and memory requests/limits in resources field will be
+              # automatically configured based on
+              # 'pod.staroid.com/instance-type'
+              requests:
+                  cpu: 4000m
+                  memory: 8Gi
+              limits:
+                  cpu: 4000m
+                  # The maximum memory that this pod is allowed to use. The
+                  # limit will be detected by ray and split to use 10% for
+                  # redis, 30% for the shared memory object store, and the
+                  # rest for application memory. If this limit is not set and
+                  # the object store size is not set manually, ray will
+                  # allocate a very large object store in each pod that may
+                  # cause problems for other pods.
+                  memory: 8Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: limits.cpu
+              - name: RAY_ADDRESS
+                value: "auto"
+
+# Kubernetes pod config for worker node pods.
+worker_nodes:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-worker-
+
+        # Must match the worker node service selector above if a worker node
+        # service is required.
+        labels:
+            component: ray-worker
+
+            # Locate this Pod to spot instance or not.
+            # https://docs.staroid.com/ske/pod.html
+            pod.staroid.com/spot: "true" # use on-demand instance for head.
+
+            # Locate ray head to dedicated Kubernetes node or not.
+            # 'sandboxed' (default) or 'dedicated'.
+            pod.staroid.com/isolation: dedicated
+
+            # Instance type to use in 'dedicated' mode, such as 'standard-4', 'gpu-1'.
+            # See available instance type from https://docs.staroid.com/ske/pod.html.
+            pod.staroid.com/instance-type: gpu-1
+    spec:
+        serviceAccountName: default
+
+        # Worker nodes will be managed automatically by the head node, so
+        # do not change the restart policy.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: tmp-volume
+          emptyDir: {}
+        - name: nfs-volume
+          persistentVolumeClaim:
+              claimName: nfs
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          image: rayproject/autoscaler
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /tmp
+                name: tmp-volume
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              # in case of 'pod.staroid.com/isolation' is 'dedicated',
+              # cpu and memory requests/limits in resources field will be
+              # automatically configured based on
+              # 'pod.staroid.com/instance-type'
+              requests:
+                  cpu: 4000m
+                  memory: 8Gi
+              limits:
+                  cpu: 4000m
+                  # This memory limit will be detected by ray and split into
+                  # 30% for plasma, and 70% for workers.
+                  memory: 8Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: limits.cpu
@@ -1,9 +1,9 @@
 # An unique identifier for the head node and workers of this cluster.
-cluster_name: minimal
+cluster_name: minimal # name with 'a-z' and '-'

 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
-max_workers: 1
+max_workers: 5

 # Kubernetes resources that need to be configured for the autoscaler to be
 # able to manage the Ray cluster. If any of the provided resources don't
@@ -46,8 +46,8 @@ provider:
    #   - Kubernetes resources to create (like Persistent volume claim)
    #     on namespace creation
    # You can fork when you need to customize.
-    #   1. Fork github.com/open-datastudio/ray
-    #   2. Change .staroid/ directory to cutomize
+    #   1. Fork github.com/open-datastudio/ray-cluster
+    #   2. Change contents
    #   3. Connect forked repository (https://staroid.com/projects/settings)
    #   4. Release your customized branch
    #      4-1. Select project from 'My projects' menu
@@ -56,7 +56,7 @@ provider:
    #      4-4. Switch Launch permission to 'Public' if required
    #   5. Change 'project' field to point your 
    #      repository and branch in this file
-    project: "GITHUB/open-datastudio/ray:master-staroid"
+    project: "GITHUB/open-datastudio/ray-cluster:master"

    # 'spec.containers.image' field for ray-node and ray-worker will be
    # overrided by the image built from the 'project' field above.
@@ -0,0 +1,113 @@
+# an example of configuring a mixed-node-type cluster.
+cluster_name: multi-node-type # name with 'a-z' and '-'
+min_workers: 1
+max_workers: 40
+
+# Cloud-provider specific configuration.
+provider:
+    type: staroid
+    access_token:
+    account:
+    ske: "Ray cluster"
+    ske_region: "aws us-west2"
+    project: "GITHUB/open-datastudio/ray-cluster:master"
+    image_from_project: true
+    python_version: 3.7.7
+    use_internal_ips: true
+
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    cpu_2_ondemand:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "false"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-2
+        resources: {"CPU": 2}
+        max_workers: 10
+    cpu_4_ondemand:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "false"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-4
+        resources: {"CPU": 4}
+        max_workers: 10
+    cpu_8_ondemand:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "false"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-8
+        resources: {"CPU": 8}
+        max_workers: 10
+    gpu_1_ondemand:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "false"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: gpu-1
+        resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1}
+        max_workers: 10
+    cpu_2_spot:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "true"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-2
+        resources: {"CPU": 2}
+        max_workers: 10
+    cpu_4_spot:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "true"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-4
+        resources: {"CPU": 4}
+        max_workers: 10
+    cpu_8_spot:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "true"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: standard-8
+        resources: {"CPU": 8}
+        max_workers: 10
+        # worker_setup_commands:
+        #    - pip install tensorflow-gpu  # Example command.
+    gpu_1_spot:
+        node_config:
+            metadata:
+                labels:
+                    pod.staroid.com/spot: "true"
+                    pod.staroid.com/isolation: dedicated
+                    pod.staroid.com/instance-type: gpu-1
+        resources: {"CPU": 8, "GPU": 1, "accelerator_type:V100": 1}
+        max_workers: 10
+
+# Specify the node type of the head node (as configured above).
+head_node_type: cpu_4_ondemand
+
+# Specify the default type of the worker node (as configured above).
+worker_default_node_type: cpu_4_spot
+
+# The default settings for the head node. This will be merged with the per-node
+# type configs given above.
+#head_node:
+
+# The default settings for worker nodes. This will be merged with the per-node
+# type configs given above.
+#worker_nodes:
+
+# Configure the cluster for very conservative auto-scaling otherwise.
+target_utilization_fraction: 0.9
+idle_timeout_minutes: 5