[autoscaler] Staroid node provider (#10956)

2026-06-28 13:19:38 +08:00 · 2020-09-22 21:25:29 -07:00
parent 4872ffb44c
commit df4c3abe30
9 changed files with 859 additions and 1 deletions
@@ -23,4 +23,8 @@ if [ "X$pod" = "X-l" ]; then
    shift
 fi

-exec kubectl "$namespace" exec -i "$pod" -- "$@"
+if [ -z "$KUBE_API_SERVER" ]; then
+  exec kubectl "$namespace" exec -i "$pod" -- "$@"
+else
+  exec kubectl --server "$KUBE_API_SERVER" "$namespace" exec -i "$pod" -- "$@"
+fi
@@ -0,0 +1 @@
+log_prefix = "StaroidNodeProvider: "
@@ -0,0 +1,37 @@
+import os
+from ray.autoscaler._private.command_runner import KubernetesCommandRunner
+
+
+class StaroidCommandRunner(KubernetesCommandRunner):
+    def __init__(self,
+                 log_prefix,
+                 namespace,
+                 node_id,
+                 auth_config,
+                 process_runner,
+                 kube_api_server=None):
+
+        super(StaroidCommandRunner, self).__init__(
+            log_prefix, namespace, node_id, auth_config, process_runner)
+
+        if kube_api_server is not None:
+            self.kubectl.extend(["--server", kube_api_server])
+            os.environ["KUBE_API_SERVER"] = kube_api_server
+
+    def _rewrite_target_home_dir(self, target):
+        # Staroid forces containers to run non-root permission. Ray docker
+        # image does not have a support for non-root user at the moment.
+        # Use /tmp/ray as a home directory until docker image supports
+        # non-root user.
+
+        if target.startswith("~/"):
+            return "/home/ray" + target[1:]
+        return target
+
+    def run_rsync_up(self, source, target, options=None):
+        target = self._rewrite_target_home_dir(target)
+        super().run_rsync_up(source, target, options)
+
+    def run_rsync_down(self, source, target, options=None):
+        target = self._rewrite_target_home_dir(target)
+        super().run_rsync_down(source, target, options)
@@ -0,0 +1,387 @@
+import os
+import logging
+import time
+import requests
+from staroid import Staroid
+from kubernetes import client, config
+import socket
+from contextlib import closing
+
+from uuid import uuid4
+from kubernetes.client.rest import ApiException
+
+from ray.autoscaler._private.staroid.command_runner import StaroidCommandRunner
+from ray.autoscaler._private.staroid import log_prefix
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
+
+logger = logging.getLogger(__name__)
+
+
+def find_free_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+def to_label_selector(tags):
+    label_selector = ""
+    for k, v in tags.items():
+        if label_selector != "":
+            label_selector += ","
+        label_selector += "{}={}".format(k, v)
+    return label_selector
+
+
+class StaroidNodeProvider(NodeProvider):
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        self.__cached = {}
+
+        self.__star = Staroid(
+            access_token=provider_config["access_token"],
+            account=provider_config["account"])
+
+        self.__ske = self._get_config_or_env(provider_config, "ske",
+                                             "STAROID_SKE")
+        self.__ske_region = self._get_config_or_env(
+            provider_config, "ske_region", "STAROID_SKE_REGION")
+
+    def _get_config_or_env(self, config, config_key, env_name):
+        value = None
+        # check env first, so config can override env later
+        if env_name in os.environ:
+            value = os.environ[env_name]
+
+        if config_key in config and config[config_key] is not None:
+            value = config[config_key]
+
+        return value
+
+    def _connect_kubeapi_incluster(self, instance_name):
+        if not os.path.isdir("/var/run/secrets/kubernetes.io/serviceaccount"):
+            return None
+
+        kube_conf = config.load_incluster_config()
+        kube_client = client.ApiClient(kube_conf)
+
+        with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace",
+                  "r") as file:
+            namespace = file.read().replace("\n", "")
+
+        self.__cached[instance_name] = {
+            "kube_client": kube_client,
+            "api_server": None
+        }
+        self.namespace = namespace
+        return kube_client
+
+    def _connect_kubeapi(self, instance_name):
+        if instance_name in self.__cached:
+            return self.__cached[instance_name]["kube_client"]
+
+        # try incluster configuration first
+        kube_client = self._connect_kubeapi_incluster(instance_name)
+        if kube_client is not None:
+            return kube_client
+
+        # check if ske exists
+        cluster_api = self.__star.cluster()
+        ske = cluster_api.get(self.__ske)
+        if ske is None:  # ske not exists
+            return None
+
+        # check if ray cluster instance exists
+        ns_api = self.__star.namespace(ske)
+        ns = ns_api.get(instance_name)
+        if ns is None:  # instance not exists
+            return None
+
+        # check if staroid namespace is not PAUSED (stopped)
+        # or INACTIVE (terminated)
+        if ns.status() != "ACTIVE":
+            return None
+
+        # wait for the staroid namespace to be started
+        start_time = time.time()
+        timeout = 300
+        started = False
+        while time.time() - start_time < timeout:
+            if ns.phase() == "RUNNING":
+                started = True
+                break
+            time.sleep(3)
+            ns = ns_api.get(instance_name)
+
+        if started is False:
+            logger.info(log_prefix + "fail to start namespace")
+            return None
+
+        # start a shell service to create secure tunnel
+        ns_api.shell_start(instance_name)
+
+        local_port = find_free_port()
+        # fixed port number for kube api access through
+        # shell service in staroid
+        remote_port = 57683
+
+        # start a secure tunnel
+        ns_api.start_tunnel(
+            instance_name, ["{}:localhost:{}".format(local_port, remote_port)])
+
+        # wait for tunnel to be established by checking /version
+        local_kube_api_addr = "http://localhost:{}".format(local_port)
+        start_time = time.time()
+        established = False
+        while time.time() - start_time < timeout:
+            try:
+                r = requests.get(
+                    "{}/version".format(local_kube_api_addr), timeout=(3, 5))
+                if r.status_code == 200:
+                    established = True
+                    break
+            except requests.exceptions.ConnectionError:
+                pass
+            time.sleep(3)
+
+        if established:
+            kube_conf = client.Configuration()
+            kube_conf.host = local_kube_api_addr
+            kube_client = client.ApiClient(kube_conf)
+            self.__cached[instance_name] = {
+                "kube_client": kube_client,
+                "api_server": local_kube_api_addr
+            }
+            self.namespace = ns.namespace()
+            return kube_client
+        else:
+            self.__cached[instance_name] = None
+            return None
+
+    def non_terminated_nodes(self, tag_filters):
+        instance_name = self.cluster_name
+
+        kube_client = self._connect_kubeapi(instance_name)
+        if kube_client is None:
+            return []
+        core_api = client.CoreV1Api(kube_client)
+
+        # Match pods that are in the 'Pending' or 'Running' phase.
+        # Unfortunately there is no OR operator in field selectors, so we
+        # have to match on NOT any of the other phases.
+        field_selector = ",".join([
+            "status.phase!=Failed",
+            "status.phase!=Unknown",
+            "status.phase!=Succeeded",
+            "status.phase!=Terminating",
+        ])
+
+        tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
+        label_selector = to_label_selector(tag_filters)
+        pod_list = core_api.list_namespaced_pod(
+            self.namespace,
+            field_selector=field_selector,
+            label_selector=label_selector)
+
+        return [pod.metadata.name for pod in pod_list.items]
+
+    def is_running(self, node_id):
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
+        return pod.status.phase == "Running"
+
+    def is_terminated(self, node_id):
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
+        return pod.status.phase not in ["Running", "Pending"]
+
+    def node_tags(self, node_id):
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
+        return pod.metadata.labels
+
+    def external_ip(self, node_id):
+        raise NotImplementedError("Must use internal IPs with Kubernetes.")
+
+    def internal_ip(self, node_id):
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
+        return pod.status.pod_ip
+
+    def set_node_tags(self, node_id, tags):
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
+        pod.metadata.labels.update(tags)
+        core_api.patch_namespaced_pod(node_id, self.namespace, pod)
+
+    def create_node(self, node_config, tags, count):
+        instance_name = self.cluster_name
+
+        # get or create ske
+        cluster_api = self.__star.cluster()
+        ske = cluster_api.create(self.__ske, self.__ske_region)
+        if ske is None:
+            raise Exception("Failed to create an SKE '{}' in '{}' region"
+                            .format(self.__ske, self.__ske_region))
+
+        # create a namespace
+        ns_api = self.__star.namespace(ske)
+        ns = ns_api.create(
+            instance_name,
+            self.provider_config["project"],
+
+            # Configure 'start-head' param to 'false'.
+            # head node will be created using Kubernetes api.
+            params=[{
+                "group": "Misc",
+                "name": "start-head",
+                "value": "false"
+            }])
+        if ns is None:
+            raise Exception("Failed to create a cluster '{}' in SKE '{}'"
+                            .format(instance_name, self.__ske))
+
+        # 'ray down' will change staroid namespace status to "PAUSE"
+        # in this case we need to start namespace again.
+        if ns.status() == "PAUSE":
+            ns = ns_api.start(instance_name)
+
+        # kube client
+        kube_client = self._connect_kubeapi(instance_name)
+        core_api = client.CoreV1Api(kube_client)
+        apps_api = client.AppsV1Api(kube_client)
+
+        # retrieve container image
+        image = None
+        if self.provider_config["image_from_project"]:
+            ray_images = apps_api.read_namespaced_deployment(
+                name="ray-images", namespace=self.namespace)
+            py_ver = self.provider_config["python_version"].replace(".", "-")
+            containers = ray_images.spec.template.spec.containers
+            for c in containers:
+                if py_ver in c.image:
+                    image = c.image
+                    break
+            logger.info(log_prefix + "use image {}".format(image))
+
+        # create head node
+        conf = node_config.copy()
+        pod_spec = conf.get("pod", conf)
+        service_spec = conf.get("service")
+        node_uuid = str(uuid4())
+        tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
+        tags["ray-node-uuid"] = node_uuid
+        pod_spec["metadata"]["namespace"] = self.namespace
+        if "labels" in pod_spec["metadata"]:
+            pod_spec["metadata"]["labels"].update(tags)
+        else:
+            pod_spec["metadata"]["labels"] = tags
+
+        if image is not None:
+            containers = pod_spec["spec"]["containers"]
+            for c in containers:
+                if c["name"] == "ray-node":
+                    c["image"] = image
+
+                    node_type = pod_spec["metadata"]["labels"]["ray-node-type"]
+                    if node_type == "head":
+                        if "STAROID_ACCESS_TOKEN" in os.environ:
+                            c["env"].append({
+                                "name": "STAROID_ACCESS_TOKEN",
+                                "value": os.environ["STAROID_ACCESS_TOKEN"]
+                            })
+                        if "STAROID_ACCOUNT" in os.environ:
+                            c["env"].append({
+                                "name": "STAROID_ACCOUNT",
+                                "value": os.environ["STAROID_ACCOUNT"]
+                            })
+                        if "STAROID_SKE" in os.environ:
+                            c["env"].append({
+                                "name": "STAROID_SKE",
+                                "value": os.environ["STAROID_SKE"]
+                            })
+
+        logger.info(log_prefix + "calling create_namespaced_pod "
+                    "(count={}).".format(count))
+        new_nodes = []
+        for _ in range(count):
+            pod = core_api.create_namespaced_pod(self.namespace, pod_spec)
+            new_nodes.append(pod)
+
+        new_svcs = []
+        if service_spec is not None:
+            logger.info(log_prefix + "calling create_namespaced_service "
+                        "(count={}).".format(count))
+
+            for new_node in new_nodes:
+                metadata = service_spec.get("metadata", {})
+                metadata["name"] = new_node.metadata.name
+                service_spec["metadata"] = metadata
+                service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid}
+                svc = core_api.create_namespaced_service(
+                    self.namespace, service_spec)
+                new_svcs.append(svc)
+
+    def terminate_node(self, node_id):
+        logger.info(log_prefix + "calling delete_namespaced_pod")
+        kube_client = self.__cached[self.cluster_name]["kube_client"]
+        core_api = client.CoreV1Api(kube_client)
+
+        core_api.delete_namespaced_pod(node_id, self.namespace)
+        try:
+            core_api.delete_namespaced_service(node_id, self.namespace)
+        except ApiException:
+            pass
+
+        if node_id.startswith("ray-head"):
+            # Stop namespace on staroid after remove ray-head node.
+            instance_name = self.cluster_name
+
+            cluster_api = self.__star.cluster()
+            ske = cluster_api.get(self.__ske)
+
+            ns_api = self.__star.namespace(ske)
+            ns_api.get(instance_name)
+
+            del self.__cached[instance_name]
+
+            ns_api.stop_tunnel(instance_name)
+            ns_api.stop(instance_name)
+
+    def terminate_nodes(self, node_ids):
+        for node_id in node_ids:
+            self.terminate_node(node_id)
+
+    def get_command_runner(self,
+                           log_prefix,
+                           node_id,
+                           auth_config,
+                           cluster_name,
+                           process_runner,
+                           use_internal_ip,
+                           docker_config=None):
+        instance_name = self.cluster_name
+
+        # initialize connection
+        self._connect_kubeapi(instance_name)
+
+        command_runner = StaroidCommandRunner(
+            log_prefix, self.namespace, node_id, auth_config, process_runner,
+            self.__cached[cluster_name]["api_server"])
+        return command_runner
+
+    @staticmethod
+    def bootstrap_config(cluster_config):
+        """Bootstraps the cluster config by adding env defaults if needed."""
+        return cluster_config
@@ -43,6 +43,12 @@ def _import_kubernetes(provider_config):
    return KubernetesNodeProvider


+def _import_staroid(provider_config):
+    from ray.autoscaler._private.staroid.node_provider import \
+        StaroidNodeProvider
+    return StaroidNodeProvider
+
+
 def _load_local_example_config():
    import ray.autoscaler.local as ray_local
    return os.path.join(
@@ -71,6 +77,12 @@ def _load_azure_example_config():
        os.path.dirname(ray_azure.__file__), "example-full.yaml")


+def _load_staroid_example_config():
+    import ray.autoscaler.staroid as ray_staroid
+    return os.path.join(
+        os.path.dirname(ray_staroid.__file__), "example-full.yaml")
+
+
 def _import_external(provider_config):
    provider_cls = _load_class(path=provider_config["module"])
    return provider_cls
@@ -81,6 +93,7 @@ _NODE_PROVIDERS = {
    "aws": _import_aws,
    "gcp": _import_gcp,
    "azure": _import_azure,
+    "staroid": _import_staroid,
    "kubernetes": _import_kubernetes,
    "external": _import_external  # Import an external module
 }
@@ -90,6 +103,7 @@ _PROVIDER_PRETTY_NAMES = {
    "aws": "AWS",
    "gcp": "GCP",
    "azure": "Azure",
+    "staroid": "Staroid",
    "kubernetes": "Kubernetes",
    "external": "External"
 }
@@ -99,6 +113,7 @@ _DEFAULT_CONFIGS = {
    "aws": _load_aws_example_config,
    "gcp": _load_gcp_example_config,
    "azure": _load_azure_example_config,
+    "staroid": _load_staroid_example_config,
    "kubernetes": _load_kubernetes_example_config,
 }

@@ -0,0 +1,312 @@
+# An unique identifier for the head node and workers of this cluster.
+# A namespace will be automatically created for each cluster_name in SKE.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+#   we would start more workers, we start at least enough to bring us to
+#   initial_workers.
+autoscaling_mode: default
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: staroid
+
+    # Access token for Staroid from https://staroid.com/settings/accesstokens.
+    # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    access_token:
+
+    # Staroid account to use. e.g. GITHUB/staroids
+    # Alternatively, you can set STAROID_ACCOUNT environment variable.
+    # Leave empty to select default account for given access token.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    account:
+
+    # Name of a Staroid Kubernetes Engine (SKE) instance.
+    # Alternatively, you can set STAROID_SKE environment variable.
+    # An SKE is a virtualized Kubernetes cluster.
+    # Will create a new if not exists.
+    ske: "Ray cluster"
+
+    # Cloud and Region to create an SKE when not exists.
+    # If SKE already exists, this value will be ignored.
+    # Supported cloud region can be found
+    # https://docs.staroid.com/ske/cloudregion.html.
+    ske_region: "aws us-west2"
+
+    # To create a namespace in SKE, you need to specify a Github project.
+    # The Github project needs to have a staroid.yaml
+    # (https://docs.staroid.com/references/staroid_yaml.html).
+    # staroid.yaml defines various resources for the project, such as
+    #   - Building container images can be accessed from the namespace
+    #   - Kubernetes resources to create (like Persistent volume claim)
+    #     on namespace creation
+    # You can fork when you need to customize.
+    #   1. Fork github.com/open-datastudio/ray
+    #   2. Change .staroid/ directory to cutomize
+    #   3. Connect forked repository (https://staroid.com/projects/settings)
+    #   4. Release your customized branch
+    #      4-1. Select project from 'My projects' menu
+    #      4-2. Select your branch in 'Release' tab
+    #      4-3. After build success, switch to 'Production'
+    #      4-4. Switch Launch permission to 'Public' if required
+    #   5. Change 'project' field to point your 
+    #      repository and branch in this file
+    project: "GITHUB/open-datastudio/ray:master-staroid"
+
+    # 'spec.containers.image' field for ray-node and ray-worker will be
+    # overrided by the image built from the 'project' field above.
+    # Set this value to 'false' to not override the image.
+    image_from_project: true
+
+    # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
+    # 'project' field above provides docker image for each python version.
+    # Fork 'project' if you'd like to support other python versions.
+    python_version: 3.7.7
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true
+
+# Kubernetes pod config for the head node pod.
+head_node:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-head-
+
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: ray-head
+
+            # https://docs.staroid.com/ske/pod.html#pod
+            pod.staroid.com/spot: "false" # use on-demand instance for head.
+
+            # Uncomment to locate ray head to dedicated Kubernetes node
+            # (GPU instance is only available for 'dedicated' isolation)
+            #pod.staroid.com/isolation: dedicated
+            #pod.staroid.com/instance-type: gpu-1
+    spec:
+        automountServiceAccountToken: true
+
+        # Restarting the head node automatically is not currently supported.
+        # If the head node goes down, `ray up` must be run again.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        # nfs volume provides a shared volume across all ray-nodes.
+        - name: nfs-volume
+          persistentVolumeClaim:
+            claimName: nfs
+
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          #   - screen (used for `ray attach`)
+          #   - kubectl (used by the autoscaler to manage worker pods)
+          # Image will be overriden when 'image_from_project' is true.
+          image: rayproject/autoscaler
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 6379 # Redis port.
+              - containerPort: 6380 # Redis port.
+              - containerPort: 6381 # Redis port.
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 2Gi
+              limits:
+                  # The maximum memory that this pod is allowed to use. The
+                  # limit will be detected by ray and split to use 10% for
+                  # redis, 30% for the shared memory object store, and the
+                  # rest for application memory. If this limit is not set and
+                  # the object store size is not set manually, ray will
+                  # allocate a very large object store in each pod that may
+                  # cause problems for other pods.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+              - name: RAY_ADDRESS
+                value: "auto"
+
+# Kubernetes pod config for worker node pods.
+worker_nodes:
+    apiVersion: v1
+    kind: Pod
+    metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: ray-worker-
+
+        # Must match the worker node service selector above if a worker node
+        # service is required.
+        labels:
+            component: ray-worker
+
+            # https://docs.staroid.com/ske/pod.html#pod
+            pod.staroid.com/spot: "true" # use spot instance for workers.
+
+            # Uncomment to locate ray head to dedicated Kubernetes node
+            # (GPU instance is only available for 'dedicated' isolation)
+            #pod.staroid.com/isolation: dedicated
+            #pod.staroid.com/instance-type: gpu-1
+    spec:
+        serviceAccountName: default
+
+        # Worker nodes will be managed automatically by the head node, so
+        # do not change the restart policy.
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: nfs-volume
+          persistentVolumeClaim:
+            claimName: nfs
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          image: rayproject/autoscaler
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /nfs
+                name: nfs-volume
+          resources:
+              requests:
+                  cpu: 1000m
+                  memory: 2Gi
+              limits:
+                  # This memory limit will be detected by ray and split into
+                  # 30% for plasma, and 70% for workers.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    # install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
+    - pip install -q staroid kubernetes
+    # install jupyterlab
+    - pip install -q jupyterlab
+    - ln -s /nfs /home/ray/nfs
+    - bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
+    # show 'notebook' link in staroid management console to access jupyter notebook.
+    - 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n  name: notebook\n  annotations:\n    service.staroid.com/link: show\nspec:\n  ports:\n  - name: http\n    port: 8888\n  selector:\n    component: ray-head" | kubectl apply -f -'
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,72 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: minimal
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers. min_workers default to 0.
+max_workers: 1
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: staroid
+
+    # Access token for Staroid from https://staroid.com/settings/accesstokens.
+    # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    access_token:
+
+    # Staroid account to use. e.g. GITHUB/staroids
+    # Alternatively, you can set STAROID_ACCOUNT environment variable.
+    # Leave empty to select default account for given access token.
+    # https://github.com/staroids/staroid-python#configuration
+    # for more information.
+    account:
+
+    # Name of a Staroid Kubernetes Engine (SKE) instance.
+    # Alternatively, you can set STAROID_SKE environment variable.
+    # An SKE is a virtualized Kubernetes cluster.
+    # Will create a new if not exists.
+    ske: "Ray cluster"
+
+    # Cloud and Region to create an SKE when not exists.
+    # If SKE already exists, this value will be ignored.
+    # Supported cloud region can be found
+    # https://docs.staroid.com/ske/cloudregion.html.
+    ske_region: "aws us-west2"
+
+    # To create a namespace in SKE, you need to specify a Github project.
+    # The Github project needs to have a staroid.yaml
+    # (https://docs.staroid.com/references/staroid_yaml.html).
+    # staroid.yaml defines various resources for the project, such as
+    #   - Building container images can be accessed from the namespace
+    #   - Kubernetes resources to create (like Persistent volume claim)
+    #     on namespace creation
+    # You can fork when you need to customize.
+    #   1. Fork github.com/open-datastudio/ray
+    #   2. Change .staroid/ directory to cutomize
+    #   3. Connect forked repository (https://staroid.com/projects/settings)
+    #   4. Release your customized branch
+    #      4-1. Select project from 'My projects' menu
+    #      4-2. Select your branch in 'Release' tab
+    #      4-3. After build success, switch to 'Production'
+    #      4-4. Switch Launch permission to 'Public' if required
+    #   5. Change 'project' field to point your 
+    #      repository and branch in this file
+    project: "GITHUB/open-datastudio/ray:master-staroid"
+
+    # 'spec.containers.image' field for ray-node and ray-worker will be
+    # overrided by the image built from the 'project' field above.
+    # Set this value to 'false' to not override the image.
+    image_from_project: true
+
+    # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
+    # 'project' field above provides docker image for each python version.
+    # Fork 'project' if you'd like to support other python versions.
+    python_version: 3.7.7
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true