diff --git a/doc/source/cluster/cloud.rst b/doc/source/cluster/cloud.rst index 8fccec6a8..9257859e9 100644 --- a/doc/source/cluster/cloud.rst +++ b/doc/source/cluster/cloud.rst @@ -180,6 +180,36 @@ Test that it works by running the following commands from your local machine: .. tip:: If you would like to use Ray Tune in your Kubernetes cluster, have a look at :ref:`this short guide to make it work `. +Staroid +------- + +First, install the staroid client package (``pip install staroid``) then get `access token `_. +Once you have an access token, you should be ready to launch your cluster. + +The provided `ray/python/ray/autoscaler/staroid/example-full.yaml `__ cluster config file will create a cluster with + +- a Jupyter notebook running on head node. + (Staroid management console -> Kubernetes -> ```` -> ```` -> Click "notebook") +- a shared nfs volume across all ray nodes mounted under ``/nfs`` directory. + +Test that it works by running the following commands from your local machine: + +.. code-block:: bash + + # Configure access token through environment variable. + $ export STAROID_ACCESS_TOKEN= + + # Create or update the cluster. When the command finishes, + # you can attach a screen to the head node. + $ ray up ray/python/ray/autoscaler/staroid/example-full.yaml + + # Get a remote screen on the head node. + $ ray attach ray/python/ray/autoscaler/staroid/example-full.yaml + $ # Try running a Ray program with 'ray.init(address="auto")'. + + # Tear down the cluster + $ ray down ray/python/ray/autoscaler/staroid/example-full.yaml + .. _cluster-private-setup: Local On Premise Cluster (List of nodes) diff --git a/python/ray/autoscaler/_private/kubernetes/kubectl-rsync.sh b/python/ray/autoscaler/_private/kubernetes/kubectl-rsync.sh index 3bda9ca9d..361eb6d85 100755 --- a/python/ray/autoscaler/_private/kubernetes/kubectl-rsync.sh +++ b/python/ray/autoscaler/_private/kubernetes/kubectl-rsync.sh @@ -23,4 +23,8 @@ if [ "X$pod" = "X-l" ]; then shift fi -exec kubectl "$namespace" exec -i "$pod" -- "$@" +if [ -z "$KUBE_API_SERVER" ]; then + exec kubectl "$namespace" exec -i "$pod" -- "$@" +else + exec kubectl --server "$KUBE_API_SERVER" "$namespace" exec -i "$pod" -- "$@" +fi diff --git a/python/ray/autoscaler/_private/staroid/__init__.py b/python/ray/autoscaler/_private/staroid/__init__.py new file mode 100644 index 000000000..174c2cac8 --- /dev/null +++ b/python/ray/autoscaler/_private/staroid/__init__.py @@ -0,0 +1 @@ +log_prefix = "StaroidNodeProvider: " diff --git a/python/ray/autoscaler/_private/staroid/command_runner.py b/python/ray/autoscaler/_private/staroid/command_runner.py new file mode 100644 index 000000000..28f17b2a4 --- /dev/null +++ b/python/ray/autoscaler/_private/staroid/command_runner.py @@ -0,0 +1,37 @@ +import os +from ray.autoscaler._private.command_runner import KubernetesCommandRunner + + +class StaroidCommandRunner(KubernetesCommandRunner): + def __init__(self, + log_prefix, + namespace, + node_id, + auth_config, + process_runner, + kube_api_server=None): + + super(StaroidCommandRunner, self).__init__( + log_prefix, namespace, node_id, auth_config, process_runner) + + if kube_api_server is not None: + self.kubectl.extend(["--server", kube_api_server]) + os.environ["KUBE_API_SERVER"] = kube_api_server + + def _rewrite_target_home_dir(self, target): + # Staroid forces containers to run non-root permission. Ray docker + # image does not have a support for non-root user at the moment. + # Use /tmp/ray as a home directory until docker image supports + # non-root user. + + if target.startswith("~/"): + return "/home/ray" + target[1:] + return target + + def run_rsync_up(self, source, target, options=None): + target = self._rewrite_target_home_dir(target) + super().run_rsync_up(source, target, options) + + def run_rsync_down(self, source, target, options=None): + target = self._rewrite_target_home_dir(target) + super().run_rsync_down(source, target, options) diff --git a/python/ray/autoscaler/_private/staroid/node_provider.py b/python/ray/autoscaler/_private/staroid/node_provider.py new file mode 100644 index 000000000..4748bc1f9 --- /dev/null +++ b/python/ray/autoscaler/_private/staroid/node_provider.py @@ -0,0 +1,387 @@ +import os +import logging +import time +import requests +from staroid import Staroid +from kubernetes import client, config +import socket +from contextlib import closing + +from uuid import uuid4 +from kubernetes.client.rest import ApiException + +from ray.autoscaler._private.staroid.command_runner import StaroidCommandRunner +from ray.autoscaler._private.staroid import log_prefix +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME + +logger = logging.getLogger(__name__) + + +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("localhost", 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + +def to_label_selector(tags): + label_selector = "" + for k, v in tags.items(): + if label_selector != "": + label_selector += "," + label_selector += "{}={}".format(k, v) + return label_selector + + +class StaroidNodeProvider(NodeProvider): + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + self.__cached = {} + + self.__star = Staroid( + access_token=provider_config["access_token"], + account=provider_config["account"]) + + self.__ske = self._get_config_or_env(provider_config, "ske", + "STAROID_SKE") + self.__ske_region = self._get_config_or_env( + provider_config, "ske_region", "STAROID_SKE_REGION") + + def _get_config_or_env(self, config, config_key, env_name): + value = None + # check env first, so config can override env later + if env_name in os.environ: + value = os.environ[env_name] + + if config_key in config and config[config_key] is not None: + value = config[config_key] + + return value + + def _connect_kubeapi_incluster(self, instance_name): + if not os.path.isdir("/var/run/secrets/kubernetes.io/serviceaccount"): + return None + + kube_conf = config.load_incluster_config() + kube_client = client.ApiClient(kube_conf) + + with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", + "r") as file: + namespace = file.read().replace("\n", "") + + self.__cached[instance_name] = { + "kube_client": kube_client, + "api_server": None + } + self.namespace = namespace + return kube_client + + def _connect_kubeapi(self, instance_name): + if instance_name in self.__cached: + return self.__cached[instance_name]["kube_client"] + + # try incluster configuration first + kube_client = self._connect_kubeapi_incluster(instance_name) + if kube_client is not None: + return kube_client + + # check if ske exists + cluster_api = self.__star.cluster() + ske = cluster_api.get(self.__ske) + if ske is None: # ske not exists + return None + + # check if ray cluster instance exists + ns_api = self.__star.namespace(ske) + ns = ns_api.get(instance_name) + if ns is None: # instance not exists + return None + + # check if staroid namespace is not PAUSED (stopped) + # or INACTIVE (terminated) + if ns.status() != "ACTIVE": + return None + + # wait for the staroid namespace to be started + start_time = time.time() + timeout = 300 + started = False + while time.time() - start_time < timeout: + if ns.phase() == "RUNNING": + started = True + break + time.sleep(3) + ns = ns_api.get(instance_name) + + if started is False: + logger.info(log_prefix + "fail to start namespace") + return None + + # start a shell service to create secure tunnel + ns_api.shell_start(instance_name) + + local_port = find_free_port() + # fixed port number for kube api access through + # shell service in staroid + remote_port = 57683 + + # start a secure tunnel + ns_api.start_tunnel( + instance_name, ["{}:localhost:{}".format(local_port, remote_port)]) + + # wait for tunnel to be established by checking /version + local_kube_api_addr = "http://localhost:{}".format(local_port) + start_time = time.time() + established = False + while time.time() - start_time < timeout: + try: + r = requests.get( + "{}/version".format(local_kube_api_addr), timeout=(3, 5)) + if r.status_code == 200: + established = True + break + except requests.exceptions.ConnectionError: + pass + time.sleep(3) + + if established: + kube_conf = client.Configuration() + kube_conf.host = local_kube_api_addr + kube_client = client.ApiClient(kube_conf) + self.__cached[instance_name] = { + "kube_client": kube_client, + "api_server": local_kube_api_addr + } + self.namespace = ns.namespace() + return kube_client + else: + self.__cached[instance_name] = None + return None + + def non_terminated_nodes(self, tag_filters): + instance_name = self.cluster_name + + kube_client = self._connect_kubeapi(instance_name) + if kube_client is None: + return [] + core_api = client.CoreV1Api(kube_client) + + # Match pods that are in the 'Pending' or 'Running' phase. + # Unfortunately there is no OR operator in field selectors, so we + # have to match on NOT any of the other phases. + field_selector = ",".join([ + "status.phase!=Failed", + "status.phase!=Unknown", + "status.phase!=Succeeded", + "status.phase!=Terminating", + ]) + + tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name + label_selector = to_label_selector(tag_filters) + pod_list = core_api.list_namespaced_pod( + self.namespace, + field_selector=field_selector, + label_selector=label_selector) + + return [pod.metadata.name for pod in pod_list.items] + + def is_running(self, node_id): + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + pod = core_api.read_namespaced_pod_status(node_id, self.namespace) + return pod.status.phase == "Running" + + def is_terminated(self, node_id): + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + pod = core_api.read_namespaced_pod_status(node_id, self.namespace) + return pod.status.phase not in ["Running", "Pending"] + + def node_tags(self, node_id): + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + pod = core_api.read_namespaced_pod_status(node_id, self.namespace) + return pod.metadata.labels + + def external_ip(self, node_id): + raise NotImplementedError("Must use internal IPs with Kubernetes.") + + def internal_ip(self, node_id): + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + pod = core_api.read_namespaced_pod_status(node_id, self.namespace) + return pod.status.pod_ip + + def set_node_tags(self, node_id, tags): + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + pod = core_api.read_namespaced_pod_status(node_id, self.namespace) + pod.metadata.labels.update(tags) + core_api.patch_namespaced_pod(node_id, self.namespace, pod) + + def create_node(self, node_config, tags, count): + instance_name = self.cluster_name + + # get or create ske + cluster_api = self.__star.cluster() + ske = cluster_api.create(self.__ske, self.__ske_region) + if ske is None: + raise Exception("Failed to create an SKE '{}' in '{}' region" + .format(self.__ske, self.__ske_region)) + + # create a namespace + ns_api = self.__star.namespace(ske) + ns = ns_api.create( + instance_name, + self.provider_config["project"], + + # Configure 'start-head' param to 'false'. + # head node will be created using Kubernetes api. + params=[{ + "group": "Misc", + "name": "start-head", + "value": "false" + }]) + if ns is None: + raise Exception("Failed to create a cluster '{}' in SKE '{}'" + .format(instance_name, self.__ske)) + + # 'ray down' will change staroid namespace status to "PAUSE" + # in this case we need to start namespace again. + if ns.status() == "PAUSE": + ns = ns_api.start(instance_name) + + # kube client + kube_client = self._connect_kubeapi(instance_name) + core_api = client.CoreV1Api(kube_client) + apps_api = client.AppsV1Api(kube_client) + + # retrieve container image + image = None + if self.provider_config["image_from_project"]: + ray_images = apps_api.read_namespaced_deployment( + name="ray-images", namespace=self.namespace) + py_ver = self.provider_config["python_version"].replace(".", "-") + containers = ray_images.spec.template.spec.containers + for c in containers: + if py_ver in c.image: + image = c.image + break + logger.info(log_prefix + "use image {}".format(image)) + + # create head node + conf = node_config.copy() + pod_spec = conf.get("pod", conf) + service_spec = conf.get("service") + node_uuid = str(uuid4()) + tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name + tags["ray-node-uuid"] = node_uuid + pod_spec["metadata"]["namespace"] = self.namespace + if "labels" in pod_spec["metadata"]: + pod_spec["metadata"]["labels"].update(tags) + else: + pod_spec["metadata"]["labels"] = tags + + if image is not None: + containers = pod_spec["spec"]["containers"] + for c in containers: + if c["name"] == "ray-node": + c["image"] = image + + node_type = pod_spec["metadata"]["labels"]["ray-node-type"] + if node_type == "head": + if "STAROID_ACCESS_TOKEN" in os.environ: + c["env"].append({ + "name": "STAROID_ACCESS_TOKEN", + "value": os.environ["STAROID_ACCESS_TOKEN"] + }) + if "STAROID_ACCOUNT" in os.environ: + c["env"].append({ + "name": "STAROID_ACCOUNT", + "value": os.environ["STAROID_ACCOUNT"] + }) + if "STAROID_SKE" in os.environ: + c["env"].append({ + "name": "STAROID_SKE", + "value": os.environ["STAROID_SKE"] + }) + + logger.info(log_prefix + "calling create_namespaced_pod " + "(count={}).".format(count)) + new_nodes = [] + for _ in range(count): + pod = core_api.create_namespaced_pod(self.namespace, pod_spec) + new_nodes.append(pod) + + new_svcs = [] + if service_spec is not None: + logger.info(log_prefix + "calling create_namespaced_service " + "(count={}).".format(count)) + + for new_node in new_nodes: + metadata = service_spec.get("metadata", {}) + metadata["name"] = new_node.metadata.name + service_spec["metadata"] = metadata + service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid} + svc = core_api.create_namespaced_service( + self.namespace, service_spec) + new_svcs.append(svc) + + def terminate_node(self, node_id): + logger.info(log_prefix + "calling delete_namespaced_pod") + kube_client = self.__cached[self.cluster_name]["kube_client"] + core_api = client.CoreV1Api(kube_client) + + core_api.delete_namespaced_pod(node_id, self.namespace) + try: + core_api.delete_namespaced_service(node_id, self.namespace) + except ApiException: + pass + + if node_id.startswith("ray-head"): + # Stop namespace on staroid after remove ray-head node. + instance_name = self.cluster_name + + cluster_api = self.__star.cluster() + ske = cluster_api.get(self.__ske) + + ns_api = self.__star.namespace(ske) + ns_api.get(instance_name) + + del self.__cached[instance_name] + + ns_api.stop_tunnel(instance_name) + ns_api.stop(instance_name) + + def terminate_nodes(self, node_ids): + for node_id in node_ids: + self.terminate_node(node_id) + + def get_command_runner(self, + log_prefix, + node_id, + auth_config, + cluster_name, + process_runner, + use_internal_ip, + docker_config=None): + instance_name = self.cluster_name + + # initialize connection + self._connect_kubeapi(instance_name) + + command_runner = StaroidCommandRunner( + log_prefix, self.namespace, node_id, auth_config, process_runner, + self.__cached[cluster_name]["api_server"]) + return command_runner + + @staticmethod + def bootstrap_config(cluster_config): + """Bootstraps the cluster config by adding env defaults if needed.""" + return cluster_config diff --git a/python/ray/autoscaler/node_provider.py b/python/ray/autoscaler/node_provider.py index 5bbbf0fdb..6f651a9da 100644 --- a/python/ray/autoscaler/node_provider.py +++ b/python/ray/autoscaler/node_provider.py @@ -43,6 +43,12 @@ def _import_kubernetes(provider_config): return KubernetesNodeProvider +def _import_staroid(provider_config): + from ray.autoscaler._private.staroid.node_provider import \ + StaroidNodeProvider + return StaroidNodeProvider + + def _load_local_example_config(): import ray.autoscaler.local as ray_local return os.path.join( @@ -71,6 +77,12 @@ def _load_azure_example_config(): os.path.dirname(ray_azure.__file__), "example-full.yaml") +def _load_staroid_example_config(): + import ray.autoscaler.staroid as ray_staroid + return os.path.join( + os.path.dirname(ray_staroid.__file__), "example-full.yaml") + + def _import_external(provider_config): provider_cls = _load_class(path=provider_config["module"]) return provider_cls @@ -81,6 +93,7 @@ _NODE_PROVIDERS = { "aws": _import_aws, "gcp": _import_gcp, "azure": _import_azure, + "staroid": _import_staroid, "kubernetes": _import_kubernetes, "external": _import_external # Import an external module } @@ -90,6 +103,7 @@ _PROVIDER_PRETTY_NAMES = { "aws": "AWS", "gcp": "GCP", "azure": "Azure", + "staroid": "Staroid", "kubernetes": "Kubernetes", "external": "External" } @@ -99,6 +113,7 @@ _DEFAULT_CONFIGS = { "aws": _load_aws_example_config, "gcp": _load_gcp_example_config, "azure": _load_azure_example_config, + "staroid": _load_staroid_example_config, "kubernetes": _load_kubernetes_example_config, } diff --git a/python/ray/autoscaler/staroid/__init__.py b/python/ray/autoscaler/staroid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/ray/autoscaler/staroid/example-full.yaml b/python/ray/autoscaler/staroid/example-full.yaml new file mode 100644 index 000000000..d17f6378a --- /dev/null +++ b/python/ray/autoscaler/staroid/example-full.yaml @@ -0,0 +1,312 @@ +# An unique identifier for the head node and workers of this cluster. +# A namespace will be automatically created for each cluster_name in SKE. +cluster_name: default + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# Whether or not to autoscale aggressively. If this is enabled, if at any point +# we would start more workers, we start at least enough to bring us to +# initial_workers. +autoscaling_mode: default + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Kubernetes resources that need to be configured for the autoscaler to be +# able to manage the Ray cluster. If any of the provided resources don't +# exist, the autoscaler will attempt to create them. If this fails, you may +# not have the required permissions and will have to request them to be +# created by your cluster administrator. +provider: + type: staroid + + # Access token for Staroid from https://staroid.com/settings/accesstokens. + # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable. + # https://github.com/staroids/staroid-python#configuration + # for more information. + access_token: + + # Staroid account to use. e.g. GITHUB/staroids + # Alternatively, you can set STAROID_ACCOUNT environment variable. + # Leave empty to select default account for given access token. + # https://github.com/staroids/staroid-python#configuration + # for more information. + account: + + # Name of a Staroid Kubernetes Engine (SKE) instance. + # Alternatively, you can set STAROID_SKE environment variable. + # An SKE is a virtualized Kubernetes cluster. + # Will create a new if not exists. + ske: "Ray cluster" + + # Cloud and Region to create an SKE when not exists. + # If SKE already exists, this value will be ignored. + # Supported cloud region can be found + # https://docs.staroid.com/ske/cloudregion.html. + ske_region: "aws us-west2" + + # To create a namespace in SKE, you need to specify a Github project. + # The Github project needs to have a staroid.yaml + # (https://docs.staroid.com/references/staroid_yaml.html). + # staroid.yaml defines various resources for the project, such as + # - Building container images can be accessed from the namespace + # - Kubernetes resources to create (like Persistent volume claim) + # on namespace creation + # You can fork when you need to customize. + # 1. Fork github.com/open-datastudio/ray + # 2. Change .staroid/ directory to cutomize + # 3. Connect forked repository (https://staroid.com/projects/settings) + # 4. Release your customized branch + # 4-1. Select project from 'My projects' menu + # 4-2. Select your branch in 'Release' tab + # 4-3. After build success, switch to 'Production' + # 4-4. Switch Launch permission to 'Public' if required + # 5. Change 'project' field to point your + # repository and branch in this file + project: "GITHUB/open-datastudio/ray:master-staroid" + + # 'spec.containers.image' field for ray-node and ray-worker will be + # overrided by the image built from the 'project' field above. + # Set this value to 'false' to not override the image. + image_from_project: true + + # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'. + # 'project' field above provides docker image for each python version. + # Fork 'project' if you'd like to support other python versions. + python_version: 3.7.7 + + # Exposing external IP addresses for ray pods isn't currently supported. + use_internal_ips: true + +# Kubernetes pod config for the head node pod. +head_node: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-head- + + # Must match the head node service selector above if a head node + # service is required. + labels: + component: ray-head + + # https://docs.staroid.com/ske/pod.html#pod + pod.staroid.com/spot: "false" # use on-demand instance for head. + + # Uncomment to locate ray head to dedicated Kubernetes node + # (GPU instance is only available for 'dedicated' isolation) + #pod.staroid.com/isolation: dedicated + #pod.staroid.com/instance-type: gpu-1 + spec: + automountServiceAccountToken: true + + # Restarting the head node automatically is not currently supported. + # If the head node goes down, `ray up` must be run again. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + # nfs volume provides a shared volume across all ray-nodes. + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + # - screen (used for `ray attach`) + # - kubectl (used by the autoscaler to manage worker pods) + # Image will be overriden when 'image_from_project' is true. + image: rayproject/autoscaler + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 6379 # Redis port. + - containerPort: 6380 # Redis port. + - containerPort: 6381 # Redis port. + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /nfs + name: nfs-volume + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + # The maximum memory that this pod is allowed to use. The + # limit will be detected by ray and split to use 10% for + # redis, 30% for the shared memory object store, and the + # rest for application memory. If this limit is not set and + # the object store size is not set manually, ray will + # allocate a very large object store in each pod that may + # cause problems for other pods. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + - name: RAY_ADDRESS + value: "auto" + +# Kubernetes pod config for worker node pods. +worker_nodes: + apiVersion: v1 + kind: Pod + metadata: + # Automatically generates a name for the pod with this prefix. + generateName: ray-worker- + + # Must match the worker node service selector above if a worker node + # service is required. + labels: + component: ray-worker + + # https://docs.staroid.com/ske/pod.html#pod + pod.staroid.com/spot: "true" # use spot instance for workers. + + # Uncomment to locate ray head to dedicated Kubernetes node + # (GPU instance is only available for 'dedicated' isolation) + #pod.staroid.com/isolation: dedicated + #pod.staroid.com/instance-type: gpu-1 + spec: + serviceAccountName: default + + # Worker nodes will be managed automatically by the head node, so + # do not change the restart policy. + restartPolicy: Never + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: nfs-volume + persistentVolumeClaim: + claimName: nfs + containers: + - name: ray-node + imagePullPolicy: Always + # You are free (and encouraged) to use your own container image, + # but it should have the following installed: + # - rsync (used for `ray rsync` commands and file mounts) + image: rayproject/autoscaler + # Do not change this command - it keeps the pod alive until it is + # explicitly killed. + command: ["/bin/bash", "-c", "--"] + args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"] + ports: + - containerPort: 12345 # Ray internal communication. + - containerPort: 12346 # Ray internal communication. + + # This volume allocates shared memory for Ray to use for its plasma + # object store. If you do not provide this, Ray will fall back to + # /tmp which cause slowdowns if is not a shared memory volume. + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /nfs + name: nfs-volume + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + # This memory limit will be detected by ray and split into + # 30% for plasma, and 70% for workers. + memory: 2Gi + env: + # This is used in the head_start_ray_commands below so that + # Ray can spawn the correct number of processes. Omitting this + # may lead to degraded performance. + - name: MY_CPU_REQUEST + valueFrom: + resourceFieldRef: + resource: requests.cpu + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + # install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use. + - pip install -q staroid kubernetes + # install jupyterlab + - pip install -q jupyterlab + - ln -s /nfs /home/ray/nfs + - bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' & + # show 'notebook' link in staroid management console to access jupyter notebook. + - 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -' + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/staroid/example-minimal.yaml b/python/ray/autoscaler/staroid/example-minimal.yaml new file mode 100644 index 000000000..bc32883a2 --- /dev/null +++ b/python/ray/autoscaler/staroid/example-minimal.yaml @@ -0,0 +1,72 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: minimal + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. min_workers default to 0. +max_workers: 1 + +# Kubernetes resources that need to be configured for the autoscaler to be +# able to manage the Ray cluster. If any of the provided resources don't +# exist, the autoscaler will attempt to create them. If this fails, you may +# not have the required permissions and will have to request them to be +# created by your cluster administrator. +provider: + type: staroid + + # Access token for Staroid from https://staroid.com/settings/accesstokens. + # Alternatively, you can set STAROID_ACCESS_TOKEN environment variable. + # https://github.com/staroids/staroid-python#configuration + # for more information. + access_token: + + # Staroid account to use. e.g. GITHUB/staroids + # Alternatively, you can set STAROID_ACCOUNT environment variable. + # Leave empty to select default account for given access token. + # https://github.com/staroids/staroid-python#configuration + # for more information. + account: + + # Name of a Staroid Kubernetes Engine (SKE) instance. + # Alternatively, you can set STAROID_SKE environment variable. + # An SKE is a virtualized Kubernetes cluster. + # Will create a new if not exists. + ske: "Ray cluster" + + # Cloud and Region to create an SKE when not exists. + # If SKE already exists, this value will be ignored. + # Supported cloud region can be found + # https://docs.staroid.com/ske/cloudregion.html. + ske_region: "aws us-west2" + + # To create a namespace in SKE, you need to specify a Github project. + # The Github project needs to have a staroid.yaml + # (https://docs.staroid.com/references/staroid_yaml.html). + # staroid.yaml defines various resources for the project, such as + # - Building container images can be accessed from the namespace + # - Kubernetes resources to create (like Persistent volume claim) + # on namespace creation + # You can fork when you need to customize. + # 1. Fork github.com/open-datastudio/ray + # 2. Change .staroid/ directory to cutomize + # 3. Connect forked repository (https://staroid.com/projects/settings) + # 4. Release your customized branch + # 4-1. Select project from 'My projects' menu + # 4-2. Select your branch in 'Release' tab + # 4-3. After build success, switch to 'Production' + # 4-4. Switch Launch permission to 'Public' if required + # 5. Change 'project' field to point your + # repository and branch in this file + project: "GITHUB/open-datastudio/ray:master-staroid" + + # 'spec.containers.image' field for ray-node and ray-worker will be + # overrided by the image built from the 'project' field above. + # Set this value to 'false' to not override the image. + image_from_project: true + + # Python version to use. One of '3.6.9', '3.7.7', '3.8.3'. + # 'project' field above provides docker image for each python version. + # Fork 'project' if you'd like to support other python versions. + python_version: 3.7.7 + + # Exposing external IP addresses for ray pods isn't currently supported. + use_internal_ips: true