[autoscaler] Staroid node provider (#10956)

This commit is contained in:
Lee moon soo
2020-09-22 21:25:29 -07:00
committed by GitHub
parent 4872ffb44c
commit df4c3abe30
9 changed files with 859 additions and 1 deletions
@@ -23,4 +23,8 @@ if [ "X$pod" = "X-l" ]; then
shift
fi
exec kubectl "$namespace" exec -i "$pod" -- "$@"
if [ -z "$KUBE_API_SERVER" ]; then
exec kubectl "$namespace" exec -i "$pod" -- "$@"
else
exec kubectl --server "$KUBE_API_SERVER" "$namespace" exec -i "$pod" -- "$@"
fi
@@ -0,0 +1 @@
log_prefix = "StaroidNodeProvider: "
@@ -0,0 +1,37 @@
import os
from ray.autoscaler._private.command_runner import KubernetesCommandRunner
class StaroidCommandRunner(KubernetesCommandRunner):
def __init__(self,
log_prefix,
namespace,
node_id,
auth_config,
process_runner,
kube_api_server=None):
super(StaroidCommandRunner, self).__init__(
log_prefix, namespace, node_id, auth_config, process_runner)
if kube_api_server is not None:
self.kubectl.extend(["--server", kube_api_server])
os.environ["KUBE_API_SERVER"] = kube_api_server
def _rewrite_target_home_dir(self, target):
# Staroid forces containers to run non-root permission. Ray docker
# image does not have a support for non-root user at the moment.
# Use /tmp/ray as a home directory until docker image supports
# non-root user.
if target.startswith("~/"):
return "/home/ray" + target[1:]
return target
def run_rsync_up(self, source, target, options=None):
target = self._rewrite_target_home_dir(target)
super().run_rsync_up(source, target, options)
def run_rsync_down(self, source, target, options=None):
target = self._rewrite_target_home_dir(target)
super().run_rsync_down(source, target, options)
@@ -0,0 +1,387 @@
import os
import logging
import time
import requests
from staroid import Staroid
from kubernetes import client, config
import socket
from contextlib import closing
from uuid import uuid4
from kubernetes.client.rest import ApiException
from ray.autoscaler._private.staroid.command_runner import StaroidCommandRunner
from ray.autoscaler._private.staroid import log_prefix
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
logger = logging.getLogger(__name__)
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(("localhost", 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
def to_label_selector(tags):
label_selector = ""
for k, v in tags.items():
if label_selector != "":
label_selector += ","
label_selector += "{}={}".format(k, v)
return label_selector
class StaroidNodeProvider(NodeProvider):
def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
self.__cached = {}
self.__star = Staroid(
access_token=provider_config["access_token"],
account=provider_config["account"])
self.__ske = self._get_config_or_env(provider_config, "ske",
"STAROID_SKE")
self.__ske_region = self._get_config_or_env(
provider_config, "ske_region", "STAROID_SKE_REGION")
def _get_config_or_env(self, config, config_key, env_name):
value = None
# check env first, so config can override env later
if env_name in os.environ:
value = os.environ[env_name]
if config_key in config and config[config_key] is not None:
value = config[config_key]
return value
def _connect_kubeapi_incluster(self, instance_name):
if not os.path.isdir("/var/run/secrets/kubernetes.io/serviceaccount"):
return None
kube_conf = config.load_incluster_config()
kube_client = client.ApiClient(kube_conf)
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace",
"r") as file:
namespace = file.read().replace("\n", "")
self.__cached[instance_name] = {
"kube_client": kube_client,
"api_server": None
}
self.namespace = namespace
return kube_client
def _connect_kubeapi(self, instance_name):
if instance_name in self.__cached:
return self.__cached[instance_name]["kube_client"]
# try incluster configuration first
kube_client = self._connect_kubeapi_incluster(instance_name)
if kube_client is not None:
return kube_client
# check if ske exists
cluster_api = self.__star.cluster()
ske = cluster_api.get(self.__ske)
if ske is None: # ske not exists
return None
# check if ray cluster instance exists
ns_api = self.__star.namespace(ske)
ns = ns_api.get(instance_name)
if ns is None: # instance not exists
return None
# check if staroid namespace is not PAUSED (stopped)
# or INACTIVE (terminated)
if ns.status() != "ACTIVE":
return None
# wait for the staroid namespace to be started
start_time = time.time()
timeout = 300
started = False
while time.time() - start_time < timeout:
if ns.phase() == "RUNNING":
started = True
break
time.sleep(3)
ns = ns_api.get(instance_name)
if started is False:
logger.info(log_prefix + "fail to start namespace")
return None
# start a shell service to create secure tunnel
ns_api.shell_start(instance_name)
local_port = find_free_port()
# fixed port number for kube api access through
# shell service in staroid
remote_port = 57683
# start a secure tunnel
ns_api.start_tunnel(
instance_name, ["{}:localhost:{}".format(local_port, remote_port)])
# wait for tunnel to be established by checking /version
local_kube_api_addr = "http://localhost:{}".format(local_port)
start_time = time.time()
established = False
while time.time() - start_time < timeout:
try:
r = requests.get(
"{}/version".format(local_kube_api_addr), timeout=(3, 5))
if r.status_code == 200:
established = True
break
except requests.exceptions.ConnectionError:
pass
time.sleep(3)
if established:
kube_conf = client.Configuration()
kube_conf.host = local_kube_api_addr
kube_client = client.ApiClient(kube_conf)
self.__cached[instance_name] = {
"kube_client": kube_client,
"api_server": local_kube_api_addr
}
self.namespace = ns.namespace()
return kube_client
else:
self.__cached[instance_name] = None
return None
def non_terminated_nodes(self, tag_filters):
instance_name = self.cluster_name
kube_client = self._connect_kubeapi(instance_name)
if kube_client is None:
return []
core_api = client.CoreV1Api(kube_client)
# Match pods that are in the 'Pending' or 'Running' phase.
# Unfortunately there is no OR operator in field selectors, so we
# have to match on NOT any of the other phases.
field_selector = ",".join([
"status.phase!=Failed",
"status.phase!=Unknown",
"status.phase!=Succeeded",
"status.phase!=Terminating",
])
tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
label_selector = to_label_selector(tag_filters)
pod_list = core_api.list_namespaced_pod(
self.namespace,
field_selector=field_selector,
label_selector=label_selector)
return [pod.metadata.name for pod in pod_list.items]
def is_running(self, node_id):
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
return pod.status.phase == "Running"
def is_terminated(self, node_id):
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
return pod.status.phase not in ["Running", "Pending"]
def node_tags(self, node_id):
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
return pod.metadata.labels
def external_ip(self, node_id):
raise NotImplementedError("Must use internal IPs with Kubernetes.")
def internal_ip(self, node_id):
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
return pod.status.pod_ip
def set_node_tags(self, node_id, tags):
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
pod.metadata.labels.update(tags)
core_api.patch_namespaced_pod(node_id, self.namespace, pod)
def create_node(self, node_config, tags, count):
instance_name = self.cluster_name
# get or create ske
cluster_api = self.__star.cluster()
ske = cluster_api.create(self.__ske, self.__ske_region)
if ske is None:
raise Exception("Failed to create an SKE '{}' in '{}' region"
.format(self.__ske, self.__ske_region))
# create a namespace
ns_api = self.__star.namespace(ske)
ns = ns_api.create(
instance_name,
self.provider_config["project"],
# Configure 'start-head' param to 'false'.
# head node will be created using Kubernetes api.
params=[{
"group": "Misc",
"name": "start-head",
"value": "false"
}])
if ns is None:
raise Exception("Failed to create a cluster '{}' in SKE '{}'"
.format(instance_name, self.__ske))
# 'ray down' will change staroid namespace status to "PAUSE"
# in this case we need to start namespace again.
if ns.status() == "PAUSE":
ns = ns_api.start(instance_name)
# kube client
kube_client = self._connect_kubeapi(instance_name)
core_api = client.CoreV1Api(kube_client)
apps_api = client.AppsV1Api(kube_client)
# retrieve container image
image = None
if self.provider_config["image_from_project"]:
ray_images = apps_api.read_namespaced_deployment(
name="ray-images", namespace=self.namespace)
py_ver = self.provider_config["python_version"].replace(".", "-")
containers = ray_images.spec.template.spec.containers
for c in containers:
if py_ver in c.image:
image = c.image
break
logger.info(log_prefix + "use image {}".format(image))
# create head node
conf = node_config.copy()
pod_spec = conf.get("pod", conf)
service_spec = conf.get("service")
node_uuid = str(uuid4())
tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
tags["ray-node-uuid"] = node_uuid
pod_spec["metadata"]["namespace"] = self.namespace
if "labels" in pod_spec["metadata"]:
pod_spec["metadata"]["labels"].update(tags)
else:
pod_spec["metadata"]["labels"] = tags
if image is not None:
containers = pod_spec["spec"]["containers"]
for c in containers:
if c["name"] == "ray-node":
c["image"] = image
node_type = pod_spec["metadata"]["labels"]["ray-node-type"]
if node_type == "head":
if "STAROID_ACCESS_TOKEN" in os.environ:
c["env"].append({
"name": "STAROID_ACCESS_TOKEN",
"value": os.environ["STAROID_ACCESS_TOKEN"]
})
if "STAROID_ACCOUNT" in os.environ:
c["env"].append({
"name": "STAROID_ACCOUNT",
"value": os.environ["STAROID_ACCOUNT"]
})
if "STAROID_SKE" in os.environ:
c["env"].append({
"name": "STAROID_SKE",
"value": os.environ["STAROID_SKE"]
})
logger.info(log_prefix + "calling create_namespaced_pod "
"(count={}).".format(count))
new_nodes = []
for _ in range(count):
pod = core_api.create_namespaced_pod(self.namespace, pod_spec)
new_nodes.append(pod)
new_svcs = []
if service_spec is not None:
logger.info(log_prefix + "calling create_namespaced_service "
"(count={}).".format(count))
for new_node in new_nodes:
metadata = service_spec.get("metadata", {})
metadata["name"] = new_node.metadata.name
service_spec["metadata"] = metadata
service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid}
svc = core_api.create_namespaced_service(
self.namespace, service_spec)
new_svcs.append(svc)
def terminate_node(self, node_id):
logger.info(log_prefix + "calling delete_namespaced_pod")
kube_client = self.__cached[self.cluster_name]["kube_client"]
core_api = client.CoreV1Api(kube_client)
core_api.delete_namespaced_pod(node_id, self.namespace)
try:
core_api.delete_namespaced_service(node_id, self.namespace)
except ApiException:
pass
if node_id.startswith("ray-head"):
# Stop namespace on staroid after remove ray-head node.
instance_name = self.cluster_name
cluster_api = self.__star.cluster()
ske = cluster_api.get(self.__ske)
ns_api = self.__star.namespace(ske)
ns_api.get(instance_name)
del self.__cached[instance_name]
ns_api.stop_tunnel(instance_name)
ns_api.stop(instance_name)
def terminate_nodes(self, node_ids):
for node_id in node_ids:
self.terminate_node(node_id)
def get_command_runner(self,
log_prefix,
node_id,
auth_config,
cluster_name,
process_runner,
use_internal_ip,
docker_config=None):
instance_name = self.cluster_name
# initialize connection
self._connect_kubeapi(instance_name)
command_runner = StaroidCommandRunner(
log_prefix, self.namespace, node_id, auth_config, process_runner,
self.__cached[cluster_name]["api_server"])
return command_runner
@staticmethod
def bootstrap_config(cluster_config):
"""Bootstraps the cluster config by adding env defaults if needed."""
return cluster_config
+15
View File
@@ -43,6 +43,12 @@ def _import_kubernetes(provider_config):
return KubernetesNodeProvider
def _import_staroid(provider_config):
from ray.autoscaler._private.staroid.node_provider import \
StaroidNodeProvider
return StaroidNodeProvider
def _load_local_example_config():
import ray.autoscaler.local as ray_local
return os.path.join(
@@ -71,6 +77,12 @@ def _load_azure_example_config():
os.path.dirname(ray_azure.__file__), "example-full.yaml")
def _load_staroid_example_config():
import ray.autoscaler.staroid as ray_staroid
return os.path.join(
os.path.dirname(ray_staroid.__file__), "example-full.yaml")
def _import_external(provider_config):
provider_cls = _load_class(path=provider_config["module"])
return provider_cls
@@ -81,6 +93,7 @@ _NODE_PROVIDERS = {
"aws": _import_aws,
"gcp": _import_gcp,
"azure": _import_azure,
"staroid": _import_staroid,
"kubernetes": _import_kubernetes,
"external": _import_external # Import an external module
}
@@ -90,6 +103,7 @@ _PROVIDER_PRETTY_NAMES = {
"aws": "AWS",
"gcp": "GCP",
"azure": "Azure",
"staroid": "Staroid",
"kubernetes": "Kubernetes",
"external": "External"
}
@@ -99,6 +113,7 @@ _DEFAULT_CONFIGS = {
"aws": _load_aws_example_config,
"gcp": _load_gcp_example_config,
"azure": _load_azure_example_config,
"staroid": _load_staroid_example_config,
"kubernetes": _load_kubernetes_example_config,
}
@@ -0,0 +1,312 @@
# An unique identifier for the head node and workers of this cluster.
# A namespace will be automatically created for each cluster_name in SKE.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0
# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
autoscaling_mode: default
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: staroid
# Access token for Staroid from https://staroid.com/settings/accesstokens.
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
# https://github.com/staroids/staroid-python#configuration
# for more information.
access_token:
# Staroid account to use. e.g. GITHUB/staroids
# Alternatively, you can set STAROID_ACCOUNT environment variable.
# Leave empty to select default account for given access token.
# https://github.com/staroids/staroid-python#configuration
# for more information.
account:
# Name of a Staroid Kubernetes Engine (SKE) instance.
# Alternatively, you can set STAROID_SKE environment variable.
# An SKE is a virtualized Kubernetes cluster.
# Will create a new if not exists.
ske: "Ray cluster"
# Cloud and Region to create an SKE when not exists.
# If SKE already exists, this value will be ignored.
# Supported cloud region can be found
# https://docs.staroid.com/ske/cloudregion.html.
ske_region: "aws us-west2"
# To create a namespace in SKE, you need to specify a Github project.
# The Github project needs to have a staroid.yaml
# (https://docs.staroid.com/references/staroid_yaml.html).
# staroid.yaml defines various resources for the project, such as
# - Building container images can be accessed from the namespace
# - Kubernetes resources to create (like Persistent volume claim)
# on namespace creation
# You can fork when you need to customize.
# 1. Fork github.com/open-datastudio/ray
# 2. Change .staroid/ directory to cutomize
# 3. Connect forked repository (https://staroid.com/projects/settings)
# 4. Release your customized branch
# 4-1. Select project from 'My projects' menu
# 4-2. Select your branch in 'Release' tab
# 4-3. After build success, switch to 'Production'
# 4-4. Switch Launch permission to 'Public' if required
# 5. Change 'project' field to point your
# repository and branch in this file
project: "GITHUB/open-datastudio/ray:master-staroid"
# 'spec.containers.image' field for ray-node and ray-worker will be
# overrided by the image built from the 'project' field above.
# Set this value to 'false' to not override the image.
image_from_project: true
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
# 'project' field above provides docker image for each python version.
# Fork 'project' if you'd like to support other python versions.
python_version: 3.7.7
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "false" # use on-demand instance for head.
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
automountServiceAccountToken: true
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
# nfs volume provides a shared volume across all ray-nodes.
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
# Image will be overriden when 'image_from_project' is true.
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
- name: RAY_ADDRESS
value: "auto"
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
# https://docs.staroid.com/ske/pod.html#pod
pod.staroid.com/spot: "true" # use spot instance for workers.
# Uncomment to locate ray head to dedicated Kubernetes node
# (GPU instance is only available for 'dedicated' isolation)
#pod.staroid.com/isolation: dedicated
#pod.staroid.com/instance-type: gpu-1
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: nfs-volume
persistentVolumeClaim:
claimName: nfs
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/autoscaler
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /nfs
name: nfs-volume
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
# install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
- pip install -q staroid kubernetes
# install jupyterlab
- pip install -q jupyterlab
- ln -s /nfs /home/ray/nfs
- bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
# show 'notebook' link in staroid management console to access jupyter notebook.
- 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -'
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,72 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
max_workers: 1
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: staroid
# Access token for Staroid from https://staroid.com/settings/accesstokens.
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
# https://github.com/staroids/staroid-python#configuration
# for more information.
access_token:
# Staroid account to use. e.g. GITHUB/staroids
# Alternatively, you can set STAROID_ACCOUNT environment variable.
# Leave empty to select default account for given access token.
# https://github.com/staroids/staroid-python#configuration
# for more information.
account:
# Name of a Staroid Kubernetes Engine (SKE) instance.
# Alternatively, you can set STAROID_SKE environment variable.
# An SKE is a virtualized Kubernetes cluster.
# Will create a new if not exists.
ske: "Ray cluster"
# Cloud and Region to create an SKE when not exists.
# If SKE already exists, this value will be ignored.
# Supported cloud region can be found
# https://docs.staroid.com/ske/cloudregion.html.
ske_region: "aws us-west2"
# To create a namespace in SKE, you need to specify a Github project.
# The Github project needs to have a staroid.yaml
# (https://docs.staroid.com/references/staroid_yaml.html).
# staroid.yaml defines various resources for the project, such as
# - Building container images can be accessed from the namespace
# - Kubernetes resources to create (like Persistent volume claim)
# on namespace creation
# You can fork when you need to customize.
# 1. Fork github.com/open-datastudio/ray
# 2. Change .staroid/ directory to cutomize
# 3. Connect forked repository (https://staroid.com/projects/settings)
# 4. Release your customized branch
# 4-1. Select project from 'My projects' menu
# 4-2. Select your branch in 'Release' tab
# 4-3. After build success, switch to 'Production'
# 4-4. Switch Launch permission to 'Public' if required
# 5. Change 'project' field to point your
# repository and branch in this file
project: "GITHUB/open-datastudio/ray:master-staroid"
# 'spec.containers.image' field for ray-node and ray-worker will be
# overrided by the image built from the 'project' field above.
# Set this value to 'false' to not override the image.
image_from_project: true
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
# 'project' field above provides docker image for each python version.
# Fork 'project' if you'd like to support other python versions.
python_version: 3.7.7
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true