mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 13:19:38 +08:00
[autoscaler] Staroid node provider (#10956)
This commit is contained in:
@@ -23,4 +23,8 @@ if [ "X$pod" = "X-l" ]; then
|
||||
shift
|
||||
fi
|
||||
|
||||
exec kubectl "$namespace" exec -i "$pod" -- "$@"
|
||||
if [ -z "$KUBE_API_SERVER" ]; then
|
||||
exec kubectl "$namespace" exec -i "$pod" -- "$@"
|
||||
else
|
||||
exec kubectl --server "$KUBE_API_SERVER" "$namespace" exec -i "$pod" -- "$@"
|
||||
fi
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
log_prefix = "StaroidNodeProvider: "
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
from ray.autoscaler._private.command_runner import KubernetesCommandRunner
|
||||
|
||||
|
||||
class StaroidCommandRunner(KubernetesCommandRunner):
|
||||
def __init__(self,
|
||||
log_prefix,
|
||||
namespace,
|
||||
node_id,
|
||||
auth_config,
|
||||
process_runner,
|
||||
kube_api_server=None):
|
||||
|
||||
super(StaroidCommandRunner, self).__init__(
|
||||
log_prefix, namespace, node_id, auth_config, process_runner)
|
||||
|
||||
if kube_api_server is not None:
|
||||
self.kubectl.extend(["--server", kube_api_server])
|
||||
os.environ["KUBE_API_SERVER"] = kube_api_server
|
||||
|
||||
def _rewrite_target_home_dir(self, target):
|
||||
# Staroid forces containers to run non-root permission. Ray docker
|
||||
# image does not have a support for non-root user at the moment.
|
||||
# Use /tmp/ray as a home directory until docker image supports
|
||||
# non-root user.
|
||||
|
||||
if target.startswith("~/"):
|
||||
return "/home/ray" + target[1:]
|
||||
return target
|
||||
|
||||
def run_rsync_up(self, source, target, options=None):
|
||||
target = self._rewrite_target_home_dir(target)
|
||||
super().run_rsync_up(source, target, options)
|
||||
|
||||
def run_rsync_down(self, source, target, options=None):
|
||||
target = self._rewrite_target_home_dir(target)
|
||||
super().run_rsync_down(source, target, options)
|
||||
@@ -0,0 +1,387 @@
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
import requests
|
||||
from staroid import Staroid
|
||||
from kubernetes import client, config
|
||||
import socket
|
||||
from contextlib import closing
|
||||
|
||||
from uuid import uuid4
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
from ray.autoscaler._private.staroid.command_runner import StaroidCommandRunner
|
||||
from ray.autoscaler._private.staroid import log_prefix
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def find_free_port():
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
||||
s.bind(("localhost", 0))
|
||||
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def to_label_selector(tags):
|
||||
label_selector = ""
|
||||
for k, v in tags.items():
|
||||
if label_selector != "":
|
||||
label_selector += ","
|
||||
label_selector += "{}={}".format(k, v)
|
||||
return label_selector
|
||||
|
||||
|
||||
class StaroidNodeProvider(NodeProvider):
|
||||
def __init__(self, provider_config, cluster_name):
|
||||
NodeProvider.__init__(self, provider_config, cluster_name)
|
||||
self.__cached = {}
|
||||
|
||||
self.__star = Staroid(
|
||||
access_token=provider_config["access_token"],
|
||||
account=provider_config["account"])
|
||||
|
||||
self.__ske = self._get_config_or_env(provider_config, "ske",
|
||||
"STAROID_SKE")
|
||||
self.__ske_region = self._get_config_or_env(
|
||||
provider_config, "ske_region", "STAROID_SKE_REGION")
|
||||
|
||||
def _get_config_or_env(self, config, config_key, env_name):
|
||||
value = None
|
||||
# check env first, so config can override env later
|
||||
if env_name in os.environ:
|
||||
value = os.environ[env_name]
|
||||
|
||||
if config_key in config and config[config_key] is not None:
|
||||
value = config[config_key]
|
||||
|
||||
return value
|
||||
|
||||
def _connect_kubeapi_incluster(self, instance_name):
|
||||
if not os.path.isdir("/var/run/secrets/kubernetes.io/serviceaccount"):
|
||||
return None
|
||||
|
||||
kube_conf = config.load_incluster_config()
|
||||
kube_client = client.ApiClient(kube_conf)
|
||||
|
||||
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace",
|
||||
"r") as file:
|
||||
namespace = file.read().replace("\n", "")
|
||||
|
||||
self.__cached[instance_name] = {
|
||||
"kube_client": kube_client,
|
||||
"api_server": None
|
||||
}
|
||||
self.namespace = namespace
|
||||
return kube_client
|
||||
|
||||
def _connect_kubeapi(self, instance_name):
|
||||
if instance_name in self.__cached:
|
||||
return self.__cached[instance_name]["kube_client"]
|
||||
|
||||
# try incluster configuration first
|
||||
kube_client = self._connect_kubeapi_incluster(instance_name)
|
||||
if kube_client is not None:
|
||||
return kube_client
|
||||
|
||||
# check if ske exists
|
||||
cluster_api = self.__star.cluster()
|
||||
ske = cluster_api.get(self.__ske)
|
||||
if ske is None: # ske not exists
|
||||
return None
|
||||
|
||||
# check if ray cluster instance exists
|
||||
ns_api = self.__star.namespace(ske)
|
||||
ns = ns_api.get(instance_name)
|
||||
if ns is None: # instance not exists
|
||||
return None
|
||||
|
||||
# check if staroid namespace is not PAUSED (stopped)
|
||||
# or INACTIVE (terminated)
|
||||
if ns.status() != "ACTIVE":
|
||||
return None
|
||||
|
||||
# wait for the staroid namespace to be started
|
||||
start_time = time.time()
|
||||
timeout = 300
|
||||
started = False
|
||||
while time.time() - start_time < timeout:
|
||||
if ns.phase() == "RUNNING":
|
||||
started = True
|
||||
break
|
||||
time.sleep(3)
|
||||
ns = ns_api.get(instance_name)
|
||||
|
||||
if started is False:
|
||||
logger.info(log_prefix + "fail to start namespace")
|
||||
return None
|
||||
|
||||
# start a shell service to create secure tunnel
|
||||
ns_api.shell_start(instance_name)
|
||||
|
||||
local_port = find_free_port()
|
||||
# fixed port number for kube api access through
|
||||
# shell service in staroid
|
||||
remote_port = 57683
|
||||
|
||||
# start a secure tunnel
|
||||
ns_api.start_tunnel(
|
||||
instance_name, ["{}:localhost:{}".format(local_port, remote_port)])
|
||||
|
||||
# wait for tunnel to be established by checking /version
|
||||
local_kube_api_addr = "http://localhost:{}".format(local_port)
|
||||
start_time = time.time()
|
||||
established = False
|
||||
while time.time() - start_time < timeout:
|
||||
try:
|
||||
r = requests.get(
|
||||
"{}/version".format(local_kube_api_addr), timeout=(3, 5))
|
||||
if r.status_code == 200:
|
||||
established = True
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
pass
|
||||
time.sleep(3)
|
||||
|
||||
if established:
|
||||
kube_conf = client.Configuration()
|
||||
kube_conf.host = local_kube_api_addr
|
||||
kube_client = client.ApiClient(kube_conf)
|
||||
self.__cached[instance_name] = {
|
||||
"kube_client": kube_client,
|
||||
"api_server": local_kube_api_addr
|
||||
}
|
||||
self.namespace = ns.namespace()
|
||||
return kube_client
|
||||
else:
|
||||
self.__cached[instance_name] = None
|
||||
return None
|
||||
|
||||
def non_terminated_nodes(self, tag_filters):
|
||||
instance_name = self.cluster_name
|
||||
|
||||
kube_client = self._connect_kubeapi(instance_name)
|
||||
if kube_client is None:
|
||||
return []
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
# Match pods that are in the 'Pending' or 'Running' phase.
|
||||
# Unfortunately there is no OR operator in field selectors, so we
|
||||
# have to match on NOT any of the other phases.
|
||||
field_selector = ",".join([
|
||||
"status.phase!=Failed",
|
||||
"status.phase!=Unknown",
|
||||
"status.phase!=Succeeded",
|
||||
"status.phase!=Terminating",
|
||||
])
|
||||
|
||||
tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
||||
label_selector = to_label_selector(tag_filters)
|
||||
pod_list = core_api.list_namespaced_pod(
|
||||
self.namespace,
|
||||
field_selector=field_selector,
|
||||
label_selector=label_selector)
|
||||
|
||||
return [pod.metadata.name for pod in pod_list.items]
|
||||
|
||||
def is_running(self, node_id):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
|
||||
return pod.status.phase == "Running"
|
||||
|
||||
def is_terminated(self, node_id):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
|
||||
return pod.status.phase not in ["Running", "Pending"]
|
||||
|
||||
def node_tags(self, node_id):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
|
||||
return pod.metadata.labels
|
||||
|
||||
def external_ip(self, node_id):
|
||||
raise NotImplementedError("Must use internal IPs with Kubernetes.")
|
||||
|
||||
def internal_ip(self, node_id):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
|
||||
return pod.status.pod_ip
|
||||
|
||||
def set_node_tags(self, node_id, tags):
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
pod = core_api.read_namespaced_pod_status(node_id, self.namespace)
|
||||
pod.metadata.labels.update(tags)
|
||||
core_api.patch_namespaced_pod(node_id, self.namespace, pod)
|
||||
|
||||
def create_node(self, node_config, tags, count):
|
||||
instance_name = self.cluster_name
|
||||
|
||||
# get or create ske
|
||||
cluster_api = self.__star.cluster()
|
||||
ske = cluster_api.create(self.__ske, self.__ske_region)
|
||||
if ske is None:
|
||||
raise Exception("Failed to create an SKE '{}' in '{}' region"
|
||||
.format(self.__ske, self.__ske_region))
|
||||
|
||||
# create a namespace
|
||||
ns_api = self.__star.namespace(ske)
|
||||
ns = ns_api.create(
|
||||
instance_name,
|
||||
self.provider_config["project"],
|
||||
|
||||
# Configure 'start-head' param to 'false'.
|
||||
# head node will be created using Kubernetes api.
|
||||
params=[{
|
||||
"group": "Misc",
|
||||
"name": "start-head",
|
||||
"value": "false"
|
||||
}])
|
||||
if ns is None:
|
||||
raise Exception("Failed to create a cluster '{}' in SKE '{}'"
|
||||
.format(instance_name, self.__ske))
|
||||
|
||||
# 'ray down' will change staroid namespace status to "PAUSE"
|
||||
# in this case we need to start namespace again.
|
||||
if ns.status() == "PAUSE":
|
||||
ns = ns_api.start(instance_name)
|
||||
|
||||
# kube client
|
||||
kube_client = self._connect_kubeapi(instance_name)
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
apps_api = client.AppsV1Api(kube_client)
|
||||
|
||||
# retrieve container image
|
||||
image = None
|
||||
if self.provider_config["image_from_project"]:
|
||||
ray_images = apps_api.read_namespaced_deployment(
|
||||
name="ray-images", namespace=self.namespace)
|
||||
py_ver = self.provider_config["python_version"].replace(".", "-")
|
||||
containers = ray_images.spec.template.spec.containers
|
||||
for c in containers:
|
||||
if py_ver in c.image:
|
||||
image = c.image
|
||||
break
|
||||
logger.info(log_prefix + "use image {}".format(image))
|
||||
|
||||
# create head node
|
||||
conf = node_config.copy()
|
||||
pod_spec = conf.get("pod", conf)
|
||||
service_spec = conf.get("service")
|
||||
node_uuid = str(uuid4())
|
||||
tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
||||
tags["ray-node-uuid"] = node_uuid
|
||||
pod_spec["metadata"]["namespace"] = self.namespace
|
||||
if "labels" in pod_spec["metadata"]:
|
||||
pod_spec["metadata"]["labels"].update(tags)
|
||||
else:
|
||||
pod_spec["metadata"]["labels"] = tags
|
||||
|
||||
if image is not None:
|
||||
containers = pod_spec["spec"]["containers"]
|
||||
for c in containers:
|
||||
if c["name"] == "ray-node":
|
||||
c["image"] = image
|
||||
|
||||
node_type = pod_spec["metadata"]["labels"]["ray-node-type"]
|
||||
if node_type == "head":
|
||||
if "STAROID_ACCESS_TOKEN" in os.environ:
|
||||
c["env"].append({
|
||||
"name": "STAROID_ACCESS_TOKEN",
|
||||
"value": os.environ["STAROID_ACCESS_TOKEN"]
|
||||
})
|
||||
if "STAROID_ACCOUNT" in os.environ:
|
||||
c["env"].append({
|
||||
"name": "STAROID_ACCOUNT",
|
||||
"value": os.environ["STAROID_ACCOUNT"]
|
||||
})
|
||||
if "STAROID_SKE" in os.environ:
|
||||
c["env"].append({
|
||||
"name": "STAROID_SKE",
|
||||
"value": os.environ["STAROID_SKE"]
|
||||
})
|
||||
|
||||
logger.info(log_prefix + "calling create_namespaced_pod "
|
||||
"(count={}).".format(count))
|
||||
new_nodes = []
|
||||
for _ in range(count):
|
||||
pod = core_api.create_namespaced_pod(self.namespace, pod_spec)
|
||||
new_nodes.append(pod)
|
||||
|
||||
new_svcs = []
|
||||
if service_spec is not None:
|
||||
logger.info(log_prefix + "calling create_namespaced_service "
|
||||
"(count={}).".format(count))
|
||||
|
||||
for new_node in new_nodes:
|
||||
metadata = service_spec.get("metadata", {})
|
||||
metadata["name"] = new_node.metadata.name
|
||||
service_spec["metadata"] = metadata
|
||||
service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid}
|
||||
svc = core_api.create_namespaced_service(
|
||||
self.namespace, service_spec)
|
||||
new_svcs.append(svc)
|
||||
|
||||
def terminate_node(self, node_id):
|
||||
logger.info(log_prefix + "calling delete_namespaced_pod")
|
||||
kube_client = self.__cached[self.cluster_name]["kube_client"]
|
||||
core_api = client.CoreV1Api(kube_client)
|
||||
|
||||
core_api.delete_namespaced_pod(node_id, self.namespace)
|
||||
try:
|
||||
core_api.delete_namespaced_service(node_id, self.namespace)
|
||||
except ApiException:
|
||||
pass
|
||||
|
||||
if node_id.startswith("ray-head"):
|
||||
# Stop namespace on staroid after remove ray-head node.
|
||||
instance_name = self.cluster_name
|
||||
|
||||
cluster_api = self.__star.cluster()
|
||||
ske = cluster_api.get(self.__ske)
|
||||
|
||||
ns_api = self.__star.namespace(ske)
|
||||
ns_api.get(instance_name)
|
||||
|
||||
del self.__cached[instance_name]
|
||||
|
||||
ns_api.stop_tunnel(instance_name)
|
||||
ns_api.stop(instance_name)
|
||||
|
||||
def terminate_nodes(self, node_ids):
|
||||
for node_id in node_ids:
|
||||
self.terminate_node(node_id)
|
||||
|
||||
def get_command_runner(self,
|
||||
log_prefix,
|
||||
node_id,
|
||||
auth_config,
|
||||
cluster_name,
|
||||
process_runner,
|
||||
use_internal_ip,
|
||||
docker_config=None):
|
||||
instance_name = self.cluster_name
|
||||
|
||||
# initialize connection
|
||||
self._connect_kubeapi(instance_name)
|
||||
|
||||
command_runner = StaroidCommandRunner(
|
||||
log_prefix, self.namespace, node_id, auth_config, process_runner,
|
||||
self.__cached[cluster_name]["api_server"])
|
||||
return command_runner
|
||||
|
||||
@staticmethod
|
||||
def bootstrap_config(cluster_config):
|
||||
"""Bootstraps the cluster config by adding env defaults if needed."""
|
||||
return cluster_config
|
||||
@@ -43,6 +43,12 @@ def _import_kubernetes(provider_config):
|
||||
return KubernetesNodeProvider
|
||||
|
||||
|
||||
def _import_staroid(provider_config):
|
||||
from ray.autoscaler._private.staroid.node_provider import \
|
||||
StaroidNodeProvider
|
||||
return StaroidNodeProvider
|
||||
|
||||
|
||||
def _load_local_example_config():
|
||||
import ray.autoscaler.local as ray_local
|
||||
return os.path.join(
|
||||
@@ -71,6 +77,12 @@ def _load_azure_example_config():
|
||||
os.path.dirname(ray_azure.__file__), "example-full.yaml")
|
||||
|
||||
|
||||
def _load_staroid_example_config():
|
||||
import ray.autoscaler.staroid as ray_staroid
|
||||
return os.path.join(
|
||||
os.path.dirname(ray_staroid.__file__), "example-full.yaml")
|
||||
|
||||
|
||||
def _import_external(provider_config):
|
||||
provider_cls = _load_class(path=provider_config["module"])
|
||||
return provider_cls
|
||||
@@ -81,6 +93,7 @@ _NODE_PROVIDERS = {
|
||||
"aws": _import_aws,
|
||||
"gcp": _import_gcp,
|
||||
"azure": _import_azure,
|
||||
"staroid": _import_staroid,
|
||||
"kubernetes": _import_kubernetes,
|
||||
"external": _import_external # Import an external module
|
||||
}
|
||||
@@ -90,6 +103,7 @@ _PROVIDER_PRETTY_NAMES = {
|
||||
"aws": "AWS",
|
||||
"gcp": "GCP",
|
||||
"azure": "Azure",
|
||||
"staroid": "Staroid",
|
||||
"kubernetes": "Kubernetes",
|
||||
"external": "External"
|
||||
}
|
||||
@@ -99,6 +113,7 @@ _DEFAULT_CONFIGS = {
|
||||
"aws": _load_aws_example_config,
|
||||
"gcp": _load_gcp_example_config,
|
||||
"azure": _load_azure_example_config,
|
||||
"staroid": _load_staroid_example_config,
|
||||
"kubernetes": _load_kubernetes_example_config,
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,312 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
# A namespace will be automatically created for each cluster_name in SKE.
|
||||
cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 0
|
||||
|
||||
# Whether or not to autoscale aggressively. If this is enabled, if at any point
|
||||
# we would start more workers, we start at least enough to bring us to
|
||||
# initial_workers.
|
||||
autoscaling_mode: default
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: staroid
|
||||
|
||||
# Access token for Staroid from https://staroid.com/settings/accesstokens.
|
||||
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
access_token:
|
||||
|
||||
# Staroid account to use. e.g. GITHUB/staroids
|
||||
# Alternatively, you can set STAROID_ACCOUNT environment variable.
|
||||
# Leave empty to select default account for given access token.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
account:
|
||||
|
||||
# Name of a Staroid Kubernetes Engine (SKE) instance.
|
||||
# Alternatively, you can set STAROID_SKE environment variable.
|
||||
# An SKE is a virtualized Kubernetes cluster.
|
||||
# Will create a new if not exists.
|
||||
ske: "Ray cluster"
|
||||
|
||||
# Cloud and Region to create an SKE when not exists.
|
||||
# If SKE already exists, this value will be ignored.
|
||||
# Supported cloud region can be found
|
||||
# https://docs.staroid.com/ske/cloudregion.html.
|
||||
ske_region: "aws us-west2"
|
||||
|
||||
# To create a namespace in SKE, you need to specify a Github project.
|
||||
# The Github project needs to have a staroid.yaml
|
||||
# (https://docs.staroid.com/references/staroid_yaml.html).
|
||||
# staroid.yaml defines various resources for the project, such as
|
||||
# - Building container images can be accessed from the namespace
|
||||
# - Kubernetes resources to create (like Persistent volume claim)
|
||||
# on namespace creation
|
||||
# You can fork when you need to customize.
|
||||
# 1. Fork github.com/open-datastudio/ray
|
||||
# 2. Change .staroid/ directory to cutomize
|
||||
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
||||
# 4. Release your customized branch
|
||||
# 4-1. Select project from 'My projects' menu
|
||||
# 4-2. Select your branch in 'Release' tab
|
||||
# 4-3. After build success, switch to 'Production'
|
||||
# 4-4. Switch Launch permission to 'Public' if required
|
||||
# 5. Change 'project' field to point your
|
||||
# repository and branch in this file
|
||||
project: "GITHUB/open-datastudio/ray:master-staroid"
|
||||
|
||||
# 'spec.containers.image' field for ray-node and ray-worker will be
|
||||
# overrided by the image built from the 'project' field above.
|
||||
# Set this value to 'false' to not override the image.
|
||||
image_from_project: true
|
||||
|
||||
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
|
||||
# 'project' field above provides docker image for each python version.
|
||||
# Fork 'project' if you'd like to support other python versions.
|
||||
python_version: 3.7.7
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
head_node:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "false" # use on-demand instance for head.
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# nfs volume provides a shared volume across all ray-nodes.
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
# Image will be overriden when 'image_from_project' is true.
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
- name: RAY_ADDRESS
|
||||
value: "auto"
|
||||
|
||||
# Kubernetes pod config for worker node pods.
|
||||
worker_nodes:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
|
||||
# https://docs.staroid.com/ske/pod.html#pod
|
||||
pod.staroid.com/spot: "true" # use spot instance for workers.
|
||||
|
||||
# Uncomment to locate ray head to dedicated Kubernetes node
|
||||
# (GPU instance is only available for 'dedicated' isolation)
|
||||
#pod.staroid.com/isolation: dedicated
|
||||
#pod.staroid.com/instance-type: gpu-1
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: nfs-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/autoscaler
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["touch ~/.bashrc; trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /nfs
|
||||
name: nfs-volume
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
# install staroid and kubernetes packages. Staroid node provider depends on them which autoscaler will use.
|
||||
- pip install -q staroid kubernetes
|
||||
# install jupyterlab
|
||||
- pip install -q jupyterlab
|
||||
- ln -s /nfs /home/ray/nfs
|
||||
- bash -c 'jupyter-lab --ip="*" --NotebookApp.token="" --NotebookApp.password="" --NotebookApp.allow_origin="*" --NotebookApp.notebook_dir="/home/ray"' &
|
||||
# show 'notebook' link in staroid management console to access jupyter notebook.
|
||||
- 'echo -e "kind: Service\napiVersion: v1\nmetadata:\n name: notebook\n annotations:\n service.staroid.com/link: show\nspec:\n ports:\n - name: http\n port: 8888\n selector:\n component: ray-head" | kubectl apply -f -'
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -0,0 +1,72 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
max_workers: 1
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: staroid
|
||||
|
||||
# Access token for Staroid from https://staroid.com/settings/accesstokens.
|
||||
# Alternatively, you can set STAROID_ACCESS_TOKEN environment variable.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
access_token:
|
||||
|
||||
# Staroid account to use. e.g. GITHUB/staroids
|
||||
# Alternatively, you can set STAROID_ACCOUNT environment variable.
|
||||
# Leave empty to select default account for given access token.
|
||||
# https://github.com/staroids/staroid-python#configuration
|
||||
# for more information.
|
||||
account:
|
||||
|
||||
# Name of a Staroid Kubernetes Engine (SKE) instance.
|
||||
# Alternatively, you can set STAROID_SKE environment variable.
|
||||
# An SKE is a virtualized Kubernetes cluster.
|
||||
# Will create a new if not exists.
|
||||
ske: "Ray cluster"
|
||||
|
||||
# Cloud and Region to create an SKE when not exists.
|
||||
# If SKE already exists, this value will be ignored.
|
||||
# Supported cloud region can be found
|
||||
# https://docs.staroid.com/ske/cloudregion.html.
|
||||
ske_region: "aws us-west2"
|
||||
|
||||
# To create a namespace in SKE, you need to specify a Github project.
|
||||
# The Github project needs to have a staroid.yaml
|
||||
# (https://docs.staroid.com/references/staroid_yaml.html).
|
||||
# staroid.yaml defines various resources for the project, such as
|
||||
# - Building container images can be accessed from the namespace
|
||||
# - Kubernetes resources to create (like Persistent volume claim)
|
||||
# on namespace creation
|
||||
# You can fork when you need to customize.
|
||||
# 1. Fork github.com/open-datastudio/ray
|
||||
# 2. Change .staroid/ directory to cutomize
|
||||
# 3. Connect forked repository (https://staroid.com/projects/settings)
|
||||
# 4. Release your customized branch
|
||||
# 4-1. Select project from 'My projects' menu
|
||||
# 4-2. Select your branch in 'Release' tab
|
||||
# 4-3. After build success, switch to 'Production'
|
||||
# 4-4. Switch Launch permission to 'Public' if required
|
||||
# 5. Change 'project' field to point your
|
||||
# repository and branch in this file
|
||||
project: "GITHUB/open-datastudio/ray:master-staroid"
|
||||
|
||||
# 'spec.containers.image' field for ray-node and ray-worker will be
|
||||
# overrided by the image built from the 'project' field above.
|
||||
# Set this value to 'false' to not override the image.
|
||||
image_from_project: true
|
||||
|
||||
# Python version to use. One of '3.6.9', '3.7.7', '3.8.3'.
|
||||
# 'project' field above provides docker image for each python version.
|
||||
# Fork 'project' if you'd like to support other python versions.
|
||||
python_version: 3.7.7
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
Reference in New Issue
Block a user