Merge branch 'master' into py39

This commit is contained in:
Akash Patel
2020-12-16 11:04:27 -05:00
committed by GitHub
489 changed files with 36844 additions and 8744 deletions
+1 -1
View File
@@ -101,7 +101,7 @@ from ray import util # noqa: E402
# Replaced with the current commit when building the wheels.
__commit__ = "{{RAY_COMMIT_SHA}}"
__version__ = "1.1.0.dev0"
__version__ = "1.2.0.dev0"
__all__ = [
"__version__",
+19 -31
View File
@@ -136,7 +136,7 @@ def find_redis_address(address=None):
# --redis_address=123.456.78.910 --node_ip_address=123.456.78.910
# --raylet_socket_name=... --store_socket_name=... --object_manager_port=0
# --min_worker_port=10000 --max_worker_port=10999
# --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8
# --node_manager_port=58578 --redis_port=6379
# --maximum_startup_concurrency=8
# --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
# --config_list=plasma_store_as_thread,True
@@ -279,7 +279,8 @@ def get_address_info_from_redis_helper(redis_address,
def get_address_info_from_redis(redis_address,
node_ip_address,
num_retries=5,
redis_password=None):
redis_password=None,
no_warning=False):
counter = 0
while True:
try:
@@ -290,10 +291,11 @@ def get_address_info_from_redis(redis_address,
raise
# Some of the information may not be in Redis yet, so wait a little
# bit.
logger.warning(
"Some processes that the driver needs to connect to have "
"not registered with Redis, so retrying. Have you run "
"'ray start' on this node?")
if not no_warning:
logger.warning(
"Some processes that the driver needs to connect to have "
"not registered with Redis, so retrying. Have you run "
"'ray start' on this node?")
time.sleep(1)
counter += 1
@@ -1251,13 +1253,11 @@ def start_raylet(redis_address,
stderr_file=None,
config=None,
java_worker_options=None,
load_code_from_local=False,
huge_pages=False,
fate_share=None,
socket_to_use=None,
head_node=False,
start_initial_python_workers_for_first_job=False,
code_search_path=None):
start_initial_python_workers_for_first_job=False):
"""Start a raylet, which is a combined local scheduler and object manager.
Args:
@@ -1294,9 +1294,6 @@ def start_raylet(redis_address,
config (dict|None): Optional Raylet configuration that will
override defaults in RayConfig.
java_worker_options (list): The command options for Java worker.
code_search_path (list): Code search path for worker. code_search_path
is added to worker command in non-multi-tenancy mode and job_config
in multi-tenancy mode.
Returns:
ProcessInfo for the process that was started.
"""
@@ -1309,7 +1306,6 @@ def start_raylet(redis_address,
raise ValueError("Cannot use valgrind and profiler at the same time.")
assert resource_spec.resolved()
num_initial_workers = resource_spec.num_cpus
static_resources = resource_spec.to_resource_dict()
# Limit the number of workers that can be started in parallel by the
@@ -1346,7 +1342,6 @@ def start_raylet(redis_address,
raylet_name,
redis_password,
session_dir,
code_search_path,
)
else:
java_worker_command = []
@@ -1366,15 +1361,18 @@ def start_raylet(redis_address,
# Create the command that the Raylet will use to start workers.
start_worker_command = [
sys.executable, worker_path, f"--node-ip-address={node_ip_address}",
sys.executable,
worker_path,
f"--node-ip-address={node_ip_address}",
f"--node-manager-port={node_manager_port}",
f"--object-store-name={plasma_store_name}",
f"--raylet-name={raylet_name}", f"--redis-address={redis_address}",
f"--config-list={config_str}", f"--temp-dir={temp_dir}",
f"--metrics-agent-port={metrics_agent_port}"
f"--raylet-name={raylet_name}",
f"--redis-address={redis_address}",
f"--config-list={config_str}",
f"--temp-dir={temp_dir}",
f"--metrics-agent-port={metrics_agent_port}",
"RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER",
]
if code_search_path:
start_worker_command.append(f"--code-search-path={code_search_path}")
if redis_password:
start_worker_command += [f"--redis-password={redis_password}"]
@@ -1389,12 +1387,6 @@ def start_raylet(redis_address,
if max_worker_port is None:
max_worker_port = 0
if code_search_path is not None and len(code_search_path) > 0:
load_code_from_local = True
if load_code_from_local:
start_worker_command += ["--load-code-from-local"]
# Create agent command
agent_command = [
sys.executable,
@@ -1425,7 +1417,6 @@ def start_raylet(redis_address,
f"--node_ip_address={node_ip_address}",
f"--redis_address={gcs_ip_address}",
f"--redis_port={gcs_port}",
f"--num_initial_workers={num_initial_workers}",
f"--maximum_startup_concurrency={maximum_startup_concurrency}",
f"--static_resource_list={resource_argument}",
f"--config_list={config_str}",
@@ -1485,8 +1476,7 @@ def get_ray_jars_dir():
def build_java_worker_command(java_worker_options, redis_address,
node_manager_port, plasma_store_name,
raylet_name, redis_password, session_dir,
code_search_path):
raylet_name, redis_password, session_dir):
"""This method assembles the command used to start a Java worker.
Args:
@@ -1497,7 +1487,6 @@ def build_java_worker_command(java_worker_options, redis_address,
raylet_name (str): The name of the raylet socket to create.
redis_password (str): The password of connect to redis.
session_dir (str): The path of this session.
code_search_path (list): Teh job code search path.
Returns:
The command string for starting Java worker.
"""
@@ -1518,7 +1507,6 @@ def build_java_worker_command(java_worker_options, redis_address,
pairs.append(("ray.home", RAY_HOME))
pairs.append(("ray.logging.dir", os.path.join(session_dir, "logs")))
pairs.append(("ray.session-dir", session_dir))
pairs.append(("ray.job.code-search-path", code_search_path))
command = ["java"] + ["-D{}={}".format(*pair) for pair in pairs]
command += ["RAY_WORKER_RAYLET_CONFIG_PLACEHOLDER"]
+79 -18
View File
@@ -336,6 +336,7 @@ cdef execute_task(
const c_vector[shared_ptr[CRayObject]] &c_args,
const c_vector[CObjectID] &c_arg_reference_ids,
const c_vector[CObjectID] &c_return_ids,
const c_string debugger_breakpoint,
c_vector[shared_ptr[CRayObject]] *returns):
worker = ray.worker.global_worker
@@ -351,6 +352,18 @@ cdef execute_task(
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
# Helper method used to exit current asyncio actor.
# This is called when a KeyboardInterrupt is received by the main thread.
# Upon receiving a KeyboardInterrupt signal, Ray will exit the current
# worker. If the worker is processing normal tasks, Ray treat it as task
# cancellation from ray.cancel(object_ref). If the worker is an asyncio
# actor, Ray will exit the actor.
def exit_current_actor_if_asyncio():
if core_worker.current_actor_is_asyncio():
error = SystemExit(0)
error.is_ray_terminate = True
raise error
function_descriptor = CFunctionDescriptorToPython(
ray_function.GetFunctionDescriptor())
@@ -457,9 +470,26 @@ cdef execute_task(
task_exception = True
try:
with ray.worker._changeproctitle(title, next_title):
if debugger_breakpoint != b"":
ray.util.pdb.set_trace(
breakpoint_uuid=debugger_breakpoint)
outputs = function_executor(*args, **kwargs)
next_breakpoint = (
ray.worker.global_worker.debugger_breakpoint)
if next_breakpoint != b"":
# If this happens, the user typed "remote" and
# there were no more remote calls left in this
# task. In that case we just exit the debugger.
ray.experimental.internal_kv._internal_kv_put(
"RAY_PDB_{}".format(next_breakpoint),
"{\"exit_debugger\": true}")
ray.experimental.internal_kv._internal_kv_del(
"RAY_PDB_CONTINUE_{}".format(next_breakpoint)
)
ray.worker.global_worker.debugger_breakpoint = b""
task_exception = False
except KeyboardInterrupt as e:
exit_current_actor_if_asyncio()
raise TaskCancelledError(
core_worker.get_current_task_id())
if c_return_ids.size() == 1:
@@ -467,6 +497,7 @@ cdef execute_task(
# Check for a cancellation that was called when the function
# was exiting and was raised after the except block.
if not check_signals().ok():
exit_current_actor_if_asyncio()
task_exception = True
raise TaskCancelledError(
core_worker.get_current_task_id())
@@ -523,6 +554,7 @@ cdef CRayStatus task_execution_handler(
const c_vector[shared_ptr[CRayObject]] &c_args,
const c_vector[CObjectID] &c_arg_reference_ids,
const c_vector[CObjectID] &c_return_ids,
const c_string debugger_breakpoint,
c_vector[shared_ptr[CRayObject]] *returns) nogil:
with gil:
@@ -532,7 +564,7 @@ cdef CRayStatus task_execution_handler(
# it does, that indicates that there was an internal error.
execute_task(task_type, task_name, ray_function, c_resources,
c_args, c_arg_reference_ids, c_return_ids,
returns)
debugger_breakpoint, returns)
except Exception:
traceback_str = traceback.format_exc() + (
"An unexpected internal error occurred while the worker "
@@ -1041,6 +1073,7 @@ cdef class CoreWorker:
PlacementGroupID placement_group_id,
int64_t placement_group_bundle_index,
c_bool placement_group_capture_child_tasks,
c_string debugger_breakpoint,
override_environment_variables):
cdef:
unordered_map[c_string, double] c_resources
@@ -1059,15 +1092,18 @@ cdef class CoreWorker:
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
with nogil:
CCoreWorkerProcess.GetCoreWorker().SubmitTask(
ray_function, args_vector, CTaskOptions(
name, num_returns, c_resources,
c_override_environment_variables),
&return_ids, max_retries,
c_pair[CPlacementGroupID, int64_t](
c_placement_group_id, placement_group_bundle_index),
placement_group_capture_child_tasks)
# NOTE(edoakes): releasing the GIL while calling this method causes
# segfaults. See relevant issue for details:
# https://github.com/ray-project/ray/pull/12803
CCoreWorkerProcess.GetCoreWorker().SubmitTask(
ray_function, args_vector, CTaskOptions(
name, num_returns, c_resources,
c_override_environment_variables),
&return_ids, max_retries,
c_pair[CPlacementGroupID, int64_t](
c_placement_group_id, placement_group_bundle_index),
placement_group_capture_child_tasks,
debugger_breakpoint)
return VectorToObjectRefs(return_ids)
@@ -1170,6 +1206,21 @@ cdef class CoreWorker:
CCoreWorkerProcess.GetCoreWorker().
RemovePlacementGroup(c_placement_group_id))
def wait_placement_group_ready(self,
PlacementGroupID placement_group_id,
int32_t timeout_seconds):
cdef CRayStatus status
cdef CPlacementGroupID cplacement_group_id = (
CPlacementGroupID.FromBinary(placement_group_id.binary()))
cdef int ctimeout_seconds = timeout_seconds
with nogil:
status = CCoreWorkerProcess.GetCoreWorker() \
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_seconds)
if status.IsNotFound():
raise Exception("Placement group {} does not exist.".format(
placement_group_id))
return status.ok()
def submit_actor_task(self,
Language language,
ActorID actor_id,
@@ -1193,12 +1244,14 @@ cdef class CoreWorker:
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
with nogil:
CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
c_actor_id,
ray_function,
args_vector, CTaskOptions(name, num_returns, c_resources),
&return_ids)
# NOTE(edoakes): releasing the GIL while calling this method causes
# segfaults. See relevant issue for details:
# https://github.com/ray-project/ray/pull/12803
CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
c_actor_id,
ray_function,
args_vector, CTaskOptions(name, num_returns, c_resources),
&return_ids)
return VectorToObjectRefs(return_ids)
@@ -1400,8 +1453,16 @@ cdef class CoreWorker:
context = worker.get_serialization_context()
serialized_object = context.serialize(output)
data_sizes.push_back(serialized_object.total_bytes)
metadatas.push_back(
string_to_buffer(serialized_object.metadata))
metadata = serialized_object.metadata
if ray.worker.global_worker.debugger_get_breakpoint:
breakpoint = (
ray.worker.global_worker.debugger_get_breakpoint)
metadata += (
b"," + ray_constants.OBJECT_METADATA_DEBUG_PREFIX +
breakpoint.encode())
# Reset debugging context of this worker.
ray.worker.global_worker.debugger_get_breakpoint = b""
metadatas.push_back(string_to_buffer(metadata))
serialized_objects.append(serialized_object)
contained_ids.push_back(
ObjectRefsToVector(serialized_object.contained_object_refs)
+10
View File
@@ -1,6 +1,7 @@
import inspect
import logging
import weakref
import _thread
import ray.ray_constants as ray_constants
import ray._raylet
@@ -1006,6 +1007,7 @@ def exit_actor():
"""Intentionally exit the current actor.
This function is used to disconnect an actor and exit the worker.
Any ``atexit`` handlers installed in the actor will be run.
Raises:
Exception: An exception is raised if this is a driver or this
@@ -1018,6 +1020,14 @@ def exit_actor():
ray.disconnect()
# Disconnect global state from GCS.
ray.state.state.disconnect()
# In asyncio actor mode, we can't raise SystemExit because it will just
# quit the asycnio event loop thread, not the main thread. Instead, we
# raise an interrupt signal to the main thread to tell it to exit.
if worker.core_worker.current_actor_is_asyncio():
_thread.interrupt_main()
return
# Set a flag to indicate this is an intentional actor exit. This
# reduces log verbosity.
exit = SystemExit(0)
+129 -28
View File
@@ -13,18 +13,19 @@ import collections
from ray.experimental.internal_kv import _internal_kv_put, \
_internal_kv_initialized
from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
TAG_RAY_FILE_MOUNTS_CONTENTS,
TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE,
NODE_KIND_WORKER, NODE_KIND_UNMANAGED)
from ray.autoscaler.tags import (
TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
TAG_RAY_FILE_MOUNTS_CONTENTS, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE, NODE_KIND_WORKER,
NODE_KIND_UNMANAGED, NODE_KIND_HEAD)
from ray.autoscaler._private.providers import _get_node_provider
from ray.autoscaler._private.updater import NodeUpdaterThread
from ray.autoscaler._private.node_launcher import NodeLauncher
from ray.autoscaler._private.resource_demand_scheduler import \
ResourceDemandScheduler, NodeType, NodeID
get_bin_pack_residual, ResourceDemandScheduler, NodeType, NodeID, NodeIP, \
ResourceDict
from ray.autoscaler._private.util import ConcurrentCounter, validate_config, \
with_head_node_ip, hash_launch_conf, hash_runtime_conf, \
with_head_node_ip, hash_launch_conf, hash_runtime_conf, add_prefix, \
DEBUG_AUTOSCALING_STATUS, DEBUG_AUTOSCALING_ERROR
from ray.autoscaler._private.constants import \
AUTOSCALER_MAX_NUM_FAILURES, AUTOSCALER_MAX_LAUNCH_BATCH, \
@@ -47,7 +48,7 @@ class StandardAutoscaler:
There are two ways to start an autoscaling cluster: manually by running
`ray start --head --autoscaling-config=/path/to/config.yaml` on a
instance that has permission to launch other instances, or you can also use
`ray create_or_update /path/to/config.yaml` from your laptop, which will
`ray up /path/to/config.yaml` from your laptop, which will
configure the right AWS/Cloud roles automatically.
StandardAutoscaler's `update` method is periodically called by `monitor.py`
@@ -66,8 +67,11 @@ class StandardAutoscaler:
max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
max_failures=AUTOSCALER_MAX_NUM_FAILURES,
process_runner=subprocess,
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
prefix_cluster_info=False):
self.config_path = config_path
# Prefix each line of info string with cluster name if True
self.prefix_cluster_info = prefix_cluster_info
# Keep this before self.reset (self.provider needs to be created
# exactly once).
self.provider = None
@@ -164,27 +168,35 @@ class StandardAutoscaler:
last_used = self.load_metrics.last_used_time_by_ip
horizon = now - (60 * self.config["idle_timeout_minutes"])
nodes_to_terminate = []
nodes_to_terminate: Dict[NodeID, bool] = []
node_type_counts = collections.defaultdict(int)
# Sort based on last used to make sure to keep min_workers that
# were most recently used. Otherwise, _keep_min_workers_of_node_type
# might keep a node that should be terminated.
for node_id in self._sort_based_on_last_used(nodes, last_used):
sorted_node_ids = self._sort_based_on_last_used(nodes, last_used)
# Don't terminate nodes needed by request_resources()
nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
if self.resource_demand_vector:
nodes_allowed_to_terminate = self._get_nodes_allowed_to_terminate(
sorted_node_ids)
for node_id in sorted_node_ids:
# Make sure to not kill idle node types if the number of workers
# of that type is lower/equal to the min_workers of that type.
if self._keep_min_worker_of_node_type(
node_id,
node_type_counts) and self.launch_config_ok(node_id):
# of that type is lower/equal to the min_workers of that type
# or it is needed for request_resources().
if (self._keep_min_worker_of_node_type(node_id, node_type_counts)
or not nodes_allowed_to_terminate.get(
node_id, True)) and self.launch_config_ok(node_id):
continue
node_ip = self.provider.internal_ip(node_id)
if node_ip in last_used and last_used[node_ip] < horizon:
logger.info("StandardAutoscaler: "
"{}: Terminating idle node".format(node_id))
"{}: Terminating idle node.".format(node_id))
nodes_to_terminate.append(node_id)
elif not self.launch_config_ok(node_id):
logger.info("StandardAutoscaler: "
"{}: Terminating outdated node".format(node_id))
"{}: Terminating outdated node.".format(node_id))
nodes_to_terminate.append(node_id)
if nodes_to_terminate:
@@ -198,7 +210,7 @@ class StandardAutoscaler:
len(nodes_to_terminate)) > self.config["max_workers"] and nodes:
to_terminate = nodes.pop()
logger.info("StandardAutoscaler: "
"{}: Terminating unneeded node".format(to_terminate))
"{}: Terminating unneeded node.".format(to_terminate))
nodes_to_terminate.append(to_terminate)
if nodes_to_terminate:
@@ -226,15 +238,23 @@ class StandardAutoscaler:
if not updater.is_alive():
completed.append(node_id)
if completed:
nodes_to_terminate: List[NodeID] = []
for node_id in completed:
if self.updaters[node_id].exitcode == 0:
self.num_successful_updates[node_id] += 1
# Mark the node as active to prevent the node recovery
# logic immediately trying to restart Ray on the new node.
self.load_metrics.mark_active(
self.provider.internal_ip(node_id))
else:
logger.error(f"StandardAutoscaler: {node_id}: Terminating "
"failed to setup/initialize node.")
nodes_to_terminate.append(node_id)
self.num_failed_updates[node_id] += 1
del self.updaters[node_id]
# Mark the node as active to prevent the node recovery logic
# immediately trying to restart Ray on the new node.
self.load_metrics.mark_active(self.provider.internal_ip(node_id))
if nodes_to_terminate:
self.provider.terminate_nodes(nodes_to_terminate)
nodes = self.workers()
self.log_info_string(nodes)
@@ -266,14 +286,16 @@ class StandardAutoscaler:
last_used: Dict[str, float]) -> List[NodeID]:
"""Sort the nodes based on the last time they were used.
The first item in the return list is the least recently used.
The first item in the return list is the most recently used.
"""
updated_last_used = copy.deepcopy(last_used)
now = time.time()
# Add the unconnected nodes as the least recently used (the end of
# list). This prioritizes connected nodes.
least_recently_used = -1
for node_id in nodes:
node_ip = self.provider.internal_ip(node_id)
if node_ip not in updated_last_used:
updated_last_used[node_ip] = now
updated_last_used[node_ip] = least_recently_used
def last_time_used(node_id: NodeID):
node_ip = self.provider.internal_ip(node_id)
@@ -281,9 +303,86 @@ class StandardAutoscaler:
return sorted(nodes, key=last_time_used, reverse=True)
def _keep_min_worker_of_node_type(self, node_id: NodeID,
node_type_counts: Dict[NodeType, int]):
"""Returns if workers of node_type should be terminated.
def _get_nodes_allowed_to_terminate(
self, sorted_node_ids: List[NodeID]) -> Dict[NodeID, bool]:
# TODO(ameer): try merging this with resource_demand_scheduler
# code responsible for adding nodes for request_resources().
"""Returns the nodes allowed to terminate for request_resources().
Args:
sorted_node_ids: the node ids sorted based on last used (LRU last).
Returns:
nodes_allowed_to_terminate: whether the node id is allowed to
terminate or not.
"""
nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
head_node_resources: ResourceDict = copy.deepcopy(
self.available_node_types[self.config["head_node_type"]][
"resources"])
if not head_node_resources:
# Legacy yaml might include {} in the resources field.
# TODO(ameer): this is somewhat duplicated in
# resource_demand_scheduler.py.
head_id: List[NodeID] = self.provider.non_terminated_nodes({
TAG_RAY_NODE_KIND: NODE_KIND_HEAD
})
if head_id:
head_ip = self.provider.internal_ip(head_id[0])
static_nodes: Dict[
NodeIP,
ResourceDict] = \
self.load_metrics.get_static_node_resources_by_ip()
head_node_resources = static_nodes[head_ip]
else:
head_node_resources = {}
max_node_resources: List[ResourceDict] = [head_node_resources]
resource_demand_vector_worker_node_ids = []
# Get max resources on all the non terminated nodes.
for node_id in sorted_node_ids:
tags = self.provider.node_tags(node_id)
if TAG_RAY_USER_NODE_TYPE in tags:
node_type = tags[TAG_RAY_USER_NODE_TYPE]
node_resources: ResourceDict = copy.deepcopy(
self.available_node_types[node_type]["resources"])
if not node_resources:
# Legacy yaml might include {} in the resources field.
static_nodes: Dict[
NodeIP,
ResourceDict] = \
self.load_metrics.get_static_node_resources_by_ip()
node_ip = self.provider.internal_ip(node_id)
node_resources = static_nodes.get(node_ip, {})
max_node_resources.append(node_resources)
resource_demand_vector_worker_node_ids.append(node_id)
# Since it is sorted based on last used, we "keep" nodes that are
# most recently used when we binpack. We assume get_bin_pack_residual
# is following the given order here.
used_resource_requests: List[ResourceDict]
_, used_resource_requests = \
get_bin_pack_residual(max_node_resources,
self.resource_demand_vector)
# Remove the first entry (the head node).
max_node_resources.pop(0)
# Remove the first entry (the head node).
used_resource_requests.pop(0)
for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
if used_resource_requests[i] == max_node_resources[i] \
and max_node_resources[i]:
# No resources of the node were needed for request_resources().
# max_node_resources[i] is an empty dict for legacy yamls
# before the node is connected.
nodes_allowed_to_terminate[node_id] = True
else:
nodes_allowed_to_terminate[node_id] = False
return nodes_allowed_to_terminate
def _keep_min_worker_of_node_type(
self, node_id: NodeID,
node_type_counts: Dict[NodeType, int]) -> bool:
"""Returns if workers of node_type can be terminated.
The worker cannot be terminated to respect min_workers constraint.
Receives the counters of running nodes so far and determines if idle
node_id should be terminated or not. It also updates the counters
@@ -293,7 +392,7 @@ class StandardAutoscaler:
node_type_counts(Dict[NodeType, int]): The non_terminated node
types counted so far.
Returns:
bool: if workers of node_types should be terminated or not.
bool: if workers of node_types can be terminated or not.
"""
tags = self.provider.node_tags(node_id)
if TAG_RAY_USER_NODE_TYPE in tags:
@@ -589,6 +688,8 @@ class StandardAutoscaler:
self.load_metrics.get_resource_utilization())
if _internal_kv_initialized():
_internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True)
if self.prefix_cluster_info:
tmp = add_prefix(tmp, self.config["cluster_name"])
logger.debug(tmp)
def info_string(self, nodes):
@@ -29,8 +29,6 @@ from ray.autoscaler._private.subprocess_output_util import (
from ray.autoscaler._private.cli_logger import cli_logger, cf
from ray.util.debug import log_once
from ray.autoscaler._private.constants import RAY_HOME
logger = logging.getLogger(__name__)
# How long to wait for a node to start, in seconds
@@ -114,6 +112,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
self.node_id = str(node_id)
self.namespace = namespace
self.kubectl = ["kubectl", "-n", self.namespace]
self._home_cached = None
def run(
self,
@@ -195,7 +194,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
logger.warning("'rsync_filter' detected but is currently "
"unsupported for k8s.")
if target.startswith("~"):
target = RAY_HOME + target[1:]
target = self._home + target[1:]
try:
flags = "-aqz" if is_rsync_silent() else "-avz"
@@ -211,7 +210,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
UserWarning)
if target.startswith("~"):
target = RAY_HOME + target[1:]
target = self._home + target[1:]
self.process_runner.check_call(self.kubectl + [
"cp", source, "{}/{}:{}".format(self.namespace, self.node_id,
@@ -219,8 +218,8 @@ class KubernetesCommandRunner(CommandRunnerInterface):
])
def run_rsync_down(self, source, target, options=None):
if target.startswith("~"):
target = RAY_HOME + target[1:]
if source.startswith("~"):
source = self._home + source[1:]
try:
flags = "-aqz" if is_rsync_silent() else "-avz"
@@ -236,7 +235,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
UserWarning)
if target.startswith("~"):
target = RAY_HOME + target[1:]
target = self._home + target[1:]
self.process_runner.check_call(self.kubectl + [
"cp", "{}/{}:{}".format(self.namespace, self.node_id, source),
@@ -244,8 +243,21 @@ class KubernetesCommandRunner(CommandRunnerInterface):
])
def remote_shell_command_str(self):
return "{} exec -it {} bash".format(" ".join(self.kubectl),
self.node_id)
return "{} exec -it {} -- bash".format(" ".join(self.kubectl),
self.node_id)
@property
def _home(self):
# TODO (Dmitri): Think about how to use the node's HOME variable
# without making an extra kubectl exec call.
if self._home_cached is None:
cmd = self.kubectl + [
"exec", "-it", self.node_id, "--", "printenv", "HOME"
]
joined_cmd = " ".join(cmd)
raw_out = self.process_runner.check_output(joined_cmd, shell=True)
self._home_cached = raw_out.decode().strip("\n\r")
return self._home_cached
class SSHOptions:
@@ -5,6 +5,7 @@ _configured = False
_core_api = None
_auth_api = None
_extensions_beta_api = None
_custom_objects_api = None
def _load_config():
@@ -45,4 +46,13 @@ def extensions_beta_api():
return _extensions_beta_api
def custom_objects_api():
global _custom_objects_api
if _custom_objects_api is None:
_load_config()
_custom_objects_api = kubernetes.client.CustomObjectsApi()
return _custom_objects_api
log_prefix = "KubernetesNodeProvider: "
@@ -1,4 +1,6 @@
import copy
import logging
import math
from kubernetes import client
from kubernetes.client.rest import ApiException
@@ -45,9 +47,10 @@ def not_provided_msg(resource_type):
def bootstrap_kubernetes(config):
if not config["provider"]["use_internal_ips"]:
return ValueError("Exposing external IP addresses for ray pods isn't "
"currently supported. Please set "
"'use_internal_ips' to false.")
return ValueError(
"Exposing external IP addresses for ray containers isn't "
"currently supported. Please set "
"'use_internal_ips' to false.")
namespace = _configure_namespace(config["provider"])
_configure_autoscaler_service_account(namespace, config["provider"])
_configure_autoscaler_role(namespace, config["provider"])
@@ -56,6 +59,62 @@ def bootstrap_kubernetes(config):
return config
def fillout_resources_kubernetes(config):
if "available_node_types" not in config:
return config["available_node_types"]
node_types = copy.deepcopy(config["available_node_types"])
for node_type in node_types:
container_data = node_types[node_type]["node_config"]["spec"][
"containers"][0]
autodetected_resources = get_autodetected_resources(container_data)
if "resources" not in config["available_node_types"][node_type]:
config["available_node_types"][node_type]["resources"] = {}
config["available_node_types"][node_type]["resources"].update(
autodetected_resources)
logger.debug(
"Updating the resources of node type {} to include {}.".format(
node_type, autodetected_resources))
return config
def get_autodetected_resources(container_data):
container_resources = container_data.get("resources", None)
if container_resources is None:
return {"CPU": 0, "GPU": 0}
node_type_resources = {
resource_name.upper(): get_resource(container_resources, resource_name)
for resource_name in ["cpu", "gpu"]
}
return node_type_resources
def get_resource(container_resources, resource_name):
request = _get_resource(
container_resources, resource_name, field_name="requests")
limit = _get_resource(
container_resources, resource_name, field_name="limits")
resource = min(request, limit)
return 0 if resource == float("inf") else int(resource)
def _get_resource(container_resources, resource_name, field_name):
if (field_name in container_resources
and resource_name in container_resources[field_name]):
return _parse_resource(container_resources[field_name][resource_name])
else:
return float("inf")
def _parse_resource(resource):
resource_str = str(resource)
if resource_str[-1] == "m":
return math.ceil(int(resource_str[:-1]) / 1000)
else:
return int(resource_str)
def _configure_namespace(provider_config):
namespace_field = "namespace"
if namespace_field not in provider_config:
@@ -6,7 +6,8 @@ from kubernetes.client.rest import ApiException
from ray.autoscaler._private.command_runner import KubernetesCommandRunner
from ray.autoscaler._private.kubernetes import core_api, log_prefix, \
extensions_beta_api
from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes
from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes, \
fillout_resources_kubernetes
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
@@ -177,6 +178,11 @@ class KubernetesNodeProvider(NodeProvider):
def bootstrap_config(cluster_config):
return bootstrap_kubernetes(cluster_config)
@staticmethod
def fillout_available_node_types_resources(cluster_config):
"""Fills out missing "resources" field for available_node_types."""
return fillout_resources_kubernetes(cluster_config)
def _add_service_name_to_service_port(spec, svc_name):
"""Goes recursively through the ingress manifest and adds the
@@ -82,12 +82,14 @@ class LoadMetrics:
def prune(mapping):
unwanted = set(mapping) - active_ips
for unwanted_key in unwanted:
logger.info("LoadMetrics: "
"Removed mapping: {} - {}".format(
unwanted_key, mapping[unwanted_key]))
# TODO (Alex): Change this back to info after #12138.
logger.debug("LoadMetrics: "
"Removed mapping: {} - {}".format(
unwanted_key, mapping[unwanted_key]))
del mapping[unwanted_key]
if unwanted:
logger.info(
# TODO (Alex): Change this back to info after #12138.
logger.debug(
"LoadMetrics: "
"Removed {} stale ip mappings: {} not in {}".format(
len(unwanted), unwanted, active_ips))
@@ -135,24 +135,6 @@ class ResourceDemandScheduler:
this set of resources. This differs from resources_demands in
that we don't take into account existing usage.
"""
# If the user is using request_resources() API, calculate the remaining
# delta resources required to meet their requested cluster size.
if ensure_min_cluster_size is not None:
used_resources = []
for ip, max_res in max_resources_by_ip.items():
res = copy.deepcopy(max_res)
_inplace_subtract(res, unused_resources_by_ip.get(ip, {}))
used_resources.append(res)
# Example: user requests 1000 CPUs, but the cluster is currently
# 500 CPUs in size with 250 used. Then, the delta is 750 CPUs that
# we need to fit to get the cluster to scale to 1000.
resource_requests, _ = get_bin_pack_residual(
used_resources, ensure_min_cluster_size)
resource_demands += resource_requests
else:
resource_requests = []
if self.is_legacy_yaml():
# When using legacy yaml files we need to infer the head & worker
# node resources from the static node resources from LoadMetrics.
@@ -166,9 +148,12 @@ class ResourceDemandScheduler:
logger.info("Cluster resources: {}".format(node_resources))
logger.info("Node counts: {}".format(node_type_counts))
# Step 2: add nodes to add to satisfy min_workers for each type
node_resources, node_type_counts, min_workers_nodes_to_add = \
(node_resources,
node_type_counts,
adjusted_min_workers) = \
_add_min_workers_nodes(
node_resources, node_type_counts, self.node_types)
node_resources, node_type_counts, self.node_types,
self.max_workers, ensure_min_cluster_size)
# Step 3: add nodes for strict spread groups
logger.info(f"Placement group demands: {pending_placement_groups}")
@@ -180,8 +165,16 @@ class ResourceDemandScheduler:
not self.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]:
# Need to launch worker nodes to later infer their
# resources.
# We add request_resources() demands here to make sure we launch
# a single worker sometimes even if min_workers = 0 and resource
# demands is empty.
if ensure_min_cluster_size:
request_resources_demands = ensure_min_cluster_size
else:
request_resources_demands = []
return self._legacy_worker_node_to_launch(
nodes, launching_nodes, node_resources, resource_demands)
nodes, launching_nodes, node_resources,
resource_demands + request_resources_demands)
placement_group_nodes_to_add, node_resources, node_type_counts = \
self.reserve_and_allocate_spread(
strict_spreads, node_resources, node_type_counts)
@@ -194,20 +187,15 @@ class ResourceDemandScheduler:
logger.info("Unfulfilled demands: {}".format(unfulfilled))
# Add 1 to account for the head node.
max_to_add = self.max_workers + 1 - sum(node_type_counts.values())
if resource_requests:
nodes_to_add_based_on_requests = get_nodes_for(
self.node_types, node_type_counts, max_to_add,
resource_requests)
else:
nodes_to_add_based_on_requests = {}
nodes_to_add_based_on_demand = get_nodes_for(
self.node_types, node_type_counts, max_to_add, unfulfilled)
# Merge nodes to add based on demand and nodes to add based on
# min_workers constraint. We add them because nodes to add based on
# demand was calculated after the min_workers constraint was respected.
total_nodes_to_add = {}
for node_type in self.node_types:
nodes_to_add = (min_workers_nodes_to_add.get(
nodes_to_add = (adjusted_min_workers.get(
node_type, 0) + placement_group_nodes_to_add.get(node_type, 0)
+ nodes_to_add_based_on_demand.get(node_type, 0))
if nodes_to_add > 0:
@@ -216,7 +204,7 @@ class ResourceDemandScheduler:
# Limit the number of concurrent launches
total_nodes_to_add = self._get_concurrent_resource_demand_to_launch(
total_nodes_to_add, unused_resources_by_ip.keys(), nodes,
launching_nodes, nodes_to_add_based_on_requests)
launching_nodes, adjusted_min_workers)
logger.info("Node requests: {}".format(total_nodes_to_add))
return total_nodes_to_add
@@ -294,7 +282,7 @@ class ResourceDemandScheduler:
connected_nodes: List[NodeIP],
non_terminated_nodes: List[NodeID],
pending_launches_nodes: Dict[NodeType, int],
nodes_to_add_based_on_requests: Dict[NodeType, int],
adjusted_min_workers: Dict[NodeType, int],
) -> Dict[NodeType, int]:
"""Updates the max concurrent resources to launch for each node type.
@@ -314,9 +302,10 @@ class ResourceDemandScheduler:
connected_nodes: Running nodes (from LoadMetrics).
non_terminated_nodes: Non terminated nodes (pending/running).
pending_launches_nodes: Nodes that are in the launch queue.
nodes_to_add_based_on_requests: Nodes to launch to satisfy
request_resources(). This overrides the launch limits since the
user is hinting to immediately scale up to this size.
adjusted_min_workers: Nodes to launch to satisfy
min_workers and request_resources(). This overrides the launch
limits since the user is hinting to immediately scale up to
this size.
Returns:
Dict[NodeType, int]: Maximum number of nodes to launch for each
node type.
@@ -338,13 +327,9 @@ class ResourceDemandScheduler:
upper_bound = max(
max_allowed_pending_nodes - total_pending_nodes,
# Allow more nodes if this is to respect min_workers.
self.node_types[node_type].get("min_workers", 0) -
total_pending_nodes - running_nodes[node_type],
# Allow more nodes from request_resources API.
nodes_to_add_based_on_requests.get(node_type,
0) - total_pending_nodes)
# Allow more nodes if this is to respect min_workers or
# request_resources().
adjusted_min_workers.get(node_type, 0))
if upper_bound > 0:
updated_nodes_to_launch[node_type] = min(
@@ -504,21 +489,26 @@ def _node_type_counts_to_node_resources(
def _add_min_workers_nodes(
node_resources: List[ResourceDict],
node_type_counts: Dict[NodeType, int],
node_types: Dict[NodeType, NodeTypeConfigDict],
node_types: Dict[NodeType, NodeTypeConfigDict], max_workers: int,
ensure_min_cluster_size: List[ResourceDict]
) -> (List[ResourceDict], Dict[NodeType, int], Dict[NodeType, int]):
"""Updates resource demands to respect the min_workers constraint.
"""Updates resource demands to respect the min_workers and
request_resources() constraints.
Args:
node_resources: Resources of exisiting nodes already launched/pending.
node_type_counts: Counts of existing nodes already launched/pending.
node_types: Node types config.
max_workers: global max_workers constaint.
ensure_min_cluster_size: resource demands from request_resources().
Returns:
node_resources: The updated node resources after adding min_workers
constraint per node type.
and request_resources() constraints per node type.
node_type_counts: The updated node counts after adding min_workers
constraint per node type.
total_nodes_to_add: The nodes to add to respect min_workers constraint.
and request_resources() constraints per node type.
total_nodes_to_add_dict: The nodes to add to respect min_workers and
request_resources() constraints.
"""
total_nodes_to_add_dict = {}
for node_type, config in node_types.items():
@@ -528,10 +518,41 @@ def _add_min_workers_nodes(
if existing < target:
total_nodes_to_add_dict[node_type] = target - existing
node_type_counts[node_type] = target
available = copy.deepcopy(node_types[node_type]["resources"])
node_resources.extend(
[available] * total_nodes_to_add_dict[node_type])
node_resources.extend([
copy.deepcopy(node_types[node_type]["resources"])
for _ in range(total_nodes_to_add_dict[node_type])
])
if ensure_min_cluster_size:
max_to_add = max_workers + 1 - sum(node_type_counts.values())
max_node_resources = []
# Fit request_resources() on all the resources as if they are idle.
for node_type in node_type_counts:
max_node_resources.extend([
copy.deepcopy(node_types[node_type]["resources"])
for _ in range(node_type_counts[node_type])
])
# Get the unfulfilled to ensure min cluster size.
resource_requests_unfulfilled, _ = get_bin_pack_residual(
max_node_resources, ensure_min_cluster_size)
# Get the nodes to meet the unfulfilled.
nodes_to_add_request_resources = get_nodes_for(
node_types, node_type_counts, max_to_add,
resource_requests_unfulfilled)
# Update the resources, counts and total nodes to add.
for node_type in nodes_to_add_request_resources:
nodes_to_add = nodes_to_add_request_resources.get(node_type, 0)
if nodes_to_add > 0:
node_type_counts[
node_type] = nodes_to_add + node_type_counts.get(
node_type, 0)
node_resources.extend([
copy.deepcopy(node_types[node_type]["resources"])
for _ in range(nodes_to_add)
])
total_nodes_to_add_dict[
node_type] = nodes_to_add + total_nodes_to_add_dict.get(
node_type, 0)
return node_resources, node_type_counts, total_nodes_to_add_dict
@@ -623,7 +644,8 @@ def _utilization_score(node_resources: ResourceDict,
def get_bin_pack_residual(node_resources: List[ResourceDict],
resource_demands: List[ResourceDict],
strict_spread: bool = False) -> List[ResourceDict]:
strict_spread: bool = False
) -> (List[ResourceDict], List[ResourceDict]):
"""Return a subset of resource_demands that cannot fit in the cluster.
TODO(ekl): this currently does not guarantee the resources will be packed
@@ -638,7 +660,7 @@ def get_bin_pack_residual(node_resources: List[ResourceDict],
placed on a different entry in `node_resources`.
Returns:
List[ResourceDict] the residual list resources that do not fit.
List[ResourceDict]: the residual list resources that do not fit.
List[ResourceDict]: The updated node_resources after the method.
"""
+9 -1
View File
@@ -256,8 +256,16 @@ class NodeUpdater:
retry_str = "(" + str(e) + ")"
if hasattr(e, "cmd"):
if isinstance(e.cmd, str):
cmd_ = e.cmd
elif isinstance(e.cmd, list):
cmd_ = " ".join(e.cmd)
else:
logger.debug(f"e.cmd type ({type(e.cmd)}) not "
"list or str.")
cmd_ = str(e.cmd)
retry_str = "(Exit Status {}): {}".format(
e.returncode, " ".join(e.cmd))
e.returncode, cmd_)
cli_logger.print(
"SSH still not available {}, "
+11
View File
@@ -244,3 +244,14 @@ def hash_runtime_conf(file_mounts,
file_mounts_contents_hash = None
return (_hash_cache[conf_str], file_mounts_contents_hash)
def add_prefix(info_string, prefix):
"""Prefixes each line of info_string, except the first, by prefix."""
lines = info_string.split("\n")
prefixed_lines = [lines[0]]
for line in lines[1:]:
prefixed_line = ":".join([prefix, line])
prefixed_lines.append(prefixed_line)
prefixed_info_string = "\n".join(prefixed_lines)
return prefixed_info_string
+1 -1
View File
@@ -112,7 +112,7 @@ setup_commands:
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
+5 -4
View File
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -27,10 +28,10 @@ docker:
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray:latest-cpu"
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
@@ -128,7 +129,7 @@ setup_commands: []
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu"
image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray:latest"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@@ -90,8 +91,8 @@ file_mounts: {
# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray:latest has ray latest bundled
setup_commands: []
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
+4 -4
View File
@@ -2,7 +2,7 @@
cluster_name: java
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 1
min_workers: 1
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 1
@@ -72,10 +72,10 @@ worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --code-search-path=~/ray-word-count/target
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --code-search-path=ray-word-count/target
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar"`
# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar -Dray.job.code-search-path=ray-word-count/target"`
+4 -4
View File
@@ -24,7 +24,7 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "" # e.g., rayproject/ray:latest
image: "" # e.g., rayproject/ray-ml:latest
container_name: "" # e.g. ray_docker
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -32,9 +32,9 @@ docker:
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray:latest"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@@ -120,7 +120,7 @@ setup_commands:
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- source activate pytorch_p36 && pip install -U ray
- source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray[debug]
- source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
+1 -1
View File
@@ -112,7 +112,7 @@ setup_commands:
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -27,10 +28,10 @@ docker:
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray:latest-cpu"
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
@@ -128,7 +129,7 @@ setup_commands:
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu"
image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray:latest"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@@ -65,9 +66,9 @@ file_mounts: {
}
# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray:latest has ray latest bundled
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
+5 -4
View File
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu"
image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_docker"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -27,9 +28,9 @@ docker:
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray:latest"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@@ -97,7 +98,7 @@ setup_commands:
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
+1 -1
View File
@@ -130,7 +130,7 @@ setup_commands:
&& echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile
# Install ray
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
+5 -4
View File
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -27,10 +28,10 @@ docker:
run_options: [] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# Allow Ray to automatically detect GPUs
# worker_image: "rayproject/ray:latest-cpu"
# worker_image: "rayproject/ray-ml:latest-cpu"
# worker_run_options: []
# If a node is idle for this many minutes, it will be removed.
@@ -136,7 +137,7 @@ setup_commands: []
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
@@ -19,14 +19,15 @@ upscaling_speed: 1.0
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "rayproject/ray:latest-gpu"
image: "rayproject/ray-ml:latest-gpu"
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_nvidia_docker" # e.g. ray_docker
# # Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# head_image: "rayproject/ray-ml:latest-gpu"
# worker_image: "rayproject/ray:latest"
# worker_image: "rayproject/ray-ml:latest"
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
@@ -117,10 +118,10 @@ initialization_commands:
done"
# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray:latest has ray latest bundled
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@@ -142,7 +142,7 @@ head_node:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
@@ -215,7 +215,7 @@ worker_nodes:
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
@@ -142,7 +142,7 @@ head_node:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
@@ -215,7 +215,7 @@ worker_nodes:
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
@@ -146,7 +146,7 @@ head_node:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
@@ -221,7 +221,7 @@ worker_nodes:
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,128 @@
apiVersion: cluster.ray.io/v1
kind: RayCluster
metadata:
name: example-cluster
spec:
# The maximum number of workers nodes to launch in addition to the head node.
maxWorkers: 3
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscalingSpeed: 1.0
# If a node is idle for this many minutes, it will be removed.
idleTimeoutMinutes: 5
# Specify the pod type for the ray head node (as configured below).
headPodType: head-node
# Specify the default pod type for ray the worker nodes (as configured below).
workerDefaultPodType: worker-nodes
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
spec:
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
- name: worker-nodes
# Minimum number of Ray workers of this Pod type.
minWorkers: 2
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 3
# User-specified custom resources for use by Ray
rayResources: {"Custom1": 1, "is_spot": 1}
# Optional commands to run before starting the Ray runtime.
setupCommands:
- pip install numpy # Example
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,128 @@
apiVersion: cluster.ray.io/v1
kind: RayCluster
metadata:
name: example-cluster2
spec:
# The maximum number of workers nodes to launch in addition to the head node.
maxWorkers: 3
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscalingSpeed: 1.0
# If a node is idle for this many minutes, it will be removed.
idleTimeoutMinutes: 5
# Specify the pod type for the ray head node (as configured below).
headPodType: head-node
# Specify the default pod type for ray the worker nodes (as configured below).
workerDefaultPodType: worker-nodes
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster2-ray-head-
spec:
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
- name: worker-nodes
# Minimum number of Ray workers of this Pod type.
minWorkers: 1
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 3
# User-specified custom resources for use by Ray
rayResources: {"Custom1": 1, "is_spot": 1}
# Optional commands to run before starting the Ray runtime.
setupCommands:
- pip install numpy # Example
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster2-ray-worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -9,8 +9,8 @@ apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: ray-operator-role
rules:
- apiGroups: ["", "rbac.authorization.k8s.io"]
resources: ["configmaps", "pods", "pods/exec", "services", "serviceaccounts", "roles", "rolebindings"]
- apiGroups: ["", "cluster.ray.io"]
resources: ["rayclusters", "pods", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
@@ -35,8 +35,7 @@ spec:
- name: ray
imagePullPolicy: Always
image: rayproject/ray:nightly
command: ["/bin/bash", "-c", "--"]
args: ["ray-operator; trap : TERM INT; sleep infinity & wait;"]
command: ["ray-operator"]
env:
- name: RAY_OPERATOR_POD_NAMESPACE
valueFrom:
@@ -1,260 +0,0 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: ray
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Service that maps to the worker nodes of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-workers
spec:
# This selector must match the worker node pods' selector below.
selector:
component: ray-worker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# Kubernetes pod config for the head node pod.
available_node_types:
head_node:
resources: {}
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
worker_nodes:
resources: {}
min_workers: 1
max_workers: 2
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
head_node_type:
head_node
worker_default_node_type:
worker_nodes
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down.
# This is not supported on kubernetes.
rsync_exclude: []
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
# This is not supported on kubernetes.
rsync_filter: []
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --object-manager-port=8076 --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -25,7 +25,8 @@ idle_timeout_minutes: 5
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled. Assumes Docker is installed.
docker:
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
@@ -93,7 +94,7 @@ setup_commands: []
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
+1 -1
View File
@@ -20,7 +20,7 @@
"additionalProperties": false,
"properties": {
"cluster_name": {
"description": "An unique identifier for the head node and workers of this cluster.",
"description": "A unique identifier for the head node and workers of this cluster.",
"type": "string"
},
"min_workers": {
+4 -4
View File
@@ -3,9 +3,8 @@ from traceback import format_exception
import colorama
import ray
import ray.cloudpickle as pickle
from ray.core.generated.common_pb2 import RayException, Language
from ray.core.generated.common_pb2 import RayException, Language, PYTHON
import setproctitle
@@ -17,7 +16,7 @@ class RayError(Exception):
exc_info = (type(self), self, self.__traceback__)
formatted_exception_string = "\n".join(format_exception(*exc_info))
return RayException(
language=ray.Language.PYTHON.value(),
language=PYTHON,
serialized_exception=pickle.dumps(self),
formatted_exception_string=formatted_exception_string
).SerializeToString()
@@ -26,7 +25,7 @@ class RayError(Exception):
def from_bytes(b):
ray_exception = RayException()
ray_exception.ParseFromString(b)
if ray_exception.language == ray.Language.PYTHON.value():
if ray_exception.language == PYTHON:
return pickle.loads(ray_exception.serialized_exception)
else:
return CrossLanguageError(ray_exception)
@@ -81,6 +80,7 @@ class RayTaskError(RayError):
pid=None,
ip=None):
"""Initialize a RayTaskError."""
import ray
if proctitle:
self.proctitle = proctitle
else:
+72 -25
View File
@@ -7,34 +7,88 @@ import logging
logger = logging.getLogger(__name__)
# _client_api has to be external to the API stub, below.
# Otherwise, ray.remote() that contains ray.remote()
# contains a reference to the RayAPIStub, therefore a
# reference to the _client_api, and then tries to pickle
# the thing.
# About these global variables: Ray 1.0 uses exported module functions to
# provide its API, and we need to match that. However, we want different
# behaviors depending on where, exactly, in the client stack this is running.
#
# The reason for these differences depends on what's being pickled and passed
# to functions, or functions inside functions. So there are three cases to care
# about
#
# (Python Client)-->(Python ClientServer)-->(Internal Raylet Process)
#
# * _client_api should be set if we're inside the client
# * _server_api should be set if we're inside the clientserver
# * Both will be set if we're running both (as in a test)
# * Neither should be set if we're inside the raylet (but we still need to shim
# from the client API surface to the Ray API)
#
# The job of RayAPIStub (below) delegates to the appropriate one of these
# depending on what's set or not. Then, all users importing the ray object
# from this package get the stub which routes them to the appropriate APIImpl.
_client_api: Optional[APIImpl] = None
_server_api: Optional[APIImpl] = None
# The reason for _is_server is a hack around the above comment while running
# tests. If we have both a client and a server trying to control these static
# variables then we need a way to decide which to use. In this case, both
# _client_api and _server_api are set.
# This boolean flips between the two
_is_server: bool = False
@contextmanager
def stash_api_for_tests(in_test: bool):
api = None
global _is_server
is_server = _is_server
if in_test:
api = stash_api()
yield api
_is_server = True
yield _server_api
if in_test:
restore_api(api)
_is_server = is_server
def stash_api() -> Optional[APIImpl]:
def _set_client_api(val: Optional[APIImpl]):
global _client_api
a = _client_api
global _is_server
if _client_api is not None:
raise Exception("Trying to set more than one client API")
_client_api = val
_is_server = False
def _set_server_api(val: Optional[APIImpl]):
global _server_api
global _is_server
if _server_api is not None:
raise Exception("Trying to set more than one server API")
_server_api = val
_is_server = True
def reset_api():
global _client_api
global _server_api
global _is_server
_client_api = None
return a
_server_api = None
_is_server = False
def restore_api(api: Optional[APIImpl]):
def _get_client_api() -> APIImpl:
global _client_api
_client_api = api
global _server_api
global _is_server
api = None
if _is_server:
api = _server_api
else:
api = _client_api
if api is None:
# We're inside a raylet worker
from ray.experimental.client.server.core_ray_api import CoreRayAPI
return CoreRayAPI()
return api
class RayAPIStub:
@@ -43,11 +97,10 @@ class RayAPIStub:
secure: bool = False,
metadata: List[Tuple[str, str]] = None,
stub=None):
global _client_api
from ray.experimental.client.worker import Worker
_client_worker = Worker(
conn_str, secure=secure, metadata=metadata, stub=stub)
_client_api = ClientAPI(_client_worker)
_set_client_api(ClientAPI(_client_worker))
def disconnect(self):
global _client_api
@@ -56,15 +109,9 @@ class RayAPIStub:
_client_api = None
def __getattr__(self, key: str):
global _client_api
self.__check_client_api()
return getattr(_client_api, key)
def __check_client_api(self):
global _client_api
if _client_api is None:
from ray.experimental.client.server.core_ray_api import CoreRayAPI
_client_api = CoreRayAPI()
global _get_client_api
api = _get_client_api()
return getattr(api, key)
ray = RayAPIStub()
+167 -9
View File
@@ -11,40 +11,145 @@
from abc import ABC
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, Union, Optional
import ray.core.generated.ray_client_pb2 as ray_client_pb2
if TYPE_CHECKING:
from ray.experimental.client.common import ClientActorHandle
from ray.experimental.client.common import ClientStub
from ray.experimental.client.common import ClientObjectRef
from ray._raylet import ObjectRef
# Use the imports for type checking. This is a python 3.6 limitation.
# See https://www.python.org/dev/peps/pep-0563/
PutType = Union[ClientObjectRef, ObjectRef]
class APIImpl(ABC):
"""
APIImpl is the interface to implement for whichever version of the core
Ray API that needs abstracting when run in client mode.
"""
@abstractmethod
def get(self, *args, **kwargs):
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
"""
get is the hook stub passed on to replace `ray.get`
Args:
vals: [Client]ObjectRef or list of these refs to retrieve.
timeout: Optional timeout in milliseconds
"""
pass
@abstractmethod
def put(self, *args, **kwargs):
def put(self, vals: Any, *args,
**kwargs) -> Union["ClientObjectRef", "ObjectRef"]:
"""
put is the hook stub passed on to replace `ray.put`
Args:
vals: The value or list of values to `put`.
args: opaque arguments
kwargs: opaque keyword arguments
"""
pass
@abstractmethod
def wait(self, *args, **kwargs):
"""
wait is the hook stub passed on to replace `ray.wait`
Args:
args: opaque arguments
kwargs: opaque keyword arguments
"""
pass
@abstractmethod
def remote(self, *args, **kwargs):
"""
remote is the hook stub passed on to replace `ray.remote`.
This sets up remote functions or actors, as the decorator,
but does not execute them.
Args:
args: opaque arguments
kwargs: opaque keyword arguments
"""
pass
@abstractmethod
def call_remote(self, f, kind, *args, **kwargs):
def call_remote(self, instance: "ClientStub", *args, **kwargs):
"""
call_remote is called by stub objects to execute them remotely.
This is used by stub objects in situations where they're called
with .remote, eg, `f.remote()` or `actor_cls.remote()`.
This allows the client stub objects to delegate execution to be
implemented in the most effective way whether it's in the client,
clientserver, or raylet worker.
Args:
instance: The Client-side stub reference to a remote object
args: opaque arguments
kwargs: opaque keyword arguments
"""
pass
@abstractmethod
def close(self, *args, **kwargs):
def close(self) -> None:
"""
close cleans up an API connection by closing any channels or
shutting down any servers gracefully.
"""
pass
@abstractmethod
def kill(self, actor, *, no_restart=True):
"""
kill forcibly stops an actor running in the cluster
Args:
no_restart: Whether this actor should be restarted if it's a
restartable actor.
"""
pass
@abstractmethod
def cancel(self, obj, *, force=False, recursive=True):
"""
Cancels a task on the cluster.
If the specified task is pending execution, it will not be executed. If
the task is currently executing, the behavior depends on the ``force``
flag, as per `ray.cancel()`
Only non-actor tasks can be canceled. Canceled tasks will not be
retried (max_retries will not be respected).
Args:
object_ref (ObjectRef): ObjectRef returned by the task
that should be canceled.
force (boolean): Whether to force-kill a running task by killing
the worker that is running the task.
recursive (boolean): Whether to try to cancel tasks submitted by
the task specified.
"""
pass
class ClientAPI(APIImpl):
"""
The Client-side methods corresponding to the ray API. Delegates
to the Client Worker that contains the connection to the ClientServer.
"""
def __init__(self, worker):
self.worker = worker
def get(self, *args, **kwargs):
return self.worker.get(*args, **kwargs)
def get(self, vals, *, timeout=None):
return self.worker.get(vals, timeout=timeout)
def put(self, *args, **kwargs):
return self.worker.put(*args, **kwargs)
@@ -55,12 +160,65 @@ class ClientAPI(APIImpl):
def remote(self, *args, **kwargs):
return self.worker.remote(*args, **kwargs)
def call_remote(self, f, kind, *args, **kwargs):
return self.worker.call_remote(f, kind, *args, **kwargs)
def call_remote(self, instance: "ClientStub", *args, **kwargs):
return self.worker.call_remote(instance, *args, **kwargs)
def close(self, *args, **kwargs):
def close(self) -> None:
return self.worker.close()
def kill(self, actor: "ClientActorHandle", *, no_restart=True):
return self.worker.terminate_actor(actor, no_restart)
def cancel(self, obj: "ClientObjectRef", *, force=False, recursive=True):
return self.worker.terminate_task(obj, force, recursive)
# Various metadata methods for the client that are defined in the protocol.
def is_initialized(self) -> bool:
""" True if our client is connected, and if the server is initialized.
Returns:
A boolean determining if the client is connected and
server initialized.
"""
return self.worker.is_initialized()
def nodes(self):
"""Get a list of the nodes in the cluster (for debugging only).
Returns:
Information about the Ray clients in the cluster.
"""
return self.worker.get_cluster_info(
ray_client_pb2.ClusterInfoType.NODES)
def cluster_resources(self):
"""Get the current total cluster resources.
Note that this information can grow stale as nodes are added to or
removed from the cluster.
Returns:
A dictionary mapping resource name to the total quantity of that
resource in the cluster.
"""
return self.worker.get_cluster_info(
ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES)
def available_resources(self):
"""Get the current available cluster resources.
This is different from `cluster_resources` in that this will return
idle (available) resources rather than total resources.
Note that this information can grow stale as tasks start and finish.
Returns:
A dictionary mapping resource name to the total quantity of that
resource in the cluster.
"""
return self.worker.get_cluster_info(
ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES)
def __getattr__(self, key: str):
if not key.startswith("_"):
raise NotImplementedError(
+195 -21
View File
@@ -1,12 +1,16 @@
import ray.core.generated.ray_client_pb2 as ray_client_pb2
from ray.experimental.client import ray
from typing import Any
from typing import Dict
from ray import cloudpickle
import base64
class ClientBaseRef:
def __init__(self, id):
def __init__(self, id, handle=None):
self.id = id
self.handle = handle
def __repr__(self):
return "%s(%s)" % (
@@ -17,83 +21,243 @@ class ClientBaseRef:
def __eq__(self, other):
return self.id == other.id
def binary(self):
return self.id
@classmethod
def from_remote_ref(cls, ref: ray_client_pb2.RemoteRef):
return cls(id=ref.id, handle=ref.handle)
class ClientObjectRef(ClientBaseRef):
pass
def _unpack_ref(self):
return cloudpickle.loads(self.handle)
class ClientActorRef(ClientBaseRef):
pass
class ClientRemoteFunc:
class ClientStub:
pass
class ClientRemoteFunc(ClientStub):
"""
A stub created on the Ray Client to represent a remote
function that can be exectued on the cluster.
This class is allowed to be passed around between remote functions.
Args:
_func: The actual function to execute remotely
_name: The original name of the function
_ref: The ClientObjectRef of the pickled code of the function, _func
_raylet_remote: The Raylet-side ray.remote_function.RemoteFunction
for this object
"""
def __init__(self, f):
self._func = f
self._name = f.__name__
self.id = None
self._raylet_remote_func = None
# self._ref can be lazily instantiated. Rather than eagerly creating
# function data objects in the server we can put them just before we
# execute the function, especially in cases where many @ray.remote
# functions exist in a library and only a handful are ever executed by
# a user of the library.
#
# TODO(barakmich): This ref might actually be better as a serialized
# ObjectRef. This requires being able to serialize the ref without
# pinning it (as the lifetime of the ref is tied with the server, not
# the client)
self._ref = None
self._raylet_remote = None
def __call__(self, *args, **kwargs):
raise TypeError(f"Remote function cannot be called directly. "
"Use {self._name}.remote method instead")
def remote(self, *args, **kwargs):
return ray.call_remote(self, ray_client_pb2.ClientTask.FUNCTION, *args,
**kwargs)
return ray.call_remote(self, *args, **kwargs)
def _get_ray_remote_impl(self):
if self._raylet_remote is None:
self._raylet_remote = ray.remote(self._func)
return self._raylet_remote
def __repr__(self):
return "ClientRemoteFunc(%s, %s)" % (self._name, self.id)
return "ClientRemoteFunc(%s, %s)" % (self._name, self._ref)
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
if self._ref is None:
self._ref = ray.put(self._func)
task = ray_client_pb2.ClientTask()
task.type = ray_client_pb2.ClientTask.FUNCTION
task.name = self._name
task.payload_id = self._ref.handle
return task
class ClientActorClass:
class ClientActorClass(ClientStub):
""" A stub created on the Ray Client to represent an actor class.
It is wrapped by ray.remote and can be executed on the cluster.
Args:
actor_cls: The actual class to execute remotely
_name: The original name of the class
_ref: The ClientObjectRef of the pickled `actor_cls`
_raylet_remote: The Raylet-side ray.ActorClass for this object
"""
def __init__(self, actor_cls):
self.actor_cls = actor_cls
self._name = actor_cls.__name__
self._ref = None
self._raylet_remote = None
def __call__(self, *args, **kwargs):
raise TypeError(f"Remote actor cannot be instantiated directly. "
"Use {self._name}.remote() instead")
def __getstate__(self) -> Dict:
state = {
"actor_cls": self.actor_cls,
"_name": self._name,
"_ref": self._ref,
}
return state
def __setstate__(self, state: Dict) -> None:
self.actor_cls = state["actor_cls"]
self._name = state["_name"]
self._ref = state["_ref"]
def remote(self, *args, **kwargs):
# Actually instantiate the actor
ref = ray.call_remote(self, ray_client_pb2.ClientTask.ACTOR, *args,
**kwargs)
return ClientActorHandle(ref, self)
ref = ray.call_remote(self, *args, **kwargs)
return ClientActorHandle(ClientActorRef(ref.id, ref.handle), self)
def __repr__(self):
return "ClientRemoteActor(%s, %s)" % (self._name, self.id)
return "ClientRemoteActor(%s, %s)" % (self._name, self._ref)
def __getattr__(self, key):
if key not in self.__dict__:
raise AttributeError("Not a class attribute")
raise NotImplementedError("static methods")
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
if self._ref is None:
self._ref = ray.put(self.actor_cls)
task = ray_client_pb2.ClientTask()
task.type = ray_client_pb2.ClientTask.ACTOR
task.name = self._name
task.payload_id = self._ref.handle
return task
class ClientActorHandle:
def __init__(self, actor_id: ClientActorRef,
class ClientActorHandle(ClientStub):
"""Client-side stub for instantiated actor.
A stub created on the Ray Client to represent a remote actor that
has been started on the cluster. This class is allowed to be passed
around between remote functions.
Args:
actor_ref: A reference to the running actor given to the client. This
is a serialized version of the actual handle as an opaque token.
actor_class: A reference to the ClientActorClass that this actor was
instantiated from.
_real_actor_handle: Cached copy of the Raylet-side
ray.actor.ActorHandle contained in the actor_id ref.
"""
def __init__(self, actor_ref: ClientActorRef,
actor_class: ClientActorClass):
self.actor_id = actor_id
self.actor_ref = actor_ref
self.actor_class = actor_class
self._real_actor_handle = None
def _get_ray_remote_impl(self):
if self._real_actor_handle is None:
self._real_actor_handle = cloudpickle.loads(self.actor_ref.handle)
return self._real_actor_handle
def __getstate__(self) -> Dict:
state = {
"actor_ref": self.actor_ref,
"actor_class": self.actor_class,
"_real_actor_handle": self._real_actor_handle,
}
return state
def __setstate__(self, state: Dict) -> None:
self.actor_ref = state["actor_ref"]
self.actor_class = state["actor_class"]
self._real_actor_handle = state["_real_actor_handle"]
@property
def _actor_id(self):
return self.actor_ref.id
def __getattr__(self, key):
return ClientRemoteMethod(self, key)
def __repr__(self):
return "ClientActorHandle(%s)" % (self.actor_ref.id.hex())
class ClientRemoteMethod(ClientStub):
"""A stub for a method on a remote actor.
Can be annotated with exection options.
Args:
actor_handle: A reference to the ClientActorHandle that generated
this method and will have this method called upon it.
method_name: The name of this method
"""
class ClientRemoteMethod:
def __init__(self, actor_handle: ClientActorHandle, method_name: str):
self.actor_handle = actor_handle
self.method_name = method_name
self._name = "%s.%s" % (self.actor_handle.actor_class._name,
self.method_name)
def __call__(self, *args, **kwargs):
raise TypeError(f"Remote method cannot be called directly. "
"Use {self._name}.remote() instead")
def _get_ray_remote_impl(self):
return getattr(self.actor_handle._get_ray_remote_impl(),
self.method_name)
def __getstate__(self) -> Dict:
state = {
"actor_handle": self.actor_handle,
"method_name": self.method_name,
}
return state
def __setstate__(self, state: Dict) -> None:
self.actor_handle = state["actor_handle"]
self.method_name = state["method_name"]
def remote(self, *args, **kwargs):
return ray.call_remote(self, ray_client_pb2.ClientTask.METHOD, *args,
**kwargs)
return ray.call_remote(self, *args, **kwargs)
def __repr__(self):
return "ClientRemoteMethod(%s, %s)" % (self._name, self.actor_id)
name = "%s.%s" % (self.actor_handle.actor_class._name,
self.method_name)
return "ClientRemoteMethod(%s, %s)" % (name,
self.actor_handle.actor_id)
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
task = ray_client_pb2.ClientTask()
task.type = ray_client_pb2.ClientTask.METHOD
task.name = self.method_name
task.payload_id = self.actor_handle.actor_ref.handle
return task
def convert_from_arg(pb) -> Any:
@@ -114,3 +278,13 @@ def convert_to_arg(val):
out.local = ray_client_pb2.Arg.Locality.INTERNED
out.data = cloudpickle.dumps(val)
return out
def encode_exception(exception) -> str:
data = cloudpickle.dumps(exception)
return base64.standard_b64encode(data).decode()
def decode_exception(data) -> Exception:
data = base64.standard_b64decode(data)
return cloudpickle.loads(data)
@@ -7,18 +7,36 @@
# While the stub is trivial, it allows us to check that the calls we're
# making into the core-ray module are contained and well-defined.
from typing import Any
from typing import Optional
from typing import Union
import ray
from ray.experimental.client.api import APIImpl
from ray.experimental.client.common import ClientRemoteFunc
from ray.experimental.client.common import ClientObjectRef
from ray.experimental.client.common import ClientStub
class CoreRayAPI(APIImpl):
def get(self, *args, **kwargs):
return ray.get(*args, **kwargs)
"""
Implements the equivalent client-side Ray API by simply passing along to
the Core Ray API. Primarily used inside of Ray Workers as a trampoline back
to core ray when passed client stubs.
"""
def put(self, *args, **kwargs):
return ray.put(*args, **kwargs)
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
if isinstance(vals, list):
if isinstance(vals[0], ClientObjectRef):
return ray.get(
[val._unpack_ref() for val in vals], timeout=timeout)
elif isinstance(vals, ClientObjectRef):
return ray.get(vals._unpack_ref(), timeout=timeout)
return ray.get(vals, timeout=timeout)
def put(self, vals: Any, *args,
**kwargs) -> Union[ClientObjectRef, ray._raylet.ObjectRef]:
return ray.put(vals, *args, **kwargs)
def wait(self, *args, **kwargs):
return ray.wait(*args, **kwargs)
@@ -26,16 +44,58 @@ class CoreRayAPI(APIImpl):
def remote(self, *args, **kwargs):
return ray.remote(*args, **kwargs)
def call_remote(self, f: ClientRemoteFunc, kind: int, *args, **kwargs):
if f._raylet_remote_func is None:
f._raylet_remote_func = ray.remote(f._func)
return f._raylet_remote_func.remote(*args, **kwargs)
def call_remote(self, instance: ClientStub, *args, **kwargs):
return instance._get_ray_remote_impl().remote(*args, **kwargs)
def close(self, *args, **kwargs):
def close(self) -> None:
return None
def kill(self, actor, *, no_restart=True):
return ray.kill(actor, no_restart=no_restart)
def cancel(self, obj, *, force=False, recursive=True):
return ray.cancel(obj, force=force, recursive=recursive)
def is_initialized(self) -> bool:
return ray.is_initialized()
# Allow for generic fallback to ray.* in remote methods. This allows calls
# like ray.nodes() to be run in remote functions even though the client
# doesn't currently support them.
def __getattr__(self, key: str):
return getattr(ray, key)
class RayServerAPI(CoreRayAPI):
"""
Ray Client server-side API shim. By default, simply calls the default Core
Ray API calls, but also accepts scheduling calls from functions running
inside of other remote functions that need to create more work.
"""
def __init__(self, server_instance):
self.server = server_instance
# Wrap single item into list if needed before calling server put.
def put(self, vals: Any, *args, **kwargs) -> ClientObjectRef:
to_put = []
single = False
if isinstance(vals, list):
to_put = vals
else:
single = True
to_put.append(vals)
out = [self._put(x) for x in to_put]
if single:
out = out[0]
return out
def _put(self, val: Any):
resp = self.server._put_and_retain_obj(val)
return ClientObjectRef(resp.id)
def call_remote(self, instance: ClientStub, *args, **kwargs):
task = instance._prepare_client_task()
ticket = self.server.Schedule(task, prepared_args=args)
return ClientObjectRef(ticket.return_id)
+153 -44
View File
@@ -3,14 +3,17 @@ from concurrent import futures
import grpc
from ray import cloudpickle
import ray
import ray.state
import ray.core.generated.ray_client_pb2 as ray_client_pb2
import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
import time
import inspect
from ray.experimental.client import stash_api_for_tests
import json
from ray.experimental.client import stash_api_for_tests, _set_server_api
from ray.experimental.client.common import convert_from_arg
from ray.experimental.client.common import encode_exception
from ray.experimental.client.common import ClientObjectRef
from ray.experimental.client.common import ClientRemoteFunc
from ray.experimental.client.server.core_ray_api import RayServerAPI
logger = logging.getLogger(__name__)
@@ -23,31 +26,98 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
self.registered_actor_classes = {}
self._test_mode = test_mode
def ClusterInfo(self, request,
context=None) -> ray_client_pb2.ClusterInfoResponse:
resp = ray_client_pb2.ClusterInfoResponse()
resp.type = request.type
if request.type == ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES:
resources = ray.cluster_resources()
# Normalize resources into floats
# (the function may return values that are ints)
float_resources = {k: float(v) for k, v in resources.items()}
resp.resource_table.CopyFrom(
ray_client_pb2.ClusterInfoResponse.ResourceTable(
table=float_resources))
elif request.type == \
ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES:
resources = ray.available_resources()
# Normalize resources into floats
# (the function may return values that are ints)
float_resources = {k: float(v) for k, v in resources.items()}
resp.resource_table.CopyFrom(
ray_client_pb2.ClusterInfoResponse.ResourceTable(
table=float_resources))
else:
resp.json = self._return_debug_cluster_info(request, context)
return resp
def _return_debug_cluster_info(self, request, context=None) -> str:
data = None
if request.type == ray_client_pb2.ClusterInfoType.NODES:
data = ray.nodes()
elif request.type == ray_client_pb2.ClusterInfoType.IS_INITIALIZED:
data = ray.is_initialized()
else:
raise TypeError("Unsupported cluster info type")
return json.dumps(data)
def Terminate(self, request, context=None):
if request.WhichOneof("terminate_type") == "task_object":
try:
object_ref = cloudpickle.loads(request.task_object.handle)
ray.cancel(
object_ref,
force=request.task_object.force,
recursive=request.task_object.recursive)
except Exception as e:
return_exception_in_context(e, context)
elif request.WhichOneof("terminate_type") == "actor":
try:
actor_ref = cloudpickle.loads(request.actor.handle)
ray.kill(actor_ref, no_restart=request.actor.no_restart)
except Exception as e:
return_exception_in_context(e, context)
else:
raise RuntimeError(
"Client requested termination without providing a valid "
"terminate_type")
return ray_client_pb2.TerminateResponse(ok=True)
def GetObject(self, request, context=None):
if request.id not in self.object_refs:
request_ref = cloudpickle.loads(request.handle)
if request_ref.binary() not in self.object_refs:
return ray_client_pb2.GetResponse(valid=False)
objectref = self.object_refs[request.id]
objectref = self.object_refs[request_ref.binary()]
logger.info("get: %s" % objectref)
item = ray.get(objectref)
try:
item = ray.get(objectref, timeout=request.timeout)
except Exception as e:
return_exception_in_context(e, context)
item_ser = cloudpickle.dumps(item)
return ray_client_pb2.GetResponse(valid=True, data=item_ser)
def PutObject(self, request, context=None):
def PutObject(self, request, context=None) -> ray_client_pb2.PutResponse:
obj = cloudpickle.loads(request.data)
objectref = self._put_and_retain_obj(obj)
pickled_ref = cloudpickle.dumps(objectref)
return ray_client_pb2.PutResponse(
ref=make_remote_ref(objectref.binary(), pickled_ref))
def _put_and_retain_obj(self, obj) -> ray.ObjectRef:
objectref = ray.put(obj)
self.object_refs[objectref.binary()] = objectref
logger.info("put: %s" % objectref)
return ray_client_pb2.PutResponse(id=objectref.binary())
return objectref
def WaitObject(self, request, context=None) -> ray_client_pb2.WaitResponse:
object_refs = [cloudpickle.loads(o) for o in request.object_refs]
object_refs = [cloudpickle.loads(o) for o in request.object_handles]
num_returns = request.num_returns
timeout = request.timeout
object_refs_ids = []
for object_ref in object_refs:
if object_ref.id not in self.object_refs:
if object_ref.binary() not in self.object_refs:
return ray_client_pb2.WaitResponse(valid=False)
object_refs_ids.append(self.object_refs[object_ref.id])
object_refs_ids.append(self.object_refs[object_ref.binary()])
try:
ready_object_refs, remaining_object_refs = ray.wait(
object_refs_ids,
@@ -59,94 +129,133 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
logger.info("wait: %s %s" % (str(ready_object_refs),
str(remaining_object_refs)))
ready_object_ids = [
ready_object_ref.binary() for ready_object_ref in ready_object_refs
make_remote_ref(
id=ready_object_ref.binary(),
handle=cloudpickle.dumps(ready_object_ref),
) for ready_object_ref in ready_object_refs
]
remaining_object_ids = [
remaining_object_ref.binary()
for remaining_object_ref in remaining_object_refs
make_remote_ref(
id=remaining_object_ref.binary(),
handle=cloudpickle.dumps(remaining_object_ref),
) for remaining_object_ref in remaining_object_refs
]
return ray_client_pb2.WaitResponse(
valid=True,
ready_object_ids=ready_object_ids,
remaining_object_ids=remaining_object_ids)
def Schedule(self, task, context=None) -> ray_client_pb2.ClientTaskTicket:
def Schedule(self, task, context=None,
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
logger.info("schedule: %s %s" %
(task.name,
ray_client_pb2.ClientTask.RemoteExecType.Name(task.type)))
if task.type == ray_client_pb2.ClientTask.FUNCTION:
return self._schedule_function(task, context)
return self._schedule_function(task, context, prepared_args)
elif task.type == ray_client_pb2.ClientTask.ACTOR:
return self._schedule_actor(task, context)
return self._schedule_actor(task, context, prepared_args)
elif task.type == ray_client_pb2.ClientTask.METHOD:
return self._schedule_method(task, context)
return self._schedule_method(task, context, prepared_args)
else:
raise NotImplementedError(
"Unimplemented Schedule task type: %s" %
ray_client_pb2.ClientTask.RemoteExecType.Name(task.type))
def _schedule_method(self, task: ray_client_pb2.ClientTask,
context=None) -> ray_client_pb2.ClientTaskTicket:
def _schedule_method(
self,
task: ray_client_pb2.ClientTask,
context=None,
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
actor_handle = self.actor_refs.get(task.payload_id)
if actor_handle is None:
raise Exception(
"Can't run an actor the server doesn't have a handle for")
arglist = _convert_args(task.args)
arglist = _convert_args(task.args, prepared_args)
with stash_api_for_tests(self._test_mode):
output = getattr(actor_handle, task.name).remote(*arglist)
self.object_refs[output.binary()] = output
return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
pickled_ref = cloudpickle.dumps(output)
return ray_client_pb2.ClientTaskTicket(
return_ref=make_remote_ref(output.binary(), pickled_ref))
def _schedule_actor(self, task: ray_client_pb2.ClientTask,
context=None) -> ray_client_pb2.ClientTaskTicket:
def _schedule_actor(self,
task: ray_client_pb2.ClientTask,
context=None,
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
with stash_api_for_tests(self._test_mode):
if task.payload_id not in self.registered_actor_classes:
actor_class_ref = self.object_refs[task.payload_id]
payload_ref = cloudpickle.loads(task.payload_id)
if payload_ref.binary() not in self.registered_actor_classes:
actor_class_ref = self.object_refs[payload_ref.binary()]
actor_class = ray.get(actor_class_ref)
if not inspect.isclass(actor_class):
raise Exception("Attempting to schedule actor that "
"isn't a ClientActorClass.")
"isn't a class.")
reg_class = ray.remote(actor_class)
self.registered_actor_classes[task.payload_id] = reg_class
remote_class = self.registered_actor_classes[task.payload_id]
arglist = _convert_args(task.args)
self.registered_actor_classes[payload_ref.binary()] = reg_class
remote_class = self.registered_actor_classes[payload_ref.binary()]
arglist = _convert_args(task.args, prepared_args)
actor = remote_class.remote(*arglist)
actor_ref = actor._actor_id
self.actor_refs[actor_ref.binary()] = actor
return ray_client_pb2.ClientTaskTicket(return_id=actor_ref.binary())
actorhandle = cloudpickle.dumps(actor)
self.actor_refs[actorhandle] = actor
return ray_client_pb2.ClientTaskTicket(
return_ref=make_remote_ref(actor._actor_id.binary(), actorhandle))
def _schedule_function(self, task: ray_client_pb2.ClientTask,
context=None) -> ray_client_pb2.ClientTaskTicket:
if task.payload_id not in self.function_refs:
funcref = self.object_refs[task.payload_id]
def _schedule_function(
self,
task: ray_client_pb2.ClientTask,
context=None,
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
payload_ref = cloudpickle.loads(task.payload_id)
if payload_ref.binary() not in self.function_refs:
funcref = self.object_refs[payload_ref.binary()]
func = ray.get(funcref)
if not isinstance(func, ClientRemoteFunc):
if not inspect.isfunction(func):
raise Exception("Attempting to schedule function that "
"isn't a ClientRemoteFunc.")
self.function_refs[task.payload_id] = func
remote_func = self.function_refs[task.payload_id]
arglist = _convert_args(task.args)
"isn't a function.")
self.function_refs[payload_ref.binary()] = ray.remote(func)
remote_func = self.function_refs[payload_ref.binary()]
arglist = _convert_args(task.args, prepared_args)
# Prepare call if we're in a test
with stash_api_for_tests(self._test_mode):
output = remote_func.remote(*arglist)
if output.binary() in self.object_refs:
raise Exception("already found it")
self.object_refs[output.binary()] = output
return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
pickled_output = cloudpickle.dumps(output)
return ray_client_pb2.ClientTaskTicket(
return_ref=make_remote_ref(output.binary(), pickled_output))
def _convert_args(arg_list):
def _convert_args(arg_list, prepared_args=None):
if prepared_args is not None:
return prepared_args
out = []
for arg in arg_list:
t = convert_from_arg(arg)
if isinstance(t, ClientObjectRef):
out.append(ray.ObjectRef(t.id))
out.append(t._unpack_ref())
else:
out.append(t)
return out
def make_remote_ref(id: bytes, handle: bytes) -> ray_client_pb2.RemoteRef:
return ray_client_pb2.RemoteRef(
id=id,
handle=handle,
)
def return_exception_in_context(err, context):
if context is not None:
context.set_details(encode_exception(err))
context.set_code(grpc.StatusCode.INTERNAL)
def serve(connection_str, test_mode=False):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
task_servicer = RayletServicer(test_mode=test_mode)
_set_server_api(RayServerAPI(task_servicer))
ray_client_pb2_grpc.add_RayletDriverServicer_to_server(
task_servicer, server)
server.add_insecure_port(connection_str)
+84 -60
View File
@@ -3,22 +3,29 @@ It implements the Ray API functions that are forwarded through grpc calls
to the server.
"""
import inspect
import json
import logging
from typing import Any
from typing import List
from typing import Tuple
from typing import Optional
import ray.cloudpickle as cloudpickle
from ray.util.inspect import is_cython
import grpc
from ray.exceptions import TaskCancelledError
import ray.core.generated.ray_client_pb2 as ray_client_pb2
import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
from ray.experimental.client.common import convert_to_arg
from ray.experimental.client.common import decode_exception
from ray.experimental.client.common import ClientObjectRef
from ray.experimental.client.common import ClientActorRef
from ray.experimental.client.common import ClientActorClass
from ray.experimental.client.common import ClientRemoteMethod
from ray.experimental.client.common import ClientActorHandle
from ray.experimental.client.common import ClientRemoteFunc
logger = logging.getLogger(__name__)
class Worker:
def __init__(self,
@@ -34,6 +41,7 @@ class Worker:
metadata: additional metadata passed in the grpc request headers.
"""
self.metadata = metadata
self.channel = None
if stub is None:
if secure:
credentials = grpc.ssl_channel_credentials()
@@ -44,28 +52,32 @@ class Worker:
else:
self.server = stub
def get(self, ids):
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
to_get = []
single = False
if isinstance(ids, list):
to_get = [x.id for x in ids]
elif isinstance(ids, ClientObjectRef):
to_get = [ids.id]
if isinstance(vals, list):
to_get = [x.handle for x in vals]
elif isinstance(vals, ClientObjectRef):
to_get = [vals.handle]
single = True
else:
raise Exception("Can't get something that's not a "
"list of IDs or just an ID: %s" % type(ids))
out = [self._get(x) for x in to_get]
"list of IDs or just an ID: %s" % type(vals))
if timeout is None:
timeout = 0
out = [self._get(x, timeout) for x in to_get]
if single:
out = out[0]
return out
def _get(self, id: bytes):
req = ray_client_pb2.GetRequest(id=id)
data = self.server.GetObject(req, metadata=self.metadata)
def _get(self, handle: bytes, timeout: float):
req = ray_client_pb2.GetRequest(handle=handle, timeout=timeout)
try:
data = self.server.GetObject(req, metadata=self.metadata)
except grpc.RpcError as e:
raise decode_exception(e.details())
if not data.valid:
raise Exception(
"Client GetObject returned invalid data: id invalid?")
raise TaskCancelledError(handle)
return cloudpickle.loads(data.data)
def put(self, vals):
@@ -86,7 +98,7 @@ class Worker:
data = cloudpickle.dumps(val)
req = ray_client_pb2.PutRequest(data=data)
resp = self.server.PutObject(req, metadata=self.metadata)
return ClientObjectRef(resp.id)
return ClientObjectRef.from_remote_ref(resp.ref)
def wait(self,
object_refs: List[ClientObjectRef],
@@ -98,8 +110,8 @@ class Worker:
for ref in object_refs:
assert isinstance(ref, ClientObjectRef)
data = {
"object_refs": [
cloudpickle.dumps(object_ref) for object_ref in object_refs
"object_handles": [
object_ref.handle for object_ref in object_refs
],
"num_returns": num_returns,
"timeout": timeout if timeout else -1
@@ -110,10 +122,12 @@ class Worker:
# TODO(ameer): improve error/exceptions messages.
raise Exception("Client Wait request failed. Reference invalid?")
client_ready_object_ids = [
ClientObjectRef(id) for id in resp.ready_object_ids
ClientObjectRef.from_remote_ref(ref)
for ref in resp.ready_object_ids
]
client_remaining_object_ids = [
ClientObjectRef(id) for id in resp.remaining_object_ids
ClientObjectRef.from_remote_ref(ref)
for ref in resp.remaining_object_ids
]
return (client_ready_object_ids, client_remaining_object_ids)
@@ -130,50 +144,60 @@ class Worker:
raise TypeError("The @ray.remote decorator must be applied to "
"either a function or to a class.")
def call_remote(self, instance, kind, *args, **kwargs):
ticket = None
if kind == ray_client_pb2.ClientTask.FUNCTION:
ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
elif kind == ray_client_pb2.ClientTask.ACTOR:
ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
return ClientActorRef(ticket.return_id)
elif kind == ray_client_pb2.ClientTask.METHOD:
ticket = self._call_method(instance, *args, **kwargs)
if ticket is None:
raise Exception(
"Couldn't call_remote on %s for type %s" % (instance, kind))
return ClientObjectRef(ticket.return_id)
def _call_method(self, instance: ClientRemoteMethod, *args, **kwargs):
if not isinstance(instance, ClientRemoteMethod):
raise TypeError("Client not passing a ClientRemoteMethod stub")
task = ray_client_pb2.ClientTask()
task.type = ray_client_pb2.ClientTask.METHOD
task.name = instance.method_name
task.payload_id = instance.actor_handle.actor_id.id
def call_remote(self, instance, *args, **kwargs):
task = instance._prepare_client_task()
for arg in args:
pb_arg = convert_to_arg(arg)
task.args.append(pb_arg)
logging.debug("Scheduling %s" % task)
ticket = self.server.Schedule(task, metadata=self.metadata)
return ticket
def _put_and_schedule(self, item, task_type, *args, **kwargs):
if isinstance(item, ClientRemoteFunc):
ref = self._put(item)
elif isinstance(item, ClientActorClass):
ref = self._put(item.actor_cls)
else:
raise TypeError("Client not passing a ClientRemoteFunc stub")
task = ray_client_pb2.ClientTask()
task.type = task_type
task.name = item._name
task.payload_id = ref.id
for arg in args:
pb_arg = convert_to_arg(arg)
task.args.append(pb_arg)
ticket = self.server.Schedule(task, metadata=self.metadata)
return ticket
return ClientObjectRef.from_remote_ref(ticket.return_ref)
def close(self):
self.channel.close()
self.server = None
if self.channel:
self.channel.close()
def terminate_actor(self, actor: ClientActorHandle,
no_restart: bool) -> None:
if not isinstance(actor, ClientActorHandle):
raise ValueError("ray.kill() only supported for actors. "
"Got: {}.".format(type(actor)))
term_actor = ray_client_pb2.TerminateRequest.ActorTerminate()
term_actor.handle = actor.actor_ref.handle
term_actor.no_restart = no_restart
try:
term = ray_client_pb2.TerminateRequest(actor=term_actor)
self.server.Terminate(term)
except grpc.RpcError as e:
raise decode_exception(e.details())
def terminate_task(self, obj: ClientObjectRef, force: bool,
recursive: bool) -> None:
if not isinstance(obj, ClientObjectRef):
raise TypeError(
"ray.cancel() only supported for non-actor object refs. "
f"Got: {type(obj)}.")
term_object = ray_client_pb2.TerminateRequest.TaskObjectTerminate()
term_object.handle = obj.handle
term_object.force = force
term_object.recursive = recursive
try:
term = ray_client_pb2.TerminateRequest(task_object=term_object)
self.server.Terminate(term)
except grpc.RpcError as e:
raise decode_exception(e.details())
def get_cluster_info(self, type: ray_client_pb2.ClusterInfoType.TypeEnum):
req = ray_client_pb2.ClusterInfoRequest()
req.type = type
resp = self.server.ClusterInfo(req)
if resp.WhichOneof("response_type") == "resource_table":
return resp.resource_table.table
return json.loads(resp.json)
def is_initialized(self) -> bool:
if self.server is not None:
return self.get_cluster_info(
ray_client_pb2.ClusterInfoType.IS_INITIALIZED)
return False
+6 -6
View File
@@ -7,8 +7,8 @@ from ray.core.generated.gcs_pb2 import (
JobConfig,
ErrorTableData,
GcsEntry,
HeartbeatBatchTableData,
HeartbeatTableData,
ResourceUsageBatchData,
ResourcesData,
ObjectTableData,
ProfileTableData,
TablePrefix,
@@ -33,8 +33,8 @@ __all__ = [
"ErrorTableData",
"ErrorType",
"GcsEntry",
"HeartbeatBatchTableData",
"HeartbeatTableData",
"ResourceUsageBatchData",
"ResourcesData",
"ObjectTableData",
"ProfileTableData",
"TablePrefix",
@@ -55,8 +55,8 @@ FUNCTION_PREFIX = "RemoteFunction:"
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
REPORTER_CHANNEL = "RAY_REPORTER"
# xray heartbeats
XRAY_HEARTBEAT_BATCH_PATTERN = "HEARTBEAT_BATCH:".encode("ascii")
# xray resource usages
XRAY_RESOURCES_BATCH_PATTERN = "RESOURCES_BATCH:".encode("ascii")
# xray job updates
XRAY_JOB_PATTERN = "JOB:*".encode("ascii")
@@ -23,7 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
c_vector[c_string] GetAllProfileInfo()
c_vector[c_string] GetAllObjectInfo()
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
unique_ptr[c_string] GetAllHeartbeat()
unique_ptr[c_string] GetAllResourceUsage()
c_vector[c_string] GetAllActorInfo()
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
c_string GetNodeResourceInfo(const CNodeID &node_id)
@@ -78,11 +78,11 @@ cdef class GlobalStateAccessor:
return c_string(object_info.get().data(), object_info.get().size())
return None
def get_all_heartbeat(self):
"""Get newest heartbeat of all nodes from GCS service."""
def get_all_resource_usage(self):
"""Get newest resource usage of all nodes from GCS service."""
cdef unique_ptr[c_string] result
with nogil:
result = self.inner.get().GetAllHeartbeat()
result = self.inner.get().GetAllResourceUsage()
if result:
return c_string(result.get().data(), result.get().size())
return None
+5 -1
View File
@@ -90,7 +90,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const CTaskOptions &options, c_vector[CObjectID] *return_ids,
int max_retries,
c_pair[CPlacementGroupID, int64_t] placement_options,
c_bool placement_group_capture_child_tasks)
c_bool placement_group_capture_child_tasks,
c_string debugger_breakpoint)
CRayStatus CreateActor(
const CRayFunction &function,
const c_vector[unique_ptr[CTaskArg]] &args,
@@ -101,6 +102,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CPlacementGroupID *placement_group_id)
CRayStatus RemovePlacementGroup(
const CPlacementGroupID &placement_group_id)
CRayStatus WaitPlacementGroupReady(
const CPlacementGroupID &placement_group_id, int timeout_ms)
void SubmitActorTask(
const CActorID &actor_id, const CRayFunction &function,
const c_vector[unique_ptr[CTaskArg]] &args,
@@ -222,6 +225,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const c_vector[shared_ptr[CRayObject]] &args,
const c_vector[CObjectID] &arg_reference_ids,
const c_vector[CObjectID] &return_ids,
const c_string debugger_breakpoint,
c_vector[shared_ptr[CRayObject]] *returns) nogil
) task_execution_callback
(void(const CWorkerID &) nogil) on_worker_shutdown
+3 -5
View File
@@ -15,7 +15,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
int64_t raylet_heartbeat_timeout_milliseconds() const
c_bool light_heartbeat_enabled() const
c_bool light_report_resource_usage_enabled() const
int64_t debug_dump_period_milliseconds() const
@@ -51,10 +51,6 @@ cdef extern from "ray/common/ray_config.h" nogil:
uint64_t object_manager_default_chunk_size() const
int num_workers_per_process_python() const
int num_workers_per_process_java() const
uint32_t maximum_gcs_deletion_batch_size() const
int64_t max_direct_call_object_size() const
@@ -68,3 +64,5 @@ cdef extern from "ray/common/ray_config.h" nogil:
c_bool enable_timeline() const
c_bool automatic_object_deletion_enabled() const
uint32_t max_grpc_message_size() const
+6 -10
View File
@@ -14,8 +14,8 @@ cdef class Config:
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
@staticmethod
def light_heartbeat_enabled():
return RayConfig.instance().light_heartbeat_enabled()
def light_report_resource_usage_enabled():
return RayConfig.instance().light_report_resource_usage_enabled()
@staticmethod
def debug_dump_period_milliseconds():
@@ -88,14 +88,6 @@ cdef class Config:
def object_manager_default_chunk_size():
return RayConfig.instance().object_manager_default_chunk_size()
@staticmethod
def num_workers_per_process_python():
return RayConfig.instance().num_workers_per_process_python()
@staticmethod
def num_workers_per_process_java():
return RayConfig.instance().num_workers_per_process_java()
@staticmethod
def maximum_gcs_deletion_batch_size():
return RayConfig.instance().maximum_gcs_deletion_batch_size()
@@ -119,3 +111,7 @@ cdef class Config:
@staticmethod
def automatic_object_deletion_enabled():
return RayConfig.instance().automatic_object_deletion_enabled()
@staticmethod
def max_grpc_message_size():
return RayConfig.instance().max_grpc_message_size()
+9 -1
View File
@@ -1,7 +1,9 @@
import ray
import ray.worker
from ray import profiling
__all__ = ["free", "global_gc"]
MAX_MESSAGE_LENGTH = ray._config.max_grpc_message_size()
def global_gc():
@@ -22,7 +24,13 @@ def memory_summary():
raylet = ray.nodes()[0]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
ray.nodes()[0]["NodeManagerPort"])
channel = grpc.insecure_channel(raylet_address)
channel = grpc.insecure_channel(
raylet_address,
options=[
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
],
)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.FormatGlobalMemoryInfo(
node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
+2 -1
View File
@@ -133,7 +133,7 @@ class LogMonitor:
job_match = JOB_LOG_PATTERN.match(file_path)
if job_match:
job_id = job_match.group(2)
worker_pid = job_match.group(3)
worker_pid = int(job_match.group(3))
else:
job_id = None
worker_pid = None
@@ -361,4 +361,5 @@ if __name__ == "__main__":
f"failed with the following error:\n{traceback_str}")
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.LOG_MONITOR_DIED_ERROR, message)
logger.error(message)
raise e
+1 -1
View File
@@ -91,7 +91,7 @@ class MemoryMonitor:
if not psutil:
logger.warn("WARNING: Not monitoring node memory since `psutil` "
"is not installed. Install this with "
"`pip install psutil` (or ray[debug]) to enable "
"`pip install psutil` to enable "
"debugging of memory-related crashes.")
def get_memory_usage(self):
+20 -14
View File
@@ -85,7 +85,11 @@ class Monitor:
This is used to receive notifications about failed components.
"""
def __init__(self, redis_address, autoscaling_config, redis_password=None):
def __init__(self,
redis_address,
autoscaling_config,
redis_password=None,
prefix_cluster_info=False):
# Initialize the Redis clients.
ray.state.state._initialize_global_state(
redis_address, redis_password=redis_password)
@@ -107,8 +111,10 @@ class Monitor:
head_node_ip = redis_address.split(":")[0]
self.load_metrics = LoadMetrics(local_ip=head_node_ip)
if autoscaling_config:
self.autoscaler = StandardAutoscaler(autoscaling_config,
self.load_metrics)
self.autoscaler = StandardAutoscaler(
autoscaling_config,
self.load_metrics,
prefix_cluster_info=prefix_cluster_info)
self.autoscaling_config = autoscaling_config
else:
self.autoscaler = None
@@ -139,24 +145,24 @@ class Monitor:
self.primary_subscribe_client.subscribe(channel)
def update_load_metrics(self):
"""Fetches heartbeat data from GCS and updates load metrics."""
"""Fetches resource usage data from GCS and updates load metrics."""
all_heartbeat = self.global_state_accessor.get_all_heartbeat()
heartbeat_batch_data = \
ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
for heartbeat_message in heartbeat_batch_data.batch:
resource_load = dict(heartbeat_message.resource_load)
total_resources = dict(heartbeat_message.resources_total)
available_resources = dict(heartbeat_message.resources_available)
all_resources = self.global_state_accessor.get_all_resource_usage()
resources_batch_data = \
ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources)
for resource_message in resources_batch_data.batch:
resource_load = dict(resource_message.resource_load)
total_resources = dict(resource_message.resources_total)
available_resources = dict(resource_message.resources_available)
waiting_bundles, infeasible_bundles = parse_resource_demands(
heartbeat_batch_data.resource_load_by_shape)
resources_batch_data.resource_load_by_shape)
pending_placement_groups = list(
heartbeat_batch_data.placement_group_load.placement_group_data)
resources_batch_data.placement_group_load.placement_group_data)
# Update the load metrics for this raylet.
node_id = ray.utils.binary_to_hex(heartbeat_message.node_id)
node_id = ray.utils.binary_to_hex(resource_message.node_id)
ip = self.raylet_id_to_ip_map.get(node_id)
if ip:
self.load_metrics.update(ip, total_resources,
+9 -9
View File
@@ -1,14 +1,14 @@
linux:
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
darwin:
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-macosx_10_13_intel.whl
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-macosx_10_13_intel.whl
win32:
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-win_amd64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-win_amd64.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-win_amd64.whl
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-win_amd64.whl
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-win_amd64.whl
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-win_amd64.whl
+11 -10
View File
@@ -339,10 +339,6 @@ class Node:
"""Get the cluster Redis password"""
return self._ray_params.redis_password
@property
def load_code_from_local(self):
return self._ray_params.load_code_from_local
@property
def object_ref_seed(self):
"""Get the seed for deterministic generation of object refs"""
@@ -723,14 +719,12 @@ class Node:
stderr_file=stderr_file,
config=self._config,
java_worker_options=self._ray_params.java_worker_options,
load_code_from_local=self._ray_params.load_code_from_local,
huge_pages=self._ray_params.huge_pages,
fate_share=self.kernel_fate_share,
socket_to_use=self.socket,
head_node=self.head,
start_initial_python_workers_for_first_job=self._ray_params.
start_initial_python_workers_for_first_job,
code_search_path=self._ray_params.code_search_path)
start_initial_python_workers_for_first_job)
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
@@ -739,12 +733,19 @@ class Node:
raise NotImplementedError
def start_monitor(self):
"""Start the monitor."""
"""Start the monitor.
Autoscaling output goes to these monitor.err/out files, and
any modification to these files may break existing
cluster launching commands.
"""
stdout_file, stderr_file = self.get_log_file_handles(
"monitor", unique=True)
process_info = ray._private.services.start_monitor(
self._redis_address,
self._logs_dir,
stdout_file=subprocess.DEVNULL,
stderr_file=subprocess.DEVNULL,
stdout_file=stdout_file,
stderr_file=stderr_file,
autoscaling_config=self._ray_params.autoscaling_config,
redis_password=self._ray_params.redis_password,
fate_share=self.kernel_fate_share)
-108
View File
@@ -1,108 +0,0 @@
"""
Ray operator for Kubernetes.
Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
create_or_update_cluster(), then runs an autoscaling loop in the operator pod
executing this script. Writes autoscaling logs to the directory
/root/ray-operator-logs.
In this setup, the ray head node does not run an autoscaler. It is important
NOT to supply an --autoscaling-config argument to head node's ray start command
in the cluster config when using this operator.
To run, first create a ConfigMap named ray-operator-configmap from a ray
cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
For example:
kubectl create namespace raytest
kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
""" # noqa
import os
from typing import Any, Dict, IO, Tuple
import kubernetes
import yaml
from ray._private import services
from ray.autoscaler._private.commands import create_or_update_cluster
from ray.autoscaler._private.kubernetes import core_api
from ray.utils import open_log
from ray import ray_constants
RAY_CLUSTER_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
RAY_CONFIG_MAP = "ray-operator-configmap"
RAY_CONFIG_DIR = "/root"
LOG_DIR = "/root/ray-operator-logs"
ERR_NAME, OUT_NAME = "ray-operator.err", "ray-operator.out"
def prepare_ray_cluster_config() -> str:
config_map = core_api().read_namespaced_config_map(
name=RAY_CONFIG_MAP, namespace=RAY_CLUSTER_NAMESPACE)
# config_map.data consists of a single key:value pair
for config_file_name, config_string in config_map.data.items():
config = yaml.safe_load(config_string)
config["provider"]["namespace"] = RAY_CLUSTER_NAMESPACE
cluster_config_path = os.path.join(RAY_CONFIG_DIR, config_file_name)
with open(cluster_config_path, "w") as file:
yaml.dump(config, file)
return cluster_config_path
def get_ray_head_pod_ip(config: Dict[str, Any]) -> str:
cluster_name = config["cluster_name"]
label_selector = f"component=ray-head,ray-cluster-name={cluster_name}"
pods = core_api().list_namespaced_pod(
namespace=RAY_CLUSTER_NAMESPACE, label_selector=label_selector).items
assert (len(pods)) == 1
head_pod = pods.pop()
return head_pod.status.pod_ip
def get_logs() -> Tuple[IO, IO]:
try:
os.makedirs(LOG_DIR)
except OSError:
pass
err_path = os.path.join(LOG_DIR, ERR_NAME)
out_path = os.path.join(LOG_DIR, OUT_NAME)
return open_log(err_path), open_log(out_path)
def main():
kubernetes.config.load_incluster_config()
cluster_config_path = prepare_ray_cluster_config()
config = create_or_update_cluster(
cluster_config_path,
override_min_workers=None,
override_max_workers=None,
no_restart=False,
restart_only=False,
yes=True,
no_config_cache=True)
with open(cluster_config_path, "w") as file:
yaml.dump(config, file)
ray_head_pod_ip = get_ray_head_pod_ip(config)
# TODO: Add support for user-specified redis port and password
redis_address = services.address(ray_head_pod_ip,
ray_constants.DEFAULT_PORT)
stderr_file, stdout_file = get_logs()
services.start_monitor(
redis_address,
stdout_file=stdout_file,
stderr_file=stderr_file,
autoscaling_config=cluster_config_path,
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
if __name__ == "__main__":
main()
View File
+154
View File
@@ -0,0 +1,154 @@
"""
Ray operator for Kubernetes.
Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
create_or_update_cluster(), then runs an autoscaling loop in the operator pod
executing this script. Writes autoscaling logs to the directory
/root/ray-operator-logs.
In this setup, the ray head node does not run an autoscaler. It is important
NOT to supply an --autoscaling-config argument to head node's ray start command
in the cluster config when using this operator.
To run, first create a ConfigMap named ray-operator-configmap from a ray
cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
For example:
kubectl create namespace raytest
kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
""" # noqa
import logging
import multiprocessing as mp
import os
from typing import Any, Callable, Dict, Optional
from kubernetes.client.exceptions import ApiException
import yaml
from ray._private import services
from ray.autoscaler._private import commands
from ray import monitor
from ray.operator import operator_utils
from ray import ray_constants
class RayCluster():
def __init__(self, config: Dict[str, Any]):
self.config = config
self.name = self.config["cluster_name"]
self.config_path = operator_utils.config_path(self.name)
self.setup_logging()
self.subprocess = None # type: Optional[mp.Process]
def do_in_subprocess(self,
f: Callable[[], None],
wait_to_finish: bool = False) -> None:
# First stop the subprocess if it's alive
self.clean_up_subprocess()
# Reinstantiate process with f as target and start.
self.subprocess = mp.Process(name=self.name, target=f)
# Kill subprocess if monitor dies
self.subprocess.daemon = True
self.subprocess.start()
if wait_to_finish:
self.subprocess.join()
def clean_up_subprocess(self):
if self.subprocess and self.subprocess.is_alive():
self.subprocess.terminate()
self.subprocess.join()
def create_or_update(self) -> None:
self.do_in_subprocess(self._create_or_update)
def _create_or_update(self) -> None:
self.start_head()
self.start_monitor()
def start_head(self) -> None:
self.write_config()
self.config = commands.create_or_update_cluster(
self.config_path,
override_min_workers=None,
override_max_workers=None,
no_restart=False,
restart_only=False,
yes=True,
no_config_cache=True)
self.write_config()
def start_monitor(self) -> None:
ray_head_pod_ip = commands.get_head_node_ip(self.config_path)
# TODO: Add support for user-specified redis port and password
redis_address = services.address(ray_head_pod_ip,
ray_constants.DEFAULT_PORT)
self.mtr = monitor.Monitor(
redis_address=redis_address,
autoscaling_config=self.config_path,
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
prefix_cluster_info=True)
self.mtr.run()
def clean_up(self) -> None:
self.clean_up_subprocess()
self.clean_up_logging()
self.delete_config()
def setup_logging(self) -> None:
self.handler = logging.StreamHandler()
self.handler.addFilter(lambda rec: rec.processName == self.name)
logging_format = ":".join([self.name, ray_constants.LOGGER_FORMAT])
self.handler.setFormatter(logging.Formatter(logging_format))
operator_utils.root_logger.addHandler(self.handler)
def clean_up_logging(self) -> None:
operator_utils.root_logger.removeHandler(self.handler)
def write_config(self) -> None:
with open(self.config_path, "w") as file:
yaml.dump(self.config, file)
def delete_config(self) -> None:
os.remove(self.config_path)
ray_clusters = {}
def cluster_action(cluster_config: Dict[str, Any], event_type: str) -> None:
cluster_name = cluster_config["cluster_name"]
if event_type == "ADDED":
ray_clusters[cluster_name] = RayCluster(cluster_config)
ray_clusters[cluster_name].create_or_update()
elif event_type == "MODIFIED":
ray_clusters[cluster_name].create_or_update()
elif event_type == "DELETED":
ray_clusters[cluster_name].clean_up()
del ray_clusters[cluster_name]
def main() -> None:
# Make directory for ray cluster configs
if not os.path.isdir(operator_utils.RAY_CONFIG_DIR):
os.mkdir(operator_utils.RAY_CONFIG_DIR)
# Control loop
cluster_cr_stream = operator_utils.cluster_cr_stream()
try:
for event in cluster_cr_stream:
cluster_cr = event["object"]
event_type = event["type"]
cluster_config = operator_utils.cr_to_config(cluster_cr)
cluster_action(cluster_config, event_type)
except ApiException as e:
if e.status == 404:
raise Exception(
"Caught a 404 error. Has the RayCluster CRD been created?")
else:
raise
if __name__ == "__main__":
main()
+114
View File
@@ -0,0 +1,114 @@
import copy
import logging
import os
from typing import Any, Dict, Iterator, List
from kubernetes.watch import Watch
from ray.autoscaler._private.kubernetes import custom_objects_api
RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
RAY_CONFIG_DIR = os.path.expanduser("~/ray_cluster_configs")
CONFIG_SUFFIX = "_config.yaml"
CONFIG_FIELDS = {
"maxWorkers": "max_workers",
"upscalingSpeed": "upscaling_speed",
"idleTimeoutMinutes": "idle_timeout_minutes",
"headPodType": "head_node_type",
"workerDefaultPodType": "worker_default_node_type",
"workerStartRayCommands": "worker_start_ray_commands",
"headStartRayCommands": "head_start_ray_commands",
"podTypes": "available_node_types"
}
NODE_TYPE_FIELDS = {
"minWorkers": "min_workers",
"maxWorkers": "max_workers",
"podConfig": "node_config",
"rayResources": "resources",
"setupCommands": "worker_setup_commands"
}
PROVIDER_CONFIG = {
"type": "kubernetes",
"use_internal_ips": True,
"namespace": RAY_NAMESPACE
}
root_logger = logging.getLogger("ray")
root_logger.setLevel(logging.getLevelName("DEBUG"))
"""
ownerReferences:
- apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
kind: ReplicaSet
name: my-repset
uid: d9607e19-f88f-11e6-a518-42010a800195
"""
def config_path(cluster_name: str) -> str:
file_name = cluster_name + CONFIG_SUFFIX
return os.path.join(RAY_CONFIG_DIR, file_name)
def cluster_cr_stream() -> Iterator:
w = Watch()
return w.stream(
custom_objects_api().list_namespaced_custom_object,
namespace=RAY_NAMESPACE,
group="cluster.ray.io",
version="v1",
plural="rayclusters")
def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
"""Convert RayCluster custom resource to a ray cluster config for use by the
autoscaler."""
cr_spec = cluster_resource["spec"]
cr_meta = cluster_resource["metadata"]
config = translate(cr_spec, dictionary=CONFIG_FIELDS)
pod_types = cr_spec["podTypes"]
config["available_node_types"] = get_node_types(
pod_types, cluster_name=cr_meta["name"], cluster_uid=cr_meta["uid"])
config["cluster_name"] = cr_meta["name"]
config["provider"] = PROVIDER_CONFIG
return config
def get_node_types(pod_types: List[Dict[str, Any]], cluster_name: str,
cluster_uid: str) -> Dict[str, Any]:
cluster_owner_reference = get_cluster_owner_reference(
cluster_name, cluster_uid)
node_types = {}
for pod_type in pod_types:
name = pod_type["name"]
pod_type_copy = copy.deepcopy(pod_type)
pod_type_copy.pop("name")
node_types[name] = translate(
pod_type_copy, dictionary=NODE_TYPE_FIELDS)
# Deleting a RayCluster CR will also delete the associated pods.
node_types[name]["node_config"]["metadata"].update({
"ownerReferences": [cluster_owner_reference]
})
return node_types
def get_cluster_owner_reference(cluster_name: str,
cluster_uid: str) -> Dict[str, Any]:
return {
"apiVersion": "apps/v1",
"controller": True,
"blockOwnerDeletion": True,
"kind": "RayCluster",
"name": cluster_name,
"uid": cluster_uid
}
def translate(configuration: Dict[str, Any],
dictionary: Dict[str, str]) -> Dict[str, Any]:
return {dictionary[field]: configuration[field] for field in configuration}
+1 -8
View File
@@ -89,7 +89,6 @@ class RayParams:
contents to Redis.
autoscaling_config: path to autoscaling config file.
java_worker_options (list): The command options for Java worker.
load_code_from_local: Whether load code from local file or from GCS.
metrics_agent_port(int): The port to bind metrics agent.
metrics_export_port(int): The port at which metrics are exposed
through a Prometheus endpoint.
@@ -142,14 +141,12 @@ class RayParams:
include_log_monitor=None,
autoscaling_config=None,
java_worker_options=None,
load_code_from_local=False,
start_initial_python_workers_for_first_job=False,
_system_config=None,
enable_object_reconstruction=False,
metrics_agent_port=None,
metrics_export_port=None,
lru_evict=False,
code_search_path=None):
lru_evict=False):
self.object_ref_seed = object_ref_seed
self.redis_address = redis_address
self.num_cpus = num_cpus
@@ -186,7 +183,6 @@ class RayParams:
self.include_log_monitor = include_log_monitor
self.autoscaling_config = autoscaling_config
self.java_worker_options = java_worker_options
self.load_code_from_local = load_code_from_local
self.metrics_agent_port = metrics_agent_port
self.metrics_export_port = metrics_export_port
self.start_initial_python_workers_for_first_job = (
@@ -195,9 +191,6 @@ class RayParams:
self._lru_evict = lru_evict
self._enable_object_reconstruction = enable_object_reconstruction
self._check_usage()
self.code_search_path = code_search_path
if code_search_path is None:
self.code_search_path = []
# Set the internal config options for LRU eviction.
if lru_evict:
+5 -1
View File
@@ -197,7 +197,8 @@ LOG_MONITOR_MAX_OPEN_FILES = 200
# The object metadata field uses the following format: It is a comma
# separated list of fields. The first field is mandatory and is the
# type of the object (see types below) or an integer, which is interpreted
# as an error value.
# as an error value. The second part is optional and if present has the
# form DEBUG:<breakpoint_id>, it is used for implementing the debugger.
# A constant used as object metadata to indicate the object is cross language.
OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
@@ -213,6 +214,9 @@ OBJECT_METADATA_TYPE_RAW = b"RAW"
# of XLANG.
OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"
# A constant indicating the debugging part of the metadata (see above).
OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"
AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"
# The default password to prevent redis port scanning attack.
+40
View File
@@ -153,6 +153,46 @@ class StandardFdRedirectionRotatingFileHandler(RotatingFileHandler):
os.dup2(self.stream.fileno(), self.get_original_stream().fileno())
def get_worker_log_file_name(worker_type):
job_id = os.environ.get("RAY_JOB_ID")
if worker_type == "WORKER":
assert job_id is not None, (
"RAY_JOB_ID should be set as an env "
"variable within default_worker.py. If you see this error, "
"please report it to Ray's Github issue.")
worker_name = "worker"
else:
job_id = ray.JobID.nil()
worker_name = "io_worker"
# Make sure these values are set already.
assert ray.worker._global_node is not None
assert ray.worker.global_worker is not None
filename = (f"{worker_name}-"
f"{binary_to_hex(ray.worker.global_worker.worker_id)}-"
f"{job_id}-{os.getpid()}")
return filename
def configure_log_file(out_file, err_file):
stdout_fileno = sys.stdout.fileno()
stderr_fileno = sys.stderr.fileno()
# C++ logging requires redirecting the stdout file descriptor. Note that
# dup2 will automatically close the old file descriptor before overriding
# it.
os.dup2(out_file.fileno(), stdout_fileno)
os.dup2(err_file.fileno(), stderr_fileno)
# We also manually set sys.stdout and sys.stderr because that seems to
# have an effect on the output buffering. Without doing this, stdout
# and stderr are heavily buffered resulting in seemingly lost logging
# statements. We never want to close the stdout file descriptor, dup2 will
# close it when necessary and we don't want python's GC to close it.
sys.stdout = ray.utils.open_log(
stdout_fileno, unbuffered=True, closefd=False)
sys.stderr = ray.utils.open_log(
stderr_fileno, unbuffered=True, closefd=False)
def setup_and_get_worker_interceptor_logger(args,
max_bytes=0,
backup_count=0,
+4
View File
@@ -258,8 +258,12 @@ class RemoteFunction:
placement_group.id,
placement_group_bundle_index,
placement_group_capture_child_tasks,
worker.debugger_breakpoint,
override_environment_variables=override_environment_variables
or dict())
# Reset worker's debug context from the last "remote" command
# (which applies only to this .remote call).
worker.debugger_breakpoint = b""
if len(object_refs) == 1:
return object_refs[0]
elif len(object_refs) > 1:
+38 -21
View File
@@ -6,6 +6,7 @@ import logging
import os
import subprocess
import sys
from telnetlib import Telnet
import time
import urllib
import urllib.parse
@@ -150,6 +151,35 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
from None
def continue_debug_session():
"""Continue active debugging session.
This function will connect 'ray debug' to the right debugger
when a user is stepping between Ray tasks.
"""
active_sessions = ray.experimental.internal_kv._internal_kv_list(
"RAY_PDB_")
for active_session in active_sessions:
if active_session.startswith(b"RAY_PDB_CONTINUE"):
print("Continuing pdb session in different process...")
key = b"RAY_PDB_" + active_session[len("RAY_PDB_CONTINUE_"):]
while True:
data = ray.experimental.internal_kv._internal_kv_get(key)
if data:
session = json.loads(data)
if "exit_debugger" in session:
ray.experimental.internal_kv._internal_kv_del(key)
return
host, port = session["pdb_address"].split(":")
with Telnet(host, int(port)) as tn:
tn.interact()
ray.experimental.internal_kv._internal_kv_del(key)
continue_debug_session()
return
time.sleep(1.0)
@cli.command()
@click.option(
"--address",
@@ -158,12 +188,13 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
help="Override the address to connect to.")
def debug(address):
"""Show all active breakpoints and exceptions in the Ray debugger."""
from telnetlib import Telnet
if not address:
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
ray.init(address=address, log_to_driver=False)
while True:
continue_debug_session()
active_sessions = ray.experimental.internal_kv._internal_kv_list(
"RAY_PDB_")
print("Active breakpoints:")
@@ -358,25 +389,12 @@ def debug(address):
default=None,
type=str,
help="Overwrite the options to start Java workers.")
@click.option(
"--code-search-path",
default=None,
hidden=True,
type=str,
help="A list of directories or jar files separated by colon that specify "
"the search path for user code. This will be used as `CLASSPATH` in "
"Java and `PYTHONPATH` in Python.")
@click.option(
"--system-config",
default=None,
hidden=True,
type=json.loads,
help="Override system configuration defaults.")
@click.option(
"--load-code-from-local",
is_flag=True,
default=False,
help="Specify whether load code from local file or GCS serialization.")
@click.option(
"--lru-evict",
is_flag=True,
@@ -405,8 +423,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
head, include_dashboard, dashboard_host, dashboard_port, block,
plasma_directory, autoscaling_config, no_redirect_worker_output,
no_redirect_output, plasma_store_socket_name, raylet_socket_name,
temp_dir, java_worker_options, load_code_from_local,
code_search_path, system_config, lru_evict,
temp_dir, java_worker_options, system_config, lru_evict,
enable_object_reconstruction, metrics_export_port, log_style,
log_color, verbose):
"""Start Ray processes manually on the local machine."""
@@ -465,8 +482,6 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
dashboard_host=dashboard_host,
dashboard_port=dashboard_port,
java_worker_options=java_worker_options,
load_code_from_local=load_code_from_local,
code_search_path=code_search_path,
_system_config=system_config,
lru_evict=lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
@@ -537,6 +552,8 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
with cli_logger.group("Next steps"):
cli_logger.print(
"To connect to this Ray runtime from another node, run")
# NOTE(kfstorm): Java driver rely on this line to get the address
# of the cluster. Please be careful when updating this line.
cli_logger.print(
cf.bold(" ray start --address='{}'{}"), redis_address,
f" --redis-password='{redis_password}'"
@@ -632,7 +649,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
cli_logger.print(
"This command will now block until terminated by a signal.")
cli_logger.print(
"Runing subprocesses are monitored and a message will be "
"Running subprocesses are monitored and a message will be "
"printed if any of them terminate unexpectedly.")
while True:
@@ -1273,7 +1290,7 @@ def stack():
COMMAND = """
pyspy=`which py-spy`
if [ ! -e "$pyspy" ]; then
echo "ERROR: Please 'pip install py-spy' (or ray[debug]) first"
echo "ERROR: Please 'pip install py-spy' first"
exit 1
fi
# Set IFS to iterate over lines instead of over words.
+15 -8
View File
@@ -17,6 +17,14 @@ py_test(
deps = [":serve_lib"],
)
py_test(
name = "test_controller",
size = "small",
srcs = serve_tests_srcs,
tags = ["exclusive"],
deps = [":serve_lib"],
)
py_test(
name = "test_backend_worker",
size = "small",
@@ -35,14 +43,13 @@ py_test(
)
# TODO(simon): Test skipped until #11683 fixed.
# py_test(
# name = "test_failure",
# size = "medium",
# srcs = serve_tests_srcs,
# tags = ["exclusive"],
# deps = [":serve_lib"],
# )
py_test(
name = "test_failure",
size = "medium",
srcs = serve_tests_srcs,
tags = ["exclusive"],
deps = [":serve_lib"],
)
py_test(
+81 -15
View File
@@ -1,6 +1,9 @@
import asyncio
import atexit
import time
from functools import wraps
import os
from uuid import UUID
import ray
from ray.serve.constants import (DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT,
@@ -42,6 +45,8 @@ class Client:
self._controller_name = controller_name
self._detached = detached
self._shutdown = False
self._http_host, self._http_port = ray.get(
controller.get_http_config.remote())
# NOTE(simon): Used to cache client.get_handle(endpoint) call. It will
# mostly grow in size, it will only shrink when user calls the
@@ -62,9 +67,9 @@ class Client:
def __del__(self):
if not self._detached:
logger.info("Shutting down Ray Serve because client went out of "
"scope. To prevent this, either keep a reference to "
"the client object or use serve.start(detached=True).")
logger.debug("Shutting down Ray Serve because client went out of "
"scope. To prevent this, either keep a reference to "
"the client or use serve.start(detached=True).")
self.shutdown()
def __reduce__(self):
@@ -78,11 +83,34 @@ class Client:
Shuts down all processes and deletes all state associated with the
instance.
"""
if not self._shutdown:
if (not self._shutdown) and ray.is_initialized():
ray.get(self._controller.shutdown.remote())
ray.kill(self._controller, no_restart=True)
# Wait for the named actor entry gets removed as well.
started = time.time()
while True:
try:
ray.get_actor(self._controller_name)
if time.time() - started > 5:
logger.warning(
"Waited 5s for Serve to shutdown gracefully but "
"the controller is still not cleaned up. "
"You can ignore this warning if you are shutting "
"down the Ray cluster.")
break
except ValueError: # actor name is removed
break
self._shutdown = True
@_ensure_connected
def _get_result(self, result_object_id: ray.ObjectRef) -> bool:
result_id: UUID = ray.get(result_object_id)
result = ray.get(self._controller.wait_for_event.remote(result_id))
logger.debug(f"Getting result_id ({result_id}) with result: {result}")
return result
@_ensure_connected
def create_endpoint(self,
endpoint_name: str,
@@ -137,10 +165,33 @@ class Client:
"an element of type {}".format(type(method)))
upper_methods.append(method.upper())
ray.get(
self._get_result(
self._controller.create_endpoint.remote(
endpoint_name, {backend: 1.0}, route, upper_methods))
# Block until the route table has been propagated to all HTTP proxies.
if route is not None:
def check_ready(http_response):
return route in http_response.json()
futures = []
for node_id in ray.state.node_ids():
future = block_until_http_ready.options(
num_cpus=0, resources={
node_id: 0.01
}).remote(
"http://{}:{}/-/routes".format(self._http_host,
self._http_port),
check_ready=check_ready,
timeout=HTTP_PROXY_TIMEOUT)
futures.append(future)
try:
ray.get(futures)
except ray.exceptions.RayTaskError:
raise TimeoutError("Route not available at HTTP proxies "
"after {HTTP_PROXY_TIMEOUT}s.")
@_ensure_connected
def delete_endpoint(self, endpoint: str) -> None:
"""Delete the given endpoint.
@@ -149,7 +200,7 @@ class Client:
"""
if endpoint in self._handle_cache:
del self._handle_cache[endpoint]
ray.get(self._controller.delete_endpoint.remote(endpoint))
self._get_result(self._controller.delete_endpoint.remote(endpoint))
@_ensure_connected
def list_endpoints(self) -> Dict[str, Dict[str, Any]]:
@@ -193,7 +244,7 @@ class Client:
"config_options must be a BackendConfig or dictionary.")
if isinstance(config_options, dict):
config_options = BackendConfig.parse_obj(config_options)
ray.get(
self._get_result(
self._controller.update_backend_config.remote(
backend_tag, config_options))
@@ -222,7 +273,8 @@ class Client:
Args:
backend_tag (str): a unique tag assign to identify this backend.
func_or_class (callable, class): a function or a class implementing
__call__.
__call__, returning a JSON-serializable object or a
Starlette Response object.
actor_init_args (optional): the arguments to pass to the class.
initialization method.
ray_actor_options (optional): options to be passed into the
@@ -290,7 +342,7 @@ class Client:
raise TypeError("config must be a BackendConfig or a dictionary.")
backend_config._validate_complete()
ray.get(
self._get_result(
self._controller.create_backend.remote(backend_tag, backend_config,
replica_config))
@@ -308,7 +360,7 @@ class Client:
The backend must not currently be used by any endpoints.
"""
ray.get(self._controller.delete_backend.remote(backend_tag))
self._get_result(self._controller.delete_backend.remote(backend_tag))
@_ensure_connected
def set_traffic(self, endpoint_name: str,
@@ -327,7 +379,7 @@ class Client:
traffic_policy_dictionary (dict): a dictionary maps backend names
to their traffic weights. The weights must sum to 1.
"""
ray.get(
self._get_result(
self._controller.set_traffic.remote(endpoint_name,
traffic_policy_dictionary))
@@ -353,20 +405,24 @@ class Client:
(float, int)) or not 0 <= proportion <= 1:
raise TypeError("proportion must be a float from 0 to 1.")
ray.get(
self._get_result(
self._controller.shadow_traffic.remote(endpoint_name, backend_tag,
proportion))
@_ensure_connected
def get_handle(self,
endpoint_name: str,
missing_ok: Optional[bool] = False) -> RayServeHandle:
missing_ok: Optional[bool] = False,
sync: bool = True) -> RayServeHandle:
"""Retrieve RayServeHandle for service endpoint to invoke it from Python.
Args:
endpoint_name (str): A registered service endpoint.
missing_ok (bool): If true, then Serve won't check the endpoint is
registered. False by default.
sync (bool): If true, then Serve will return a ServeHandle that
works everywhere. Otherwise, Serve will return a ServeHandle
that's only usable in asyncio loop.
Returns:
RayServeHandle
@@ -375,8 +431,14 @@ class Client:
self._controller.get_all_endpoints.remote()):
raise KeyError(f"Endpoint '{endpoint_name}' does not exist.")
if asyncio.get_event_loop().is_running() and sync:
logger.warning(
"You are retrieving a ServeHandle inside an asyncio loop. "
"Try getting client.get_handle(.., sync=False) to get better "
"performance.")
if endpoint_name not in self._handle_cache:
handle = RayServeHandle(self._controller, endpoint_name, sync=True)
handle = RayServeHandle(self._controller, endpoint_name, sync=sync)
self._handle_cache[endpoint_name] = handle
return self._handle_cache[endpoint_name]
@@ -445,7 +507,11 @@ def start(detached: bool = False,
"http://{}:{}/-/routes".format(http_host, http_port),
timeout=HTTP_PROXY_TIMEOUT)
futures.append(future)
ray.get(futures)
try:
ray.get(futures)
except ray.exceptions.RayTaskError:
raise TimeoutError(
"HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.")
return Client(controller, controller_name, detached=detached)
+8 -5
View File
@@ -15,10 +15,13 @@ from ray.serve.utils import (parse_request_item, _get_logger, chain_future,
from ray.serve.exceptions import RayServeException
from ray.util import metrics
from ray.serve.config import BackendConfig
from ray.serve.long_poll import LongPollerAsyncClient
from ray.serve.long_poll import LongPollAsyncClient
from ray.serve.router import Query
from ray.serve.constants import (DEFAULT_LATENCY_BUCKET_MS,
BACKEND_RECONFIGURE_METHOD)
from ray.serve.constants import (
BACKEND_RECONFIGURE_METHOD,
DEFAULT_LATENCY_BUCKET_MS,
LongPollKey,
)
from ray.exceptions import RayTaskError
logger = _get_logger()
@@ -168,8 +171,8 @@ class RayServeReplica:
tag_keys=("backend", ))
self.request_counter.set_default_tags({"backend": self.backend_tag})
self.long_poll_client = LongPollerAsyncClient(controller_handle, {
"backend_configs": self._update_backend_configs,
self.long_poll_client = LongPollAsyncClient(controller_handle, {
LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
})
self.error_counter = metrics.Count(
+4 -3
View File
@@ -1,7 +1,7 @@
cluster_name: default
min_workers: 22
max_workers: 22
initial_workers: 22
min_workers: 5
max_workers: 5
initial_workers: 5
autoscaling_mode: default
docker:
image: 'anyscale/ray-ml:latest'
@@ -28,6 +28,7 @@ initialization_commands: []
setup_commands:
- apt-get install build-essential libssl-dev git -y
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
- ray install-nightly
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:
+55 -31
View File
@@ -23,64 +23,88 @@
# 2 forwarders and 5 worker replicas: 620 requests/s
# 2 forwarders and 10 worker replicas: 609 requests/s
import asyncio
import time
import ray
from ray import serve
from ray.serve import BackendConfig
from ray.serve.utils import logger
import time
num_queries = 2000
num_queries = 10000
max_concurrent_queries = 100000
ray.init(address="auto")
client = serve.start()
def hello_world(_):
def worker(_):
return b"Hello World"
class ForwardActor:
def __init__(self):
def __init__(self, sync: bool):
client = serve.connect()
self.handle = client.get_handle("hello_world")
self.sync = sync
self.handle = client.get_handle("worker", sync=sync)
async def __call__(self, _):
await self.handle.remote()
if self.sync:
await self.handle.remote()
else:
await (await self.handle.remote_async())
client.create_backend("hello_world", hello_world)
client.create_endpoint("hello_world", backend="hello_world")
async def run_test(num_replicas, num_forwarders, sync):
client = serve.start()
client.create_backend(
"worker",
worker,
config=BackendConfig(
num_replicas=num_replicas,
max_concurrent_queries=max_concurrent_queries,
))
client.create_endpoint("worker", backend="worker")
endpoint_name = "worker"
client.create_backend("ForwardActor", ForwardActor)
client.create_endpoint("ForwardActor", backend="ForwardActor")
if num_forwarders > 0:
client.create_backend(
"ForwardActor",
ForwardActor,
sync,
config=BackendConfig(
num_replicas=num_forwarders,
max_concurrent_queries=max_concurrent_queries))
client.create_endpoint("ForwardActor", backend="ForwardActor")
endpoint_name = "ForwardActor"
def run_test(num_replicas, num_forwarders):
replicas_config = BackendConfig(num_replicas=num_replicas)
client.update_backend_config("hello_world", replicas_config)
if (num_forwarders == 0):
handle = client.get_handle("hello_world")
else:
forwarders_config = BackendConfig(num_replicas=num_forwarders)
client.update_backend_config("ForwardActor", forwarders_config)
handle = client.get_handle("ForwardActor")
handle = client.get_handle(endpoint_name, sync=sync)
# warmup - helpful to wait for gc.collect() and actors to start
start = time.time()
while time.time() - start < 1:
ray.get(handle.remote())
if sync:
ray.get(handle.remote())
else:
ray.get(await handle.remote_async())
# real test
start = time.time()
ray.get([handle.remote() for _ in range(num_queries)])
if sync:
ray.get([handle.remote() for _ in range(num_queries)])
else:
ray.get([(await handle.remote_async()) for _ in range(num_queries)])
qps = num_queries / (time.time() - start)
logger.info("{} forwarders and {} worker replicas: {} requests/s".format(
num_forwarders, num_replicas, int(qps)))
print(
f"Sync: {sync}, {num_forwarders} forwarders and {num_replicas} worker "
f"replicas: {int(qps)} requests/s")
client.shutdown()
for num_forwarders in [0, 1, 2]:
for num_replicas in [1, 5, 10]:
run_test(num_replicas, num_forwarders)
async def main():
for sync in [True, False]:
for num_forwarders in [0, 1, 2]:
for num_replicas in [1, 5, 10]:
await run_test(num_replicas, num_forwarders, sync)
asyncio.get_event_loop().run_until_complete(main())
@@ -86,13 +86,14 @@ async def main():
client.create_backend("backend", backend)
client.create_endpoint("endpoint", backend="backend", route="/api")
for intermediate_handles in [False, True]:
if (intermediate_handles):
if intermediate_handles:
client.create_endpoint(
"backend", backend="backend", route="/backend")
class forwardActor:
def __init__(self):
client = serve.connect()
self.handle = client.get_handle("backend")
def __call__(self, _):
+32 -29
View File
@@ -36,73 +36,76 @@ from ray import serve
from ray.serve import BackendConfig
from ray.serve.utils import logger
from ray.util.placement_group import (placement_group, remove_placement_group)
from ray.util.placement_group import placement_group, remove_placement_group
ray.shutdown()
ray.init(address="auto")
client = serve.start()
# These numbers need to correspond with the autoscaler config file.
# The number of remote nodes in the autoscaler should upper bound
# these because sometimes nodes fail to update.
num_workers = 20
expected_num_nodes = num_workers + 1
cpus_per_node = 4
num_remote_cpus = expected_num_nodes * cpus_per_node
# We ask for more worker but only need to run on smaller subset.
# This should account for worker nodes failed to launch.
expected_num_nodes = 6
num_replicas = 11
# wrk HTTP load testing config
num_connections = 20
num_threads = 2
time_to_run = "20s"
# Wait until the expected number of nodes have joined the cluster.
while True:
num_nodes = len(ray.nodes())
num_nodes = len(list(filter(lambda node: node["Alive"], ray.nodes())))
logger.info("Waiting for nodes {}/{}".format(num_nodes,
expected_num_nodes))
if num_nodes >= expected_num_nodes:
break
time.sleep(5)
logger.info("Nodes have all joined. There are %s resources.",
ray.cluster_resources())
client = serve.start()
def hey(_):
time.sleep(0.01) # Sleep for 10ms
return b"hey"
num_connections = int(num_remote_cpus * 0.75)
num_threads = 2
time_to_run = "10s"
pg = placement_group(
[{
"CPU": 1
} for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
ray.get(pg.ready())
# The number of replicas is the number of cores remaining after accounting
# for the one HTTP proxy actor on each node, the "hey" requester task on each
# node, and the serve controller.
# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
num_replicas = ray.available_resources()["CPU"]
logger.info("Starting %i replicas", num_replicas)
client.create_backend(
"hey", hey, config=BackendConfig(num_replicas=num_replicas))
client.create_endpoint("hey", backend="hey", route="/hey")
@ray.remote
@ray.remote(num_cpus=0)
def run_wrk():
logger.info("Warming up for ~3 seconds")
for _ in range(5):
resp = requests.get("http://127.0.0.1:8000/hey").text
logger.info("Received response \'" + resp + "\'")
time.sleep(0.5)
logger.info("Warming up")
for _ in range(10):
try:
resp = requests.get("http://127.0.0.1:8000/hey").text
logger.info("Received response '" + resp + "'")
time.sleep(0.5)
except Exception as e:
logger.info(f"Got exception {e}")
result = subprocess.run(
[
"wrk", "-c",
str(num_connections), "-t",
str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey"
"wrk",
"-c",
str(num_connections),
"-t",
str(num_threads),
"-d",
time_to_run,
"http://127.0.0.1:8000/hey",
],
stdout=subprocess.PIPE)
stdout=subprocess.PIPE,
)
return result.stdout.decode()
+1
View File
@@ -23,6 +23,7 @@ initialization_commands: []
setup_commands:
- apt-get install build-essential libssl-dev git -y
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
- ray install-nightly
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:
+12
View File
@@ -1,3 +1,5 @@
from enum import auto, Enum
#: Actor name used to register controller
SERVE_CONTROLLER_NAME = "SERVE_CONTROLLER_ACTOR"
@@ -37,3 +39,13 @@ DEFAULT_LATENCY_BUCKET_MS = [
#: Name of backend reconfiguration method implemented by user.
BACKEND_RECONFIGURE_METHOD = "reconfigure"
class LongPollKey(Enum):
def __repr__(self):
return f"{self.__class__.__name__}.{self.name}"
REPLICA_HANDLES = auto()
TRAFFIC_POLICIES = auto()
BACKEND_CONFIGS = auto()
ROUTE_TABLE = auto()
+191 -129
View File
@@ -6,20 +6,22 @@ import random
import time
from dataclasses import dataclass, field
from typing import Dict, Any, List, Optional, Tuple
from uuid import uuid4, UUID
from pydantic import BaseModel
import ray
import ray.cloudpickle as pickle
from ray.serve.autoscaling_policy import BasicAutoscalingPolicy
from ray.serve.backend_worker import create_backend_replica
from ray.serve.constants import ASYNC_CONCURRENCY, SERVE_PROXY_NAME
from ray.serve.constants import (ASYNC_CONCURRENCY, SERVE_PROXY_NAME,
LongPollKey)
from ray.serve.http_proxy import HTTPProxyActor
from ray.serve.kv_store import RayInternalKVStore
from ray.serve.exceptions import RayServeException
from ray.serve.utils import (format_actor_name, get_random_letters, logger,
try_schedule_resources_on_nodes, get_all_node_ids)
from ray.serve.config import BackendConfig, ReplicaConfig
from ray.serve.long_poll import LongPollerHost
from ray.serve.long_poll import LongPollHost
from ray.actor import ActorHandle
import numpy as np
@@ -144,7 +146,7 @@ class ActorStateReconciler:
controller_name: str = field(init=True)
detached: bool = field(init=True)
routers_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
http_proxy_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
backend_replicas: Dict[BackendTag, Dict[ReplicaTag, ActorHandle]] = field(
default_factory=lambda: defaultdict(dict))
backend_replicas_to_start: Dict[BackendTag, List[ReplicaTag]] = field(
@@ -156,8 +158,8 @@ class ActorStateReconciler:
# TODO(edoakes): consider removing this and just using the names.
def router_handles(self) -> List[ActorHandle]:
return list(self.routers_cache.values())
def http_proxy_handles(self) -> List[ActorHandle]:
return list(self.http_proxy_cache.values())
def get_replica_handles(self) -> List[ActorHandle]:
return list(
@@ -302,7 +304,7 @@ class ActorStateReconciler:
async def _stop_pending_backend_replicas(self) -> None:
"""Stops the pending backend replicas in self.backend_replicas_to_stop.
Removes backend_replicas from the router, kills them, and clears
Removes backend_replicas from the http_proxy, kills them, and clears
self.backend_replicas_to_stop.
"""
for backend_tag, replicas_list in self.backend_replicas_to_stop.items(
@@ -326,26 +328,26 @@ class ActorStateReconciler:
self.backend_replicas_to_stop.clear()
def _start_routers_if_needed(self, http_host: str, http_port: str,
http_middlewares: List[Any]) -> None:
"""Start a router on every node if it doesn't already exist."""
def _start_http_proxies_if_needed(self, http_host: str, http_port: str,
http_middlewares: List[Any]) -> None:
"""Start an HTTP proxy on every node if it doesn't already exist."""
if http_host is None:
return
for node_id, node_resource in get_all_node_ids():
if node_id in self.routers_cache:
if node_id in self.http_proxy_cache:
continue
router_name = format_actor_name(SERVE_PROXY_NAME,
self.controller_name, node_id)
name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
node_id)
try:
router = ray.get_actor(router_name)
proxy = ray.get_actor(name)
except ValueError:
logger.info("Starting router with name '{}' on node '{}' "
logger.info("Starting HTTP proxy with name '{}' on node '{}' "
"listening on '{}:{}'".format(
router_name, node_id, http_host, http_port))
router = HTTPProxyActor.options(
name=router_name,
name, node_id, http_host, http_port))
proxy = HTTPProxyActor.options(
name=name,
lifetime="detached" if self.detached else None,
max_concurrency=ASYNC_CONCURRENCY,
max_restarts=-1,
@@ -359,10 +361,10 @@ class ActorStateReconciler:
controller_name=self.controller_name,
http_middlewares=http_middlewares)
self.routers_cache[node_id] = router
self.http_proxy_cache[node_id] = proxy
def _stop_routers_if_needed(self) -> bool:
"""Removes router actors from any nodes that no longer exist.
def _stop_http_proxies_if_needed(self) -> bool:
"""Removes HTTP proxy actors from any nodes that no longer exist.
Returns whether or not any actors were removed (a checkpoint should
be taken).
@@ -370,25 +372,25 @@ class ActorStateReconciler:
actor_stopped = False
all_node_ids = {node_id for node_id, _ in get_all_node_ids()}
to_stop = []
for node_id in self.routers_cache:
for node_id in self.http_proxy_cache:
if node_id not in all_node_ids:
logger.info(
"Removing router on removed node '{}'.".format(node_id))
logger.info("Removing HTTP proxy on removed node '{}'.".format(
node_id))
to_stop.append(node_id)
for node_id in to_stop:
router_handle = self.routers_cache.pop(node_id)
ray.kill(router_handle, no_restart=True)
proxy = self.http_proxy_cache.pop(node_id)
ray.kill(proxy, no_restart=True)
actor_stopped = True
return actor_stopped
def _recover_actor_handles(self) -> None:
# Refresh the RouterCache
for node_id in self.routers_cache.keys():
router_name = format_actor_name(SERVE_PROXY_NAME,
self.controller_name, node_id)
self.routers_cache[node_id] = ray.get_actor(router_name)
for node_id in self.http_proxy_cache.keys():
name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
node_id)
self.http_proxy_cache[node_id] = ray.get_actor(name)
# Fetch actor handles for all of the backend replicas in the system.
# All of these backend_replicas are guaranteed to already exist because
@@ -420,12 +422,19 @@ class ActorStateReconciler:
return autoscaling_policies
@dataclass
class FutureResult:
# Goal requested when this future was created
requested_goal: Dict[str, Any]
@dataclass
class Checkpoint:
goal_state: SystemState
current_state: SystemState
reconciler: ActorStateReconciler
# TODO(ilr) Rename reconciler to PendingState
inflight_reqs: Dict[uuid4, FutureResult]
@ray.remote
@@ -474,7 +483,7 @@ class ServeController:
# backend -> AutoscalingPolicy
self.autoscaling_policies = dict()
# Dictionary of backend_tag -> router_name -> most recent queue length.
# Dictionary of backend_tag -> proxy_name -> most recent queue length.
self.backend_stats = defaultdict(lambda: defaultdict(dict))
# Used to ensure that only a single state-changing operation happens
@@ -487,56 +496,87 @@ class ServeController:
# If starting the actor for the first time, starts up the other system
# components. If recovering, fetches their actor handles.
self.actor_reconciler._start_routers_if_needed(
self.actor_reconciler._start_http_proxies_if_needed(
self.http_host, self.http_port, self.http_middlewares)
# NOTE(edoakes): unfortunately, we can't completely recover from a
# checkpoint in the constructor because we block while waiting for
# other actors to start up, and those actors fetch soft state from
# this actor. Because no other tasks will start executing until after
# the constructor finishes, if we were to run this logic in the
# constructor it could lead to deadlock between this actor and a child.
# However we do need to guarantee that we have fully recovered from a
# checkpoint before any other state-changing calls run. We address this
# by acquiring the write_lock and then posting the task to recover from
# a checkpoint to the event loop. Other state-changing calls acquire
# this lock and will be blocked until recovering from the checkpoint
# finishes.
# Map of awaiting results
# TODO(ilr): Checkpoint this once this becomes asynchronous
self.inflight_results: Dict[UUID, asyncio.Event] = dict()
self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()
checkpoint = self.kv_store.get(CHECKPOINT_KEY)
if checkpoint is None:
logger.debug("No checkpoint found")
else:
await self.write_lock.acquire()
asyncio.get_event_loop().create_task(
self._recover_from_checkpoint(checkpoint))
await self._recover_from_checkpoint(checkpoint)
# NOTE(simon): Currently we do all-to-all broadcast. This means
# any listeners will receive notification for all changes. This
# can be problem at scale, e.g. updating a single backend config
# will send over the entire configs. In the future, we should
# optimize the logic to support subscription by key.
self.long_poll_host = LongPollerHost()
self.long_poll_host = LongPollHost()
# The configs pushed out here get updated by
# self._recover_from_checkpoint in the failure scenario, so that must
# be run before we notify the changes.
self.notify_backend_configs_changed()
self.notify_replica_handles_changed()
self.notify_traffic_policies_changed()
self.notify_route_table_changed()
asyncio.get_event_loop().create_task(self.run_control_loop())
async def wait_for_event(self, uuid: UUID) -> bool:
if uuid not in self.inflight_results:
return True
event = self.inflight_results[uuid]
await event.wait()
self.inflight_results.pop(uuid)
self._serializable_inflight_results.pop(uuid)
async with self.write_lock:
self._checkpoint()
return True
def _create_event_with_result(
self,
goal_state: Dict[str, any],
recreation_uuid: Optional[UUID] = None) -> UUID:
# NOTE(ilr) Must be called before checkpointing!
event = asyncio.Event()
event.result = FutureResult(goal_state)
event.set()
uuid_val = recreation_uuid or uuid4()
self.inflight_results[uuid_val] = event
self._serializable_inflight_results[uuid_val] = event.result
return uuid_val
async def _num_inflight_results(self) -> int:
return len(self.inflight_results)
def notify_replica_handles_changed(self):
self.long_poll_host.notify_changed(
"worker_handles", {
LongPollKey.REPLICA_HANDLES, {
backend_tag: list(replica_dict.values())
for backend_tag, replica_dict in
self.actor_reconciler.backend_replicas.items()
})
def notify_traffic_policies_changed(self):
self.long_poll_host.notify_changed("traffic_policies",
self.current_state.traffic_policies)
self.long_poll_host.notify_changed(
LongPollKey.TRAFFIC_POLICIES,
self.current_state.traffic_policies,
)
def notify_backend_configs_changed(self):
self.long_poll_host.notify_changed(
"backend_configs", self.current_state.get_backend_configs())
LongPollKey.BACKEND_CONFIGS,
self.current_state.get_backend_configs())
def notify_route_table_changed(self):
self.long_poll_host.notify_changed(LongPollKey.ROUTE_TABLE,
self.current_state.routes)
async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
"""Proxy long pull client's listen request.
@@ -549,13 +589,9 @@ class ServeController:
return await (
self.long_poll_host.listen_for_change(keys_to_snapshot_ids))
def get_routers(self) -> Dict[str, ActorHandle]:
"""Returns a dictionary of node ID to router actor handles."""
return self.actor_reconciler.routers_cache
def get_router_config(self) -> Dict[str, Tuple[str, List[str]]]:
"""Called by the router on startup to fetch required state."""
return self.current_state.routes
def get_http_proxies(self) -> Dict[str, ActorHandle]:
"""Returns a dictionary of node ID to http_proxy actor handles."""
return self.actor_reconciler.http_proxy_cache
def _checkpoint(self) -> None:
"""Checkpoint internal state and write it to the KV store."""
@@ -565,7 +601,8 @@ class ServeController:
checkpoint = pickle.dumps(
Checkpoint(self.goal_state, self.current_state,
self.actor_reconciler))
self.actor_reconciler,
self._serializable_inflight_results))
self.kv_store.put(CHECKPOINT_KEY, checkpoint)
logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))
@@ -578,35 +615,51 @@ class ServeController:
async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None:
"""Recover the instance state from the provided checkpoint.
This should be called in the constructor to ensure that the internal
state is updated before any other operations run. After running this,
internal state will be updated and long-poll clients may be notified.
Performs the following operations:
1) Deserializes the internal state from the checkpoint.
2) Pushes the latest configuration to the routers
in case we crashed before updating them.
3) Starts/stops any replicas that are pending creation or
2) Starts/stops any replicas that are pending creation or
deletion.
NOTE: this requires that self.write_lock is already acquired and will
release it before returning.
"""
assert self.write_lock.locked()
start = time.time()
logger.info("Recovering from checkpoint")
restored_checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
# Restore SystemState
self.current_state = restored_checkpoint.current_state
# Restore ActorStateReconciler
self.actor_reconciler = restored_checkpoint.reconciler
self.autoscaling_policies = await self.actor_reconciler.\
_recover_from_checkpoint(self.current_state, self)
self._serializable_inflight_results = restored_checkpoint.inflight_reqs
for uuid, fut_result in self._serializable_inflight_results.items():
self._create_event_with_result(fut_result.requested_goal, uuid)
logger.info(
"Recovered from checkpoint in {:.3f}s".format(time.time() - start))
# NOTE(edoakes): unfortunately, we can't completely recover from a
# checkpoint in the constructor because we block while waiting for
# other actors to start up, and those actors fetch soft state from
# this actor. Because no other tasks will start executing until after
# the constructor finishes, if we were to run this logic in the
# constructor it could lead to deadlock between this actor and a child.
# However, we do need to guarantee that we have fully recovered from a
# checkpoint before any other state-changing calls run. We address this
# by acquiring the write_lock and then posting the task to recover from
# a checkpoint to the event loop. Other state-changing calls acquire
# this lock and will be blocked until recovering from the checkpoint
# finishes. This can be removed once we move to the async control loop.
self.write_lock.release()
async def finish_recover_from_checkpoint():
assert self.write_lock.locked()
self.autoscaling_policies = await self.actor_reconciler.\
_recover_from_checkpoint(self.current_state, self)
self.write_lock.release()
logger.info(
"Recovered from checkpoint in {:.3f}s".format(time.time() -
start))
await self.write_lock.acquire()
asyncio.get_event_loop().create_task(finish_recover_from_checkpoint())
async def do_autoscale(self) -> None:
for backend, info in self.current_state.backends.items():
@@ -623,44 +676,30 @@ class ServeController:
while True:
await self.do_autoscale()
async with self.write_lock:
self.actor_reconciler._start_routers_if_needed(
self.actor_reconciler._start_http_proxies_if_needed(
self.http_host, self.http_port, self.http_middlewares)
checkpoint_required = self.actor_reconciler.\
_stop_routers_if_needed()
_stop_http_proxies_if_needed()
if checkpoint_required:
self._checkpoint()
await asyncio.sleep(CONTROL_LOOP_PERIOD_S)
def get_backend_configs(self) -> Dict[str, BackendConfig]:
"""Fetched by the router on startup."""
return self.current_state.get_backend_configs()
def get_traffic_policies(self) -> Dict[str, TrafficPolicy]:
"""Fetched by the router on startup."""
return self.current_state.traffic_policies
def _list_replicas(self, backend_tag: BackendTag) -> List[ReplicaTag]:
"""Used only for testing."""
return list(self.actor_reconciler.backend_replicas[backend_tag].keys())
def get_traffic_policy(self, endpoint: str) -> TrafficPolicy:
"""Fetched by serve handles."""
return self.current_state.traffic_policies[endpoint]
def get_all_replica_handles(self) -> Dict[str, Dict[str, ActorHandle]]:
"""Fetched by the router on startup."""
def _all_replica_handles(
self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
"""Used for testing."""
return self.actor_reconciler.backend_replicas
def get_all_backends(self) -> Dict[str, BackendConfig]:
def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
"""Returns a dictionary of backend tag to backend config."""
return self.current_state.get_backend_configs()
def get_all_endpoints(self) -> Dict[str, Dict[str, Any]]:
def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
"""Returns a dictionary of backend tag to backend config."""
return self.current_state.get_endpoints()
async def _set_traffic(self, endpoint_name: str,
traffic_dict: Dict[str, float]) -> None:
traffic_dict: Dict[str, float]) -> UUID:
if endpoint_name not in self.current_state.get_endpoints():
raise ValueError("Attempted to assign traffic for an endpoint '{}'"
" that is not registered.".format(endpoint_name))
@@ -677,21 +716,25 @@ class ServeController:
traffic_policy = TrafficPolicy(traffic_dict)
self.current_state.traffic_policies[endpoint_name] = traffic_policy
return_uuid = self._create_event_with_result({
endpoint_name: traffic_policy
})
# NOTE(edoakes): we must write a checkpoint before pushing the
# update to avoid inconsistent state if we crash after pushing the
# update.
self._checkpoint()
self.notify_traffic_policies_changed()
return return_uuid
async def set_traffic(self, endpoint_name: str,
traffic_dict: Dict[str, float]) -> None:
traffic_dict: Dict[str, float]) -> UUID:
"""Sets the traffic policy for the specified endpoint."""
async with self.write_lock:
await self._set_traffic(endpoint_name, traffic_dict)
return_uuid = await self._set_traffic(endpoint_name, traffic_dict)
return return_uuid
async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
proportion: float) -> None:
proportion: float) -> UUID:
"""Shadow traffic from the endpoint to the backend."""
async with self.write_lock:
if endpoint_name not in self.current_state.get_endpoints():
@@ -707,16 +750,22 @@ class ServeController:
self.current_state.traffic_policies[endpoint_name].set_shadow(
backend_tag, proportion)
traffic_policy = self.current_state.traffic_policies[endpoint_name]
return_uuid = self._create_event_with_result({
endpoint_name: traffic_policy
})
# NOTE(edoakes): we must write a checkpoint before pushing the
# update to avoid inconsistent state if we crash after pushing the
# update.
self._checkpoint()
self.notify_traffic_policies_changed()
return return_uuid
# TODO(architkulkarni): add Optional for route after cloudpickle upgrade
async def create_endpoint(self, endpoint: str,
traffic_dict: Dict[str, float], route,
methods) -> None:
methods) -> UUID:
"""Create a new endpoint with the specified route and methods.
If the route is None, this is a "headless" endpoint that will not
@@ -755,13 +804,11 @@ class ServeController:
self.current_state.routes[route] = (endpoint, methods)
# NOTE(edoakes): checkpoint is written in self._set_traffic.
await self._set_traffic(endpoint, traffic_dict)
await asyncio.gather(*[
router.set_route_table.remote(self.current_state.routes)
for router in self.actor_reconciler.router_handles()
])
return_uuid = await self._set_traffic(endpoint, traffic_dict)
self.notify_route_table_changed()
return return_uuid
async def delete_endpoint(self, endpoint: str) -> None:
async def delete_endpoint(self, endpoint: str) -> UUID:
"""Delete the specified endpoint.
Does not modify any corresponding backends.
@@ -788,19 +835,20 @@ class ServeController:
self.actor_reconciler.endpoints_to_remove.append(endpoint)
return_uuid = self._create_event_with_result({
route_to_delete: None,
endpoint: None
})
# NOTE(edoakes): we must write a checkpoint before pushing the
# updates to the routers to avoid inconsistent state if we crash
# updates to the proxies to avoid inconsistent state if we crash
# after pushing the update.
self._checkpoint()
await asyncio.gather(*[
router.set_route_table.remote(self.current_state.routes)
for router in self.actor_reconciler.router_handles()
])
self.notify_route_table_changed()
return return_uuid
async def create_backend(self, backend_tag: BackendTag,
backend_config: BackendConfig,
replica_config: ReplicaConfig) -> None:
replica_config: ReplicaConfig) -> UUID:
"""Register a new backend under the specified tag."""
async with self.write_lock:
# Ensures this method is idempotent.
@@ -815,12 +863,11 @@ class ServeController:
# Save creator that starts replicas, the arguments to be passed in,
# and the configuration for the backends.
self.current_state.add_backend(
backend_tag,
BackendInfo(
worker_class=backend_replica,
backend_config=backend_config,
replica_config=replica_config))
backend_info = BackendInfo(
worker_class=backend_replica,
backend_config=backend_config,
replica_config=replica_config)
self.current_state.add_backend(backend_tag, backend_info)
metadata = backend_config.internal_metadata
if metadata.autoscaling_config is not None:
self.autoscaling_policies[
@@ -835,6 +882,9 @@ class ServeController:
del self.current_state.backends[backend_tag]
raise e
return_uuid = self._create_event_with_result({
backend_tag: backend_info
})
# NOTE(edoakes): we must write a checkpoint before starting new
# or pushing the updated config to avoid inconsistent state if we
# crash while making the change.
@@ -844,11 +894,12 @@ class ServeController:
self.notify_replica_handles_changed()
# Set the backend config inside the router
# Set the backend config inside routers
# (particularly for max_concurrent_queries).
self.notify_backend_configs_changed()
return return_uuid
async def delete_backend(self, backend_tag: BackendTag) -> None:
async def delete_backend(self, backend_tag: BackendTag) -> UUID:
async with self.write_lock:
# This method must be idempotent. We should validate that the
# specified backend exists on the client.
@@ -876,19 +927,21 @@ class ServeController:
if backend_tag in self.autoscaling_policies:
del self.autoscaling_policies[backend_tag]
# Add the intention to remove the backend from the router.
# Add the intention to remove the backend from the routers.
self.actor_reconciler.backends_to_remove.append(backend_tag)
return_uuid = self._create_event_with_result({backend_tag: None})
# NOTE(edoakes): we must write a checkpoint before removing the
# backend from the router to avoid inconsistent state if we crash
# backend from the routers to avoid inconsistent state if we crash
# after pushing the update.
self._checkpoint()
await self.actor_reconciler._stop_pending_backend_replicas()
self.notify_replica_handles_changed()
return return_uuid
async def update_backend_config(self, backend_tag: BackendTag,
config_options: BackendConfig) -> None:
config_options: BackendConfig) -> UUID:
"""Set the config for the specified backend."""
async with self.write_lock:
assert (self.current_state.get_backend(backend_tag)
@@ -902,18 +955,22 @@ class ServeController:
backend_config._validate_complete()
self.current_state.get_backend(
backend_tag).backend_config = backend_config
backend_info = self.current_state.get_backend(backend_tag)
# Scale the replicas with the new configuration.
self.actor_reconciler._scale_backend_replicas(
self.current_state.backends, backend_tag,
backend_config.num_replicas)
return_uuid = self._create_event_with_result({
backend_tag: backend_info
})
# NOTE(edoakes): we must write a checkpoint before pushing the
# update to avoid inconsistent state if we crash after pushing the
# update.
self._checkpoint()
# Inform the router about change in configuration
# Inform the routers about change in configuration
# (particularly for setting max_batch_size).
await self.actor_reconciler._start_pending_backend_replicas(
@@ -922,6 +979,7 @@ class ServeController:
self.notify_replica_handles_changed()
self.notify_backend_configs_changed()
return return_uuid
def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
"""Get the current config for the specified backend."""
@@ -929,11 +987,15 @@ class ServeController:
), "Backend {} is not registered.".format(backend_tag)
return self.current_state.get_backend(backend_tag).backend_config
def get_http_config(self):
"""Return the HTTP proxy configuration."""
return self.http_host, self.http_port
async def shutdown(self) -> None:
"""Shuts down the serve instance completely."""
async with self.write_lock:
for router in self.actor_reconciler.router_handles():
ray.kill(router, no_restart=True)
for http_proxy in self.actor_reconciler.http_proxy_handles():
ray.kill(http_proxy, no_restart=True)
for replica in self.actor_reconciler.get_replica_handles():
ray.kill(replica, no_restart=True)
self.kv_store.delete(CHECKPOINT_KEY)
+2 -1
View File
@@ -89,5 +89,6 @@ class RandomEndpointPolicy(EndpointPolicy):
query.metadata.shard_key.encode("utf-8"))
chosen_backend, shadow_backends = self._select_backends(value)
logger.debug(f"Chosen backend {chosen_backend} for query {query}")
logger.debug(f"Assigning query {query.metadata.request_id} "
f"to backend {chosen_backend}.")
return [chosen_backend] + shadow_backends
+13 -3
View File
@@ -7,6 +7,7 @@ import ray
from ray.serve.context import TaskContext
from ray.serve.router import RequestMetadata, Router
from ray.serve.utils import get_random_letters
from ray.serve.exceptions import RayServeException
global_async_loop = None
@@ -109,16 +110,25 @@ class RayServeHandle:
``**kwargs``: All keyword arguments will be available in
``request.args``.
"""
assert self.sync, "handle.remote() should be called from sync handle."
if not self.sync:
raise RayServeException(
"You are trying to call handle.remote() with async handle. "
"Please use `await handle.remote_async()` instead.")
coro = self._remote(request_data, kwargs)
future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe(
coro, self.async_loop)
# Block until the result is ready.
return future.result()
async def _remote_async(self, request_data, **kwargs) -> ray.ObjectRef:
async def remote_async(self,
request_data: Optional[Union[Dict, Any]] = None,
**kwargs) -> ray.ObjectRef:
"""Experimental API for enqueue a request in async context."""
assert not self.sync, "_remote_async must be called inside async loop."
if not asyncio.get_event_loop().is_running():
raise RayServeException(
"remote_async must be called from a running event loop.")
return await self._remote(request_data, kwargs)
def options(self,
+34 -27
View File
@@ -3,46 +3,46 @@ import socket
from typing import List
import uvicorn
import starlette.responses
import ray
from ray.exceptions import RayTaskError
from ray.serve.constants import LongPollKey
from ray.serve.context import TaskContext
from ray.util import metrics
from ray.serve.utils import _get_logger, get_random_letters
from ray.serve.http_util import Response
from ray.serve.long_poll import LongPollAsyncClient
from ray.serve.router import Router, RequestMetadata
# The maximum number of times to retry a request due to actor failure.
# TODO(edoakes): this should probably be configurable.
MAX_ACTOR_DEAD_RETRIES = 10
logger = _get_logger()
class HTTPProxy:
"""
This class should be instantiated and ran by ASGI server.
"""This class is meant to be instantiated and run by an ASGI HTTP server.
>>> import uvicorn
>>> uvicorn.run(HTTPProxy(kv_store_actor_handle, router_handle))
# blocks forever
"""
async def fetch_config_from_controller(self, controller_name):
assert ray.is_initialized()
def __init__(self, controller_name):
controller = ray.get_actor(controller_name)
self.route_table = await controller.get_router_config.remote()
self.route_table = {} # Should be updated via long polling.
self.router = Router(controller)
self.long_poll_client = LongPollAsyncClient(controller, {
LongPollKey.ROUTE_TABLE: self._update_route_table,
})
self.request_counter = metrics.Count(
"num_http_requests",
description="The number of HTTP requests processed",
tag_keys=("route", ))
self.router = Router(controller)
async def setup(self):
await self.router.setup_in_async_loop()
def set_route_table(self, route_table):
async def _update_route_table(self, route_table):
logger.debug(f"HTTP Proxy: Get updated route table: {route_table}.")
self.route_table = route_table
async def receive_http_body(self, scope, receive, send):
@@ -74,8 +74,11 @@ class HTTPProxy:
status_code=404).send(scope, receive, send)
async def __call__(self, scope, receive, send):
# NOTE: This implements ASGI protocol specified in
# https://asgi.readthedocs.io/en/latest/specs/index.html
"""Implements the ASGI protocol.
See details at:
https://asgi.readthedocs.io/en/latest/specs/index.html.
"""
error_sender = self._make_error_sender(scope, receive, send)
@@ -126,6 +129,18 @@ class HTTPProxy:
if isinstance(result, RayTaskError):
error_message = "Task Error. Traceback: {}.".format(result)
await error_sender(error_message, 500)
elif isinstance(result, starlette.responses.Response):
if isinstance(result, starlette.responses.StreamingResponse):
raise TypeError("Starlette StreamingResponse returned by "
f"backend for endpoint {endpoint_name}. "
"StreamingResponse is unserializable and not "
"supported by Ray Serve. Consider using "
"another Starlette response type such as "
"Response, HTMLResponse, PlainTextResponse, "
"or JSONResponse. If support for "
"StreamingResponse is desired, please let "
"the Ray team know by making a Github issue!")
await result(scope, receive, send)
else:
await Response(result).send(scope, receive, send)
@@ -137,12 +152,13 @@ class HTTPProxyActor:
host,
port,
controller_name,
http_middlewares: List["starlette.middleware.Middleware"] = []):
http_middlewares: List[
"starlette.middleware.Middleware"] = []): # noqa: F821
self.host = host
self.port = port
self.app = HTTPProxy()
await self.app.fetch_config_from_controller(controller_name)
self.app = HTTPProxy(controller_name)
await self.app.setup()
self.wrapped_app = self.app
for middleware in http_middlewares:
@@ -180,12 +196,3 @@ class HTTPProxyActor:
# the main thread and uvicorn doesn't expose a way to configure it.
server.install_signal_handlers = lambda: None
await server.serve(sockets=[sock])
async def set_route_table(self, route_table):
self.app.set_route_table(route_table)
# ------ Proxy router logic ------ #
async def assign_request(self, request_meta, *request_args,
**request_kwargs):
return await (await self.app.router.assign_request(
request_meta, *request_args, **request_kwargs))
+1 -1
View File
@@ -117,7 +117,7 @@ class Response:
elif content_type == "json":
self.raw_headers.append([b"content-type", b"application/json"])
else:
raise ValueError("Invalid content type {}".foramt(content_type))
raise ValueError("Invalid content type {}".format(content_type))
async def send(self, scope, receive, send):
await send({
+20 -19
View File
@@ -1,4 +1,5 @@
import asyncio
from inspect import iscoroutinefunction
import random
from collections import defaultdict
from dataclasses import dataclass
@@ -22,7 +23,7 @@ class UpdatedObject:
UpdateStateAsyncCallable = Callable[[Any], Awaitable[None]]
class LongPollerAsyncClient:
class LongPollAsyncClient:
"""The asynchronous long polling client.
Internally, it runs `await object_ref` in a `while True` loop. When a
@@ -31,7 +32,7 @@ class LongPollerAsyncClient:
the next poll.
Args:
host_actor(ray.ActorHandle): handle to actor embedding LongPollerHost.
host_actor(ray.ActorHandle): handle to actor embedding LongPollHost.
key_listeners(Dict[str, AsyncCallable]): a dictionary mapping keys to
callbacks to be called on state update for the corresponding keys.
"""
@@ -40,6 +41,10 @@ class LongPollerAsyncClient:
key_listeners: Dict[str, UpdateStateAsyncCallable]) -> None:
self.host_actor = host_actor
self.key_listeners = key_listeners
for callback in key_listeners.values():
if not iscoroutinefunction(callback):
raise ValueError(
"Callbacks to async long poller must be 'async def'.")
self.snapshot_ids: Dict[str, int] = {
key: -1
@@ -56,34 +61,31 @@ class LongPollerAsyncClient:
self.snapshot_ids)
return object_ref
def _update(self, updates: Dict[str, UpdatedObject]):
for key, update in updates.items():
self.object_snapshots[key] = update.object_snapshot
self.snapshot_ids[key] = update.snapshot_id
async def _do_long_poll(self):
while True:
try:
updates: Dict[str, UpdatedObject] = await self._poll_once()
self._update(updates)
logger.debug(f"LongPollerClient received udpates: {updates}")
for key, updated_object in updates.items():
logger.debug("LongPollClient received updates for keys: "
f"{list(updates.keys())}.")
for key, update in updates.items():
self.object_snapshots[key] = update.object_snapshot
self.snapshot_ids[key] = update.snapshot_id
# NOTE(simon):
# This blocks the loop from doing another poll. Consider
# use loop.create_task here or poll first then call the
# callbacks.
callback = self.key_listeners[key]
await callback(updated_object.object_snapshot)
await callback(update.object_snapshot)
except ray.exceptions.RayActorError:
# This can happen during shutdown where the controller is
# intentionally killed, the client should just gracefully
# exit.
logger.debug("LongPollerClient failed to connect to host. "
logger.debug("LongPollClient failed to connect to host. "
"Shutting down.")
break
class LongPollerHost:
class LongPollHost:
"""The server side object that manages long pulling requests.
The desired use case is to embed this in an Ray actor. Client will be
@@ -115,11 +117,10 @@ class LongPollerHost:
immediately if the snapshot_ids are outdated, otherwise it will block
until there's one updates.
"""
# 1. Figure out which keys do we care about
watched_keys = set(self.snapshot_ids.keys()).intersection(
keys_to_snapshot_ids.keys())
if len(watched_keys) == 0:
raise ValueError("Keys not found.")
watched_keys = keys_to_snapshot_ids.keys()
nonexistent_keys = set(watched_keys) - set(self.snapshot_ids.keys())
if len(nonexistent_keys) > 0:
raise ValueError(f"Keys not found: {nonexistent_keys}.")
# 2. If there are any outdated keys (by comparing snapshot ids)
# return immediately.
@@ -159,7 +160,7 @@ class LongPollerHost:
def notify_changed(self, object_key: str, updated_object: Any):
self.snapshot_ids[object_key] += 1
self.object_snapshots[object_key] = updated_object
logger.debug(f"LongPollerHost: {object_key} = {updated_object}")
logger.debug(f"LongPollHost: Notify change for key {object_key}.")
if object_key in self.notifier_events:
for event in self.notifier_events.pop(object_key):
+15 -12
View File
@@ -6,9 +6,10 @@ from typing import Any, DefaultDict, Dict, Iterable, List, Optional
import ray
from ray.actor import ActorHandle
from ray.serve.constants import LongPollKey
from ray.serve.context import TaskContext
from ray.serve.endpoint_policy import EndpointPolicy, RandomEndpointPolicy
from ray.serve.long_poll import LongPollerAsyncClient
from ray.serve.long_poll import LongPollAsyncClient
from ray.serve.utils import logger
from ray.util import metrics
@@ -106,7 +107,8 @@ class ReplicaSet:
) >= self.max_concurrent_queries:
# This replica is overloaded, try next one
continue
logger.debug(f"Replica set assigned {query} to {replica}")
logger.debug(f"Assigned query {query.metadata.request_id} "
f"to replica {replica}.")
ref = replica.handle_request.remote(query)
self.in_flight_queries[replica].add(ref)
return ref
@@ -133,7 +135,8 @@ class ReplicaSet:
"""
assigned_ref = self._try_assign_replica(query)
while assigned_ref is None: # Can't assign a replica right now.
logger.debug(f"Failed to assign a replica for query {query}")
logger.debug("Failed to assign a replica for "
f"query {query.metadata.request_id}")
# Maybe there exists a free replica, we just need to refresh our
# query tracker.
num_finished = self._drain_completed_object_refs()
@@ -141,7 +144,7 @@ class ReplicaSet:
# config to be updated.
if num_finished == 0:
logger.debug(
f"All replicas are busy, waiting for a free replica.")
"All replicas are busy, waiting for a free replica.")
await asyncio.wait(
self._all_query_refs + [self.config_updated_event.wait()],
return_when=asyncio.FIRST_COMPLETED)
@@ -176,14 +179,14 @@ class Router:
async def setup_in_async_loop(self):
# NOTE(simon): Instead of performing initialization in __init__,
# We separated the init of LongPollerAsyncClient to this method because
# __init__ might be called in sync context. LongPollerAsyncClient
# We separated the init of LongPollAsyncClient to this method because
# __init__ might be called in sync context. LongPollAsyncClient
# requires async context.
self.long_pull_client = LongPollerAsyncClient(
self.long_poll_client = LongPollAsyncClient(
self.controller, {
"traffic_policies": self._update_traffic_policies,
"worker_handles": self._update_worker_handles,
"backend_configs": self._update_backend_configs,
LongPollKey.TRAFFIC_POLICIES: self._update_traffic_policies,
LongPollKey.REPLICA_HANDLES: self._update_replica_handles,
LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
})
async def _update_traffic_policies(self, traffic_policies):
@@ -194,8 +197,8 @@ class Router:
event = self._pending_endpoints.pop(endpoint)
event.set()
async def _update_worker_handles(self, worker_handles):
for backend_tag, replica_handles in worker_handles.items():
async def _update_replica_handles(self, replica_handles):
for backend_tag, replica_handles in replica_handles.items():
self.backend_replicas[backend_tag].update_worker_replicas(
replica_handles)
+45
View File
@@ -0,0 +1,45 @@
#!/usr/bin/env python
import click
import ray
from ray import serve
from ray.serve.constants import DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT
@click.group(
help="[EXPERIMENTAL] CLI for managing Serve instances on a Ray cluster.")
@click.option(
"--address",
"-a",
default="auto",
required=False,
type=str,
help="Address of the running Ray cluster to connect to. "
"Defaults to \"auto\".")
def cli(address):
ray.init(address=address)
@cli.command(help="Start a detached Serve instance on the Ray cluster.")
@click.option(
"--http-host",
default=DEFAULT_HTTP_HOST,
required=False,
type=str,
help="Host for HTTP servers to listen on. "
f"Defaults to {DEFAULT_HTTP_HOST}.")
@click.option(
"--http-port",
default=DEFAULT_HTTP_PORT,
required=False,
type=int,
help="Port for HTTP servers to listen on. "
f"Defaults to {DEFAULT_HTTP_PORT}.")
def start(http_host, http_port):
serve.start(detached=True, http_host=http_host, http_port=http_port)
@cli.command(help="Shutdown the running Serve instance on the Ray cluster.")
def shutdown():
serve.connect().shutdown()
+12 -9
View File
@@ -7,6 +7,7 @@ import pytest
import ray
from ray import serve
from ray.serve.config import BackendConfig
from ray.serve.constants import LongPollKey
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5
@@ -42,22 +43,22 @@ def mock_controller_with_name():
@ray.remote(num_cpus=0)
class MockControllerActor:
def __init__(self):
from ray.serve.long_poll import LongPollerHost
self.host = LongPollerHost()
from ray.serve.long_poll import LongPollHost
self.host = LongPollHost()
self.backend_replicas = defaultdict(list)
self.backend_configs = dict()
self.clear()
def clear(self):
self.host.notify_changed("worker_handles", {})
self.host.notify_changed("traffic_policies", {})
self.host.notify_changed("backend_configs", {})
self.host.notify_changed(LongPollKey.REPLICA_HANDLES, {})
self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES, {})
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS, {})
async def listen_for_change(self, snapshot_ids):
return await self.host.listen_for_change(snapshot_ids)
def set_traffic(self, endpoint, traffic_policy):
self.host.notify_changed("traffic_policies",
self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES,
{endpoint: traffic_policy})
def add_new_replica(self,
@@ -68,15 +69,17 @@ def mock_controller_with_name():
self.backend_configs[backend_tag] = backend_config
self.host.notify_changed(
"worker_handles",
LongPollKey.REPLICA_HANDLES,
self.backend_replicas,
)
self.host.notify_changed("backend_configs", self.backend_configs)
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
self.backend_configs)
def update_backend(self, backend_tag: str,
backend_config: BackendConfig):
self.backend_configs[backend_tag] = backend_config
self.host.notify_changed("backend_configs", self.backend_configs)
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
self.backend_configs)
name = f"MockController{random.randint(0,10e4)}"
yield name, MockControllerActor.options(name=name).remote()
+74 -45
View File
@@ -4,6 +4,7 @@ import time
import os
import pytest
import requests
import starlette.responses
import ray
from ray import serve
@@ -25,22 +26,6 @@ def test_e2e(serve_instance):
client.create_endpoint(
"endpoint", backend="echo:v1", route="/api", methods=["GET", "POST"])
retry_count = 5
timeout_sleep = 0.5
while True:
try:
resp = requests.get(
"http://127.0.0.1:8000/-/routes", timeout=0.5).json()
assert resp == {"/api": ["endpoint", ["GET", "POST"]]}
break
except Exception as e:
time.sleep(timeout_sleep)
timeout_sleep *= 2
retry_count -= 1
if retry_count == 0:
assert False, ("Route table hasn't been updated after 3 tries."
"The latest error was {}").format(e)
resp = requests.get("http://127.0.0.1:8000/api").json()["method"]
assert resp == "GET"
@@ -48,6 +33,63 @@ def test_e2e(serve_instance):
assert resp == "POST"
def test_starlette_response(serve_instance):
client = serve_instance
def basic_response(_):
return starlette.responses.Response(
"Hello, world!", media_type="text/plain")
client.create_backend("basic_response", basic_response)
client.create_endpoint(
"basic_response", backend="basic_response", route="/basic_response")
assert requests.get(
"http://127.0.0.1:8000/basic_response").text == "Hello, world!"
def html_response(_):
return starlette.responses.HTMLResponse(
"<html><body><h1>Hello, world!</h1></body></html>")
client.create_backend("html_response", html_response)
client.create_endpoint(
"html_response", backend="html_response", route="/html_response")
assert requests.get(
"http://127.0.0.1:8000/html_response"
).text == "<html><body><h1>Hello, world!</h1></body></html>"
def plain_text_response(_):
return starlette.responses.PlainTextResponse("Hello, world!")
client.create_backend("plain_text_response", plain_text_response)
client.create_endpoint(
"plain_text_response",
backend="plain_text_response",
route="/plain_text_response")
assert requests.get(
"http://127.0.0.1:8000/plain_text_response").text == "Hello, world!"
def json_response(_):
return starlette.responses.JSONResponse({"hello": "world"})
client.create_backend("json_response", json_response)
client.create_endpoint(
"json_response", backend="json_response", route="/json_response")
assert requests.get("http://127.0.0.1:8000/json_response").json()[
"hello"] == "world"
def redirect_response(_):
return starlette.responses.RedirectResponse(
url="http://127.0.0.1:8000/basic_response")
client.create_backend("redirect_response", redirect_response)
client.create_endpoint(
"redirect_response",
backend="redirect_response",
route="/redirect_response")
assert requests.get(
"http://127.0.0.1:8000/redirect_response").text == "Hello, world!"
def test_backend_user_config(serve_instance):
client = serve_instance
@@ -63,25 +105,26 @@ def test_backend_user_config(serve_instance):
config = BackendConfig(num_replicas=2, user_config={"count": 123, "b": 2})
client.create_backend("counter", Counter, config=config)
client.create_endpoint("counter", backend="counter", route="/counter")
client.create_endpoint("counter", backend="counter")
handle = client.get_handle("counter")
def check(val, num_replicas):
pids_seen = set()
for i in range(100):
result = ray.get(handle.remote())
assert (str(result[0]) == val), result[0]
if str(result[0]) != val:
return False
pids_seen.add(result[1])
assert (len(pids_seen) == num_replicas)
return len(pids_seen) == num_replicas
check("123", 2)
wait_for_condition(lambda: check("123", 2))
client.update_backend_config("counter", BackendConfig(num_replicas=3))
check("123", 3)
wait_for_condition(lambda: check("123", 3))
config = BackendConfig(user_config={"count": 456})
client.update_backend_config("counter", config)
check("456", 3)
wait_for_condition(lambda: check("456", 3))
def test_call_method(serve_instance):
@@ -183,7 +226,7 @@ def test_reject_duplicate_endpoint_and_route(serve_instance):
def test_no_http(serve_instance):
client = serve.start(http_host=None)
assert len(ray.get(client._controller.get_routers.remote())) == 0
assert len(ray.get(client._controller.get_http_proxies.remote())) == 0
def hello(*args):
return "hello"
@@ -223,11 +266,6 @@ def test_scaling_replicas(serve_instance):
client.create_endpoint("counter", backend="counter:v1", route="/increment")
# Keep checking the routing table until /increment is populated
while "/increment" not in requests.get(
"http://127.0.0.1:8000/-/routes").json():
time.sleep(0.2)
counter_result = []
for _ in range(10):
resp = requests.get("http://127.0.0.1:8000/increment").json()
@@ -267,11 +305,6 @@ def test_batching(serve_instance):
client.create_endpoint(
"counter1", backend="counter:v11", route="/increment2")
# Keep checking the routing table until /increment is populated
while "/increment2" not in requests.get(
"http://127.0.0.1:8000/-/routes").json():
time.sleep(0.2)
future_list = []
handle = client.get_handle("counter1")
for _ in range(20):
@@ -299,8 +332,7 @@ def test_batching_exception(serve_instance):
# Set the max batch size.
config = BackendConfig(max_batch_size=5)
client.create_backend("exception:v1", NoListReturned, config=config)
client.create_endpoint(
"exception-test", backend="exception:v1", route="/noListReturned")
client.create_endpoint("exception-test", backend="exception:v1")
handle = client.get_handle("exception-test")
with pytest.raises(ray.exceptions.RayTaskError):
@@ -323,16 +355,16 @@ def test_updating_config(serve_instance):
client.create_endpoint("bsimple", backend="bsimple:v1", route="/bsimple")
controller = client._controller
old_replica_tag_list = ray.get(
controller._list_replicas.remote("bsimple:v1"))
old_replica_tag_list = list(
ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())
update_config = BackendConfig(max_batch_size=5)
client.update_backend_config("bsimple:v1", update_config)
new_replica_tag_list = ray.get(
controller._list_replicas.remote("bsimple:v1"))
new_replica_tag_list = list(
ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())
new_all_tag_list = []
for worker_dict in ray.get(
controller.get_all_replica_handles.remote()).values():
controller._all_replica_handles.remote()).values():
new_all_tag_list.extend(list(worker_dict.keys()))
# the old and new replica tag list should be identical
@@ -648,7 +680,7 @@ def test_create_infeasible_error(serve_instance):
"MagicMLResource": 100
}})
# Even each replica might be feasible, the total might not be.
# Even though each replica might be feasible, the total might not be.
current_cpus = int(ray.nodes()[0]["Resources"]["CPU"])
num_replicas = current_cpus + 20
config = BackendConfig(num_replicas=num_replicas)
@@ -661,10 +693,6 @@ def test_create_infeasible_error(serve_instance):
}},
config=config)
# No replica should be created!
replicas = ray.get(client._controller._list_replicas.remote("f1"))
assert len(replicas) == 0
def test_shutdown():
def f():
@@ -797,6 +825,7 @@ def test_serve_metrics(serve_instance):
client.create_backend("metrics", batcher)
client.create_endpoint("metrics", backend="metrics", route="/metrics")
# send 10 concurrent requests
url = "http://127.0.0.1:8000/metrics"
ray.get([block_until_http_ready.remote(url) for _ in range(10)])
@@ -48,7 +48,7 @@ def setup_worker(name,
async def add_servable_to_router(servable, router, controller_name, **kwargs):
worker = setup_worker(
"backend", servable, controller_name=controller_name, **kwargs)
await router._update_worker_handles.remote({"backend": [worker]})
await router._update_replica_handles.remote({"backend": [worker]})
await router._update_traffic_policies.remote({
"endpoint": TrafficPolicy({
"backend": 1.0
+23
View File
@@ -0,0 +1,23 @@
import pytest
import ray
def test_controller_inflight_requests_clear(serve_instance):
client = serve_instance
initial_number_reqs = ray.get(
client._controller._num_inflight_results.remote())
def function(_):
return "hello"
client.create_backend("tst", function)
client.create_endpoint("end_pt", backend="tst")
assert ray.get(client._controller._num_inflight_results.remote()
) - initial_number_reqs == 0
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", "-s", __file__]))
+13 -10
View File
@@ -4,6 +4,7 @@ import tempfile
import time
import ray
from ray.test_utils import wait_for_condition
from ray import serve
from ray.serve.config import BackendConfig, ReplicaConfig
@@ -53,9 +54,11 @@ def test_controller_failure(serve_instance):
client.create_backend("controller_failure:v2", function)
client.set_traffic("controller_failure", {"controller_failure:v2": 1.0})
for _ in range(10):
def check_controller_failure():
response = request_with_retries("/controller_failure", timeout=30)
assert response.text == "hello2"
return response.text == "hello2"
wait_for_condition(check_controller_failure)
def function(_):
return "hello3"
@@ -76,10 +79,10 @@ def test_controller_failure(serve_instance):
assert response.text == "hello3"
def _kill_routers(client):
routers = ray.get(client._controller.get_routers.remote())
for router in routers.values():
ray.kill(router, no_restart=False)
def _kill_http_proxies(client):
http_proxies = ray.get(client._controller.get_http_proxies.remote())
for http_proxy in http_proxies.values():
ray.kill(http_proxy, no_restart=False)
def test_http_proxy_failure(serve_instance):
@@ -98,7 +101,7 @@ def test_http_proxy_failure(serve_instance):
response = request_with_retries("/proxy_failure", timeout=30)
assert response.text == "hello1"
_kill_routers(client)
_kill_http_proxies(client)
def function(_):
return "hello2"
@@ -113,7 +116,7 @@ def test_http_proxy_failure(serve_instance):
def _get_worker_handles(client, backend):
controller = client._controller
backend_dict = ray.get(controller.get_all_replica_handles.remote())
backend_dict = ray.get(controller._all_replica_handles.remote())
return list(backend_dict[backend].values())
@@ -124,7 +127,7 @@ def test_worker_restart(serve_instance):
client = serve_instance
class Worker1:
def __call__(self):
def __call__(self, *args):
return os.getpid()
client.create_backend("worker_failure:v1", Worker1)
@@ -176,7 +179,7 @@ def test_worker_replica_failure(serve_instance):
while True:
pass
def __call__(self):
def __call__(self, *args):
pass
temp_path = os.path.join(tempfile.gettempdir(),
+22 -14
View File
@@ -1,5 +1,4 @@
import sys
import functools
import time
import asyncio
import os
@@ -8,12 +7,12 @@ from typing import Dict
import pytest
import ray
from ray.serve.long_poll import (LongPollerAsyncClient, LongPollerHost,
from ray.serve.long_poll import (LongPollAsyncClient, LongPollHost,
UpdatedObject)
def test_host_standalone(serve_instance):
host = ray.remote(LongPollerHost).remote()
host = ray.remote(LongPollHost).remote()
# Write two values
ray.get(host.notify_changed.remote("key_1", 999))
@@ -44,10 +43,10 @@ def test_long_poll_restarts(serve_instance):
max_restarts=-1,
max_task_retries=-1,
)
class RestartableLongPollerHost:
class RestartableLongPollHost:
def __init__(self) -> None:
print("actor started")
self.host = LongPollerHost()
self.host = LongPollHost()
self.host.notify_changed("timer", time.time())
self.should_exit = False
@@ -63,7 +62,7 @@ def test_long_poll_restarts(serve_instance):
print("actor exit")
os._exit(1)
host = RestartableLongPollerHost.remote()
host = RestartableLongPollHost.remote()
updated_values = ray.get(host.listen_for_change.remote({"timer": -1}))
timer: UpdatedObject = updated_values["timer"]
@@ -81,22 +80,31 @@ def test_long_poll_restarts(serve_instance):
@pytest.mark.asyncio
async def test_async_client(serve_instance):
host = ray.remote(LongPollerHost).remote()
host = ray.remote(LongPollHost).remote()
# Write two values
ray.get(host.notify_changed.remote("key_1", 100))
ray.get(host.notify_changed.remote("key_2", 999))
# Check that construction fails with a sync callback.
def callback(result, key):
pass
with pytest.raises(ValueError):
client = LongPollAsyncClient(host, {"key": callback})
callback_results = dict()
async def callback(result, key):
callback_results[key] = result
async def key_1_callback(result):
callback_results["key_1"] = result
client = LongPollerAsyncClient(
host, {
"key_1": functools.partial(callback, key="key_1"),
"key_2": functools.partial(callback, key="key_2")
})
async def key_2_callback(result):
callback_results["key_2"] = result
client = LongPollAsyncClient(host, {
"key_1": key_1_callback,
"key_2": key_2_callback,
})
while len(client.object_snapshots) == 0:
# Yield the loop for client to get the result
+5 -1
View File
@@ -144,6 +144,7 @@ class ServeEncoder(json.JSONEncoder):
@ray.remote(num_cpus=0)
def block_until_http_ready(http_endpoint,
backoff_time_s=1,
check_ready=None,
timeout=HTTP_PROXY_TIMEOUT):
http_is_ready = False
start_time = time.time()
@@ -152,7 +153,10 @@ def block_until_http_ready(http_endpoint,
try:
resp = requests.get(http_endpoint)
assert resp.status_code == 200
http_is_ready = True
if check_ready is None:
http_is_ready = True
else:
http_is_ready = check_ready(resp)
except Exception:
pass
+2
View File
@@ -95,6 +95,8 @@ py_test_module_list(
"test_dask_callback.py",
"test_debug_tools.py",
"test_experimental_client.py",
"test_experimental_client_metadata.py",
"test_experimental_client_terminate.py",
"test_job.py",
"test_memstat.py",
"test_metrics_agent.py",
+57
View File
@@ -8,6 +8,7 @@ try:
except ImportError:
pytest_timeout = None
import sys
import tempfile
import datetime
import ray
@@ -867,5 +868,61 @@ def test_actor_creation_latency(ray_start_regular_shared):
actor_create_time - start, end - start))
@pytest.mark.parametrize(
"exit_condition",
[
# "out_of_scope", TODO(edoakes): enable this once fixed.
"__ray_terminate__",
"ray.actor.exit_actor",
"ray.kill"
])
def test_atexit_handler(ray_start_regular_shared, exit_condition):
@ray.remote
class A():
def __init__(self, tmpfile, data):
import atexit
def f(*args, **kwargs):
with open(tmpfile, "w") as f:
f.write(data)
f.flush()
atexit.register(f)
def ready(self):
pass
def exit(self):
ray.actor.exit_actor()
data = "hello"
tmpfile = tempfile.NamedTemporaryFile()
a = A.remote(tmpfile.name, data)
ray.get(a.ready.remote())
if exit_condition == "out_of_scope":
del a
elif exit_condition == "__ray_terminate__":
ray.wait([a.__ray_terminate__.remote()])
elif exit_condition == "ray.actor.exit_actor":
ray.wait([a.exit.remote()])
elif exit_condition == "ray.kill":
ray.kill(a)
else:
assert False, "Unrecognized condition"
def check_file_written():
with open(tmpfile.name) as f:
if f.read() == data:
return True
return False
# ray.kill() should not trigger atexit handlers, all other methods should.
if exit_condition == "ray.kill":
assert not check_file_written()
else:
ray.test_utils.wait_for_condition(check_file_written)
if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
+15 -14
View File
@@ -1055,11 +1055,11 @@ def test_actor_resource_demand(shutdown_only):
ray.get(a.foo.remote())
time.sleep(1)
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
# The actor is scheduled so there should be no more demands left.
assert len(heartbeat.resource_load_by_shape.resource_demands) == 0
assert len(resource_usages.resource_load_by_shape.resource_demands) == 0
@ray.remote(num_cpus=80)
class Actor2:
@@ -1070,23 +1070,24 @@ def test_actor_resource_demand(shutdown_only):
time.sleep(1)
# This actor cannot be scheduled.
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
assert (heartbeat.resource_load_by_shape.resource_demands[0].shape == {
"CPU": 80.0
})
assert (heartbeat.resource_load_by_shape.resource_demands[0]
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
assert (
resource_usages.resource_load_by_shape.resource_demands[0].shape == {
"CPU": 80.0
})
assert (resource_usages.resource_load_by_shape.resource_demands[0]
.num_infeasible_requests_queued == 1)
actors.append(Actor2.remote())
time.sleep(1)
# Two actors cannot be scheduled.
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
assert (heartbeat.resource_load_by_shape.resource_demands[0]
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
assert (resource_usages.resource_load_by_shape.resource_demands[0]
.num_infeasible_requests_queued == 2)
global_state_accessor.disconnect()
+61
View File
@@ -1,3 +1,4 @@
import asyncio
import collections
import numpy as np
import os
@@ -211,6 +212,66 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
ray.get(actor.increase.remote())
def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
@ray.remote(num_cpus=0)
class Counter:
def __init__(self):
self.count = 0
self.event = asyncio.Event()
def increment(self):
self.count += 1
self.event.set()
async def wait_for_count(self, count):
while True:
if self.count >= count:
return
await self.event.wait()
self.event.clear()
@ray.remote
class ActorToKill:
def __init__(self, counter):
counter.increment.remote()
def run(self, counter, signal):
counter.increment.remote()
ray.get(signal.wait.remote())
@ray.remote
class CallingActor:
def __init__(self):
self.actor = ray.get_actor("a")
def call_other(self, counter, signal):
return ray.get(self.actor.run.remote(counter, signal))
init_counter = Counter.remote()
run_counter = Counter.remote()
signal = SignalActor.remote()
# Start the two actors, wait for ActorToKill's constructor to run.
a = ActorToKill.options(
name="a", max_restarts=-1, max_task_retries=-1).remote(init_counter)
c = CallingActor.remote()
ray.get(init_counter.wait_for_count.remote(1), timeout=30)
# Signal the CallingActor to call ActorToKill, wait for it to be running,
# then kill ActorToKill.
# Verify that this causes ActorToKill's constructor to run a second time
# and the run method to begin a second time.
ref = c.call_other.remote(run_counter, signal)
ray.get(run_counter.wait_for_count.remote(1), timeout=30)
ray.kill(a, no_restart=False)
ray.get(init_counter.wait_for_count.remote(2), timeout=30)
ray.get(run_counter.wait_for_count.remote(2), timeout=30)
# Signal the run method to finish, verify that the CallingActor returns.
signal.send.remote()
ray.get(ref, timeout=30)
def test_actor_restart_on_node_failure(ray_start_cluster):
config = {
"num_heartbeats_timeout": 10,
+11 -3
View File
@@ -94,8 +94,13 @@ def test_local_scheduling_first(ray_start_cluster):
assert local()
@pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often")
def test_load_balancing_with_dependencies(ray_start_cluster):
@pytest.mark.parametrize("fast", [True, False])
def test_load_balancing_with_dependencies(ray_start_cluster, fast):
if fast and new_scheduler_enabled:
# Load-balancing on new scheduler can be inefficient if (task
# duration:heartbeat interval) is small enough.
pytest.skip()
# This test ensures that tasks are being assigned to all raylets in a
# roughly equal manner even when the tasks have dependencies.
cluster = ray_start_cluster
@@ -106,7 +111,10 @@ def test_load_balancing_with_dependencies(ray_start_cluster):
@ray.remote
def f(x):
time.sleep(0.010)
if fast:
time.sleep(0.010)
else:
time.sleep(0.1)
return ray.worker.global_worker.node.unique_id
# This object will be local to one of the raylets. Make sure
+26
View File
@@ -198,6 +198,32 @@ async def test_asyncio_double_await(ray_start_regular_shared):
await waiting
@pytest.mark.asyncio
async def test_asyncio_exit_actor(ray_start_regular_shared):
# https://github.com/ray-project/ray/issues/12649
# The test should just hang without the fix.
@ray.remote
class Actor:
async def exit(self):
ray.actor.exit_actor()
async def ping(self):
return "pong"
async def loop_forever(self):
while True:
await asyncio.sleep(5)
a = Actor.options(max_task_retries=0).remote()
a.loop_forever.remote()
# Make sure exit_actor exits immediately, not once all tasks completed.
ray.get(a.exit.remote())
with pytest.raises(ray.exceptions.RayActorError):
ray.get(a.ping.remote())
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))
+94 -7
View File
@@ -537,6 +537,7 @@ class AutoscalingTest(unittest.TestCase):
self.provider = MockProvider()
self.provider.create_node({}, {TAG_RAY_NODE_KIND: "worker"}, 10)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -558,6 +559,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
@@ -613,6 +615,70 @@ class AutoscalingTest(unittest.TestCase):
autoscaler.update()
self.waitForNodes(0)
def testLegacyYamlWithRequestResources(self):
"""Test when using legacy yamls request_resources() adds workers.
Makes sure that requested resources are added for legacy yamls when
necessary. So if requested resources for instance fit on the headnode
we don't add more nodes. But we add more nodes when they don't fit.
"""
config = SMALL_CLUSTER.copy()
config["min_workers"] = 0
config["max_workers"] = 100
config["idle_timeout_minutes"] = 0
config["upscaling_speed"] = 1
config_path = self.write_config(config)
self.provider = MockProvider()
self.provider.create_node({}, {
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
}, 1)
head_ip = self.provider.non_terminated_node_ips(
tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
lm = LoadMetrics()
lm.local_ip = head_ip
lm.update(head_ip, {"CPU": 1}, {"CPU": 1}, {})
autoscaler = StandardAutoscaler(
config_path,
lm,
max_launch_batch=5,
max_concurrent_launches=5,
max_failures=0,
process_runner=runner,
update_interval_s=0)
autoscaler.update()
# 1 head node.
self.waitForNodes(1)
autoscaler.request_resources([{"CPU": 1}])
autoscaler.update()
# still 1 head node because request_resources fits in the headnode.
self.waitForNodes(1)
autoscaler.request_resources([{"CPU": 1}] + [{"CPU": 2}] * 9)
autoscaler.update()
self.waitForNodes(2) # Adds a single worker to get its resources.
autoscaler.update()
self.waitForNodes(2) # Still 1 worker because its resources
# aren't known.
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
autoscaler.update()
self.waitForNodes(10) # 9 workers and 1 head node, scaled immediately.
lm.update(
"172.0.0.1", {"CPU": 2}, {"CPU": 2}, {},
waiting_bundles=[{
"CPU": 2
}] * 9,
infeasible_bundles=[{
"CPU": 1
}] * 1)
autoscaler.update()
# Make sure that if all the resources fit on the exising nodes not
# to add any more.
self.waitForNodes(10)
def testAggressiveAutoscaling(self):
config = SMALL_CLUSTER.copy()
config["min_workers"] = 0
@@ -629,7 +695,7 @@ class AutoscalingTest(unittest.TestCase):
head_ip = self.provider.non_terminated_node_ips(
tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
lm = LoadMetrics()
lm.local_ip = head_ip
@@ -782,6 +848,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -817,6 +884,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(config)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -896,6 +964,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
@@ -949,6 +1018,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -989,6 +1059,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(config)
self.provider = MockProvider()
runner = MockProcessRunner(fail_cmds=["setup_cmd"])
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -1000,14 +1071,18 @@ class AutoscalingTest(unittest.TestCase):
self.waitForNodes(2)
self.provider.finish_starting_nodes()
autoscaler.update()
self.waitForNodes(
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
try:
self.waitForNodes(
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
except AssertionError:
# The failed nodes might have been already terminated by autoscaler
assert len(self.provider.non_terminated_nodes({})) == 0
def testConfiguresOutdatedNodes(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
@@ -1038,6 +1113,7 @@ class AutoscalingTest(unittest.TestCase):
self.provider = MockProvider()
lm = LoadMetrics()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(5)])
autoscaler = StandardAutoscaler(
config_path,
lm,
@@ -1087,12 +1163,22 @@ class AutoscalingTest(unittest.TestCase):
autoscaler.update()
assert autoscaler.pending_launches.value == 0
assert len(self.provider.non_terminated_nodes({})) == 3
# This actually remained 4 instead of 3, because the other 2 nodes
# are not connected and hence we rely more on connected nodes for
# min_workers. When the "pending" nodes show up as connected,
# then we can terminate the ones connected before.
assert len(self.provider.non_terminated_nodes({})) == 4
lm.last_used_time_by_ip["172.0.0.2"] = 0
lm.last_used_time_by_ip["172.0.0.3"] = 0
autoscaler.update()
assert autoscaler.pending_launches.value == 0
assert len(self.provider.non_terminated_nodes({})) == 1
# 2 nodes and not 1 because 1 is needed for min_worker and the other 1
# is still not connected.
self.waitForNodes(2)
# when we connect it, we will see 1 node.
lm.last_used_time_by_ip["172.0.0.4"] = 0
autoscaler.update()
self.waitForNodes(1)
def testTargetUtilizationFraction(self):
config = SMALL_CLUSTER.copy()
@@ -1103,6 +1189,7 @@ class AutoscalingTest(unittest.TestCase):
self.provider = MockProvider()
lm = LoadMetrics()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(12)])
autoscaler = StandardAutoscaler(
config_path,
lm,
@@ -1161,7 +1248,7 @@ class AutoscalingTest(unittest.TestCase):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
+3 -2
View File
@@ -1,4 +1,5 @@
import pytest
import sys
import ray
import ray.cluster_utils
@@ -6,7 +7,7 @@ import ray.test_utils
def test_cross_language_raise_kwargs(shutdown_only):
ray.init(_load_code_from_local=True)
ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
with pytest.raises(Exception, match="kwargs"):
ray.java_function("a", "b").remote(x="arg1")
@@ -16,7 +17,7 @@ def test_cross_language_raise_kwargs(shutdown_only):
def test_cross_language_raise_exception(shutdown_only):
ray.init(_load_code_from_local=True)
ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
class PythonObject(object):
pass
+71 -7
View File
@@ -2,7 +2,7 @@ import pytest
from contextlib import contextmanager
import ray.experimental.client.server.server as ray_client_server
from ray.experimental.client import ray
from ray.experimental.client import ray, reset_api
from ray.experimental.client.common import ClientObjectRef
@@ -10,9 +10,12 @@ from ray.experimental.client.common import ClientObjectRef
def ray_start_client_server():
server = ray_client_server.serve("localhost:50051", test_mode=True)
ray.connect("localhost:50051")
yield ray
ray.disconnect()
server.stop(0)
try:
yield ray
finally:
ray.disconnect()
server.stop(0)
reset_api()
def test_real_ray_fallback(ray_start_regular_shared):
@@ -34,9 +37,6 @@ def test_real_ray_fallback(ray_start_regular_shared):
nodes = ray.get(get_nodes.remote())
assert len(nodes) == 1, nodes
with pytest.raises(NotImplementedError):
print(ray.nodes())
def test_nested_function(ray_start_regular_shared):
with ray_start_client_server() as ray:
@@ -170,6 +170,70 @@ def test_basic_actor(ray_start_regular_shared):
assert count == 2
def test_pass_handles(ray_start_regular_shared):
"""
Test that passing client handles to actors and functions to remote actors
in functions (on the server or raylet side) works transparently to the
caller.
"""
with ray_start_client_server() as ray:
@ray.remote
class ExecActor:
def exec(self, f, x):
return ray.get(f.remote(x))
def exec_exec(self, actor, f, x):
return ray.get(actor.exec.remote(f, x))
@ray.remote
def fact(x):
out = 1
while x > 0:
out = out * x
x -= 1
return out
@ray.remote
def func_exec(f, x):
return ray.get(f.remote(x))
@ray.remote
def func_actor_exec(actor, f, x):
return ray.get(actor.exec.remote(f, x))
@ray.remote
def sneaky_func_exec(obj, x):
return ray.get(obj["f"].remote(x))
@ray.remote
def sneaky_actor_exec(obj, x):
return ray.get(obj["actor"].exec.remote(obj["f"], x))
def local_fact(x):
if x <= 0:
return 1
return x * local_fact(x - 1)
assert ray.get(fact.remote(7)) == local_fact(7)
assert ray.get(func_exec.remote(fact, 8)) == local_fact(8)
test_obj = {}
test_obj["f"] = fact
assert ray.get(sneaky_func_exec.remote(test_obj, 5)) == local_fact(5)
actor_handle = ExecActor.remote()
assert ray.get(actor_handle.exec.remote(fact, 7)) == local_fact(7)
assert ray.get(func_actor_exec.remote(actor_handle, fact,
10)) == local_fact(10)
second_actor = ExecActor.remote()
assert ray.get(actor_handle.exec_exec.remote(second_actor, fact,
9)) == local_fact(9)
test_actor_obj = {}
test_actor_obj["actor"] = second_actor
test_actor_obj["f"] = fact
assert ray.get(sneaky_actor_exec.remote(test_actor_obj,
4)) == local_fact(4)
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,25 @@
from ray.tests.test_experimental_client import ray_start_client_server
def test_get_ray_metadata(ray_start_regular_shared):
"""
Test the ClusterInfo client data pathway and API surface
"""
with ray_start_client_server() as ray:
ip_address = ray_start_regular_shared["node_ip_address"]
initialized = ray.is_initialized()
assert initialized
nodes = ray.nodes()
assert len(nodes) == 1, nodes
assert nodes[0]["NodeManagerAddress"] == ip_address
current_node_id = "node:" + ip_address
cluster_resources = ray.cluster_resources()
available_resources = ray.available_resources()
assert cluster_resources["CPU"] == 1.0
assert current_node_id in cluster_resources
assert current_node_id in available_resources

Some files were not shown because too many files have changed in this diff Show More