mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 02:30:34 +08:00
Merge branch 'master' into py39
This commit is contained in:
@@ -101,7 +101,7 @@ from ray import util # noqa: E402
|
||||
|
||||
# Replaced with the current commit when building the wheels.
|
||||
__commit__ = "{{RAY_COMMIT_SHA}}"
|
||||
__version__ = "1.1.0.dev0"
|
||||
__version__ = "1.2.0.dev0"
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
|
||||
@@ -136,7 +136,7 @@ def find_redis_address(address=None):
|
||||
# --redis_address=123.456.78.910 --node_ip_address=123.456.78.910
|
||||
# --raylet_socket_name=... --store_socket_name=... --object_manager_port=0
|
||||
# --min_worker_port=10000 --max_worker_port=10999
|
||||
# --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8
|
||||
# --node_manager_port=58578 --redis_port=6379
|
||||
# --maximum_startup_concurrency=8
|
||||
# --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
|
||||
# --config_list=plasma_store_as_thread,True
|
||||
@@ -279,7 +279,8 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
def get_address_info_from_redis(redis_address,
|
||||
node_ip_address,
|
||||
num_retries=5,
|
||||
redis_password=None):
|
||||
redis_password=None,
|
||||
no_warning=False):
|
||||
counter = 0
|
||||
while True:
|
||||
try:
|
||||
@@ -290,10 +291,11 @@ def get_address_info_from_redis(redis_address,
|
||||
raise
|
||||
# Some of the information may not be in Redis yet, so wait a little
|
||||
# bit.
|
||||
logger.warning(
|
||||
"Some processes that the driver needs to connect to have "
|
||||
"not registered with Redis, so retrying. Have you run "
|
||||
"'ray start' on this node?")
|
||||
if not no_warning:
|
||||
logger.warning(
|
||||
"Some processes that the driver needs to connect to have "
|
||||
"not registered with Redis, so retrying. Have you run "
|
||||
"'ray start' on this node?")
|
||||
time.sleep(1)
|
||||
counter += 1
|
||||
|
||||
@@ -1251,13 +1253,11 @@ def start_raylet(redis_address,
|
||||
stderr_file=None,
|
||||
config=None,
|
||||
java_worker_options=None,
|
||||
load_code_from_local=False,
|
||||
huge_pages=False,
|
||||
fate_share=None,
|
||||
socket_to_use=None,
|
||||
head_node=False,
|
||||
start_initial_python_workers_for_first_job=False,
|
||||
code_search_path=None):
|
||||
start_initial_python_workers_for_first_job=False):
|
||||
"""Start a raylet, which is a combined local scheduler and object manager.
|
||||
|
||||
Args:
|
||||
@@ -1294,9 +1294,6 @@ def start_raylet(redis_address,
|
||||
config (dict|None): Optional Raylet configuration that will
|
||||
override defaults in RayConfig.
|
||||
java_worker_options (list): The command options for Java worker.
|
||||
code_search_path (list): Code search path for worker. code_search_path
|
||||
is added to worker command in non-multi-tenancy mode and job_config
|
||||
in multi-tenancy mode.
|
||||
Returns:
|
||||
ProcessInfo for the process that was started.
|
||||
"""
|
||||
@@ -1309,7 +1306,6 @@ def start_raylet(redis_address,
|
||||
raise ValueError("Cannot use valgrind and profiler at the same time.")
|
||||
|
||||
assert resource_spec.resolved()
|
||||
num_initial_workers = resource_spec.num_cpus
|
||||
static_resources = resource_spec.to_resource_dict()
|
||||
|
||||
# Limit the number of workers that can be started in parallel by the
|
||||
@@ -1346,7 +1342,6 @@ def start_raylet(redis_address,
|
||||
raylet_name,
|
||||
redis_password,
|
||||
session_dir,
|
||||
code_search_path,
|
||||
)
|
||||
else:
|
||||
java_worker_command = []
|
||||
@@ -1366,15 +1361,18 @@ def start_raylet(redis_address,
|
||||
|
||||
# Create the command that the Raylet will use to start workers.
|
||||
start_worker_command = [
|
||||
sys.executable, worker_path, f"--node-ip-address={node_ip_address}",
|
||||
sys.executable,
|
||||
worker_path,
|
||||
f"--node-ip-address={node_ip_address}",
|
||||
f"--node-manager-port={node_manager_port}",
|
||||
f"--object-store-name={plasma_store_name}",
|
||||
f"--raylet-name={raylet_name}", f"--redis-address={redis_address}",
|
||||
f"--config-list={config_str}", f"--temp-dir={temp_dir}",
|
||||
f"--metrics-agent-port={metrics_agent_port}"
|
||||
f"--raylet-name={raylet_name}",
|
||||
f"--redis-address={redis_address}",
|
||||
f"--config-list={config_str}",
|
||||
f"--temp-dir={temp_dir}",
|
||||
f"--metrics-agent-port={metrics_agent_port}",
|
||||
"RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER",
|
||||
]
|
||||
if code_search_path:
|
||||
start_worker_command.append(f"--code-search-path={code_search_path}")
|
||||
if redis_password:
|
||||
start_worker_command += [f"--redis-password={redis_password}"]
|
||||
|
||||
@@ -1389,12 +1387,6 @@ def start_raylet(redis_address,
|
||||
if max_worker_port is None:
|
||||
max_worker_port = 0
|
||||
|
||||
if code_search_path is not None and len(code_search_path) > 0:
|
||||
load_code_from_local = True
|
||||
|
||||
if load_code_from_local:
|
||||
start_worker_command += ["--load-code-from-local"]
|
||||
|
||||
# Create agent command
|
||||
agent_command = [
|
||||
sys.executable,
|
||||
@@ -1425,7 +1417,6 @@ def start_raylet(redis_address,
|
||||
f"--node_ip_address={node_ip_address}",
|
||||
f"--redis_address={gcs_ip_address}",
|
||||
f"--redis_port={gcs_port}",
|
||||
f"--num_initial_workers={num_initial_workers}",
|
||||
f"--maximum_startup_concurrency={maximum_startup_concurrency}",
|
||||
f"--static_resource_list={resource_argument}",
|
||||
f"--config_list={config_str}",
|
||||
@@ -1485,8 +1476,7 @@ def get_ray_jars_dir():
|
||||
|
||||
def build_java_worker_command(java_worker_options, redis_address,
|
||||
node_manager_port, plasma_store_name,
|
||||
raylet_name, redis_password, session_dir,
|
||||
code_search_path):
|
||||
raylet_name, redis_password, session_dir):
|
||||
"""This method assembles the command used to start a Java worker.
|
||||
|
||||
Args:
|
||||
@@ -1497,7 +1487,6 @@ def build_java_worker_command(java_worker_options, redis_address,
|
||||
raylet_name (str): The name of the raylet socket to create.
|
||||
redis_password (str): The password of connect to redis.
|
||||
session_dir (str): The path of this session.
|
||||
code_search_path (list): Teh job code search path.
|
||||
Returns:
|
||||
The command string for starting Java worker.
|
||||
"""
|
||||
@@ -1518,7 +1507,6 @@ def build_java_worker_command(java_worker_options, redis_address,
|
||||
pairs.append(("ray.home", RAY_HOME))
|
||||
pairs.append(("ray.logging.dir", os.path.join(session_dir, "logs")))
|
||||
pairs.append(("ray.session-dir", session_dir))
|
||||
pairs.append(("ray.job.code-search-path", code_search_path))
|
||||
command = ["java"] + ["-D{}={}".format(*pair) for pair in pairs]
|
||||
|
||||
command += ["RAY_WORKER_RAYLET_CONFIG_PLACEHOLDER"]
|
||||
|
||||
+79
-18
@@ -336,6 +336,7 @@ cdef execute_task(
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
const c_vector[CObjectID] &c_return_ids,
|
||||
const c_string debugger_breakpoint,
|
||||
c_vector[shared_ptr[CRayObject]] *returns):
|
||||
|
||||
worker = ray.worker.global_worker
|
||||
@@ -351,6 +352,18 @@ cdef execute_task(
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
|
||||
# Helper method used to exit current asyncio actor.
|
||||
# This is called when a KeyboardInterrupt is received by the main thread.
|
||||
# Upon receiving a KeyboardInterrupt signal, Ray will exit the current
|
||||
# worker. If the worker is processing normal tasks, Ray treat it as task
|
||||
# cancellation from ray.cancel(object_ref). If the worker is an asyncio
|
||||
# actor, Ray will exit the actor.
|
||||
def exit_current_actor_if_asyncio():
|
||||
if core_worker.current_actor_is_asyncio():
|
||||
error = SystemExit(0)
|
||||
error.is_ray_terminate = True
|
||||
raise error
|
||||
|
||||
function_descriptor = CFunctionDescriptorToPython(
|
||||
ray_function.GetFunctionDescriptor())
|
||||
|
||||
@@ -457,9 +470,26 @@ cdef execute_task(
|
||||
task_exception = True
|
||||
try:
|
||||
with ray.worker._changeproctitle(title, next_title):
|
||||
if debugger_breakpoint != b"":
|
||||
ray.util.pdb.set_trace(
|
||||
breakpoint_uuid=debugger_breakpoint)
|
||||
outputs = function_executor(*args, **kwargs)
|
||||
next_breakpoint = (
|
||||
ray.worker.global_worker.debugger_breakpoint)
|
||||
if next_breakpoint != b"":
|
||||
# If this happens, the user typed "remote" and
|
||||
# there were no more remote calls left in this
|
||||
# task. In that case we just exit the debugger.
|
||||
ray.experimental.internal_kv._internal_kv_put(
|
||||
"RAY_PDB_{}".format(next_breakpoint),
|
||||
"{\"exit_debugger\": true}")
|
||||
ray.experimental.internal_kv._internal_kv_del(
|
||||
"RAY_PDB_CONTINUE_{}".format(next_breakpoint)
|
||||
)
|
||||
ray.worker.global_worker.debugger_breakpoint = b""
|
||||
task_exception = False
|
||||
except KeyboardInterrupt as e:
|
||||
exit_current_actor_if_asyncio()
|
||||
raise TaskCancelledError(
|
||||
core_worker.get_current_task_id())
|
||||
if c_return_ids.size() == 1:
|
||||
@@ -467,6 +497,7 @@ cdef execute_task(
|
||||
# Check for a cancellation that was called when the function
|
||||
# was exiting and was raised after the except block.
|
||||
if not check_signals().ok():
|
||||
exit_current_actor_if_asyncio()
|
||||
task_exception = True
|
||||
raise TaskCancelledError(
|
||||
core_worker.get_current_task_id())
|
||||
@@ -523,6 +554,7 @@ cdef CRayStatus task_execution_handler(
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
const c_vector[CObjectID] &c_return_ids,
|
||||
const c_string debugger_breakpoint,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil:
|
||||
|
||||
with gil:
|
||||
@@ -532,7 +564,7 @@ cdef CRayStatus task_execution_handler(
|
||||
# it does, that indicates that there was an internal error.
|
||||
execute_task(task_type, task_name, ray_function, c_resources,
|
||||
c_args, c_arg_reference_ids, c_return_ids,
|
||||
returns)
|
||||
debugger_breakpoint, returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
"An unexpected internal error occurred while the worker "
|
||||
@@ -1041,6 +1073,7 @@ cdef class CoreWorker:
|
||||
PlacementGroupID placement_group_id,
|
||||
int64_t placement_group_bundle_index,
|
||||
c_bool placement_group_capture_child_tasks,
|
||||
c_string debugger_breakpoint,
|
||||
override_environment_variables):
|
||||
cdef:
|
||||
unordered_map[c_string, double] c_resources
|
||||
@@ -1059,15 +1092,18 @@ cdef class CoreWorker:
|
||||
language.lang, function_descriptor.descriptor)
|
||||
prepare_args(self, language, args, &args_vector)
|
||||
|
||||
with nogil:
|
||||
CCoreWorkerProcess.GetCoreWorker().SubmitTask(
|
||||
ray_function, args_vector, CTaskOptions(
|
||||
name, num_returns, c_resources,
|
||||
c_override_environment_variables),
|
||||
&return_ids, max_retries,
|
||||
c_pair[CPlacementGroupID, int64_t](
|
||||
c_placement_group_id, placement_group_bundle_index),
|
||||
placement_group_capture_child_tasks)
|
||||
# NOTE(edoakes): releasing the GIL while calling this method causes
|
||||
# segfaults. See relevant issue for details:
|
||||
# https://github.com/ray-project/ray/pull/12803
|
||||
CCoreWorkerProcess.GetCoreWorker().SubmitTask(
|
||||
ray_function, args_vector, CTaskOptions(
|
||||
name, num_returns, c_resources,
|
||||
c_override_environment_variables),
|
||||
&return_ids, max_retries,
|
||||
c_pair[CPlacementGroupID, int64_t](
|
||||
c_placement_group_id, placement_group_bundle_index),
|
||||
placement_group_capture_child_tasks,
|
||||
debugger_breakpoint)
|
||||
|
||||
return VectorToObjectRefs(return_ids)
|
||||
|
||||
@@ -1170,6 +1206,21 @@ cdef class CoreWorker:
|
||||
CCoreWorkerProcess.GetCoreWorker().
|
||||
RemovePlacementGroup(c_placement_group_id))
|
||||
|
||||
def wait_placement_group_ready(self,
|
||||
PlacementGroupID placement_group_id,
|
||||
int32_t timeout_seconds):
|
||||
cdef CRayStatus status
|
||||
cdef CPlacementGroupID cplacement_group_id = (
|
||||
CPlacementGroupID.FromBinary(placement_group_id.binary()))
|
||||
cdef int ctimeout_seconds = timeout_seconds
|
||||
with nogil:
|
||||
status = CCoreWorkerProcess.GetCoreWorker() \
|
||||
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_seconds)
|
||||
if status.IsNotFound():
|
||||
raise Exception("Placement group {} does not exist.".format(
|
||||
placement_group_id))
|
||||
return status.ok()
|
||||
|
||||
def submit_actor_task(self,
|
||||
Language language,
|
||||
ActorID actor_id,
|
||||
@@ -1193,12 +1244,14 @@ cdef class CoreWorker:
|
||||
language.lang, function_descriptor.descriptor)
|
||||
prepare_args(self, language, args, &args_vector)
|
||||
|
||||
with nogil:
|
||||
CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
|
||||
c_actor_id,
|
||||
ray_function,
|
||||
args_vector, CTaskOptions(name, num_returns, c_resources),
|
||||
&return_ids)
|
||||
# NOTE(edoakes): releasing the GIL while calling this method causes
|
||||
# segfaults. See relevant issue for details:
|
||||
# https://github.com/ray-project/ray/pull/12803
|
||||
CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
|
||||
c_actor_id,
|
||||
ray_function,
|
||||
args_vector, CTaskOptions(name, num_returns, c_resources),
|
||||
&return_ids)
|
||||
|
||||
return VectorToObjectRefs(return_ids)
|
||||
|
||||
@@ -1400,8 +1453,16 @@ cdef class CoreWorker:
|
||||
context = worker.get_serialization_context()
|
||||
serialized_object = context.serialize(output)
|
||||
data_sizes.push_back(serialized_object.total_bytes)
|
||||
metadatas.push_back(
|
||||
string_to_buffer(serialized_object.metadata))
|
||||
metadata = serialized_object.metadata
|
||||
if ray.worker.global_worker.debugger_get_breakpoint:
|
||||
breakpoint = (
|
||||
ray.worker.global_worker.debugger_get_breakpoint)
|
||||
metadata += (
|
||||
b"," + ray_constants.OBJECT_METADATA_DEBUG_PREFIX +
|
||||
breakpoint.encode())
|
||||
# Reset debugging context of this worker.
|
||||
ray.worker.global_worker.debugger_get_breakpoint = b""
|
||||
metadatas.push_back(string_to_buffer(metadata))
|
||||
serialized_objects.append(serialized_object)
|
||||
contained_ids.push_back(
|
||||
ObjectRefsToVector(serialized_object.contained_object_refs)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import inspect
|
||||
import logging
|
||||
import weakref
|
||||
import _thread
|
||||
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray._raylet
|
||||
@@ -1006,6 +1007,7 @@ def exit_actor():
|
||||
"""Intentionally exit the current actor.
|
||||
|
||||
This function is used to disconnect an actor and exit the worker.
|
||||
Any ``atexit`` handlers installed in the actor will be run.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if this is a driver or this
|
||||
@@ -1018,6 +1020,14 @@ def exit_actor():
|
||||
ray.disconnect()
|
||||
# Disconnect global state from GCS.
|
||||
ray.state.state.disconnect()
|
||||
|
||||
# In asyncio actor mode, we can't raise SystemExit because it will just
|
||||
# quit the asycnio event loop thread, not the main thread. Instead, we
|
||||
# raise an interrupt signal to the main thread to tell it to exit.
|
||||
if worker.core_worker.current_actor_is_asyncio():
|
||||
_thread.interrupt_main()
|
||||
return
|
||||
|
||||
# Set a flag to indicate this is an intentional actor exit. This
|
||||
# reduces log verbosity.
|
||||
exit = SystemExit(0)
|
||||
|
||||
@@ -13,18 +13,19 @@ import collections
|
||||
|
||||
from ray.experimental.internal_kv import _internal_kv_put, \
|
||||
_internal_kv_initialized
|
||||
from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
|
||||
TAG_RAY_FILE_MOUNTS_CONTENTS,
|
||||
TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
|
||||
TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE,
|
||||
NODE_KIND_WORKER, NODE_KIND_UNMANAGED)
|
||||
from ray.autoscaler.tags import (
|
||||
TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
|
||||
TAG_RAY_FILE_MOUNTS_CONTENTS, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
|
||||
TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE, NODE_KIND_WORKER,
|
||||
NODE_KIND_UNMANAGED, NODE_KIND_HEAD)
|
||||
from ray.autoscaler._private.providers import _get_node_provider
|
||||
from ray.autoscaler._private.updater import NodeUpdaterThread
|
||||
from ray.autoscaler._private.node_launcher import NodeLauncher
|
||||
from ray.autoscaler._private.resource_demand_scheduler import \
|
||||
ResourceDemandScheduler, NodeType, NodeID
|
||||
get_bin_pack_residual, ResourceDemandScheduler, NodeType, NodeID, NodeIP, \
|
||||
ResourceDict
|
||||
from ray.autoscaler._private.util import ConcurrentCounter, validate_config, \
|
||||
with_head_node_ip, hash_launch_conf, hash_runtime_conf, \
|
||||
with_head_node_ip, hash_launch_conf, hash_runtime_conf, add_prefix, \
|
||||
DEBUG_AUTOSCALING_STATUS, DEBUG_AUTOSCALING_ERROR
|
||||
from ray.autoscaler._private.constants import \
|
||||
AUTOSCALER_MAX_NUM_FAILURES, AUTOSCALER_MAX_LAUNCH_BATCH, \
|
||||
@@ -47,7 +48,7 @@ class StandardAutoscaler:
|
||||
There are two ways to start an autoscaling cluster: manually by running
|
||||
`ray start --head --autoscaling-config=/path/to/config.yaml` on a
|
||||
instance that has permission to launch other instances, or you can also use
|
||||
`ray create_or_update /path/to/config.yaml` from your laptop, which will
|
||||
`ray up /path/to/config.yaml` from your laptop, which will
|
||||
configure the right AWS/Cloud roles automatically.
|
||||
|
||||
StandardAutoscaler's `update` method is periodically called by `monitor.py`
|
||||
@@ -66,8 +67,11 @@ class StandardAutoscaler:
|
||||
max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
||||
max_failures=AUTOSCALER_MAX_NUM_FAILURES,
|
||||
process_runner=subprocess,
|
||||
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
|
||||
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
|
||||
prefix_cluster_info=False):
|
||||
self.config_path = config_path
|
||||
# Prefix each line of info string with cluster name if True
|
||||
self.prefix_cluster_info = prefix_cluster_info
|
||||
# Keep this before self.reset (self.provider needs to be created
|
||||
# exactly once).
|
||||
self.provider = None
|
||||
@@ -164,27 +168,35 @@ class StandardAutoscaler:
|
||||
last_used = self.load_metrics.last_used_time_by_ip
|
||||
horizon = now - (60 * self.config["idle_timeout_minutes"])
|
||||
|
||||
nodes_to_terminate = []
|
||||
nodes_to_terminate: Dict[NodeID, bool] = []
|
||||
node_type_counts = collections.defaultdict(int)
|
||||
# Sort based on last used to make sure to keep min_workers that
|
||||
# were most recently used. Otherwise, _keep_min_workers_of_node_type
|
||||
# might keep a node that should be terminated.
|
||||
for node_id in self._sort_based_on_last_used(nodes, last_used):
|
||||
sorted_node_ids = self._sort_based_on_last_used(nodes, last_used)
|
||||
# Don't terminate nodes needed by request_resources()
|
||||
nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
|
||||
if self.resource_demand_vector:
|
||||
nodes_allowed_to_terminate = self._get_nodes_allowed_to_terminate(
|
||||
sorted_node_ids)
|
||||
|
||||
for node_id in sorted_node_ids:
|
||||
# Make sure to not kill idle node types if the number of workers
|
||||
# of that type is lower/equal to the min_workers of that type.
|
||||
if self._keep_min_worker_of_node_type(
|
||||
node_id,
|
||||
node_type_counts) and self.launch_config_ok(node_id):
|
||||
# of that type is lower/equal to the min_workers of that type
|
||||
# or it is needed for request_resources().
|
||||
if (self._keep_min_worker_of_node_type(node_id, node_type_counts)
|
||||
or not nodes_allowed_to_terminate.get(
|
||||
node_id, True)) and self.launch_config_ok(node_id):
|
||||
continue
|
||||
|
||||
node_ip = self.provider.internal_ip(node_id)
|
||||
if node_ip in last_used and last_used[node_ip] < horizon:
|
||||
logger.info("StandardAutoscaler: "
|
||||
"{}: Terminating idle node".format(node_id))
|
||||
"{}: Terminating idle node.".format(node_id))
|
||||
nodes_to_terminate.append(node_id)
|
||||
elif not self.launch_config_ok(node_id):
|
||||
logger.info("StandardAutoscaler: "
|
||||
"{}: Terminating outdated node".format(node_id))
|
||||
"{}: Terminating outdated node.".format(node_id))
|
||||
nodes_to_terminate.append(node_id)
|
||||
|
||||
if nodes_to_terminate:
|
||||
@@ -198,7 +210,7 @@ class StandardAutoscaler:
|
||||
len(nodes_to_terminate)) > self.config["max_workers"] and nodes:
|
||||
to_terminate = nodes.pop()
|
||||
logger.info("StandardAutoscaler: "
|
||||
"{}: Terminating unneeded node".format(to_terminate))
|
||||
"{}: Terminating unneeded node.".format(to_terminate))
|
||||
nodes_to_terminate.append(to_terminate)
|
||||
|
||||
if nodes_to_terminate:
|
||||
@@ -226,15 +238,23 @@ class StandardAutoscaler:
|
||||
if not updater.is_alive():
|
||||
completed.append(node_id)
|
||||
if completed:
|
||||
nodes_to_terminate: List[NodeID] = []
|
||||
for node_id in completed:
|
||||
if self.updaters[node_id].exitcode == 0:
|
||||
self.num_successful_updates[node_id] += 1
|
||||
# Mark the node as active to prevent the node recovery
|
||||
# logic immediately trying to restart Ray on the new node.
|
||||
self.load_metrics.mark_active(
|
||||
self.provider.internal_ip(node_id))
|
||||
else:
|
||||
logger.error(f"StandardAutoscaler: {node_id}: Terminating "
|
||||
"failed to setup/initialize node.")
|
||||
nodes_to_terminate.append(node_id)
|
||||
self.num_failed_updates[node_id] += 1
|
||||
del self.updaters[node_id]
|
||||
# Mark the node as active to prevent the node recovery logic
|
||||
# immediately trying to restart Ray on the new node.
|
||||
self.load_metrics.mark_active(self.provider.internal_ip(node_id))
|
||||
if nodes_to_terminate:
|
||||
self.provider.terminate_nodes(nodes_to_terminate)
|
||||
|
||||
nodes = self.workers()
|
||||
self.log_info_string(nodes)
|
||||
|
||||
@@ -266,14 +286,16 @@ class StandardAutoscaler:
|
||||
last_used: Dict[str, float]) -> List[NodeID]:
|
||||
"""Sort the nodes based on the last time they were used.
|
||||
|
||||
The first item in the return list is the least recently used.
|
||||
The first item in the return list is the most recently used.
|
||||
"""
|
||||
updated_last_used = copy.deepcopy(last_used)
|
||||
now = time.time()
|
||||
# Add the unconnected nodes as the least recently used (the end of
|
||||
# list). This prioritizes connected nodes.
|
||||
least_recently_used = -1
|
||||
for node_id in nodes:
|
||||
node_ip = self.provider.internal_ip(node_id)
|
||||
if node_ip not in updated_last_used:
|
||||
updated_last_used[node_ip] = now
|
||||
updated_last_used[node_ip] = least_recently_used
|
||||
|
||||
def last_time_used(node_id: NodeID):
|
||||
node_ip = self.provider.internal_ip(node_id)
|
||||
@@ -281,9 +303,86 @@ class StandardAutoscaler:
|
||||
|
||||
return sorted(nodes, key=last_time_used, reverse=True)
|
||||
|
||||
def _keep_min_worker_of_node_type(self, node_id: NodeID,
|
||||
node_type_counts: Dict[NodeType, int]):
|
||||
"""Returns if workers of node_type should be terminated.
|
||||
def _get_nodes_allowed_to_terminate(
|
||||
self, sorted_node_ids: List[NodeID]) -> Dict[NodeID, bool]:
|
||||
# TODO(ameer): try merging this with resource_demand_scheduler
|
||||
# code responsible for adding nodes for request_resources().
|
||||
"""Returns the nodes allowed to terminate for request_resources().
|
||||
|
||||
Args:
|
||||
sorted_node_ids: the node ids sorted based on last used (LRU last).
|
||||
|
||||
Returns:
|
||||
nodes_allowed_to_terminate: whether the node id is allowed to
|
||||
terminate or not.
|
||||
"""
|
||||
nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
|
||||
head_node_resources: ResourceDict = copy.deepcopy(
|
||||
self.available_node_types[self.config["head_node_type"]][
|
||||
"resources"])
|
||||
if not head_node_resources:
|
||||
# Legacy yaml might include {} in the resources field.
|
||||
# TODO(ameer): this is somewhat duplicated in
|
||||
# resource_demand_scheduler.py.
|
||||
head_id: List[NodeID] = self.provider.non_terminated_nodes({
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_HEAD
|
||||
})
|
||||
if head_id:
|
||||
head_ip = self.provider.internal_ip(head_id[0])
|
||||
static_nodes: Dict[
|
||||
NodeIP,
|
||||
ResourceDict] = \
|
||||
self.load_metrics.get_static_node_resources_by_ip()
|
||||
head_node_resources = static_nodes[head_ip]
|
||||
else:
|
||||
head_node_resources = {}
|
||||
|
||||
max_node_resources: List[ResourceDict] = [head_node_resources]
|
||||
resource_demand_vector_worker_node_ids = []
|
||||
# Get max resources on all the non terminated nodes.
|
||||
for node_id in sorted_node_ids:
|
||||
tags = self.provider.node_tags(node_id)
|
||||
if TAG_RAY_USER_NODE_TYPE in tags:
|
||||
node_type = tags[TAG_RAY_USER_NODE_TYPE]
|
||||
node_resources: ResourceDict = copy.deepcopy(
|
||||
self.available_node_types[node_type]["resources"])
|
||||
if not node_resources:
|
||||
# Legacy yaml might include {} in the resources field.
|
||||
static_nodes: Dict[
|
||||
NodeIP,
|
||||
ResourceDict] = \
|
||||
self.load_metrics.get_static_node_resources_by_ip()
|
||||
node_ip = self.provider.internal_ip(node_id)
|
||||
node_resources = static_nodes.get(node_ip, {})
|
||||
max_node_resources.append(node_resources)
|
||||
resource_demand_vector_worker_node_ids.append(node_id)
|
||||
# Since it is sorted based on last used, we "keep" nodes that are
|
||||
# most recently used when we binpack. We assume get_bin_pack_residual
|
||||
# is following the given order here.
|
||||
used_resource_requests: List[ResourceDict]
|
||||
_, used_resource_requests = \
|
||||
get_bin_pack_residual(max_node_resources,
|
||||
self.resource_demand_vector)
|
||||
# Remove the first entry (the head node).
|
||||
max_node_resources.pop(0)
|
||||
# Remove the first entry (the head node).
|
||||
used_resource_requests.pop(0)
|
||||
for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
|
||||
if used_resource_requests[i] == max_node_resources[i] \
|
||||
and max_node_resources[i]:
|
||||
# No resources of the node were needed for request_resources().
|
||||
# max_node_resources[i] is an empty dict for legacy yamls
|
||||
# before the node is connected.
|
||||
nodes_allowed_to_terminate[node_id] = True
|
||||
else:
|
||||
nodes_allowed_to_terminate[node_id] = False
|
||||
return nodes_allowed_to_terminate
|
||||
|
||||
def _keep_min_worker_of_node_type(
|
||||
self, node_id: NodeID,
|
||||
node_type_counts: Dict[NodeType, int]) -> bool:
|
||||
"""Returns if workers of node_type can be terminated.
|
||||
The worker cannot be terminated to respect min_workers constraint.
|
||||
|
||||
Receives the counters of running nodes so far and determines if idle
|
||||
node_id should be terminated or not. It also updates the counters
|
||||
@@ -293,7 +392,7 @@ class StandardAutoscaler:
|
||||
node_type_counts(Dict[NodeType, int]): The non_terminated node
|
||||
types counted so far.
|
||||
Returns:
|
||||
bool: if workers of node_types should be terminated or not.
|
||||
bool: if workers of node_types can be terminated or not.
|
||||
"""
|
||||
tags = self.provider.node_tags(node_id)
|
||||
if TAG_RAY_USER_NODE_TYPE in tags:
|
||||
@@ -589,6 +688,8 @@ class StandardAutoscaler:
|
||||
self.load_metrics.get_resource_utilization())
|
||||
if _internal_kv_initialized():
|
||||
_internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True)
|
||||
if self.prefix_cluster_info:
|
||||
tmp = add_prefix(tmp, self.config["cluster_name"])
|
||||
logger.debug(tmp)
|
||||
|
||||
def info_string(self, nodes):
|
||||
|
||||
@@ -29,8 +29,6 @@ from ray.autoscaler._private.subprocess_output_util import (
|
||||
from ray.autoscaler._private.cli_logger import cli_logger, cf
|
||||
from ray.util.debug import log_once
|
||||
|
||||
from ray.autoscaler._private.constants import RAY_HOME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How long to wait for a node to start, in seconds
|
||||
@@ -114,6 +112,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
self.node_id = str(node_id)
|
||||
self.namespace = namespace
|
||||
self.kubectl = ["kubectl", "-n", self.namespace]
|
||||
self._home_cached = None
|
||||
|
||||
def run(
|
||||
self,
|
||||
@@ -195,7 +194,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
logger.warning("'rsync_filter' detected but is currently "
|
||||
"unsupported for k8s.")
|
||||
if target.startswith("~"):
|
||||
target = RAY_HOME + target[1:]
|
||||
target = self._home + target[1:]
|
||||
|
||||
try:
|
||||
flags = "-aqz" if is_rsync_silent() else "-avz"
|
||||
@@ -211,7 +210,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
|
||||
UserWarning)
|
||||
if target.startswith("~"):
|
||||
target = RAY_HOME + target[1:]
|
||||
target = self._home + target[1:]
|
||||
|
||||
self.process_runner.check_call(self.kubectl + [
|
||||
"cp", source, "{}/{}:{}".format(self.namespace, self.node_id,
|
||||
@@ -219,8 +218,8 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
])
|
||||
|
||||
def run_rsync_down(self, source, target, options=None):
|
||||
if target.startswith("~"):
|
||||
target = RAY_HOME + target[1:]
|
||||
if source.startswith("~"):
|
||||
source = self._home + source[1:]
|
||||
|
||||
try:
|
||||
flags = "-aqz" if is_rsync_silent() else "-avz"
|
||||
@@ -236,7 +235,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
|
||||
UserWarning)
|
||||
if target.startswith("~"):
|
||||
target = RAY_HOME + target[1:]
|
||||
target = self._home + target[1:]
|
||||
|
||||
self.process_runner.check_call(self.kubectl + [
|
||||
"cp", "{}/{}:{}".format(self.namespace, self.node_id, source),
|
||||
@@ -244,8 +243,21 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
||||
])
|
||||
|
||||
def remote_shell_command_str(self):
|
||||
return "{} exec -it {} bash".format(" ".join(self.kubectl),
|
||||
self.node_id)
|
||||
return "{} exec -it {} -- bash".format(" ".join(self.kubectl),
|
||||
self.node_id)
|
||||
|
||||
@property
|
||||
def _home(self):
|
||||
# TODO (Dmitri): Think about how to use the node's HOME variable
|
||||
# without making an extra kubectl exec call.
|
||||
if self._home_cached is None:
|
||||
cmd = self.kubectl + [
|
||||
"exec", "-it", self.node_id, "--", "printenv", "HOME"
|
||||
]
|
||||
joined_cmd = " ".join(cmd)
|
||||
raw_out = self.process_runner.check_output(joined_cmd, shell=True)
|
||||
self._home_cached = raw_out.decode().strip("\n\r")
|
||||
return self._home_cached
|
||||
|
||||
|
||||
class SSHOptions:
|
||||
|
||||
@@ -5,6 +5,7 @@ _configured = False
|
||||
_core_api = None
|
||||
_auth_api = None
|
||||
_extensions_beta_api = None
|
||||
_custom_objects_api = None
|
||||
|
||||
|
||||
def _load_config():
|
||||
@@ -45,4 +46,13 @@ def extensions_beta_api():
|
||||
return _extensions_beta_api
|
||||
|
||||
|
||||
def custom_objects_api():
|
||||
global _custom_objects_api
|
||||
if _custom_objects_api is None:
|
||||
_load_config()
|
||||
_custom_objects_api = kubernetes.client.CustomObjectsApi()
|
||||
|
||||
return _custom_objects_api
|
||||
|
||||
|
||||
log_prefix = "KubernetesNodeProvider: "
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
|
||||
from kubernetes import client
|
||||
from kubernetes.client.rest import ApiException
|
||||
@@ -45,9 +47,10 @@ def not_provided_msg(resource_type):
|
||||
|
||||
def bootstrap_kubernetes(config):
|
||||
if not config["provider"]["use_internal_ips"]:
|
||||
return ValueError("Exposing external IP addresses for ray pods isn't "
|
||||
"currently supported. Please set "
|
||||
"'use_internal_ips' to false.")
|
||||
return ValueError(
|
||||
"Exposing external IP addresses for ray containers isn't "
|
||||
"currently supported. Please set "
|
||||
"'use_internal_ips' to false.")
|
||||
namespace = _configure_namespace(config["provider"])
|
||||
_configure_autoscaler_service_account(namespace, config["provider"])
|
||||
_configure_autoscaler_role(namespace, config["provider"])
|
||||
@@ -56,6 +59,62 @@ def bootstrap_kubernetes(config):
|
||||
return config
|
||||
|
||||
|
||||
def fillout_resources_kubernetes(config):
|
||||
if "available_node_types" not in config:
|
||||
return config["available_node_types"]
|
||||
node_types = copy.deepcopy(config["available_node_types"])
|
||||
for node_type in node_types:
|
||||
container_data = node_types[node_type]["node_config"]["spec"][
|
||||
"containers"][0]
|
||||
autodetected_resources = get_autodetected_resources(container_data)
|
||||
if "resources" not in config["available_node_types"][node_type]:
|
||||
config["available_node_types"][node_type]["resources"] = {}
|
||||
config["available_node_types"][node_type]["resources"].update(
|
||||
autodetected_resources)
|
||||
logger.debug(
|
||||
"Updating the resources of node type {} to include {}.".format(
|
||||
node_type, autodetected_resources))
|
||||
return config
|
||||
|
||||
|
||||
def get_autodetected_resources(container_data):
|
||||
container_resources = container_data.get("resources", None)
|
||||
if container_resources is None:
|
||||
return {"CPU": 0, "GPU": 0}
|
||||
|
||||
node_type_resources = {
|
||||
resource_name.upper(): get_resource(container_resources, resource_name)
|
||||
for resource_name in ["cpu", "gpu"]
|
||||
}
|
||||
|
||||
return node_type_resources
|
||||
|
||||
|
||||
def get_resource(container_resources, resource_name):
|
||||
request = _get_resource(
|
||||
container_resources, resource_name, field_name="requests")
|
||||
limit = _get_resource(
|
||||
container_resources, resource_name, field_name="limits")
|
||||
resource = min(request, limit)
|
||||
return 0 if resource == float("inf") else int(resource)
|
||||
|
||||
|
||||
def _get_resource(container_resources, resource_name, field_name):
|
||||
if (field_name in container_resources
|
||||
and resource_name in container_resources[field_name]):
|
||||
return _parse_resource(container_resources[field_name][resource_name])
|
||||
else:
|
||||
return float("inf")
|
||||
|
||||
|
||||
def _parse_resource(resource):
|
||||
resource_str = str(resource)
|
||||
if resource_str[-1] == "m":
|
||||
return math.ceil(int(resource_str[:-1]) / 1000)
|
||||
else:
|
||||
return int(resource_str)
|
||||
|
||||
|
||||
def _configure_namespace(provider_config):
|
||||
namespace_field = "namespace"
|
||||
if namespace_field not in provider_config:
|
||||
|
||||
@@ -6,7 +6,8 @@ from kubernetes.client.rest import ApiException
|
||||
from ray.autoscaler._private.command_runner import KubernetesCommandRunner
|
||||
from ray.autoscaler._private.kubernetes import core_api, log_prefix, \
|
||||
extensions_beta_api
|
||||
from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes
|
||||
from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes, \
|
||||
fillout_resources_kubernetes
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
||||
|
||||
@@ -177,6 +178,11 @@ class KubernetesNodeProvider(NodeProvider):
|
||||
def bootstrap_config(cluster_config):
|
||||
return bootstrap_kubernetes(cluster_config)
|
||||
|
||||
@staticmethod
|
||||
def fillout_available_node_types_resources(cluster_config):
|
||||
"""Fills out missing "resources" field for available_node_types."""
|
||||
return fillout_resources_kubernetes(cluster_config)
|
||||
|
||||
|
||||
def _add_service_name_to_service_port(spec, svc_name):
|
||||
"""Goes recursively through the ingress manifest and adds the
|
||||
|
||||
@@ -82,12 +82,14 @@ class LoadMetrics:
|
||||
def prune(mapping):
|
||||
unwanted = set(mapping) - active_ips
|
||||
for unwanted_key in unwanted:
|
||||
logger.info("LoadMetrics: "
|
||||
"Removed mapping: {} - {}".format(
|
||||
unwanted_key, mapping[unwanted_key]))
|
||||
# TODO (Alex): Change this back to info after #12138.
|
||||
logger.debug("LoadMetrics: "
|
||||
"Removed mapping: {} - {}".format(
|
||||
unwanted_key, mapping[unwanted_key]))
|
||||
del mapping[unwanted_key]
|
||||
if unwanted:
|
||||
logger.info(
|
||||
# TODO (Alex): Change this back to info after #12138.
|
||||
logger.debug(
|
||||
"LoadMetrics: "
|
||||
"Removed {} stale ip mappings: {} not in {}".format(
|
||||
len(unwanted), unwanted, active_ips))
|
||||
|
||||
@@ -135,24 +135,6 @@ class ResourceDemandScheduler:
|
||||
this set of resources. This differs from resources_demands in
|
||||
that we don't take into account existing usage.
|
||||
"""
|
||||
|
||||
# If the user is using request_resources() API, calculate the remaining
|
||||
# delta resources required to meet their requested cluster size.
|
||||
if ensure_min_cluster_size is not None:
|
||||
used_resources = []
|
||||
for ip, max_res in max_resources_by_ip.items():
|
||||
res = copy.deepcopy(max_res)
|
||||
_inplace_subtract(res, unused_resources_by_ip.get(ip, {}))
|
||||
used_resources.append(res)
|
||||
# Example: user requests 1000 CPUs, but the cluster is currently
|
||||
# 500 CPUs in size with 250 used. Then, the delta is 750 CPUs that
|
||||
# we need to fit to get the cluster to scale to 1000.
|
||||
resource_requests, _ = get_bin_pack_residual(
|
||||
used_resources, ensure_min_cluster_size)
|
||||
resource_demands += resource_requests
|
||||
else:
|
||||
resource_requests = []
|
||||
|
||||
if self.is_legacy_yaml():
|
||||
# When using legacy yaml files we need to infer the head & worker
|
||||
# node resources from the static node resources from LoadMetrics.
|
||||
@@ -166,9 +148,12 @@ class ResourceDemandScheduler:
|
||||
logger.info("Cluster resources: {}".format(node_resources))
|
||||
logger.info("Node counts: {}".format(node_type_counts))
|
||||
# Step 2: add nodes to add to satisfy min_workers for each type
|
||||
node_resources, node_type_counts, min_workers_nodes_to_add = \
|
||||
(node_resources,
|
||||
node_type_counts,
|
||||
adjusted_min_workers) = \
|
||||
_add_min_workers_nodes(
|
||||
node_resources, node_type_counts, self.node_types)
|
||||
node_resources, node_type_counts, self.node_types,
|
||||
self.max_workers, ensure_min_cluster_size)
|
||||
|
||||
# Step 3: add nodes for strict spread groups
|
||||
logger.info(f"Placement group demands: {pending_placement_groups}")
|
||||
@@ -180,8 +165,16 @@ class ResourceDemandScheduler:
|
||||
not self.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]:
|
||||
# Need to launch worker nodes to later infer their
|
||||
# resources.
|
||||
# We add request_resources() demands here to make sure we launch
|
||||
# a single worker sometimes even if min_workers = 0 and resource
|
||||
# demands is empty.
|
||||
if ensure_min_cluster_size:
|
||||
request_resources_demands = ensure_min_cluster_size
|
||||
else:
|
||||
request_resources_demands = []
|
||||
return self._legacy_worker_node_to_launch(
|
||||
nodes, launching_nodes, node_resources, resource_demands)
|
||||
nodes, launching_nodes, node_resources,
|
||||
resource_demands + request_resources_demands)
|
||||
placement_group_nodes_to_add, node_resources, node_type_counts = \
|
||||
self.reserve_and_allocate_spread(
|
||||
strict_spreads, node_resources, node_type_counts)
|
||||
@@ -194,20 +187,15 @@ class ResourceDemandScheduler:
|
||||
logger.info("Unfulfilled demands: {}".format(unfulfilled))
|
||||
# Add 1 to account for the head node.
|
||||
max_to_add = self.max_workers + 1 - sum(node_type_counts.values())
|
||||
if resource_requests:
|
||||
nodes_to_add_based_on_requests = get_nodes_for(
|
||||
self.node_types, node_type_counts, max_to_add,
|
||||
resource_requests)
|
||||
else:
|
||||
nodes_to_add_based_on_requests = {}
|
||||
nodes_to_add_based_on_demand = get_nodes_for(
|
||||
self.node_types, node_type_counts, max_to_add, unfulfilled)
|
||||
# Merge nodes to add based on demand and nodes to add based on
|
||||
# min_workers constraint. We add them because nodes to add based on
|
||||
# demand was calculated after the min_workers constraint was respected.
|
||||
total_nodes_to_add = {}
|
||||
|
||||
for node_type in self.node_types:
|
||||
nodes_to_add = (min_workers_nodes_to_add.get(
|
||||
nodes_to_add = (adjusted_min_workers.get(
|
||||
node_type, 0) + placement_group_nodes_to_add.get(node_type, 0)
|
||||
+ nodes_to_add_based_on_demand.get(node_type, 0))
|
||||
if nodes_to_add > 0:
|
||||
@@ -216,7 +204,7 @@ class ResourceDemandScheduler:
|
||||
# Limit the number of concurrent launches
|
||||
total_nodes_to_add = self._get_concurrent_resource_demand_to_launch(
|
||||
total_nodes_to_add, unused_resources_by_ip.keys(), nodes,
|
||||
launching_nodes, nodes_to_add_based_on_requests)
|
||||
launching_nodes, adjusted_min_workers)
|
||||
|
||||
logger.info("Node requests: {}".format(total_nodes_to_add))
|
||||
return total_nodes_to_add
|
||||
@@ -294,7 +282,7 @@ class ResourceDemandScheduler:
|
||||
connected_nodes: List[NodeIP],
|
||||
non_terminated_nodes: List[NodeID],
|
||||
pending_launches_nodes: Dict[NodeType, int],
|
||||
nodes_to_add_based_on_requests: Dict[NodeType, int],
|
||||
adjusted_min_workers: Dict[NodeType, int],
|
||||
) -> Dict[NodeType, int]:
|
||||
"""Updates the max concurrent resources to launch for each node type.
|
||||
|
||||
@@ -314,9 +302,10 @@ class ResourceDemandScheduler:
|
||||
connected_nodes: Running nodes (from LoadMetrics).
|
||||
non_terminated_nodes: Non terminated nodes (pending/running).
|
||||
pending_launches_nodes: Nodes that are in the launch queue.
|
||||
nodes_to_add_based_on_requests: Nodes to launch to satisfy
|
||||
request_resources(). This overrides the launch limits since the
|
||||
user is hinting to immediately scale up to this size.
|
||||
adjusted_min_workers: Nodes to launch to satisfy
|
||||
min_workers and request_resources(). This overrides the launch
|
||||
limits since the user is hinting to immediately scale up to
|
||||
this size.
|
||||
Returns:
|
||||
Dict[NodeType, int]: Maximum number of nodes to launch for each
|
||||
node type.
|
||||
@@ -338,13 +327,9 @@ class ResourceDemandScheduler:
|
||||
upper_bound = max(
|
||||
max_allowed_pending_nodes - total_pending_nodes,
|
||||
|
||||
# Allow more nodes if this is to respect min_workers.
|
||||
self.node_types[node_type].get("min_workers", 0) -
|
||||
total_pending_nodes - running_nodes[node_type],
|
||||
|
||||
# Allow more nodes from request_resources API.
|
||||
nodes_to_add_based_on_requests.get(node_type,
|
||||
0) - total_pending_nodes)
|
||||
# Allow more nodes if this is to respect min_workers or
|
||||
# request_resources().
|
||||
adjusted_min_workers.get(node_type, 0))
|
||||
|
||||
if upper_bound > 0:
|
||||
updated_nodes_to_launch[node_type] = min(
|
||||
@@ -504,21 +489,26 @@ def _node_type_counts_to_node_resources(
|
||||
def _add_min_workers_nodes(
|
||||
node_resources: List[ResourceDict],
|
||||
node_type_counts: Dict[NodeType, int],
|
||||
node_types: Dict[NodeType, NodeTypeConfigDict],
|
||||
node_types: Dict[NodeType, NodeTypeConfigDict], max_workers: int,
|
||||
ensure_min_cluster_size: List[ResourceDict]
|
||||
) -> (List[ResourceDict], Dict[NodeType, int], Dict[NodeType, int]):
|
||||
"""Updates resource demands to respect the min_workers constraint.
|
||||
"""Updates resource demands to respect the min_workers and
|
||||
request_resources() constraints.
|
||||
|
||||
Args:
|
||||
node_resources: Resources of exisiting nodes already launched/pending.
|
||||
node_type_counts: Counts of existing nodes already launched/pending.
|
||||
node_types: Node types config.
|
||||
max_workers: global max_workers constaint.
|
||||
ensure_min_cluster_size: resource demands from request_resources().
|
||||
|
||||
Returns:
|
||||
node_resources: The updated node resources after adding min_workers
|
||||
constraint per node type.
|
||||
and request_resources() constraints per node type.
|
||||
node_type_counts: The updated node counts after adding min_workers
|
||||
constraint per node type.
|
||||
total_nodes_to_add: The nodes to add to respect min_workers constraint.
|
||||
and request_resources() constraints per node type.
|
||||
total_nodes_to_add_dict: The nodes to add to respect min_workers and
|
||||
request_resources() constraints.
|
||||
"""
|
||||
total_nodes_to_add_dict = {}
|
||||
for node_type, config in node_types.items():
|
||||
@@ -528,10 +518,41 @@ def _add_min_workers_nodes(
|
||||
if existing < target:
|
||||
total_nodes_to_add_dict[node_type] = target - existing
|
||||
node_type_counts[node_type] = target
|
||||
available = copy.deepcopy(node_types[node_type]["resources"])
|
||||
node_resources.extend(
|
||||
[available] * total_nodes_to_add_dict[node_type])
|
||||
node_resources.extend([
|
||||
copy.deepcopy(node_types[node_type]["resources"])
|
||||
for _ in range(total_nodes_to_add_dict[node_type])
|
||||
])
|
||||
|
||||
if ensure_min_cluster_size:
|
||||
max_to_add = max_workers + 1 - sum(node_type_counts.values())
|
||||
max_node_resources = []
|
||||
# Fit request_resources() on all the resources as if they are idle.
|
||||
for node_type in node_type_counts:
|
||||
max_node_resources.extend([
|
||||
copy.deepcopy(node_types[node_type]["resources"])
|
||||
for _ in range(node_type_counts[node_type])
|
||||
])
|
||||
# Get the unfulfilled to ensure min cluster size.
|
||||
resource_requests_unfulfilled, _ = get_bin_pack_residual(
|
||||
max_node_resources, ensure_min_cluster_size)
|
||||
# Get the nodes to meet the unfulfilled.
|
||||
nodes_to_add_request_resources = get_nodes_for(
|
||||
node_types, node_type_counts, max_to_add,
|
||||
resource_requests_unfulfilled)
|
||||
# Update the resources, counts and total nodes to add.
|
||||
for node_type in nodes_to_add_request_resources:
|
||||
nodes_to_add = nodes_to_add_request_resources.get(node_type, 0)
|
||||
if nodes_to_add > 0:
|
||||
node_type_counts[
|
||||
node_type] = nodes_to_add + node_type_counts.get(
|
||||
node_type, 0)
|
||||
node_resources.extend([
|
||||
copy.deepcopy(node_types[node_type]["resources"])
|
||||
for _ in range(nodes_to_add)
|
||||
])
|
||||
total_nodes_to_add_dict[
|
||||
node_type] = nodes_to_add + total_nodes_to_add_dict.get(
|
||||
node_type, 0)
|
||||
return node_resources, node_type_counts, total_nodes_to_add_dict
|
||||
|
||||
|
||||
@@ -623,7 +644,8 @@ def _utilization_score(node_resources: ResourceDict,
|
||||
|
||||
def get_bin_pack_residual(node_resources: List[ResourceDict],
|
||||
resource_demands: List[ResourceDict],
|
||||
strict_spread: bool = False) -> List[ResourceDict]:
|
||||
strict_spread: bool = False
|
||||
) -> (List[ResourceDict], List[ResourceDict]):
|
||||
"""Return a subset of resource_demands that cannot fit in the cluster.
|
||||
|
||||
TODO(ekl): this currently does not guarantee the resources will be packed
|
||||
@@ -638,7 +660,7 @@ def get_bin_pack_residual(node_resources: List[ResourceDict],
|
||||
placed on a different entry in `node_resources`.
|
||||
|
||||
Returns:
|
||||
List[ResourceDict] the residual list resources that do not fit.
|
||||
List[ResourceDict]: the residual list resources that do not fit.
|
||||
List[ResourceDict]: The updated node_resources after the method.
|
||||
"""
|
||||
|
||||
|
||||
@@ -256,8 +256,16 @@ class NodeUpdater:
|
||||
|
||||
retry_str = "(" + str(e) + ")"
|
||||
if hasattr(e, "cmd"):
|
||||
if isinstance(e.cmd, str):
|
||||
cmd_ = e.cmd
|
||||
elif isinstance(e.cmd, list):
|
||||
cmd_ = " ".join(e.cmd)
|
||||
else:
|
||||
logger.debug(f"e.cmd type ({type(e.cmd)}) not "
|
||||
"list or str.")
|
||||
cmd_ = str(e.cmd)
|
||||
retry_str = "(Exit Status {}): {}".format(
|
||||
e.returncode, " ".join(e.cmd))
|
||||
e.returncode, cmd_)
|
||||
|
||||
cli_logger.print(
|
||||
"SSH still not available {}, "
|
||||
|
||||
@@ -244,3 +244,14 @@ def hash_runtime_conf(file_mounts,
|
||||
file_mounts_contents_hash = None
|
||||
|
||||
return (_hash_cache[conf_str], file_mounts_contents_hash)
|
||||
|
||||
|
||||
def add_prefix(info_string, prefix):
|
||||
"""Prefixes each line of info_string, except the first, by prefix."""
|
||||
lines = info_string.split("\n")
|
||||
prefixed_lines = [lines[0]]
|
||||
for line in lines[1:]:
|
||||
prefixed_line = ":".join([prefix, line])
|
||||
prefixed_lines.append(prefixed_line)
|
||||
prefixed_info_string = "\n".join(prefixed_lines)
|
||||
return prefixed_info_string
|
||||
|
||||
@@ -112,7 +112,7 @@ setup_commands:
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
|
||||
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -27,10 +28,10 @@ docker:
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray:latest-cpu"
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
@@ -128,7 +129,7 @@ setup_commands: []
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu"
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||
|
||||
# # Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
# worker_image: "rayproject/ray:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
@@ -90,8 +91,8 @@ file_mounts: {
|
||||
# List of shell commands to run to set up nodes.
|
||||
# NOTE: rayproject/ray:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
cluster_name: java
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 1
|
||||
min_workers: 1
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 1
|
||||
@@ -72,10 +72,10 @@ worker_setup_commands: []
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --code-search-path=~/ray-word-count/target
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --code-search-path=ray-word-count/target
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar"`
|
||||
# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar -Dray.job.code-search-path=ray-word-count/target"`
|
||||
|
||||
@@ -24,7 +24,7 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "" # e.g., rayproject/ray:latest
|
||||
image: "" # e.g., rayproject/ray-ml:latest
|
||||
container_name: "" # e.g. ray_docker
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -32,9 +32,9 @@ docker:
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
# worker_image: "rayproject/ray:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
@@ -120,7 +120,7 @@ setup_commands:
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- source activate pytorch_p36 && pip install -U ray
|
||||
- source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray[debug]
|
||||
- source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
|
||||
@@ -112,7 +112,7 @@ setup_commands:
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
|
||||
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -27,10 +28,10 @@ docker:
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray:latest-cpu"
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
@@ -128,7 +129,7 @@ setup_commands:
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu"
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||
|
||||
# # Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
# worker_image: "rayproject/ray:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
@@ -65,9 +66,9 @@ file_mounts: {
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
# NOTE: rayproject/ray:latest has ray latest bundled
|
||||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu"
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_docker"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -27,9 +28,9 @@ docker:
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
# worker_image: "rayproject/ray:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
@@ -97,7 +98,7 @@ setup_commands:
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
|
||||
@@ -130,7 +130,7 @@ setup_commands:
|
||||
&& echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile
|
||||
|
||||
# Install ray
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
|
||||
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -27,10 +28,10 @@ docker:
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
# Allow Ray to automatically detect GPUs
|
||||
|
||||
# worker_image: "rayproject/ray:latest-cpu"
|
||||
# worker_image: "rayproject/ray-ml:latest-cpu"
|
||||
# worker_run_options: []
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
@@ -136,7 +137,7 @@ setup_commands: []
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
|
||||
@@ -19,14 +19,15 @@ upscaling_speed: 1.0
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu"
|
||||
image: "rayproject/ray-ml:latest-gpu"
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_nvidia_docker" # e.g. ray_docker
|
||||
|
||||
# # Example of running a GPU head with CPU workers
|
||||
# head_image: "rayproject/ray:latest-gpu"
|
||||
# head_image: "rayproject/ray-ml:latest-gpu"
|
||||
|
||||
|
||||
# worker_image: "rayproject/ray:latest"
|
||||
# worker_image: "rayproject/ray-ml:latest"
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
@@ -117,10 +118,10 @@ initialization_commands:
|
||||
done"
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
# NOTE: rayproject/ray:latest has ray latest bundled
|
||||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -142,7 +142,7 @@ head_node:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
@@ -215,7 +215,7 @@ worker_nodes:
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
|
||||
@@ -142,7 +142,7 @@ head_node:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
@@ -215,7 +215,7 @@ worker_nodes:
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
|
||||
@@ -146,7 +146,7 @@ head_node:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
@@ -221,7 +221,7 @@ worker_nodes:
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/ray
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,128 @@
|
||||
apiVersion: cluster.ray.io/v1
|
||||
kind: RayCluster
|
||||
metadata:
|
||||
name: example-cluster
|
||||
spec:
|
||||
# The maximum number of workers nodes to launch in addition to the head node.
|
||||
maxWorkers: 3
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscalingSpeed: 1.0
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idleTimeoutMinutes: 5
|
||||
# Specify the pod type for the ray head node (as configured below).
|
||||
headPodType: head-node
|
||||
# Specify the default pod type for ray the worker nodes (as configured below).
|
||||
workerDefaultPodType: worker-nodes
|
||||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||
podTypes:
|
||||
- name: head-node
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: example-cluster-ray-head-
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ['trap : TERM INT; sleep infinity & wait;']
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
- name: worker-nodes
|
||||
# Minimum number of Ray workers of this Pod type.
|
||||
minWorkers: 2
|
||||
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||
maxWorkers: 3
|
||||
# User-specified custom resources for use by Ray
|
||||
rayResources: {"Custom1": 1, "is_spot": 1}
|
||||
# Optional commands to run before starting the Ray runtime.
|
||||
setupCommands:
|
||||
- pip install numpy # Example
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: example-cluster-ray-worker-
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
image: rayproject/ray:nightly
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
# Commands to start Ray on the head node. You don't need to change this.
|
||||
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
|
||||
headStartRayCommands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
|
||||
# Commands to start Ray on worker nodes. You don't need to change this.
|
||||
workerStartRayCommands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -0,0 +1,128 @@
|
||||
apiVersion: cluster.ray.io/v1
|
||||
kind: RayCluster
|
||||
metadata:
|
||||
name: example-cluster2
|
||||
spec:
|
||||
# The maximum number of workers nodes to launch in addition to the head node.
|
||||
maxWorkers: 3
|
||||
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
||||
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
||||
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
||||
# This number should be > 0.
|
||||
upscalingSpeed: 1.0
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idleTimeoutMinutes: 5
|
||||
# Specify the pod type for the ray head node (as configured below).
|
||||
headPodType: head-node
|
||||
# Specify the default pod type for ray the worker nodes (as configured below).
|
||||
workerDefaultPodType: worker-nodes
|
||||
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
||||
podTypes:
|
||||
- name: head-node
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: example-cluster2-ray-head-
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ['trap : TERM INT; sleep infinity & wait;']
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
- name: worker-nodes
|
||||
# Minimum number of Ray workers of this Pod type.
|
||||
minWorkers: 1
|
||||
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
||||
maxWorkers: 3
|
||||
# User-specified custom resources for use by Ray
|
||||
rayResources: {"Custom1": 1, "is_spot": 1}
|
||||
# Optional commands to run before starting the Ray runtime.
|
||||
setupCommands:
|
||||
- pip install numpy # Example
|
||||
podConfig:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: example-cluster2-ray-worker-
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
image: rayproject/ray:nightly
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 512Mi
|
||||
# Commands to start Ray on the head node. You don't need to change this.
|
||||
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
|
||||
headStartRayCommands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --dashboard-host 0.0.0.0
|
||||
# Commands to start Ray on worker nodes. You don't need to change this.
|
||||
workerStartRayCommands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
+3
-4
@@ -9,8 +9,8 @@ apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: ray-operator-role
|
||||
rules:
|
||||
- apiGroups: ["", "rbac.authorization.k8s.io"]
|
||||
resources: ["configmaps", "pods", "pods/exec", "services", "serviceaccounts", "roles", "rolebindings"]
|
||||
- apiGroups: ["", "cluster.ray.io"]
|
||||
resources: ["rayclusters", "pods", "pods/exec"]
|
||||
verbs: ["get", "watch", "list", "create", "delete", "patch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
@@ -35,8 +35,7 @@ spec:
|
||||
- name: ray
|
||||
imagePullPolicy: Always
|
||||
image: rayproject/ray:nightly
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["ray-operator; trap : TERM INT; sleep infinity & wait;"]
|
||||
command: ["ray-operator"]
|
||||
env:
|
||||
- name: RAY_OPERATOR_POD_NAMESPACE
|
||||
valueFrom:
|
||||
@@ -1,260 +0,0 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: default
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Kubernetes resources that need to be configured for the autoscaler to be
|
||||
# able to manage the Ray cluster. If any of the provided resources don't
|
||||
# exist, the autoscaler will attempt to create them. If this fails, you may
|
||||
# not have the required permissions and will have to request them to be
|
||||
# created by your cluster administrator.
|
||||
provider:
|
||||
type: kubernetes
|
||||
|
||||
# Exposing external IP addresses for ray pods isn't currently supported.
|
||||
use_internal_ips: true
|
||||
|
||||
# Namespace to use for all resources created.
|
||||
namespace: ray
|
||||
|
||||
services:
|
||||
# Service that maps to the head node of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-head
|
||||
spec:
|
||||
# This selector must match the head node pod's selector below.
|
||||
selector:
|
||||
component: ray-head
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Service that maps to the worker nodes of the Ray cluster.
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
# NOTE: If you're running multiple Ray clusters with services
|
||||
# on one Kubernetes cluster, they must have unique service
|
||||
# names.
|
||||
name: ray-workers
|
||||
spec:
|
||||
# This selector must match the worker node pods' selector below.
|
||||
selector:
|
||||
component: ray-worker
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
|
||||
# Kubernetes pod config for the head node pod.
|
||||
available_node_types:
|
||||
head_node:
|
||||
resources: {}
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-head-
|
||||
|
||||
# Must match the head node service selector above if a head node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-head
|
||||
spec:
|
||||
# Restarting the head node automatically is not currently supported.
|
||||
# If the head node goes down, `ray up` must be run again.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
# - screen (used for `ray attach`)
|
||||
# - kubectl (used by the autoscaler to manage worker pods)
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 6379 # Redis port.
|
||||
- containerPort: 6380 # Redis port.
|
||||
- containerPort: 6381 # Redis port.
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# The maximum memory that this pod is allowed to use. The
|
||||
# limit will be detected by ray and split to use 10% for
|
||||
# redis, 30% for the shared memory object store, and the
|
||||
# rest for application memory. If this limit is not set and
|
||||
# the object store size is not set manually, ray will
|
||||
# allocate a very large object store in each pod that may
|
||||
# cause problems for other pods.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
worker_nodes:
|
||||
resources: {}
|
||||
min_workers: 1
|
||||
max_workers: 2
|
||||
node_config:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
# Automatically generates a name for the pod with this prefix.
|
||||
generateName: ray-worker-
|
||||
|
||||
# Must match the worker node service selector above if a worker node
|
||||
# service is required.
|
||||
labels:
|
||||
component: ray-worker
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
|
||||
# Worker nodes will be managed automatically by the head node, so
|
||||
# do not change the restart policy.
|
||||
restartPolicy: Never
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
containers:
|
||||
- name: ray-node
|
||||
imagePullPolicy: Always
|
||||
# You are free (and encouraged) to use your own container image,
|
||||
# but it should have the following installed:
|
||||
# - rsync (used for `ray rsync` commands and file mounts)
|
||||
image: rayproject/ray:nightly
|
||||
# Do not change this command - it keeps the pod alive until it is
|
||||
# explicitly killed.
|
||||
command: ["/bin/bash", "-c", "--"]
|
||||
args: ["trap : TERM INT; sleep infinity & wait;"]
|
||||
ports:
|
||||
- containerPort: 12345 # Ray internal communication.
|
||||
- containerPort: 12346 # Ray internal communication.
|
||||
|
||||
# This volume allocates shared memory for Ray to use for its plasma
|
||||
# object store. If you do not provide this, Ray will fall back to
|
||||
# /tmp which cause slowdowns if is not a shared memory volume.
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
# This memory limit will be detected by ray and split into
|
||||
# 30% for plasma, and 70% for workers.
|
||||
memory: 2Gi
|
||||
env:
|
||||
# This is used in the head_start_ray_commands below so that
|
||||
# Ray can spawn the correct number of processes. Omitting this
|
||||
# may lead to degraded performance.
|
||||
- name: MY_CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
resource: requests.cpu
|
||||
|
||||
head_node_type:
|
||||
head_node
|
||||
|
||||
worker_default_node_type:
|
||||
worker_nodes
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
# list of paths. The same path on the head node will be copied to the worker node.
|
||||
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
||||
# you should just use file_mounts. Only use this if you know what you're doing!
|
||||
cluster_synced_files: []
|
||||
|
||||
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
||||
# should sync to the worker node continuously
|
||||
file_mounts_sync_continuously: False
|
||||
|
||||
# Patterns for files to exclude when running rsync up or rsync down.
|
||||
# This is not supported on kubernetes.
|
||||
rsync_exclude: []
|
||||
|
||||
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
||||
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
||||
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
||||
# This is not supported on kubernetes.
|
||||
rsync_filter: []
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --object-manager-port=8076 --dashboard-host 0.0.0.0
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -25,7 +25,8 @@ idle_timeout_minutes: 5
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled. Assumes Docker is installed.
|
||||
docker:
|
||||
image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
|
||||
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
|
||||
container_name: "ray_container"
|
||||
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
|
||||
# if no cached version is present.
|
||||
@@ -93,7 +94,7 @@ setup_commands: []
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"cluster_name": {
|
||||
"description": "An unique identifier for the head node and workers of this cluster.",
|
||||
"description": "A unique identifier for the head node and workers of this cluster.",
|
||||
"type": "string"
|
||||
},
|
||||
"min_workers": {
|
||||
|
||||
@@ -3,9 +3,8 @@ from traceback import format_exception
|
||||
|
||||
import colorama
|
||||
|
||||
import ray
|
||||
import ray.cloudpickle as pickle
|
||||
from ray.core.generated.common_pb2 import RayException, Language
|
||||
from ray.core.generated.common_pb2 import RayException, Language, PYTHON
|
||||
import setproctitle
|
||||
|
||||
|
||||
@@ -17,7 +16,7 @@ class RayError(Exception):
|
||||
exc_info = (type(self), self, self.__traceback__)
|
||||
formatted_exception_string = "\n".join(format_exception(*exc_info))
|
||||
return RayException(
|
||||
language=ray.Language.PYTHON.value(),
|
||||
language=PYTHON,
|
||||
serialized_exception=pickle.dumps(self),
|
||||
formatted_exception_string=formatted_exception_string
|
||||
).SerializeToString()
|
||||
@@ -26,7 +25,7 @@ class RayError(Exception):
|
||||
def from_bytes(b):
|
||||
ray_exception = RayException()
|
||||
ray_exception.ParseFromString(b)
|
||||
if ray_exception.language == ray.Language.PYTHON.value():
|
||||
if ray_exception.language == PYTHON:
|
||||
return pickle.loads(ray_exception.serialized_exception)
|
||||
else:
|
||||
return CrossLanguageError(ray_exception)
|
||||
@@ -81,6 +80,7 @@ class RayTaskError(RayError):
|
||||
pid=None,
|
||||
ip=None):
|
||||
"""Initialize a RayTaskError."""
|
||||
import ray
|
||||
if proctitle:
|
||||
self.proctitle = proctitle
|
||||
else:
|
||||
|
||||
@@ -7,34 +7,88 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# _client_api has to be external to the API stub, below.
|
||||
# Otherwise, ray.remote() that contains ray.remote()
|
||||
# contains a reference to the RayAPIStub, therefore a
|
||||
# reference to the _client_api, and then tries to pickle
|
||||
# the thing.
|
||||
# About these global variables: Ray 1.0 uses exported module functions to
|
||||
# provide its API, and we need to match that. However, we want different
|
||||
# behaviors depending on where, exactly, in the client stack this is running.
|
||||
#
|
||||
# The reason for these differences depends on what's being pickled and passed
|
||||
# to functions, or functions inside functions. So there are three cases to care
|
||||
# about
|
||||
#
|
||||
# (Python Client)-->(Python ClientServer)-->(Internal Raylet Process)
|
||||
#
|
||||
# * _client_api should be set if we're inside the client
|
||||
# * _server_api should be set if we're inside the clientserver
|
||||
# * Both will be set if we're running both (as in a test)
|
||||
# * Neither should be set if we're inside the raylet (but we still need to shim
|
||||
# from the client API surface to the Ray API)
|
||||
#
|
||||
# The job of RayAPIStub (below) delegates to the appropriate one of these
|
||||
# depending on what's set or not. Then, all users importing the ray object
|
||||
# from this package get the stub which routes them to the appropriate APIImpl.
|
||||
_client_api: Optional[APIImpl] = None
|
||||
_server_api: Optional[APIImpl] = None
|
||||
|
||||
# The reason for _is_server is a hack around the above comment while running
|
||||
# tests. If we have both a client and a server trying to control these static
|
||||
# variables then we need a way to decide which to use. In this case, both
|
||||
# _client_api and _server_api are set.
|
||||
# This boolean flips between the two
|
||||
_is_server: bool = False
|
||||
|
||||
|
||||
@contextmanager
|
||||
def stash_api_for_tests(in_test: bool):
|
||||
api = None
|
||||
global _is_server
|
||||
is_server = _is_server
|
||||
if in_test:
|
||||
api = stash_api()
|
||||
yield api
|
||||
_is_server = True
|
||||
yield _server_api
|
||||
if in_test:
|
||||
restore_api(api)
|
||||
_is_server = is_server
|
||||
|
||||
|
||||
def stash_api() -> Optional[APIImpl]:
|
||||
def _set_client_api(val: Optional[APIImpl]):
|
||||
global _client_api
|
||||
a = _client_api
|
||||
global _is_server
|
||||
if _client_api is not None:
|
||||
raise Exception("Trying to set more than one client API")
|
||||
_client_api = val
|
||||
_is_server = False
|
||||
|
||||
|
||||
def _set_server_api(val: Optional[APIImpl]):
|
||||
global _server_api
|
||||
global _is_server
|
||||
if _server_api is not None:
|
||||
raise Exception("Trying to set more than one server API")
|
||||
_server_api = val
|
||||
_is_server = True
|
||||
|
||||
|
||||
def reset_api():
|
||||
global _client_api
|
||||
global _server_api
|
||||
global _is_server
|
||||
_client_api = None
|
||||
return a
|
||||
_server_api = None
|
||||
_is_server = False
|
||||
|
||||
|
||||
def restore_api(api: Optional[APIImpl]):
|
||||
def _get_client_api() -> APIImpl:
|
||||
global _client_api
|
||||
_client_api = api
|
||||
global _server_api
|
||||
global _is_server
|
||||
api = None
|
||||
if _is_server:
|
||||
api = _server_api
|
||||
else:
|
||||
api = _client_api
|
||||
if api is None:
|
||||
# We're inside a raylet worker
|
||||
from ray.experimental.client.server.core_ray_api import CoreRayAPI
|
||||
return CoreRayAPI()
|
||||
return api
|
||||
|
||||
|
||||
class RayAPIStub:
|
||||
@@ -43,11 +97,10 @@ class RayAPIStub:
|
||||
secure: bool = False,
|
||||
metadata: List[Tuple[str, str]] = None,
|
||||
stub=None):
|
||||
global _client_api
|
||||
from ray.experimental.client.worker import Worker
|
||||
_client_worker = Worker(
|
||||
conn_str, secure=secure, metadata=metadata, stub=stub)
|
||||
_client_api = ClientAPI(_client_worker)
|
||||
_set_client_api(ClientAPI(_client_worker))
|
||||
|
||||
def disconnect(self):
|
||||
global _client_api
|
||||
@@ -56,15 +109,9 @@ class RayAPIStub:
|
||||
_client_api = None
|
||||
|
||||
def __getattr__(self, key: str):
|
||||
global _client_api
|
||||
self.__check_client_api()
|
||||
return getattr(_client_api, key)
|
||||
|
||||
def __check_client_api(self):
|
||||
global _client_api
|
||||
if _client_api is None:
|
||||
from ray.experimental.client.server.core_ray_api import CoreRayAPI
|
||||
_client_api = CoreRayAPI()
|
||||
global _get_client_api
|
||||
api = _get_client_api()
|
||||
return getattr(api, key)
|
||||
|
||||
|
||||
ray = RayAPIStub()
|
||||
|
||||
@@ -11,40 +11,145 @@
|
||||
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, Union, Optional
|
||||
import ray.core.generated.ray_client_pb2 as ray_client_pb2
|
||||
if TYPE_CHECKING:
|
||||
from ray.experimental.client.common import ClientActorHandle
|
||||
from ray.experimental.client.common import ClientStub
|
||||
from ray.experimental.client.common import ClientObjectRef
|
||||
from ray._raylet import ObjectRef
|
||||
|
||||
# Use the imports for type checking. This is a python 3.6 limitation.
|
||||
# See https://www.python.org/dev/peps/pep-0563/
|
||||
PutType = Union[ClientObjectRef, ObjectRef]
|
||||
|
||||
|
||||
class APIImpl(ABC):
|
||||
"""
|
||||
APIImpl is the interface to implement for whichever version of the core
|
||||
Ray API that needs abstracting when run in client mode.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get(self, *args, **kwargs):
|
||||
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
get is the hook stub passed on to replace `ray.get`
|
||||
|
||||
Args:
|
||||
vals: [Client]ObjectRef or list of these refs to retrieve.
|
||||
timeout: Optional timeout in milliseconds
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def put(self, *args, **kwargs):
|
||||
def put(self, vals: Any, *args,
|
||||
**kwargs) -> Union["ClientObjectRef", "ObjectRef"]:
|
||||
"""
|
||||
put is the hook stub passed on to replace `ray.put`
|
||||
|
||||
Args:
|
||||
vals: The value or list of values to `put`.
|
||||
args: opaque arguments
|
||||
kwargs: opaque keyword arguments
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def wait(self, *args, **kwargs):
|
||||
"""
|
||||
wait is the hook stub passed on to replace `ray.wait`
|
||||
|
||||
Args:
|
||||
args: opaque arguments
|
||||
kwargs: opaque keyword arguments
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remote(self, *args, **kwargs):
|
||||
"""
|
||||
remote is the hook stub passed on to replace `ray.remote`.
|
||||
|
||||
This sets up remote functions or actors, as the decorator,
|
||||
but does not execute them.
|
||||
|
||||
Args:
|
||||
args: opaque arguments
|
||||
kwargs: opaque keyword arguments
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def call_remote(self, f, kind, *args, **kwargs):
|
||||
def call_remote(self, instance: "ClientStub", *args, **kwargs):
|
||||
"""
|
||||
call_remote is called by stub objects to execute them remotely.
|
||||
|
||||
This is used by stub objects in situations where they're called
|
||||
with .remote, eg, `f.remote()` or `actor_cls.remote()`.
|
||||
This allows the client stub objects to delegate execution to be
|
||||
implemented in the most effective way whether it's in the client,
|
||||
clientserver, or raylet worker.
|
||||
|
||||
Args:
|
||||
instance: The Client-side stub reference to a remote object
|
||||
args: opaque arguments
|
||||
kwargs: opaque keyword arguments
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def close(self, *args, **kwargs):
|
||||
def close(self) -> None:
|
||||
"""
|
||||
close cleans up an API connection by closing any channels or
|
||||
shutting down any servers gracefully.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def kill(self, actor, *, no_restart=True):
|
||||
"""
|
||||
kill forcibly stops an actor running in the cluster
|
||||
|
||||
Args:
|
||||
no_restart: Whether this actor should be restarted if it's a
|
||||
restartable actor.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cancel(self, obj, *, force=False, recursive=True):
|
||||
"""
|
||||
Cancels a task on the cluster.
|
||||
|
||||
If the specified task is pending execution, it will not be executed. If
|
||||
the task is currently executing, the behavior depends on the ``force``
|
||||
flag, as per `ray.cancel()`
|
||||
|
||||
Only non-actor tasks can be canceled. Canceled tasks will not be
|
||||
retried (max_retries will not be respected).
|
||||
|
||||
Args:
|
||||
object_ref (ObjectRef): ObjectRef returned by the task
|
||||
that should be canceled.
|
||||
force (boolean): Whether to force-kill a running task by killing
|
||||
the worker that is running the task.
|
||||
recursive (boolean): Whether to try to cancel tasks submitted by
|
||||
the task specified.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ClientAPI(APIImpl):
|
||||
"""
|
||||
The Client-side methods corresponding to the ray API. Delegates
|
||||
to the Client Worker that contains the connection to the ClientServer.
|
||||
"""
|
||||
|
||||
def __init__(self, worker):
|
||||
self.worker = worker
|
||||
|
||||
def get(self, *args, **kwargs):
|
||||
return self.worker.get(*args, **kwargs)
|
||||
def get(self, vals, *, timeout=None):
|
||||
return self.worker.get(vals, timeout=timeout)
|
||||
|
||||
def put(self, *args, **kwargs):
|
||||
return self.worker.put(*args, **kwargs)
|
||||
@@ -55,12 +160,65 @@ class ClientAPI(APIImpl):
|
||||
def remote(self, *args, **kwargs):
|
||||
return self.worker.remote(*args, **kwargs)
|
||||
|
||||
def call_remote(self, f, kind, *args, **kwargs):
|
||||
return self.worker.call_remote(f, kind, *args, **kwargs)
|
||||
def call_remote(self, instance: "ClientStub", *args, **kwargs):
|
||||
return self.worker.call_remote(instance, *args, **kwargs)
|
||||
|
||||
def close(self, *args, **kwargs):
|
||||
def close(self) -> None:
|
||||
return self.worker.close()
|
||||
|
||||
def kill(self, actor: "ClientActorHandle", *, no_restart=True):
|
||||
return self.worker.terminate_actor(actor, no_restart)
|
||||
|
||||
def cancel(self, obj: "ClientObjectRef", *, force=False, recursive=True):
|
||||
return self.worker.terminate_task(obj, force, recursive)
|
||||
|
||||
# Various metadata methods for the client that are defined in the protocol.
|
||||
def is_initialized(self) -> bool:
|
||||
""" True if our client is connected, and if the server is initialized.
|
||||
|
||||
Returns:
|
||||
A boolean determining if the client is connected and
|
||||
server initialized.
|
||||
"""
|
||||
return self.worker.is_initialized()
|
||||
|
||||
def nodes(self):
|
||||
"""Get a list of the nodes in the cluster (for debugging only).
|
||||
|
||||
Returns:
|
||||
Information about the Ray clients in the cluster.
|
||||
"""
|
||||
return self.worker.get_cluster_info(
|
||||
ray_client_pb2.ClusterInfoType.NODES)
|
||||
|
||||
def cluster_resources(self):
|
||||
"""Get the current total cluster resources.
|
||||
|
||||
Note that this information can grow stale as nodes are added to or
|
||||
removed from the cluster.
|
||||
|
||||
Returns:
|
||||
A dictionary mapping resource name to the total quantity of that
|
||||
resource in the cluster.
|
||||
"""
|
||||
return self.worker.get_cluster_info(
|
||||
ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES)
|
||||
|
||||
def available_resources(self):
|
||||
"""Get the current available cluster resources.
|
||||
|
||||
This is different from `cluster_resources` in that this will return
|
||||
idle (available) resources rather than total resources.
|
||||
|
||||
Note that this information can grow stale as tasks start and finish.
|
||||
|
||||
Returns:
|
||||
A dictionary mapping resource name to the total quantity of that
|
||||
resource in the cluster.
|
||||
"""
|
||||
return self.worker.get_cluster_info(
|
||||
ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES)
|
||||
|
||||
def __getattr__(self, key: str):
|
||||
if not key.startswith("_"):
|
||||
raise NotImplementedError(
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
import ray.core.generated.ray_client_pb2 as ray_client_pb2
|
||||
from ray.experimental.client import ray
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from ray import cloudpickle
|
||||
|
||||
import base64
|
||||
|
||||
|
||||
class ClientBaseRef:
|
||||
def __init__(self, id):
|
||||
def __init__(self, id, handle=None):
|
||||
self.id = id
|
||||
self.handle = handle
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%s)" % (
|
||||
@@ -17,83 +21,243 @@ class ClientBaseRef:
|
||||
def __eq__(self, other):
|
||||
return self.id == other.id
|
||||
|
||||
def binary(self):
|
||||
return self.id
|
||||
|
||||
@classmethod
|
||||
def from_remote_ref(cls, ref: ray_client_pb2.RemoteRef):
|
||||
return cls(id=ref.id, handle=ref.handle)
|
||||
|
||||
|
||||
class ClientObjectRef(ClientBaseRef):
|
||||
pass
|
||||
def _unpack_ref(self):
|
||||
return cloudpickle.loads(self.handle)
|
||||
|
||||
|
||||
class ClientActorRef(ClientBaseRef):
|
||||
pass
|
||||
|
||||
|
||||
class ClientRemoteFunc:
|
||||
class ClientStub:
|
||||
pass
|
||||
|
||||
|
||||
class ClientRemoteFunc(ClientStub):
|
||||
"""
|
||||
A stub created on the Ray Client to represent a remote
|
||||
function that can be exectued on the cluster.
|
||||
|
||||
This class is allowed to be passed around between remote functions.
|
||||
|
||||
Args:
|
||||
_func: The actual function to execute remotely
|
||||
_name: The original name of the function
|
||||
_ref: The ClientObjectRef of the pickled code of the function, _func
|
||||
_raylet_remote: The Raylet-side ray.remote_function.RemoteFunction
|
||||
for this object
|
||||
"""
|
||||
|
||||
def __init__(self, f):
|
||||
self._func = f
|
||||
self._name = f.__name__
|
||||
self.id = None
|
||||
self._raylet_remote_func = None
|
||||
|
||||
# self._ref can be lazily instantiated. Rather than eagerly creating
|
||||
# function data objects in the server we can put them just before we
|
||||
# execute the function, especially in cases where many @ray.remote
|
||||
# functions exist in a library and only a handful are ever executed by
|
||||
# a user of the library.
|
||||
#
|
||||
# TODO(barakmich): This ref might actually be better as a serialized
|
||||
# ObjectRef. This requires being able to serialize the ref without
|
||||
# pinning it (as the lifetime of the ref is tied with the server, not
|
||||
# the client)
|
||||
self._ref = None
|
||||
self._raylet_remote = None
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise TypeError(f"Remote function cannot be called directly. "
|
||||
"Use {self._name}.remote method instead")
|
||||
|
||||
def remote(self, *args, **kwargs):
|
||||
return ray.call_remote(self, ray_client_pb2.ClientTask.FUNCTION, *args,
|
||||
**kwargs)
|
||||
return ray.call_remote(self, *args, **kwargs)
|
||||
|
||||
def _get_ray_remote_impl(self):
|
||||
if self._raylet_remote is None:
|
||||
self._raylet_remote = ray.remote(self._func)
|
||||
return self._raylet_remote
|
||||
|
||||
def __repr__(self):
|
||||
return "ClientRemoteFunc(%s, %s)" % (self._name, self.id)
|
||||
return "ClientRemoteFunc(%s, %s)" % (self._name, self._ref)
|
||||
|
||||
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
|
||||
if self._ref is None:
|
||||
self._ref = ray.put(self._func)
|
||||
task = ray_client_pb2.ClientTask()
|
||||
task.type = ray_client_pb2.ClientTask.FUNCTION
|
||||
task.name = self._name
|
||||
task.payload_id = self._ref.handle
|
||||
return task
|
||||
|
||||
|
||||
class ClientActorClass:
|
||||
class ClientActorClass(ClientStub):
|
||||
""" A stub created on the Ray Client to represent an actor class.
|
||||
|
||||
It is wrapped by ray.remote and can be executed on the cluster.
|
||||
|
||||
Args:
|
||||
actor_cls: The actual class to execute remotely
|
||||
_name: The original name of the class
|
||||
_ref: The ClientObjectRef of the pickled `actor_cls`
|
||||
_raylet_remote: The Raylet-side ray.ActorClass for this object
|
||||
"""
|
||||
|
||||
def __init__(self, actor_cls):
|
||||
self.actor_cls = actor_cls
|
||||
self._name = actor_cls.__name__
|
||||
self._ref = None
|
||||
self._raylet_remote = None
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise TypeError(f"Remote actor cannot be instantiated directly. "
|
||||
"Use {self._name}.remote() instead")
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = {
|
||||
"actor_cls": self.actor_cls,
|
||||
"_name": self._name,
|
||||
"_ref": self._ref,
|
||||
}
|
||||
return state
|
||||
|
||||
def __setstate__(self, state: Dict) -> None:
|
||||
self.actor_cls = state["actor_cls"]
|
||||
self._name = state["_name"]
|
||||
self._ref = state["_ref"]
|
||||
|
||||
def remote(self, *args, **kwargs):
|
||||
# Actually instantiate the actor
|
||||
ref = ray.call_remote(self, ray_client_pb2.ClientTask.ACTOR, *args,
|
||||
**kwargs)
|
||||
return ClientActorHandle(ref, self)
|
||||
ref = ray.call_remote(self, *args, **kwargs)
|
||||
return ClientActorHandle(ClientActorRef(ref.id, ref.handle), self)
|
||||
|
||||
def __repr__(self):
|
||||
return "ClientRemoteActor(%s, %s)" % (self._name, self.id)
|
||||
return "ClientRemoteActor(%s, %s)" % (self._name, self._ref)
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key not in self.__dict__:
|
||||
raise AttributeError("Not a class attribute")
|
||||
raise NotImplementedError("static methods")
|
||||
|
||||
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
|
||||
if self._ref is None:
|
||||
self._ref = ray.put(self.actor_cls)
|
||||
task = ray_client_pb2.ClientTask()
|
||||
task.type = ray_client_pb2.ClientTask.ACTOR
|
||||
task.name = self._name
|
||||
task.payload_id = self._ref.handle
|
||||
return task
|
||||
|
||||
class ClientActorHandle:
|
||||
def __init__(self, actor_id: ClientActorRef,
|
||||
|
||||
class ClientActorHandle(ClientStub):
|
||||
"""Client-side stub for instantiated actor.
|
||||
|
||||
A stub created on the Ray Client to represent a remote actor that
|
||||
has been started on the cluster. This class is allowed to be passed
|
||||
around between remote functions.
|
||||
|
||||
Args:
|
||||
actor_ref: A reference to the running actor given to the client. This
|
||||
is a serialized version of the actual handle as an opaque token.
|
||||
actor_class: A reference to the ClientActorClass that this actor was
|
||||
instantiated from.
|
||||
_real_actor_handle: Cached copy of the Raylet-side
|
||||
ray.actor.ActorHandle contained in the actor_id ref.
|
||||
"""
|
||||
|
||||
def __init__(self, actor_ref: ClientActorRef,
|
||||
actor_class: ClientActorClass):
|
||||
self.actor_id = actor_id
|
||||
self.actor_ref = actor_ref
|
||||
self.actor_class = actor_class
|
||||
self._real_actor_handle = None
|
||||
|
||||
def _get_ray_remote_impl(self):
|
||||
if self._real_actor_handle is None:
|
||||
self._real_actor_handle = cloudpickle.loads(self.actor_ref.handle)
|
||||
return self._real_actor_handle
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = {
|
||||
"actor_ref": self.actor_ref,
|
||||
"actor_class": self.actor_class,
|
||||
"_real_actor_handle": self._real_actor_handle,
|
||||
}
|
||||
return state
|
||||
|
||||
def __setstate__(self, state: Dict) -> None:
|
||||
self.actor_ref = state["actor_ref"]
|
||||
self.actor_class = state["actor_class"]
|
||||
self._real_actor_handle = state["_real_actor_handle"]
|
||||
|
||||
@property
|
||||
def _actor_id(self):
|
||||
return self.actor_ref.id
|
||||
|
||||
def __getattr__(self, key):
|
||||
return ClientRemoteMethod(self, key)
|
||||
|
||||
def __repr__(self):
|
||||
return "ClientActorHandle(%s)" % (self.actor_ref.id.hex())
|
||||
|
||||
|
||||
class ClientRemoteMethod(ClientStub):
|
||||
"""A stub for a method on a remote actor.
|
||||
|
||||
Can be annotated with exection options.
|
||||
|
||||
Args:
|
||||
actor_handle: A reference to the ClientActorHandle that generated
|
||||
this method and will have this method called upon it.
|
||||
method_name: The name of this method
|
||||
"""
|
||||
|
||||
class ClientRemoteMethod:
|
||||
def __init__(self, actor_handle: ClientActorHandle, method_name: str):
|
||||
self.actor_handle = actor_handle
|
||||
self.method_name = method_name
|
||||
self._name = "%s.%s" % (self.actor_handle.actor_class._name,
|
||||
self.method_name)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise TypeError(f"Remote method cannot be called directly. "
|
||||
"Use {self._name}.remote() instead")
|
||||
|
||||
def _get_ray_remote_impl(self):
|
||||
return getattr(self.actor_handle._get_ray_remote_impl(),
|
||||
self.method_name)
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = {
|
||||
"actor_handle": self.actor_handle,
|
||||
"method_name": self.method_name,
|
||||
}
|
||||
return state
|
||||
|
||||
def __setstate__(self, state: Dict) -> None:
|
||||
self.actor_handle = state["actor_handle"]
|
||||
self.method_name = state["method_name"]
|
||||
|
||||
def remote(self, *args, **kwargs):
|
||||
return ray.call_remote(self, ray_client_pb2.ClientTask.METHOD, *args,
|
||||
**kwargs)
|
||||
return ray.call_remote(self, *args, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return "ClientRemoteMethod(%s, %s)" % (self._name, self.actor_id)
|
||||
name = "%s.%s" % (self.actor_handle.actor_class._name,
|
||||
self.method_name)
|
||||
return "ClientRemoteMethod(%s, %s)" % (name,
|
||||
self.actor_handle.actor_id)
|
||||
|
||||
def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
|
||||
task = ray_client_pb2.ClientTask()
|
||||
task.type = ray_client_pb2.ClientTask.METHOD
|
||||
task.name = self.method_name
|
||||
task.payload_id = self.actor_handle.actor_ref.handle
|
||||
return task
|
||||
|
||||
|
||||
def convert_from_arg(pb) -> Any:
|
||||
@@ -114,3 +278,13 @@ def convert_to_arg(val):
|
||||
out.local = ray_client_pb2.Arg.Locality.INTERNED
|
||||
out.data = cloudpickle.dumps(val)
|
||||
return out
|
||||
|
||||
|
||||
def encode_exception(exception) -> str:
|
||||
data = cloudpickle.dumps(exception)
|
||||
return base64.standard_b64encode(data).decode()
|
||||
|
||||
|
||||
def decode_exception(data) -> Exception:
|
||||
data = base64.standard_b64decode(data)
|
||||
return cloudpickle.loads(data)
|
||||
|
||||
@@ -7,18 +7,36 @@
|
||||
# While the stub is trivial, it allows us to check that the calls we're
|
||||
# making into the core-ray module are contained and well-defined.
|
||||
|
||||
from typing import Any
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import ray
|
||||
|
||||
from ray.experimental.client.api import APIImpl
|
||||
from ray.experimental.client.common import ClientRemoteFunc
|
||||
from ray.experimental.client.common import ClientObjectRef
|
||||
from ray.experimental.client.common import ClientStub
|
||||
|
||||
|
||||
class CoreRayAPI(APIImpl):
|
||||
def get(self, *args, **kwargs):
|
||||
return ray.get(*args, **kwargs)
|
||||
"""
|
||||
Implements the equivalent client-side Ray API by simply passing along to
|
||||
the Core Ray API. Primarily used inside of Ray Workers as a trampoline back
|
||||
to core ray when passed client stubs.
|
||||
"""
|
||||
|
||||
def put(self, *args, **kwargs):
|
||||
return ray.put(*args, **kwargs)
|
||||
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
|
||||
if isinstance(vals, list):
|
||||
if isinstance(vals[0], ClientObjectRef):
|
||||
return ray.get(
|
||||
[val._unpack_ref() for val in vals], timeout=timeout)
|
||||
elif isinstance(vals, ClientObjectRef):
|
||||
return ray.get(vals._unpack_ref(), timeout=timeout)
|
||||
return ray.get(vals, timeout=timeout)
|
||||
|
||||
def put(self, vals: Any, *args,
|
||||
**kwargs) -> Union[ClientObjectRef, ray._raylet.ObjectRef]:
|
||||
return ray.put(vals, *args, **kwargs)
|
||||
|
||||
def wait(self, *args, **kwargs):
|
||||
return ray.wait(*args, **kwargs)
|
||||
@@ -26,16 +44,58 @@ class CoreRayAPI(APIImpl):
|
||||
def remote(self, *args, **kwargs):
|
||||
return ray.remote(*args, **kwargs)
|
||||
|
||||
def call_remote(self, f: ClientRemoteFunc, kind: int, *args, **kwargs):
|
||||
if f._raylet_remote_func is None:
|
||||
f._raylet_remote_func = ray.remote(f._func)
|
||||
return f._raylet_remote_func.remote(*args, **kwargs)
|
||||
def call_remote(self, instance: ClientStub, *args, **kwargs):
|
||||
return instance._get_ray_remote_impl().remote(*args, **kwargs)
|
||||
|
||||
def close(self, *args, **kwargs):
|
||||
def close(self) -> None:
|
||||
return None
|
||||
|
||||
def kill(self, actor, *, no_restart=True):
|
||||
return ray.kill(actor, no_restart=no_restart)
|
||||
|
||||
def cancel(self, obj, *, force=False, recursive=True):
|
||||
return ray.cancel(obj, force=force, recursive=recursive)
|
||||
|
||||
def is_initialized(self) -> bool:
|
||||
return ray.is_initialized()
|
||||
|
||||
# Allow for generic fallback to ray.* in remote methods. This allows calls
|
||||
# like ray.nodes() to be run in remote functions even though the client
|
||||
# doesn't currently support them.
|
||||
def __getattr__(self, key: str):
|
||||
return getattr(ray, key)
|
||||
|
||||
|
||||
class RayServerAPI(CoreRayAPI):
|
||||
"""
|
||||
Ray Client server-side API shim. By default, simply calls the default Core
|
||||
Ray API calls, but also accepts scheduling calls from functions running
|
||||
inside of other remote functions that need to create more work.
|
||||
"""
|
||||
|
||||
def __init__(self, server_instance):
|
||||
self.server = server_instance
|
||||
|
||||
# Wrap single item into list if needed before calling server put.
|
||||
def put(self, vals: Any, *args, **kwargs) -> ClientObjectRef:
|
||||
to_put = []
|
||||
single = False
|
||||
if isinstance(vals, list):
|
||||
to_put = vals
|
||||
else:
|
||||
single = True
|
||||
to_put.append(vals)
|
||||
|
||||
out = [self._put(x) for x in to_put]
|
||||
if single:
|
||||
out = out[0]
|
||||
return out
|
||||
|
||||
def _put(self, val: Any):
|
||||
resp = self.server._put_and_retain_obj(val)
|
||||
return ClientObjectRef(resp.id)
|
||||
|
||||
def call_remote(self, instance: ClientStub, *args, **kwargs):
|
||||
task = instance._prepare_client_task()
|
||||
ticket = self.server.Schedule(task, prepared_args=args)
|
||||
return ClientObjectRef(ticket.return_id)
|
||||
|
||||
@@ -3,14 +3,17 @@ from concurrent import futures
|
||||
import grpc
|
||||
from ray import cloudpickle
|
||||
import ray
|
||||
import ray.state
|
||||
import ray.core.generated.ray_client_pb2 as ray_client_pb2
|
||||
import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
|
||||
import time
|
||||
import inspect
|
||||
from ray.experimental.client import stash_api_for_tests
|
||||
import json
|
||||
from ray.experimental.client import stash_api_for_tests, _set_server_api
|
||||
from ray.experimental.client.common import convert_from_arg
|
||||
from ray.experimental.client.common import encode_exception
|
||||
from ray.experimental.client.common import ClientObjectRef
|
||||
from ray.experimental.client.common import ClientRemoteFunc
|
||||
from ray.experimental.client.server.core_ray_api import RayServerAPI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -23,31 +26,98 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
|
||||
self.registered_actor_classes = {}
|
||||
self._test_mode = test_mode
|
||||
|
||||
def ClusterInfo(self, request,
|
||||
context=None) -> ray_client_pb2.ClusterInfoResponse:
|
||||
resp = ray_client_pb2.ClusterInfoResponse()
|
||||
resp.type = request.type
|
||||
if request.type == ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES:
|
||||
resources = ray.cluster_resources()
|
||||
# Normalize resources into floats
|
||||
# (the function may return values that are ints)
|
||||
float_resources = {k: float(v) for k, v in resources.items()}
|
||||
resp.resource_table.CopyFrom(
|
||||
ray_client_pb2.ClusterInfoResponse.ResourceTable(
|
||||
table=float_resources))
|
||||
elif request.type == \
|
||||
ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES:
|
||||
resources = ray.available_resources()
|
||||
# Normalize resources into floats
|
||||
# (the function may return values that are ints)
|
||||
float_resources = {k: float(v) for k, v in resources.items()}
|
||||
resp.resource_table.CopyFrom(
|
||||
ray_client_pb2.ClusterInfoResponse.ResourceTable(
|
||||
table=float_resources))
|
||||
else:
|
||||
resp.json = self._return_debug_cluster_info(request, context)
|
||||
return resp
|
||||
|
||||
def _return_debug_cluster_info(self, request, context=None) -> str:
|
||||
data = None
|
||||
if request.type == ray_client_pb2.ClusterInfoType.NODES:
|
||||
data = ray.nodes()
|
||||
elif request.type == ray_client_pb2.ClusterInfoType.IS_INITIALIZED:
|
||||
data = ray.is_initialized()
|
||||
else:
|
||||
raise TypeError("Unsupported cluster info type")
|
||||
return json.dumps(data)
|
||||
|
||||
def Terminate(self, request, context=None):
|
||||
if request.WhichOneof("terminate_type") == "task_object":
|
||||
try:
|
||||
object_ref = cloudpickle.loads(request.task_object.handle)
|
||||
ray.cancel(
|
||||
object_ref,
|
||||
force=request.task_object.force,
|
||||
recursive=request.task_object.recursive)
|
||||
except Exception as e:
|
||||
return_exception_in_context(e, context)
|
||||
elif request.WhichOneof("terminate_type") == "actor":
|
||||
try:
|
||||
actor_ref = cloudpickle.loads(request.actor.handle)
|
||||
ray.kill(actor_ref, no_restart=request.actor.no_restart)
|
||||
except Exception as e:
|
||||
return_exception_in_context(e, context)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Client requested termination without providing a valid "
|
||||
"terminate_type")
|
||||
return ray_client_pb2.TerminateResponse(ok=True)
|
||||
|
||||
def GetObject(self, request, context=None):
|
||||
if request.id not in self.object_refs:
|
||||
request_ref = cloudpickle.loads(request.handle)
|
||||
if request_ref.binary() not in self.object_refs:
|
||||
return ray_client_pb2.GetResponse(valid=False)
|
||||
objectref = self.object_refs[request.id]
|
||||
objectref = self.object_refs[request_ref.binary()]
|
||||
logger.info("get: %s" % objectref)
|
||||
item = ray.get(objectref)
|
||||
try:
|
||||
item = ray.get(objectref, timeout=request.timeout)
|
||||
except Exception as e:
|
||||
return_exception_in_context(e, context)
|
||||
item_ser = cloudpickle.dumps(item)
|
||||
return ray_client_pb2.GetResponse(valid=True, data=item_ser)
|
||||
|
||||
def PutObject(self, request, context=None):
|
||||
def PutObject(self, request, context=None) -> ray_client_pb2.PutResponse:
|
||||
obj = cloudpickle.loads(request.data)
|
||||
objectref = self._put_and_retain_obj(obj)
|
||||
pickled_ref = cloudpickle.dumps(objectref)
|
||||
return ray_client_pb2.PutResponse(
|
||||
ref=make_remote_ref(objectref.binary(), pickled_ref))
|
||||
|
||||
def _put_and_retain_obj(self, obj) -> ray.ObjectRef:
|
||||
objectref = ray.put(obj)
|
||||
self.object_refs[objectref.binary()] = objectref
|
||||
logger.info("put: %s" % objectref)
|
||||
return ray_client_pb2.PutResponse(id=objectref.binary())
|
||||
return objectref
|
||||
|
||||
def WaitObject(self, request, context=None) -> ray_client_pb2.WaitResponse:
|
||||
object_refs = [cloudpickle.loads(o) for o in request.object_refs]
|
||||
object_refs = [cloudpickle.loads(o) for o in request.object_handles]
|
||||
num_returns = request.num_returns
|
||||
timeout = request.timeout
|
||||
object_refs_ids = []
|
||||
for object_ref in object_refs:
|
||||
if object_ref.id not in self.object_refs:
|
||||
if object_ref.binary() not in self.object_refs:
|
||||
return ray_client_pb2.WaitResponse(valid=False)
|
||||
object_refs_ids.append(self.object_refs[object_ref.id])
|
||||
object_refs_ids.append(self.object_refs[object_ref.binary()])
|
||||
try:
|
||||
ready_object_refs, remaining_object_refs = ray.wait(
|
||||
object_refs_ids,
|
||||
@@ -59,94 +129,133 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
|
||||
logger.info("wait: %s %s" % (str(ready_object_refs),
|
||||
str(remaining_object_refs)))
|
||||
ready_object_ids = [
|
||||
ready_object_ref.binary() for ready_object_ref in ready_object_refs
|
||||
make_remote_ref(
|
||||
id=ready_object_ref.binary(),
|
||||
handle=cloudpickle.dumps(ready_object_ref),
|
||||
) for ready_object_ref in ready_object_refs
|
||||
]
|
||||
remaining_object_ids = [
|
||||
remaining_object_ref.binary()
|
||||
for remaining_object_ref in remaining_object_refs
|
||||
make_remote_ref(
|
||||
id=remaining_object_ref.binary(),
|
||||
handle=cloudpickle.dumps(remaining_object_ref),
|
||||
) for remaining_object_ref in remaining_object_refs
|
||||
]
|
||||
return ray_client_pb2.WaitResponse(
|
||||
valid=True,
|
||||
ready_object_ids=ready_object_ids,
|
||||
remaining_object_ids=remaining_object_ids)
|
||||
|
||||
def Schedule(self, task, context=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
def Schedule(self, task, context=None,
|
||||
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
logger.info("schedule: %s %s" %
|
||||
(task.name,
|
||||
ray_client_pb2.ClientTask.RemoteExecType.Name(task.type)))
|
||||
if task.type == ray_client_pb2.ClientTask.FUNCTION:
|
||||
return self._schedule_function(task, context)
|
||||
return self._schedule_function(task, context, prepared_args)
|
||||
elif task.type == ray_client_pb2.ClientTask.ACTOR:
|
||||
return self._schedule_actor(task, context)
|
||||
return self._schedule_actor(task, context, prepared_args)
|
||||
elif task.type == ray_client_pb2.ClientTask.METHOD:
|
||||
return self._schedule_method(task, context)
|
||||
return self._schedule_method(task, context, prepared_args)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Unimplemented Schedule task type: %s" %
|
||||
ray_client_pb2.ClientTask.RemoteExecType.Name(task.type))
|
||||
|
||||
def _schedule_method(self, task: ray_client_pb2.ClientTask,
|
||||
context=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
def _schedule_method(
|
||||
self,
|
||||
task: ray_client_pb2.ClientTask,
|
||||
context=None,
|
||||
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
actor_handle = self.actor_refs.get(task.payload_id)
|
||||
if actor_handle is None:
|
||||
raise Exception(
|
||||
"Can't run an actor the server doesn't have a handle for")
|
||||
arglist = _convert_args(task.args)
|
||||
arglist = _convert_args(task.args, prepared_args)
|
||||
with stash_api_for_tests(self._test_mode):
|
||||
output = getattr(actor_handle, task.name).remote(*arglist)
|
||||
self.object_refs[output.binary()] = output
|
||||
return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
|
||||
pickled_ref = cloudpickle.dumps(output)
|
||||
return ray_client_pb2.ClientTaskTicket(
|
||||
return_ref=make_remote_ref(output.binary(), pickled_ref))
|
||||
|
||||
def _schedule_actor(self, task: ray_client_pb2.ClientTask,
|
||||
context=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
def _schedule_actor(self,
|
||||
task: ray_client_pb2.ClientTask,
|
||||
context=None,
|
||||
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
with stash_api_for_tests(self._test_mode):
|
||||
if task.payload_id not in self.registered_actor_classes:
|
||||
actor_class_ref = self.object_refs[task.payload_id]
|
||||
payload_ref = cloudpickle.loads(task.payload_id)
|
||||
if payload_ref.binary() not in self.registered_actor_classes:
|
||||
actor_class_ref = self.object_refs[payload_ref.binary()]
|
||||
actor_class = ray.get(actor_class_ref)
|
||||
if not inspect.isclass(actor_class):
|
||||
raise Exception("Attempting to schedule actor that "
|
||||
"isn't a ClientActorClass.")
|
||||
"isn't a class.")
|
||||
reg_class = ray.remote(actor_class)
|
||||
self.registered_actor_classes[task.payload_id] = reg_class
|
||||
remote_class = self.registered_actor_classes[task.payload_id]
|
||||
arglist = _convert_args(task.args)
|
||||
self.registered_actor_classes[payload_ref.binary()] = reg_class
|
||||
remote_class = self.registered_actor_classes[payload_ref.binary()]
|
||||
arglist = _convert_args(task.args, prepared_args)
|
||||
actor = remote_class.remote(*arglist)
|
||||
actor_ref = actor._actor_id
|
||||
self.actor_refs[actor_ref.binary()] = actor
|
||||
return ray_client_pb2.ClientTaskTicket(return_id=actor_ref.binary())
|
||||
actorhandle = cloudpickle.dumps(actor)
|
||||
self.actor_refs[actorhandle] = actor
|
||||
return ray_client_pb2.ClientTaskTicket(
|
||||
return_ref=make_remote_ref(actor._actor_id.binary(), actorhandle))
|
||||
|
||||
def _schedule_function(self, task: ray_client_pb2.ClientTask,
|
||||
context=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
if task.payload_id not in self.function_refs:
|
||||
funcref = self.object_refs[task.payload_id]
|
||||
def _schedule_function(
|
||||
self,
|
||||
task: ray_client_pb2.ClientTask,
|
||||
context=None,
|
||||
prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
|
||||
payload_ref = cloudpickle.loads(task.payload_id)
|
||||
if payload_ref.binary() not in self.function_refs:
|
||||
funcref = self.object_refs[payload_ref.binary()]
|
||||
func = ray.get(funcref)
|
||||
if not isinstance(func, ClientRemoteFunc):
|
||||
if not inspect.isfunction(func):
|
||||
raise Exception("Attempting to schedule function that "
|
||||
"isn't a ClientRemoteFunc.")
|
||||
self.function_refs[task.payload_id] = func
|
||||
remote_func = self.function_refs[task.payload_id]
|
||||
arglist = _convert_args(task.args)
|
||||
"isn't a function.")
|
||||
self.function_refs[payload_ref.binary()] = ray.remote(func)
|
||||
remote_func = self.function_refs[payload_ref.binary()]
|
||||
arglist = _convert_args(task.args, prepared_args)
|
||||
# Prepare call if we're in a test
|
||||
with stash_api_for_tests(self._test_mode):
|
||||
output = remote_func.remote(*arglist)
|
||||
if output.binary() in self.object_refs:
|
||||
raise Exception("already found it")
|
||||
self.object_refs[output.binary()] = output
|
||||
return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
|
||||
pickled_output = cloudpickle.dumps(output)
|
||||
return ray_client_pb2.ClientTaskTicket(
|
||||
return_ref=make_remote_ref(output.binary(), pickled_output))
|
||||
|
||||
|
||||
def _convert_args(arg_list):
|
||||
def _convert_args(arg_list, prepared_args=None):
|
||||
if prepared_args is not None:
|
||||
return prepared_args
|
||||
out = []
|
||||
for arg in arg_list:
|
||||
t = convert_from_arg(arg)
|
||||
if isinstance(t, ClientObjectRef):
|
||||
out.append(ray.ObjectRef(t.id))
|
||||
out.append(t._unpack_ref())
|
||||
else:
|
||||
out.append(t)
|
||||
return out
|
||||
|
||||
|
||||
def make_remote_ref(id: bytes, handle: bytes) -> ray_client_pb2.RemoteRef:
|
||||
return ray_client_pb2.RemoteRef(
|
||||
id=id,
|
||||
handle=handle,
|
||||
)
|
||||
|
||||
|
||||
def return_exception_in_context(err, context):
|
||||
if context is not None:
|
||||
context.set_details(encode_exception(err))
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
|
||||
|
||||
def serve(connection_str, test_mode=False):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
|
||||
task_servicer = RayletServicer(test_mode=test_mode)
|
||||
_set_server_api(RayServerAPI(task_servicer))
|
||||
ray_client_pb2_grpc.add_RayletDriverServicer_to_server(
|
||||
task_servicer, server)
|
||||
server.add_insecure_port(connection_str)
|
||||
|
||||
@@ -3,22 +3,29 @@ It implements the Ray API functions that are forwarded through grpc calls
|
||||
to the server.
|
||||
"""
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import Optional
|
||||
|
||||
import ray.cloudpickle as cloudpickle
|
||||
from ray.util.inspect import is_cython
|
||||
import grpc
|
||||
|
||||
from ray.exceptions import TaskCancelledError
|
||||
import ray.core.generated.ray_client_pb2 as ray_client_pb2
|
||||
import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
|
||||
from ray.experimental.client.common import convert_to_arg
|
||||
from ray.experimental.client.common import decode_exception
|
||||
from ray.experimental.client.common import ClientObjectRef
|
||||
from ray.experimental.client.common import ClientActorRef
|
||||
from ray.experimental.client.common import ClientActorClass
|
||||
from ray.experimental.client.common import ClientRemoteMethod
|
||||
from ray.experimental.client.common import ClientActorHandle
|
||||
from ray.experimental.client.common import ClientRemoteFunc
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Worker:
|
||||
def __init__(self,
|
||||
@@ -34,6 +41,7 @@ class Worker:
|
||||
metadata: additional metadata passed in the grpc request headers.
|
||||
"""
|
||||
self.metadata = metadata
|
||||
self.channel = None
|
||||
if stub is None:
|
||||
if secure:
|
||||
credentials = grpc.ssl_channel_credentials()
|
||||
@@ -44,28 +52,32 @@ class Worker:
|
||||
else:
|
||||
self.server = stub
|
||||
|
||||
def get(self, ids):
|
||||
def get(self, vals, *, timeout: Optional[float] = None) -> Any:
|
||||
to_get = []
|
||||
single = False
|
||||
if isinstance(ids, list):
|
||||
to_get = [x.id for x in ids]
|
||||
elif isinstance(ids, ClientObjectRef):
|
||||
to_get = [ids.id]
|
||||
if isinstance(vals, list):
|
||||
to_get = [x.handle for x in vals]
|
||||
elif isinstance(vals, ClientObjectRef):
|
||||
to_get = [vals.handle]
|
||||
single = True
|
||||
else:
|
||||
raise Exception("Can't get something that's not a "
|
||||
"list of IDs or just an ID: %s" % type(ids))
|
||||
out = [self._get(x) for x in to_get]
|
||||
"list of IDs or just an ID: %s" % type(vals))
|
||||
if timeout is None:
|
||||
timeout = 0
|
||||
out = [self._get(x, timeout) for x in to_get]
|
||||
if single:
|
||||
out = out[0]
|
||||
return out
|
||||
|
||||
def _get(self, id: bytes):
|
||||
req = ray_client_pb2.GetRequest(id=id)
|
||||
data = self.server.GetObject(req, metadata=self.metadata)
|
||||
def _get(self, handle: bytes, timeout: float):
|
||||
req = ray_client_pb2.GetRequest(handle=handle, timeout=timeout)
|
||||
try:
|
||||
data = self.server.GetObject(req, metadata=self.metadata)
|
||||
except grpc.RpcError as e:
|
||||
raise decode_exception(e.details())
|
||||
if not data.valid:
|
||||
raise Exception(
|
||||
"Client GetObject returned invalid data: id invalid?")
|
||||
raise TaskCancelledError(handle)
|
||||
return cloudpickle.loads(data.data)
|
||||
|
||||
def put(self, vals):
|
||||
@@ -86,7 +98,7 @@ class Worker:
|
||||
data = cloudpickle.dumps(val)
|
||||
req = ray_client_pb2.PutRequest(data=data)
|
||||
resp = self.server.PutObject(req, metadata=self.metadata)
|
||||
return ClientObjectRef(resp.id)
|
||||
return ClientObjectRef.from_remote_ref(resp.ref)
|
||||
|
||||
def wait(self,
|
||||
object_refs: List[ClientObjectRef],
|
||||
@@ -98,8 +110,8 @@ class Worker:
|
||||
for ref in object_refs:
|
||||
assert isinstance(ref, ClientObjectRef)
|
||||
data = {
|
||||
"object_refs": [
|
||||
cloudpickle.dumps(object_ref) for object_ref in object_refs
|
||||
"object_handles": [
|
||||
object_ref.handle for object_ref in object_refs
|
||||
],
|
||||
"num_returns": num_returns,
|
||||
"timeout": timeout if timeout else -1
|
||||
@@ -110,10 +122,12 @@ class Worker:
|
||||
# TODO(ameer): improve error/exceptions messages.
|
||||
raise Exception("Client Wait request failed. Reference invalid?")
|
||||
client_ready_object_ids = [
|
||||
ClientObjectRef(id) for id in resp.ready_object_ids
|
||||
ClientObjectRef.from_remote_ref(ref)
|
||||
for ref in resp.ready_object_ids
|
||||
]
|
||||
client_remaining_object_ids = [
|
||||
ClientObjectRef(id) for id in resp.remaining_object_ids
|
||||
ClientObjectRef.from_remote_ref(ref)
|
||||
for ref in resp.remaining_object_ids
|
||||
]
|
||||
|
||||
return (client_ready_object_ids, client_remaining_object_ids)
|
||||
@@ -130,50 +144,60 @@ class Worker:
|
||||
raise TypeError("The @ray.remote decorator must be applied to "
|
||||
"either a function or to a class.")
|
||||
|
||||
def call_remote(self, instance, kind, *args, **kwargs):
|
||||
ticket = None
|
||||
if kind == ray_client_pb2.ClientTask.FUNCTION:
|
||||
ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
|
||||
elif kind == ray_client_pb2.ClientTask.ACTOR:
|
||||
ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
|
||||
return ClientActorRef(ticket.return_id)
|
||||
elif kind == ray_client_pb2.ClientTask.METHOD:
|
||||
ticket = self._call_method(instance, *args, **kwargs)
|
||||
|
||||
if ticket is None:
|
||||
raise Exception(
|
||||
"Couldn't call_remote on %s for type %s" % (instance, kind))
|
||||
return ClientObjectRef(ticket.return_id)
|
||||
|
||||
def _call_method(self, instance: ClientRemoteMethod, *args, **kwargs):
|
||||
if not isinstance(instance, ClientRemoteMethod):
|
||||
raise TypeError("Client not passing a ClientRemoteMethod stub")
|
||||
task = ray_client_pb2.ClientTask()
|
||||
task.type = ray_client_pb2.ClientTask.METHOD
|
||||
task.name = instance.method_name
|
||||
task.payload_id = instance.actor_handle.actor_id.id
|
||||
def call_remote(self, instance, *args, **kwargs):
|
||||
task = instance._prepare_client_task()
|
||||
for arg in args:
|
||||
pb_arg = convert_to_arg(arg)
|
||||
task.args.append(pb_arg)
|
||||
logging.debug("Scheduling %s" % task)
|
||||
ticket = self.server.Schedule(task, metadata=self.metadata)
|
||||
return ticket
|
||||
|
||||
def _put_and_schedule(self, item, task_type, *args, **kwargs):
|
||||
if isinstance(item, ClientRemoteFunc):
|
||||
ref = self._put(item)
|
||||
elif isinstance(item, ClientActorClass):
|
||||
ref = self._put(item.actor_cls)
|
||||
else:
|
||||
raise TypeError("Client not passing a ClientRemoteFunc stub")
|
||||
task = ray_client_pb2.ClientTask()
|
||||
task.type = task_type
|
||||
task.name = item._name
|
||||
task.payload_id = ref.id
|
||||
for arg in args:
|
||||
pb_arg = convert_to_arg(arg)
|
||||
task.args.append(pb_arg)
|
||||
ticket = self.server.Schedule(task, metadata=self.metadata)
|
||||
return ticket
|
||||
return ClientObjectRef.from_remote_ref(ticket.return_ref)
|
||||
|
||||
def close(self):
|
||||
self.channel.close()
|
||||
self.server = None
|
||||
if self.channel:
|
||||
self.channel.close()
|
||||
|
||||
def terminate_actor(self, actor: ClientActorHandle,
|
||||
no_restart: bool) -> None:
|
||||
if not isinstance(actor, ClientActorHandle):
|
||||
raise ValueError("ray.kill() only supported for actors. "
|
||||
"Got: {}.".format(type(actor)))
|
||||
term_actor = ray_client_pb2.TerminateRequest.ActorTerminate()
|
||||
term_actor.handle = actor.actor_ref.handle
|
||||
term_actor.no_restart = no_restart
|
||||
try:
|
||||
term = ray_client_pb2.TerminateRequest(actor=term_actor)
|
||||
self.server.Terminate(term)
|
||||
except grpc.RpcError as e:
|
||||
raise decode_exception(e.details())
|
||||
|
||||
def terminate_task(self, obj: ClientObjectRef, force: bool,
|
||||
recursive: bool) -> None:
|
||||
if not isinstance(obj, ClientObjectRef):
|
||||
raise TypeError(
|
||||
"ray.cancel() only supported for non-actor object refs. "
|
||||
f"Got: {type(obj)}.")
|
||||
term_object = ray_client_pb2.TerminateRequest.TaskObjectTerminate()
|
||||
term_object.handle = obj.handle
|
||||
term_object.force = force
|
||||
term_object.recursive = recursive
|
||||
try:
|
||||
term = ray_client_pb2.TerminateRequest(task_object=term_object)
|
||||
self.server.Terminate(term)
|
||||
except grpc.RpcError as e:
|
||||
raise decode_exception(e.details())
|
||||
|
||||
def get_cluster_info(self, type: ray_client_pb2.ClusterInfoType.TypeEnum):
|
||||
req = ray_client_pb2.ClusterInfoRequest()
|
||||
req.type = type
|
||||
resp = self.server.ClusterInfo(req)
|
||||
if resp.WhichOneof("response_type") == "resource_table":
|
||||
return resp.resource_table.table
|
||||
return json.loads(resp.json)
|
||||
|
||||
def is_initialized(self) -> bool:
|
||||
if self.server is not None:
|
||||
return self.get_cluster_info(
|
||||
ray_client_pb2.ClusterInfoType.IS_INITIALIZED)
|
||||
return False
|
||||
|
||||
@@ -7,8 +7,8 @@ from ray.core.generated.gcs_pb2 import (
|
||||
JobConfig,
|
||||
ErrorTableData,
|
||||
GcsEntry,
|
||||
HeartbeatBatchTableData,
|
||||
HeartbeatTableData,
|
||||
ResourceUsageBatchData,
|
||||
ResourcesData,
|
||||
ObjectTableData,
|
||||
ProfileTableData,
|
||||
TablePrefix,
|
||||
@@ -33,8 +33,8 @@ __all__ = [
|
||||
"ErrorTableData",
|
||||
"ErrorType",
|
||||
"GcsEntry",
|
||||
"HeartbeatBatchTableData",
|
||||
"HeartbeatTableData",
|
||||
"ResourceUsageBatchData",
|
||||
"ResourcesData",
|
||||
"ObjectTableData",
|
||||
"ProfileTableData",
|
||||
"TablePrefix",
|
||||
@@ -55,8 +55,8 @@ FUNCTION_PREFIX = "RemoteFunction:"
|
||||
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
|
||||
REPORTER_CHANNEL = "RAY_REPORTER"
|
||||
|
||||
# xray heartbeats
|
||||
XRAY_HEARTBEAT_BATCH_PATTERN = "HEARTBEAT_BATCH:".encode("ascii")
|
||||
# xray resource usages
|
||||
XRAY_RESOURCES_BATCH_PATTERN = "RESOURCES_BATCH:".encode("ascii")
|
||||
|
||||
# xray job updates
|
||||
XRAY_JOB_PATTERN = "JOB:*".encode("ascii")
|
||||
|
||||
@@ -23,7 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
c_vector[c_string] GetAllProfileInfo()
|
||||
c_vector[c_string] GetAllObjectInfo()
|
||||
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
|
||||
unique_ptr[c_string] GetAllHeartbeat()
|
||||
unique_ptr[c_string] GetAllResourceUsage()
|
||||
c_vector[c_string] GetAllActorInfo()
|
||||
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
|
||||
c_string GetNodeResourceInfo(const CNodeID &node_id)
|
||||
|
||||
@@ -78,11 +78,11 @@ cdef class GlobalStateAccessor:
|
||||
return c_string(object_info.get().data(), object_info.get().size())
|
||||
return None
|
||||
|
||||
def get_all_heartbeat(self):
|
||||
"""Get newest heartbeat of all nodes from GCS service."""
|
||||
def get_all_resource_usage(self):
|
||||
"""Get newest resource usage of all nodes from GCS service."""
|
||||
cdef unique_ptr[c_string] result
|
||||
with nogil:
|
||||
result = self.inner.get().GetAllHeartbeat()
|
||||
result = self.inner.get().GetAllResourceUsage()
|
||||
if result:
|
||||
return c_string(result.get().data(), result.get().size())
|
||||
return None
|
||||
|
||||
@@ -90,7 +90,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const CTaskOptions &options, c_vector[CObjectID] *return_ids,
|
||||
int max_retries,
|
||||
c_pair[CPlacementGroupID, int64_t] placement_options,
|
||||
c_bool placement_group_capture_child_tasks)
|
||||
c_bool placement_group_capture_child_tasks,
|
||||
c_string debugger_breakpoint)
|
||||
CRayStatus CreateActor(
|
||||
const CRayFunction &function,
|
||||
const c_vector[unique_ptr[CTaskArg]] &args,
|
||||
@@ -101,6 +102,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CPlacementGroupID *placement_group_id)
|
||||
CRayStatus RemovePlacementGroup(
|
||||
const CPlacementGroupID &placement_group_id)
|
||||
CRayStatus WaitPlacementGroupReady(
|
||||
const CPlacementGroupID &placement_group_id, int timeout_ms)
|
||||
void SubmitActorTask(
|
||||
const CActorID &actor_id, const CRayFunction &function,
|
||||
const c_vector[unique_ptr[CTaskArg]] &args,
|
||||
@@ -222,6 +225,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
const c_vector[CObjectID] &arg_reference_ids,
|
||||
const c_vector[CObjectID] &return_ids,
|
||||
const c_string debugger_breakpoint,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil
|
||||
) task_execution_callback
|
||||
(void(const CWorkerID &) nogil) on_worker_shutdown
|
||||
|
||||
@@ -15,7 +15,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
int64_t raylet_heartbeat_timeout_milliseconds() const
|
||||
|
||||
c_bool light_heartbeat_enabled() const
|
||||
c_bool light_report_resource_usage_enabled() const
|
||||
|
||||
int64_t debug_dump_period_milliseconds() const
|
||||
|
||||
@@ -51,10 +51,6 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
uint64_t object_manager_default_chunk_size() const
|
||||
|
||||
int num_workers_per_process_python() const
|
||||
|
||||
int num_workers_per_process_java() const
|
||||
|
||||
uint32_t maximum_gcs_deletion_batch_size() const
|
||||
|
||||
int64_t max_direct_call_object_size() const
|
||||
@@ -68,3 +64,5 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
c_bool enable_timeline() const
|
||||
|
||||
c_bool automatic_object_deletion_enabled() const
|
||||
|
||||
uint32_t max_grpc_message_size() const
|
||||
|
||||
@@ -14,8 +14,8 @@ cdef class Config:
|
||||
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def light_heartbeat_enabled():
|
||||
return RayConfig.instance().light_heartbeat_enabled()
|
||||
def light_report_resource_usage_enabled():
|
||||
return RayConfig.instance().light_report_resource_usage_enabled()
|
||||
|
||||
@staticmethod
|
||||
def debug_dump_period_milliseconds():
|
||||
@@ -88,14 +88,6 @@ cdef class Config:
|
||||
def object_manager_default_chunk_size():
|
||||
return RayConfig.instance().object_manager_default_chunk_size()
|
||||
|
||||
@staticmethod
|
||||
def num_workers_per_process_python():
|
||||
return RayConfig.instance().num_workers_per_process_python()
|
||||
|
||||
@staticmethod
|
||||
def num_workers_per_process_java():
|
||||
return RayConfig.instance().num_workers_per_process_java()
|
||||
|
||||
@staticmethod
|
||||
def maximum_gcs_deletion_batch_size():
|
||||
return RayConfig.instance().maximum_gcs_deletion_batch_size()
|
||||
@@ -119,3 +111,7 @@ cdef class Config:
|
||||
@staticmethod
|
||||
def automatic_object_deletion_enabled():
|
||||
return RayConfig.instance().automatic_object_deletion_enabled()
|
||||
|
||||
@staticmethod
|
||||
def max_grpc_message_size():
|
||||
return RayConfig.instance().max_grpc_message_size()
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import ray
|
||||
import ray.worker
|
||||
from ray import profiling
|
||||
|
||||
__all__ = ["free", "global_gc"]
|
||||
MAX_MESSAGE_LENGTH = ray._config.max_grpc_message_size()
|
||||
|
||||
|
||||
def global_gc():
|
||||
@@ -22,7 +24,13 @@ def memory_summary():
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
ray.nodes()[0]["NodeManagerPort"])
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
channel = grpc.insecure_channel(
|
||||
raylet_address,
|
||||
options=[
|
||||
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
reply = stub.FormatGlobalMemoryInfo(
|
||||
node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
|
||||
|
||||
@@ -133,7 +133,7 @@ class LogMonitor:
|
||||
job_match = JOB_LOG_PATTERN.match(file_path)
|
||||
if job_match:
|
||||
job_id = job_match.group(2)
|
||||
worker_pid = job_match.group(3)
|
||||
worker_pid = int(job_match.group(3))
|
||||
else:
|
||||
job_id = None
|
||||
worker_pid = None
|
||||
@@ -361,4 +361,5 @@ if __name__ == "__main__":
|
||||
f"failed with the following error:\n{traceback_str}")
|
||||
ray.utils.push_error_to_driver_through_redis(
|
||||
redis_client, ray_constants.LOG_MONITOR_DIED_ERROR, message)
|
||||
logger.error(message)
|
||||
raise e
|
||||
|
||||
@@ -91,7 +91,7 @@ class MemoryMonitor:
|
||||
if not psutil:
|
||||
logger.warn("WARNING: Not monitoring node memory since `psutil` "
|
||||
"is not installed. Install this with "
|
||||
"`pip install psutil` (or ray[debug]) to enable "
|
||||
"`pip install psutil` to enable "
|
||||
"debugging of memory-related crashes.")
|
||||
|
||||
def get_memory_usage(self):
|
||||
|
||||
+20
-14
@@ -85,7 +85,11 @@ class Monitor:
|
||||
This is used to receive notifications about failed components.
|
||||
"""
|
||||
|
||||
def __init__(self, redis_address, autoscaling_config, redis_password=None):
|
||||
def __init__(self,
|
||||
redis_address,
|
||||
autoscaling_config,
|
||||
redis_password=None,
|
||||
prefix_cluster_info=False):
|
||||
# Initialize the Redis clients.
|
||||
ray.state.state._initialize_global_state(
|
||||
redis_address, redis_password=redis_password)
|
||||
@@ -107,8 +111,10 @@ class Monitor:
|
||||
head_node_ip = redis_address.split(":")[0]
|
||||
self.load_metrics = LoadMetrics(local_ip=head_node_ip)
|
||||
if autoscaling_config:
|
||||
self.autoscaler = StandardAutoscaler(autoscaling_config,
|
||||
self.load_metrics)
|
||||
self.autoscaler = StandardAutoscaler(
|
||||
autoscaling_config,
|
||||
self.load_metrics,
|
||||
prefix_cluster_info=prefix_cluster_info)
|
||||
self.autoscaling_config = autoscaling_config
|
||||
else:
|
||||
self.autoscaler = None
|
||||
@@ -139,24 +145,24 @@ class Monitor:
|
||||
self.primary_subscribe_client.subscribe(channel)
|
||||
|
||||
def update_load_metrics(self):
|
||||
"""Fetches heartbeat data from GCS and updates load metrics."""
|
||||
"""Fetches resource usage data from GCS and updates load metrics."""
|
||||
|
||||
all_heartbeat = self.global_state_accessor.get_all_heartbeat()
|
||||
heartbeat_batch_data = \
|
||||
ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
|
||||
for heartbeat_message in heartbeat_batch_data.batch:
|
||||
resource_load = dict(heartbeat_message.resource_load)
|
||||
total_resources = dict(heartbeat_message.resources_total)
|
||||
available_resources = dict(heartbeat_message.resources_available)
|
||||
all_resources = self.global_state_accessor.get_all_resource_usage()
|
||||
resources_batch_data = \
|
||||
ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources)
|
||||
for resource_message in resources_batch_data.batch:
|
||||
resource_load = dict(resource_message.resource_load)
|
||||
total_resources = dict(resource_message.resources_total)
|
||||
available_resources = dict(resource_message.resources_available)
|
||||
|
||||
waiting_bundles, infeasible_bundles = parse_resource_demands(
|
||||
heartbeat_batch_data.resource_load_by_shape)
|
||||
resources_batch_data.resource_load_by_shape)
|
||||
|
||||
pending_placement_groups = list(
|
||||
heartbeat_batch_data.placement_group_load.placement_group_data)
|
||||
resources_batch_data.placement_group_load.placement_group_data)
|
||||
|
||||
# Update the load metrics for this raylet.
|
||||
node_id = ray.utils.binary_to_hex(heartbeat_message.node_id)
|
||||
node_id = ray.utils.binary_to_hex(resource_message.node_id)
|
||||
ip = self.raylet_id_to_ip_map.get(node_id)
|
||||
if ip:
|
||||
self.load_metrics.update(ip, total_resources,
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
linux:
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
|
||||
|
||||
darwin:
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-macosx_10_13_intel.whl
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-macosx_10_13_intel.whl
|
||||
|
||||
win32:
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-win_amd64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-win_amd64.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-win_amd64.whl
|
||||
"3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-win_amd64.whl
|
||||
"3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-win_amd64.whl
|
||||
"3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-win_amd64.whl
|
||||
|
||||
+11
-10
@@ -339,10 +339,6 @@ class Node:
|
||||
"""Get the cluster Redis password"""
|
||||
return self._ray_params.redis_password
|
||||
|
||||
@property
|
||||
def load_code_from_local(self):
|
||||
return self._ray_params.load_code_from_local
|
||||
|
||||
@property
|
||||
def object_ref_seed(self):
|
||||
"""Get the seed for deterministic generation of object refs"""
|
||||
@@ -723,14 +719,12 @@ class Node:
|
||||
stderr_file=stderr_file,
|
||||
config=self._config,
|
||||
java_worker_options=self._ray_params.java_worker_options,
|
||||
load_code_from_local=self._ray_params.load_code_from_local,
|
||||
huge_pages=self._ray_params.huge_pages,
|
||||
fate_share=self.kernel_fate_share,
|
||||
socket_to_use=self.socket,
|
||||
head_node=self.head,
|
||||
start_initial_python_workers_for_first_job=self._ray_params.
|
||||
start_initial_python_workers_for_first_job,
|
||||
code_search_path=self._ray_params.code_search_path)
|
||||
start_initial_python_workers_for_first_job)
|
||||
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
|
||||
|
||||
@@ -739,12 +733,19 @@ class Node:
|
||||
raise NotImplementedError
|
||||
|
||||
def start_monitor(self):
|
||||
"""Start the monitor."""
|
||||
"""Start the monitor.
|
||||
|
||||
Autoscaling output goes to these monitor.err/out files, and
|
||||
any modification to these files may break existing
|
||||
cluster launching commands.
|
||||
"""
|
||||
stdout_file, stderr_file = self.get_log_file_handles(
|
||||
"monitor", unique=True)
|
||||
process_info = ray._private.services.start_monitor(
|
||||
self._redis_address,
|
||||
self._logs_dir,
|
||||
stdout_file=subprocess.DEVNULL,
|
||||
stderr_file=subprocess.DEVNULL,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
autoscaling_config=self._ray_params.autoscaling_config,
|
||||
redis_password=self._ray_params.redis_password,
|
||||
fate_share=self.kernel_fate_share)
|
||||
|
||||
@@ -1,108 +0,0 @@
|
||||
"""
|
||||
Ray operator for Kubernetes.
|
||||
|
||||
Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
|
||||
create_or_update_cluster(), then runs an autoscaling loop in the operator pod
|
||||
executing this script. Writes autoscaling logs to the directory
|
||||
/root/ray-operator-logs.
|
||||
|
||||
In this setup, the ray head node does not run an autoscaler. It is important
|
||||
NOT to supply an --autoscaling-config argument to head node's ray start command
|
||||
in the cluster config when using this operator.
|
||||
|
||||
To run, first create a ConfigMap named ray-operator-configmap from a ray
|
||||
cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
|
||||
|
||||
For example:
|
||||
kubectl create namespace raytest
|
||||
kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
|
||||
kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
|
||||
""" # noqa
|
||||
import os
|
||||
from typing import Any, Dict, IO, Tuple
|
||||
|
||||
import kubernetes
|
||||
import yaml
|
||||
|
||||
from ray._private import services
|
||||
from ray.autoscaler._private.commands import create_or_update_cluster
|
||||
from ray.autoscaler._private.kubernetes import core_api
|
||||
from ray.utils import open_log
|
||||
from ray import ray_constants
|
||||
|
||||
RAY_CLUSTER_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
|
||||
RAY_CONFIG_MAP = "ray-operator-configmap"
|
||||
RAY_CONFIG_DIR = "/root"
|
||||
|
||||
LOG_DIR = "/root/ray-operator-logs"
|
||||
ERR_NAME, OUT_NAME = "ray-operator.err", "ray-operator.out"
|
||||
|
||||
|
||||
def prepare_ray_cluster_config() -> str:
|
||||
config_map = core_api().read_namespaced_config_map(
|
||||
name=RAY_CONFIG_MAP, namespace=RAY_CLUSTER_NAMESPACE)
|
||||
|
||||
# config_map.data consists of a single key:value pair
|
||||
for config_file_name, config_string in config_map.data.items():
|
||||
config = yaml.safe_load(config_string)
|
||||
config["provider"]["namespace"] = RAY_CLUSTER_NAMESPACE
|
||||
cluster_config_path = os.path.join(RAY_CONFIG_DIR, config_file_name)
|
||||
with open(cluster_config_path, "w") as file:
|
||||
yaml.dump(config, file)
|
||||
|
||||
return cluster_config_path
|
||||
|
||||
|
||||
def get_ray_head_pod_ip(config: Dict[str, Any]) -> str:
|
||||
cluster_name = config["cluster_name"]
|
||||
label_selector = f"component=ray-head,ray-cluster-name={cluster_name}"
|
||||
pods = core_api().list_namespaced_pod(
|
||||
namespace=RAY_CLUSTER_NAMESPACE, label_selector=label_selector).items
|
||||
assert (len(pods)) == 1
|
||||
head_pod = pods.pop()
|
||||
return head_pod.status.pod_ip
|
||||
|
||||
|
||||
def get_logs() -> Tuple[IO, IO]:
|
||||
try:
|
||||
os.makedirs(LOG_DIR)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
err_path = os.path.join(LOG_DIR, ERR_NAME)
|
||||
out_path = os.path.join(LOG_DIR, OUT_NAME)
|
||||
|
||||
return open_log(err_path), open_log(out_path)
|
||||
|
||||
|
||||
def main():
|
||||
kubernetes.config.load_incluster_config()
|
||||
cluster_config_path = prepare_ray_cluster_config()
|
||||
|
||||
config = create_or_update_cluster(
|
||||
cluster_config_path,
|
||||
override_min_workers=None,
|
||||
override_max_workers=None,
|
||||
no_restart=False,
|
||||
restart_only=False,
|
||||
yes=True,
|
||||
no_config_cache=True)
|
||||
with open(cluster_config_path, "w") as file:
|
||||
yaml.dump(config, file)
|
||||
|
||||
ray_head_pod_ip = get_ray_head_pod_ip(config)
|
||||
# TODO: Add support for user-specified redis port and password
|
||||
redis_address = services.address(ray_head_pod_ip,
|
||||
ray_constants.DEFAULT_PORT)
|
||||
stderr_file, stdout_file = get_logs()
|
||||
|
||||
services.start_monitor(
|
||||
redis_address,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
autoscaling_config=cluster_config_path,
|
||||
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Ray operator for Kubernetes.
|
||||
|
||||
Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
|
||||
create_or_update_cluster(), then runs an autoscaling loop in the operator pod
|
||||
executing this script. Writes autoscaling logs to the directory
|
||||
/root/ray-operator-logs.
|
||||
|
||||
In this setup, the ray head node does not run an autoscaler. It is important
|
||||
NOT to supply an --autoscaling-config argument to head node's ray start command
|
||||
in the cluster config when using this operator.
|
||||
|
||||
To run, first create a ConfigMap named ray-operator-configmap from a ray
|
||||
cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
|
||||
|
||||
For example:
|
||||
kubectl create namespace raytest
|
||||
kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
|
||||
kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
|
||||
""" # noqa
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from kubernetes.client.exceptions import ApiException
|
||||
import yaml
|
||||
|
||||
from ray._private import services
|
||||
from ray.autoscaler._private import commands
|
||||
from ray import monitor
|
||||
from ray.operator import operator_utils
|
||||
from ray import ray_constants
|
||||
|
||||
|
||||
class RayCluster():
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
self.config = config
|
||||
self.name = self.config["cluster_name"]
|
||||
self.config_path = operator_utils.config_path(self.name)
|
||||
|
||||
self.setup_logging()
|
||||
|
||||
self.subprocess = None # type: Optional[mp.Process]
|
||||
|
||||
def do_in_subprocess(self,
|
||||
f: Callable[[], None],
|
||||
wait_to_finish: bool = False) -> None:
|
||||
# First stop the subprocess if it's alive
|
||||
self.clean_up_subprocess()
|
||||
# Reinstantiate process with f as target and start.
|
||||
self.subprocess = mp.Process(name=self.name, target=f)
|
||||
# Kill subprocess if monitor dies
|
||||
self.subprocess.daemon = True
|
||||
self.subprocess.start()
|
||||
if wait_to_finish:
|
||||
self.subprocess.join()
|
||||
|
||||
def clean_up_subprocess(self):
|
||||
if self.subprocess and self.subprocess.is_alive():
|
||||
self.subprocess.terminate()
|
||||
self.subprocess.join()
|
||||
|
||||
def create_or_update(self) -> None:
|
||||
self.do_in_subprocess(self._create_or_update)
|
||||
|
||||
def _create_or_update(self) -> None:
|
||||
self.start_head()
|
||||
self.start_monitor()
|
||||
|
||||
def start_head(self) -> None:
|
||||
self.write_config()
|
||||
self.config = commands.create_or_update_cluster(
|
||||
self.config_path,
|
||||
override_min_workers=None,
|
||||
override_max_workers=None,
|
||||
no_restart=False,
|
||||
restart_only=False,
|
||||
yes=True,
|
||||
no_config_cache=True)
|
||||
self.write_config()
|
||||
|
||||
def start_monitor(self) -> None:
|
||||
ray_head_pod_ip = commands.get_head_node_ip(self.config_path)
|
||||
# TODO: Add support for user-specified redis port and password
|
||||
redis_address = services.address(ray_head_pod_ip,
|
||||
ray_constants.DEFAULT_PORT)
|
||||
self.mtr = monitor.Monitor(
|
||||
redis_address=redis_address,
|
||||
autoscaling_config=self.config_path,
|
||||
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
|
||||
prefix_cluster_info=True)
|
||||
self.mtr.run()
|
||||
|
||||
def clean_up(self) -> None:
|
||||
self.clean_up_subprocess()
|
||||
self.clean_up_logging()
|
||||
self.delete_config()
|
||||
|
||||
def setup_logging(self) -> None:
|
||||
self.handler = logging.StreamHandler()
|
||||
self.handler.addFilter(lambda rec: rec.processName == self.name)
|
||||
logging_format = ":".join([self.name, ray_constants.LOGGER_FORMAT])
|
||||
self.handler.setFormatter(logging.Formatter(logging_format))
|
||||
operator_utils.root_logger.addHandler(self.handler)
|
||||
|
||||
def clean_up_logging(self) -> None:
|
||||
operator_utils.root_logger.removeHandler(self.handler)
|
||||
|
||||
def write_config(self) -> None:
|
||||
with open(self.config_path, "w") as file:
|
||||
yaml.dump(self.config, file)
|
||||
|
||||
def delete_config(self) -> None:
|
||||
os.remove(self.config_path)
|
||||
|
||||
|
||||
ray_clusters = {}
|
||||
|
||||
|
||||
def cluster_action(cluster_config: Dict[str, Any], event_type: str) -> None:
|
||||
cluster_name = cluster_config["cluster_name"]
|
||||
if event_type == "ADDED":
|
||||
ray_clusters[cluster_name] = RayCluster(cluster_config)
|
||||
ray_clusters[cluster_name].create_or_update()
|
||||
elif event_type == "MODIFIED":
|
||||
ray_clusters[cluster_name].create_or_update()
|
||||
elif event_type == "DELETED":
|
||||
ray_clusters[cluster_name].clean_up()
|
||||
del ray_clusters[cluster_name]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
# Make directory for ray cluster configs
|
||||
if not os.path.isdir(operator_utils.RAY_CONFIG_DIR):
|
||||
os.mkdir(operator_utils.RAY_CONFIG_DIR)
|
||||
# Control loop
|
||||
cluster_cr_stream = operator_utils.cluster_cr_stream()
|
||||
try:
|
||||
for event in cluster_cr_stream:
|
||||
cluster_cr = event["object"]
|
||||
event_type = event["type"]
|
||||
cluster_config = operator_utils.cr_to_config(cluster_cr)
|
||||
cluster_action(cluster_config, event_type)
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
raise Exception(
|
||||
"Caught a 404 error. Has the RayCluster CRD been created?")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,114 @@
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, Iterator, List
|
||||
|
||||
from kubernetes.watch import Watch
|
||||
|
||||
from ray.autoscaler._private.kubernetes import custom_objects_api
|
||||
|
||||
RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
|
||||
|
||||
RAY_CONFIG_DIR = os.path.expanduser("~/ray_cluster_configs")
|
||||
CONFIG_SUFFIX = "_config.yaml"
|
||||
|
||||
CONFIG_FIELDS = {
|
||||
"maxWorkers": "max_workers",
|
||||
"upscalingSpeed": "upscaling_speed",
|
||||
"idleTimeoutMinutes": "idle_timeout_minutes",
|
||||
"headPodType": "head_node_type",
|
||||
"workerDefaultPodType": "worker_default_node_type",
|
||||
"workerStartRayCommands": "worker_start_ray_commands",
|
||||
"headStartRayCommands": "head_start_ray_commands",
|
||||
"podTypes": "available_node_types"
|
||||
}
|
||||
|
||||
NODE_TYPE_FIELDS = {
|
||||
"minWorkers": "min_workers",
|
||||
"maxWorkers": "max_workers",
|
||||
"podConfig": "node_config",
|
||||
"rayResources": "resources",
|
||||
"setupCommands": "worker_setup_commands"
|
||||
}
|
||||
|
||||
PROVIDER_CONFIG = {
|
||||
"type": "kubernetes",
|
||||
"use_internal_ips": True,
|
||||
"namespace": RAY_NAMESPACE
|
||||
}
|
||||
|
||||
root_logger = logging.getLogger("ray")
|
||||
root_logger.setLevel(logging.getLevelName("DEBUG"))
|
||||
"""
|
||||
ownerReferences:
|
||||
- apiVersion: apps/v1
|
||||
controller: true
|
||||
blockOwnerDeletion: true
|
||||
kind: ReplicaSet
|
||||
name: my-repset
|
||||
uid: d9607e19-f88f-11e6-a518-42010a800195
|
||||
"""
|
||||
|
||||
|
||||
def config_path(cluster_name: str) -> str:
|
||||
file_name = cluster_name + CONFIG_SUFFIX
|
||||
return os.path.join(RAY_CONFIG_DIR, file_name)
|
||||
|
||||
|
||||
def cluster_cr_stream() -> Iterator:
|
||||
w = Watch()
|
||||
return w.stream(
|
||||
custom_objects_api().list_namespaced_custom_object,
|
||||
namespace=RAY_NAMESPACE,
|
||||
group="cluster.ray.io",
|
||||
version="v1",
|
||||
plural="rayclusters")
|
||||
|
||||
|
||||
def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert RayCluster custom resource to a ray cluster config for use by the
|
||||
autoscaler."""
|
||||
cr_spec = cluster_resource["spec"]
|
||||
cr_meta = cluster_resource["metadata"]
|
||||
config = translate(cr_spec, dictionary=CONFIG_FIELDS)
|
||||
pod_types = cr_spec["podTypes"]
|
||||
config["available_node_types"] = get_node_types(
|
||||
pod_types, cluster_name=cr_meta["name"], cluster_uid=cr_meta["uid"])
|
||||
config["cluster_name"] = cr_meta["name"]
|
||||
config["provider"] = PROVIDER_CONFIG
|
||||
return config
|
||||
|
||||
|
||||
def get_node_types(pod_types: List[Dict[str, Any]], cluster_name: str,
|
||||
cluster_uid: str) -> Dict[str, Any]:
|
||||
cluster_owner_reference = get_cluster_owner_reference(
|
||||
cluster_name, cluster_uid)
|
||||
node_types = {}
|
||||
for pod_type in pod_types:
|
||||
name = pod_type["name"]
|
||||
pod_type_copy = copy.deepcopy(pod_type)
|
||||
pod_type_copy.pop("name")
|
||||
node_types[name] = translate(
|
||||
pod_type_copy, dictionary=NODE_TYPE_FIELDS)
|
||||
# Deleting a RayCluster CR will also delete the associated pods.
|
||||
node_types[name]["node_config"]["metadata"].update({
|
||||
"ownerReferences": [cluster_owner_reference]
|
||||
})
|
||||
return node_types
|
||||
|
||||
|
||||
def get_cluster_owner_reference(cluster_name: str,
|
||||
cluster_uid: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"apiVersion": "apps/v1",
|
||||
"controller": True,
|
||||
"blockOwnerDeletion": True,
|
||||
"kind": "RayCluster",
|
||||
"name": cluster_name,
|
||||
"uid": cluster_uid
|
||||
}
|
||||
|
||||
|
||||
def translate(configuration: Dict[str, Any],
|
||||
dictionary: Dict[str, str]) -> Dict[str, Any]:
|
||||
return {dictionary[field]: configuration[field] for field in configuration}
|
||||
@@ -89,7 +89,6 @@ class RayParams:
|
||||
contents to Redis.
|
||||
autoscaling_config: path to autoscaling config file.
|
||||
java_worker_options (list): The command options for Java worker.
|
||||
load_code_from_local: Whether load code from local file or from GCS.
|
||||
metrics_agent_port(int): The port to bind metrics agent.
|
||||
metrics_export_port(int): The port at which metrics are exposed
|
||||
through a Prometheus endpoint.
|
||||
@@ -142,14 +141,12 @@ class RayParams:
|
||||
include_log_monitor=None,
|
||||
autoscaling_config=None,
|
||||
java_worker_options=None,
|
||||
load_code_from_local=False,
|
||||
start_initial_python_workers_for_first_job=False,
|
||||
_system_config=None,
|
||||
enable_object_reconstruction=False,
|
||||
metrics_agent_port=None,
|
||||
metrics_export_port=None,
|
||||
lru_evict=False,
|
||||
code_search_path=None):
|
||||
lru_evict=False):
|
||||
self.object_ref_seed = object_ref_seed
|
||||
self.redis_address = redis_address
|
||||
self.num_cpus = num_cpus
|
||||
@@ -186,7 +183,6 @@ class RayParams:
|
||||
self.include_log_monitor = include_log_monitor
|
||||
self.autoscaling_config = autoscaling_config
|
||||
self.java_worker_options = java_worker_options
|
||||
self.load_code_from_local = load_code_from_local
|
||||
self.metrics_agent_port = metrics_agent_port
|
||||
self.metrics_export_port = metrics_export_port
|
||||
self.start_initial_python_workers_for_first_job = (
|
||||
@@ -195,9 +191,6 @@ class RayParams:
|
||||
self._lru_evict = lru_evict
|
||||
self._enable_object_reconstruction = enable_object_reconstruction
|
||||
self._check_usage()
|
||||
self.code_search_path = code_search_path
|
||||
if code_search_path is None:
|
||||
self.code_search_path = []
|
||||
|
||||
# Set the internal config options for LRU eviction.
|
||||
if lru_evict:
|
||||
|
||||
@@ -197,7 +197,8 @@ LOG_MONITOR_MAX_OPEN_FILES = 200
|
||||
# The object metadata field uses the following format: It is a comma
|
||||
# separated list of fields. The first field is mandatory and is the
|
||||
# type of the object (see types below) or an integer, which is interpreted
|
||||
# as an error value.
|
||||
# as an error value. The second part is optional and if present has the
|
||||
# form DEBUG:<breakpoint_id>, it is used for implementing the debugger.
|
||||
|
||||
# A constant used as object metadata to indicate the object is cross language.
|
||||
OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
|
||||
@@ -213,6 +214,9 @@ OBJECT_METADATA_TYPE_RAW = b"RAW"
|
||||
# of XLANG.
|
||||
OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"
|
||||
|
||||
# A constant indicating the debugging part of the metadata (see above).
|
||||
OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"
|
||||
|
||||
AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"
|
||||
|
||||
# The default password to prevent redis port scanning attack.
|
||||
|
||||
@@ -153,6 +153,46 @@ class StandardFdRedirectionRotatingFileHandler(RotatingFileHandler):
|
||||
os.dup2(self.stream.fileno(), self.get_original_stream().fileno())
|
||||
|
||||
|
||||
def get_worker_log_file_name(worker_type):
|
||||
job_id = os.environ.get("RAY_JOB_ID")
|
||||
if worker_type == "WORKER":
|
||||
assert job_id is not None, (
|
||||
"RAY_JOB_ID should be set as an env "
|
||||
"variable within default_worker.py. If you see this error, "
|
||||
"please report it to Ray's Github issue.")
|
||||
worker_name = "worker"
|
||||
else:
|
||||
job_id = ray.JobID.nil()
|
||||
worker_name = "io_worker"
|
||||
|
||||
# Make sure these values are set already.
|
||||
assert ray.worker._global_node is not None
|
||||
assert ray.worker.global_worker is not None
|
||||
filename = (f"{worker_name}-"
|
||||
f"{binary_to_hex(ray.worker.global_worker.worker_id)}-"
|
||||
f"{job_id}-{os.getpid()}")
|
||||
return filename
|
||||
|
||||
|
||||
def configure_log_file(out_file, err_file):
|
||||
stdout_fileno = sys.stdout.fileno()
|
||||
stderr_fileno = sys.stderr.fileno()
|
||||
# C++ logging requires redirecting the stdout file descriptor. Note that
|
||||
# dup2 will automatically close the old file descriptor before overriding
|
||||
# it.
|
||||
os.dup2(out_file.fileno(), stdout_fileno)
|
||||
os.dup2(err_file.fileno(), stderr_fileno)
|
||||
# We also manually set sys.stdout and sys.stderr because that seems to
|
||||
# have an effect on the output buffering. Without doing this, stdout
|
||||
# and stderr are heavily buffered resulting in seemingly lost logging
|
||||
# statements. We never want to close the stdout file descriptor, dup2 will
|
||||
# close it when necessary and we don't want python's GC to close it.
|
||||
sys.stdout = ray.utils.open_log(
|
||||
stdout_fileno, unbuffered=True, closefd=False)
|
||||
sys.stderr = ray.utils.open_log(
|
||||
stderr_fileno, unbuffered=True, closefd=False)
|
||||
|
||||
|
||||
def setup_and_get_worker_interceptor_logger(args,
|
||||
max_bytes=0,
|
||||
backup_count=0,
|
||||
|
||||
@@ -258,8 +258,12 @@ class RemoteFunction:
|
||||
placement_group.id,
|
||||
placement_group_bundle_index,
|
||||
placement_group_capture_child_tasks,
|
||||
worker.debugger_breakpoint,
|
||||
override_environment_variables=override_environment_variables
|
||||
or dict())
|
||||
# Reset worker's debug context from the last "remote" command
|
||||
# (which applies only to this .remote call).
|
||||
worker.debugger_breakpoint = b""
|
||||
if len(object_refs) == 1:
|
||||
return object_refs[0]
|
||||
elif len(object_refs) > 1:
|
||||
|
||||
@@ -6,6 +6,7 @@ import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from telnetlib import Telnet
|
||||
import time
|
||||
import urllib
|
||||
import urllib.parse
|
||||
@@ -150,6 +151,35 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
|
||||
from None
|
||||
|
||||
|
||||
def continue_debug_session():
|
||||
"""Continue active debugging session.
|
||||
|
||||
This function will connect 'ray debug' to the right debugger
|
||||
when a user is stepping between Ray tasks.
|
||||
"""
|
||||
active_sessions = ray.experimental.internal_kv._internal_kv_list(
|
||||
"RAY_PDB_")
|
||||
|
||||
for active_session in active_sessions:
|
||||
if active_session.startswith(b"RAY_PDB_CONTINUE"):
|
||||
print("Continuing pdb session in different process...")
|
||||
key = b"RAY_PDB_" + active_session[len("RAY_PDB_CONTINUE_"):]
|
||||
while True:
|
||||
data = ray.experimental.internal_kv._internal_kv_get(key)
|
||||
if data:
|
||||
session = json.loads(data)
|
||||
if "exit_debugger" in session:
|
||||
ray.experimental.internal_kv._internal_kv_del(key)
|
||||
return
|
||||
host, port = session["pdb_address"].split(":")
|
||||
with Telnet(host, int(port)) as tn:
|
||||
tn.interact()
|
||||
ray.experimental.internal_kv._internal_kv_del(key)
|
||||
continue_debug_session()
|
||||
return
|
||||
time.sleep(1.0)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--address",
|
||||
@@ -158,12 +188,13 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
|
||||
help="Override the address to connect to.")
|
||||
def debug(address):
|
||||
"""Show all active breakpoints and exceptions in the Ray debugger."""
|
||||
from telnetlib import Telnet
|
||||
if not address:
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address)
|
||||
ray.init(address=address, log_to_driver=False)
|
||||
while True:
|
||||
continue_debug_session()
|
||||
|
||||
active_sessions = ray.experimental.internal_kv._internal_kv_list(
|
||||
"RAY_PDB_")
|
||||
print("Active breakpoints:")
|
||||
@@ -358,25 +389,12 @@ def debug(address):
|
||||
default=None,
|
||||
type=str,
|
||||
help="Overwrite the options to start Java workers.")
|
||||
@click.option(
|
||||
"--code-search-path",
|
||||
default=None,
|
||||
hidden=True,
|
||||
type=str,
|
||||
help="A list of directories or jar files separated by colon that specify "
|
||||
"the search path for user code. This will be used as `CLASSPATH` in "
|
||||
"Java and `PYTHONPATH` in Python.")
|
||||
@click.option(
|
||||
"--system-config",
|
||||
default=None,
|
||||
hidden=True,
|
||||
type=json.loads,
|
||||
help="Override system configuration defaults.")
|
||||
@click.option(
|
||||
"--load-code-from-local",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Specify whether load code from local file or GCS serialization.")
|
||||
@click.option(
|
||||
"--lru-evict",
|
||||
is_flag=True,
|
||||
@@ -405,8 +423,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
|
||||
head, include_dashboard, dashboard_host, dashboard_port, block,
|
||||
plasma_directory, autoscaling_config, no_redirect_worker_output,
|
||||
no_redirect_output, plasma_store_socket_name, raylet_socket_name,
|
||||
temp_dir, java_worker_options, load_code_from_local,
|
||||
code_search_path, system_config, lru_evict,
|
||||
temp_dir, java_worker_options, system_config, lru_evict,
|
||||
enable_object_reconstruction, metrics_export_port, log_style,
|
||||
log_color, verbose):
|
||||
"""Start Ray processes manually on the local machine."""
|
||||
@@ -465,8 +482,6 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
|
||||
dashboard_host=dashboard_host,
|
||||
dashboard_port=dashboard_port,
|
||||
java_worker_options=java_worker_options,
|
||||
load_code_from_local=load_code_from_local,
|
||||
code_search_path=code_search_path,
|
||||
_system_config=system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
@@ -537,6 +552,8 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
|
||||
with cli_logger.group("Next steps"):
|
||||
cli_logger.print(
|
||||
"To connect to this Ray runtime from another node, run")
|
||||
# NOTE(kfstorm): Java driver rely on this line to get the address
|
||||
# of the cluster. Please be careful when updating this line.
|
||||
cli_logger.print(
|
||||
cf.bold(" ray start --address='{}'{}"), redis_address,
|
||||
f" --redis-password='{redis_password}'"
|
||||
@@ -632,7 +649,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
|
||||
cli_logger.print(
|
||||
"This command will now block until terminated by a signal.")
|
||||
cli_logger.print(
|
||||
"Runing subprocesses are monitored and a message will be "
|
||||
"Running subprocesses are monitored and a message will be "
|
||||
"printed if any of them terminate unexpectedly.")
|
||||
|
||||
while True:
|
||||
@@ -1273,7 +1290,7 @@ def stack():
|
||||
COMMAND = """
|
||||
pyspy=`which py-spy`
|
||||
if [ ! -e "$pyspy" ]; then
|
||||
echo "ERROR: Please 'pip install py-spy' (or ray[debug]) first"
|
||||
echo "ERROR: Please 'pip install py-spy' first"
|
||||
exit 1
|
||||
fi
|
||||
# Set IFS to iterate over lines instead of over words.
|
||||
|
||||
+15
-8
@@ -17,6 +17,14 @@ py_test(
|
||||
deps = [":serve_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_controller",
|
||||
size = "small",
|
||||
srcs = serve_tests_srcs,
|
||||
tags = ["exclusive"],
|
||||
deps = [":serve_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_backend_worker",
|
||||
size = "small",
|
||||
@@ -35,14 +43,13 @@ py_test(
|
||||
)
|
||||
|
||||
|
||||
# TODO(simon): Test skipped until #11683 fixed.
|
||||
# py_test(
|
||||
# name = "test_failure",
|
||||
# size = "medium",
|
||||
# srcs = serve_tests_srcs,
|
||||
# tags = ["exclusive"],
|
||||
# deps = [":serve_lib"],
|
||||
# )
|
||||
py_test(
|
||||
name = "test_failure",
|
||||
size = "medium",
|
||||
srcs = serve_tests_srcs,
|
||||
tags = ["exclusive"],
|
||||
deps = [":serve_lib"],
|
||||
)
|
||||
|
||||
|
||||
py_test(
|
||||
|
||||
+81
-15
@@ -1,6 +1,9 @@
|
||||
import asyncio
|
||||
import atexit
|
||||
import time
|
||||
from functools import wraps
|
||||
import os
|
||||
from uuid import UUID
|
||||
|
||||
import ray
|
||||
from ray.serve.constants import (DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT,
|
||||
@@ -42,6 +45,8 @@ class Client:
|
||||
self._controller_name = controller_name
|
||||
self._detached = detached
|
||||
self._shutdown = False
|
||||
self._http_host, self._http_port = ray.get(
|
||||
controller.get_http_config.remote())
|
||||
|
||||
# NOTE(simon): Used to cache client.get_handle(endpoint) call. It will
|
||||
# mostly grow in size, it will only shrink when user calls the
|
||||
@@ -62,9 +67,9 @@ class Client:
|
||||
|
||||
def __del__(self):
|
||||
if not self._detached:
|
||||
logger.info("Shutting down Ray Serve because client went out of "
|
||||
"scope. To prevent this, either keep a reference to "
|
||||
"the client object or use serve.start(detached=True).")
|
||||
logger.debug("Shutting down Ray Serve because client went out of "
|
||||
"scope. To prevent this, either keep a reference to "
|
||||
"the client or use serve.start(detached=True).")
|
||||
self.shutdown()
|
||||
|
||||
def __reduce__(self):
|
||||
@@ -78,11 +83,34 @@ class Client:
|
||||
Shuts down all processes and deletes all state associated with the
|
||||
instance.
|
||||
"""
|
||||
if not self._shutdown:
|
||||
if (not self._shutdown) and ray.is_initialized():
|
||||
ray.get(self._controller.shutdown.remote())
|
||||
ray.kill(self._controller, no_restart=True)
|
||||
|
||||
# Wait for the named actor entry gets removed as well.
|
||||
started = time.time()
|
||||
while True:
|
||||
try:
|
||||
ray.get_actor(self._controller_name)
|
||||
if time.time() - started > 5:
|
||||
logger.warning(
|
||||
"Waited 5s for Serve to shutdown gracefully but "
|
||||
"the controller is still not cleaned up. "
|
||||
"You can ignore this warning if you are shutting "
|
||||
"down the Ray cluster.")
|
||||
break
|
||||
except ValueError: # actor name is removed
|
||||
break
|
||||
|
||||
self._shutdown = True
|
||||
|
||||
@_ensure_connected
|
||||
def _get_result(self, result_object_id: ray.ObjectRef) -> bool:
|
||||
result_id: UUID = ray.get(result_object_id)
|
||||
result = ray.get(self._controller.wait_for_event.remote(result_id))
|
||||
logger.debug(f"Getting result_id ({result_id}) with result: {result}")
|
||||
return result
|
||||
|
||||
@_ensure_connected
|
||||
def create_endpoint(self,
|
||||
endpoint_name: str,
|
||||
@@ -137,10 +165,33 @@ class Client:
|
||||
"an element of type {}".format(type(method)))
|
||||
upper_methods.append(method.upper())
|
||||
|
||||
ray.get(
|
||||
self._get_result(
|
||||
self._controller.create_endpoint.remote(
|
||||
endpoint_name, {backend: 1.0}, route, upper_methods))
|
||||
|
||||
# Block until the route table has been propagated to all HTTP proxies.
|
||||
if route is not None:
|
||||
|
||||
def check_ready(http_response):
|
||||
return route in http_response.json()
|
||||
|
||||
futures = []
|
||||
for node_id in ray.state.node_ids():
|
||||
future = block_until_http_ready.options(
|
||||
num_cpus=0, resources={
|
||||
node_id: 0.01
|
||||
}).remote(
|
||||
"http://{}:{}/-/routes".format(self._http_host,
|
||||
self._http_port),
|
||||
check_ready=check_ready,
|
||||
timeout=HTTP_PROXY_TIMEOUT)
|
||||
futures.append(future)
|
||||
try:
|
||||
ray.get(futures)
|
||||
except ray.exceptions.RayTaskError:
|
||||
raise TimeoutError("Route not available at HTTP proxies "
|
||||
"after {HTTP_PROXY_TIMEOUT}s.")
|
||||
|
||||
@_ensure_connected
|
||||
def delete_endpoint(self, endpoint: str) -> None:
|
||||
"""Delete the given endpoint.
|
||||
@@ -149,7 +200,7 @@ class Client:
|
||||
"""
|
||||
if endpoint in self._handle_cache:
|
||||
del self._handle_cache[endpoint]
|
||||
ray.get(self._controller.delete_endpoint.remote(endpoint))
|
||||
self._get_result(self._controller.delete_endpoint.remote(endpoint))
|
||||
|
||||
@_ensure_connected
|
||||
def list_endpoints(self) -> Dict[str, Dict[str, Any]]:
|
||||
@@ -193,7 +244,7 @@ class Client:
|
||||
"config_options must be a BackendConfig or dictionary.")
|
||||
if isinstance(config_options, dict):
|
||||
config_options = BackendConfig.parse_obj(config_options)
|
||||
ray.get(
|
||||
self._get_result(
|
||||
self._controller.update_backend_config.remote(
|
||||
backend_tag, config_options))
|
||||
|
||||
@@ -222,7 +273,8 @@ class Client:
|
||||
Args:
|
||||
backend_tag (str): a unique tag assign to identify this backend.
|
||||
func_or_class (callable, class): a function or a class implementing
|
||||
__call__.
|
||||
__call__, returning a JSON-serializable object or a
|
||||
Starlette Response object.
|
||||
actor_init_args (optional): the arguments to pass to the class.
|
||||
initialization method.
|
||||
ray_actor_options (optional): options to be passed into the
|
||||
@@ -290,7 +342,7 @@ class Client:
|
||||
raise TypeError("config must be a BackendConfig or a dictionary.")
|
||||
|
||||
backend_config._validate_complete()
|
||||
ray.get(
|
||||
self._get_result(
|
||||
self._controller.create_backend.remote(backend_tag, backend_config,
|
||||
replica_config))
|
||||
|
||||
@@ -308,7 +360,7 @@ class Client:
|
||||
|
||||
The backend must not currently be used by any endpoints.
|
||||
"""
|
||||
ray.get(self._controller.delete_backend.remote(backend_tag))
|
||||
self._get_result(self._controller.delete_backend.remote(backend_tag))
|
||||
|
||||
@_ensure_connected
|
||||
def set_traffic(self, endpoint_name: str,
|
||||
@@ -327,7 +379,7 @@ class Client:
|
||||
traffic_policy_dictionary (dict): a dictionary maps backend names
|
||||
to their traffic weights. The weights must sum to 1.
|
||||
"""
|
||||
ray.get(
|
||||
self._get_result(
|
||||
self._controller.set_traffic.remote(endpoint_name,
|
||||
traffic_policy_dictionary))
|
||||
|
||||
@@ -353,20 +405,24 @@ class Client:
|
||||
(float, int)) or not 0 <= proportion <= 1:
|
||||
raise TypeError("proportion must be a float from 0 to 1.")
|
||||
|
||||
ray.get(
|
||||
self._get_result(
|
||||
self._controller.shadow_traffic.remote(endpoint_name, backend_tag,
|
||||
proportion))
|
||||
|
||||
@_ensure_connected
|
||||
def get_handle(self,
|
||||
endpoint_name: str,
|
||||
missing_ok: Optional[bool] = False) -> RayServeHandle:
|
||||
missing_ok: Optional[bool] = False,
|
||||
sync: bool = True) -> RayServeHandle:
|
||||
"""Retrieve RayServeHandle for service endpoint to invoke it from Python.
|
||||
|
||||
Args:
|
||||
endpoint_name (str): A registered service endpoint.
|
||||
missing_ok (bool): If true, then Serve won't check the endpoint is
|
||||
registered. False by default.
|
||||
sync (bool): If true, then Serve will return a ServeHandle that
|
||||
works everywhere. Otherwise, Serve will return a ServeHandle
|
||||
that's only usable in asyncio loop.
|
||||
|
||||
Returns:
|
||||
RayServeHandle
|
||||
@@ -375,8 +431,14 @@ class Client:
|
||||
self._controller.get_all_endpoints.remote()):
|
||||
raise KeyError(f"Endpoint '{endpoint_name}' does not exist.")
|
||||
|
||||
if asyncio.get_event_loop().is_running() and sync:
|
||||
logger.warning(
|
||||
"You are retrieving a ServeHandle inside an asyncio loop. "
|
||||
"Try getting client.get_handle(.., sync=False) to get better "
|
||||
"performance.")
|
||||
|
||||
if endpoint_name not in self._handle_cache:
|
||||
handle = RayServeHandle(self._controller, endpoint_name, sync=True)
|
||||
handle = RayServeHandle(self._controller, endpoint_name, sync=sync)
|
||||
self._handle_cache[endpoint_name] = handle
|
||||
return self._handle_cache[endpoint_name]
|
||||
|
||||
@@ -445,7 +507,11 @@ def start(detached: bool = False,
|
||||
"http://{}:{}/-/routes".format(http_host, http_port),
|
||||
timeout=HTTP_PROXY_TIMEOUT)
|
||||
futures.append(future)
|
||||
ray.get(futures)
|
||||
try:
|
||||
ray.get(futures)
|
||||
except ray.exceptions.RayTaskError:
|
||||
raise TimeoutError(
|
||||
"HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.")
|
||||
|
||||
return Client(controller, controller_name, detached=detached)
|
||||
|
||||
|
||||
@@ -15,10 +15,13 @@ from ray.serve.utils import (parse_request_item, _get_logger, chain_future,
|
||||
from ray.serve.exceptions import RayServeException
|
||||
from ray.util import metrics
|
||||
from ray.serve.config import BackendConfig
|
||||
from ray.serve.long_poll import LongPollerAsyncClient
|
||||
from ray.serve.long_poll import LongPollAsyncClient
|
||||
from ray.serve.router import Query
|
||||
from ray.serve.constants import (DEFAULT_LATENCY_BUCKET_MS,
|
||||
BACKEND_RECONFIGURE_METHOD)
|
||||
from ray.serve.constants import (
|
||||
BACKEND_RECONFIGURE_METHOD,
|
||||
DEFAULT_LATENCY_BUCKET_MS,
|
||||
LongPollKey,
|
||||
)
|
||||
from ray.exceptions import RayTaskError
|
||||
|
||||
logger = _get_logger()
|
||||
@@ -168,8 +171,8 @@ class RayServeReplica:
|
||||
tag_keys=("backend", ))
|
||||
self.request_counter.set_default_tags({"backend": self.backend_tag})
|
||||
|
||||
self.long_poll_client = LongPollerAsyncClient(controller_handle, {
|
||||
"backend_configs": self._update_backend_configs,
|
||||
self.long_poll_client = LongPollAsyncClient(controller_handle, {
|
||||
LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
|
||||
})
|
||||
|
||||
self.error_counter = metrics.Count(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
cluster_name: default
|
||||
min_workers: 22
|
||||
max_workers: 22
|
||||
initial_workers: 22
|
||||
min_workers: 5
|
||||
max_workers: 5
|
||||
initial_workers: 5
|
||||
autoscaling_mode: default
|
||||
docker:
|
||||
image: 'anyscale/ray-ml:latest'
|
||||
@@ -28,6 +28,7 @@ initialization_commands: []
|
||||
setup_commands:
|
||||
- apt-get install build-essential libssl-dev git -y
|
||||
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
|
||||
- ray install-nightly
|
||||
head_setup_commands: []
|
||||
worker_setup_commands: []
|
||||
head_start_ray_commands:
|
||||
|
||||
@@ -23,64 +23,88 @@
|
||||
# 2 forwarders and 5 worker replicas: 620 requests/s
|
||||
# 2 forwarders and 10 worker replicas: 609 requests/s
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray import serve
|
||||
from ray.serve import BackendConfig
|
||||
from ray.serve.utils import logger
|
||||
import time
|
||||
|
||||
num_queries = 2000
|
||||
num_queries = 10000
|
||||
max_concurrent_queries = 100000
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
client = serve.start()
|
||||
|
||||
|
||||
def hello_world(_):
|
||||
def worker(_):
|
||||
return b"Hello World"
|
||||
|
||||
|
||||
class ForwardActor:
|
||||
def __init__(self):
|
||||
def __init__(self, sync: bool):
|
||||
client = serve.connect()
|
||||
self.handle = client.get_handle("hello_world")
|
||||
self.sync = sync
|
||||
self.handle = client.get_handle("worker", sync=sync)
|
||||
|
||||
async def __call__(self, _):
|
||||
await self.handle.remote()
|
||||
if self.sync:
|
||||
await self.handle.remote()
|
||||
else:
|
||||
await (await self.handle.remote_async())
|
||||
|
||||
|
||||
client.create_backend("hello_world", hello_world)
|
||||
client.create_endpoint("hello_world", backend="hello_world")
|
||||
async def run_test(num_replicas, num_forwarders, sync):
|
||||
client = serve.start()
|
||||
client.create_backend(
|
||||
"worker",
|
||||
worker,
|
||||
config=BackendConfig(
|
||||
num_replicas=num_replicas,
|
||||
max_concurrent_queries=max_concurrent_queries,
|
||||
))
|
||||
client.create_endpoint("worker", backend="worker")
|
||||
endpoint_name = "worker"
|
||||
|
||||
client.create_backend("ForwardActor", ForwardActor)
|
||||
client.create_endpoint("ForwardActor", backend="ForwardActor")
|
||||
if num_forwarders > 0:
|
||||
client.create_backend(
|
||||
"ForwardActor",
|
||||
ForwardActor,
|
||||
sync,
|
||||
config=BackendConfig(
|
||||
num_replicas=num_forwarders,
|
||||
max_concurrent_queries=max_concurrent_queries))
|
||||
client.create_endpoint("ForwardActor", backend="ForwardActor")
|
||||
endpoint_name = "ForwardActor"
|
||||
|
||||
|
||||
def run_test(num_replicas, num_forwarders):
|
||||
replicas_config = BackendConfig(num_replicas=num_replicas)
|
||||
client.update_backend_config("hello_world", replicas_config)
|
||||
|
||||
if (num_forwarders == 0):
|
||||
handle = client.get_handle("hello_world")
|
||||
else:
|
||||
forwarders_config = BackendConfig(num_replicas=num_forwarders)
|
||||
client.update_backend_config("ForwardActor", forwarders_config)
|
||||
handle = client.get_handle("ForwardActor")
|
||||
handle = client.get_handle(endpoint_name, sync=sync)
|
||||
|
||||
# warmup - helpful to wait for gc.collect() and actors to start
|
||||
start = time.time()
|
||||
while time.time() - start < 1:
|
||||
ray.get(handle.remote())
|
||||
if sync:
|
||||
ray.get(handle.remote())
|
||||
else:
|
||||
ray.get(await handle.remote_async())
|
||||
|
||||
# real test
|
||||
start = time.time()
|
||||
ray.get([handle.remote() for _ in range(num_queries)])
|
||||
if sync:
|
||||
ray.get([handle.remote() for _ in range(num_queries)])
|
||||
else:
|
||||
ray.get([(await handle.remote_async()) for _ in range(num_queries)])
|
||||
qps = num_queries / (time.time() - start)
|
||||
|
||||
logger.info("{} forwarders and {} worker replicas: {} requests/s".format(
|
||||
num_forwarders, num_replicas, int(qps)))
|
||||
print(
|
||||
f"Sync: {sync}, {num_forwarders} forwarders and {num_replicas} worker "
|
||||
f"replicas: {int(qps)} requests/s")
|
||||
client.shutdown()
|
||||
|
||||
|
||||
for num_forwarders in [0, 1, 2]:
|
||||
for num_replicas in [1, 5, 10]:
|
||||
run_test(num_replicas, num_forwarders)
|
||||
async def main():
|
||||
for sync in [True, False]:
|
||||
for num_forwarders in [0, 1, 2]:
|
||||
for num_replicas in [1, 5, 10]:
|
||||
await run_test(num_replicas, num_forwarders, sync)
|
||||
|
||||
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
|
||||
@@ -86,13 +86,14 @@ async def main():
|
||||
client.create_backend("backend", backend)
|
||||
client.create_endpoint("endpoint", backend="backend", route="/api")
|
||||
for intermediate_handles in [False, True]:
|
||||
if (intermediate_handles):
|
||||
if intermediate_handles:
|
||||
|
||||
client.create_endpoint(
|
||||
"backend", backend="backend", route="/backend")
|
||||
|
||||
class forwardActor:
|
||||
def __init__(self):
|
||||
client = serve.connect()
|
||||
self.handle = client.get_handle("backend")
|
||||
|
||||
def __call__(self, _):
|
||||
|
||||
@@ -36,73 +36,76 @@ from ray import serve
|
||||
from ray.serve import BackendConfig
|
||||
from ray.serve.utils import logger
|
||||
|
||||
from ray.util.placement_group import (placement_group, remove_placement_group)
|
||||
from ray.util.placement_group import placement_group, remove_placement_group
|
||||
|
||||
ray.shutdown()
|
||||
ray.init(address="auto")
|
||||
client = serve.start()
|
||||
|
||||
# These numbers need to correspond with the autoscaler config file.
|
||||
# The number of remote nodes in the autoscaler should upper bound
|
||||
# these because sometimes nodes fail to update.
|
||||
num_workers = 20
|
||||
expected_num_nodes = num_workers + 1
|
||||
cpus_per_node = 4
|
||||
num_remote_cpus = expected_num_nodes * cpus_per_node
|
||||
# We ask for more worker but only need to run on smaller subset.
|
||||
# This should account for worker nodes failed to launch.
|
||||
expected_num_nodes = 6
|
||||
num_replicas = 11
|
||||
# wrk HTTP load testing config
|
||||
num_connections = 20
|
||||
num_threads = 2
|
||||
time_to_run = "20s"
|
||||
|
||||
# Wait until the expected number of nodes have joined the cluster.
|
||||
while True:
|
||||
num_nodes = len(ray.nodes())
|
||||
num_nodes = len(list(filter(lambda node: node["Alive"], ray.nodes())))
|
||||
logger.info("Waiting for nodes {}/{}".format(num_nodes,
|
||||
expected_num_nodes))
|
||||
if num_nodes >= expected_num_nodes:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
logger.info("Nodes have all joined. There are %s resources.",
|
||||
ray.cluster_resources())
|
||||
|
||||
client = serve.start()
|
||||
|
||||
|
||||
def hey(_):
|
||||
time.sleep(0.01) # Sleep for 10ms
|
||||
return b"hey"
|
||||
|
||||
|
||||
num_connections = int(num_remote_cpus * 0.75)
|
||||
num_threads = 2
|
||||
time_to_run = "10s"
|
||||
|
||||
pg = placement_group(
|
||||
[{
|
||||
"CPU": 1
|
||||
} for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
|
||||
ray.get(pg.ready())
|
||||
|
||||
# The number of replicas is the number of cores remaining after accounting
|
||||
# for the one HTTP proxy actor on each node, the "hey" requester task on each
|
||||
# node, and the serve controller.
|
||||
# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
|
||||
num_replicas = ray.available_resources()["CPU"]
|
||||
logger.info("Starting %i replicas", num_replicas)
|
||||
client.create_backend(
|
||||
"hey", hey, config=BackendConfig(num_replicas=num_replicas))
|
||||
client.create_endpoint("hey", backend="hey", route="/hey")
|
||||
|
||||
|
||||
@ray.remote
|
||||
@ray.remote(num_cpus=0)
|
||||
def run_wrk():
|
||||
logger.info("Warming up for ~3 seconds")
|
||||
for _ in range(5):
|
||||
resp = requests.get("http://127.0.0.1:8000/hey").text
|
||||
logger.info("Received response \'" + resp + "\'")
|
||||
time.sleep(0.5)
|
||||
logger.info("Warming up")
|
||||
for _ in range(10):
|
||||
try:
|
||||
resp = requests.get("http://127.0.0.1:8000/hey").text
|
||||
logger.info("Received response '" + resp + "'")
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
logger.info(f"Got exception {e}")
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"wrk", "-c",
|
||||
str(num_connections), "-t",
|
||||
str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey"
|
||||
"wrk",
|
||||
"-c",
|
||||
str(num_connections),
|
||||
"-t",
|
||||
str(num_threads),
|
||||
"-d",
|
||||
time_to_run,
|
||||
"http://127.0.0.1:8000/hey",
|
||||
],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
return result.stdout.decode()
|
||||
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ initialization_commands: []
|
||||
setup_commands:
|
||||
- apt-get install build-essential libssl-dev git -y
|
||||
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
|
||||
- ray install-nightly
|
||||
head_setup_commands: []
|
||||
worker_setup_commands: []
|
||||
head_start_ray_commands:
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from enum import auto, Enum
|
||||
|
||||
#: Actor name used to register controller
|
||||
SERVE_CONTROLLER_NAME = "SERVE_CONTROLLER_ACTOR"
|
||||
|
||||
@@ -37,3 +39,13 @@ DEFAULT_LATENCY_BUCKET_MS = [
|
||||
|
||||
#: Name of backend reconfiguration method implemented by user.
|
||||
BACKEND_RECONFIGURE_METHOD = "reconfigure"
|
||||
|
||||
|
||||
class LongPollKey(Enum):
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}.{self.name}"
|
||||
|
||||
REPLICA_HANDLES = auto()
|
||||
TRAFFIC_POLICIES = auto()
|
||||
BACKEND_CONFIGS = auto()
|
||||
ROUTE_TABLE = auto()
|
||||
|
||||
+191
-129
@@ -6,20 +6,22 @@ import random
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from uuid import uuid4, UUID
|
||||
from pydantic import BaseModel
|
||||
|
||||
import ray
|
||||
import ray.cloudpickle as pickle
|
||||
from ray.serve.autoscaling_policy import BasicAutoscalingPolicy
|
||||
from ray.serve.backend_worker import create_backend_replica
|
||||
from ray.serve.constants import ASYNC_CONCURRENCY, SERVE_PROXY_NAME
|
||||
from ray.serve.constants import (ASYNC_CONCURRENCY, SERVE_PROXY_NAME,
|
||||
LongPollKey)
|
||||
from ray.serve.http_proxy import HTTPProxyActor
|
||||
from ray.serve.kv_store import RayInternalKVStore
|
||||
from ray.serve.exceptions import RayServeException
|
||||
from ray.serve.utils import (format_actor_name, get_random_letters, logger,
|
||||
try_schedule_resources_on_nodes, get_all_node_ids)
|
||||
from ray.serve.config import BackendConfig, ReplicaConfig
|
||||
from ray.serve.long_poll import LongPollerHost
|
||||
from ray.serve.long_poll import LongPollHost
|
||||
from ray.actor import ActorHandle
|
||||
|
||||
import numpy as np
|
||||
@@ -144,7 +146,7 @@ class ActorStateReconciler:
|
||||
controller_name: str = field(init=True)
|
||||
detached: bool = field(init=True)
|
||||
|
||||
routers_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
|
||||
http_proxy_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
|
||||
backend_replicas: Dict[BackendTag, Dict[ReplicaTag, ActorHandle]] = field(
|
||||
default_factory=lambda: defaultdict(dict))
|
||||
backend_replicas_to_start: Dict[BackendTag, List[ReplicaTag]] = field(
|
||||
@@ -156,8 +158,8 @@ class ActorStateReconciler:
|
||||
|
||||
# TODO(edoakes): consider removing this and just using the names.
|
||||
|
||||
def router_handles(self) -> List[ActorHandle]:
|
||||
return list(self.routers_cache.values())
|
||||
def http_proxy_handles(self) -> List[ActorHandle]:
|
||||
return list(self.http_proxy_cache.values())
|
||||
|
||||
def get_replica_handles(self) -> List[ActorHandle]:
|
||||
return list(
|
||||
@@ -302,7 +304,7 @@ class ActorStateReconciler:
|
||||
async def _stop_pending_backend_replicas(self) -> None:
|
||||
"""Stops the pending backend replicas in self.backend_replicas_to_stop.
|
||||
|
||||
Removes backend_replicas from the router, kills them, and clears
|
||||
Removes backend_replicas from the http_proxy, kills them, and clears
|
||||
self.backend_replicas_to_stop.
|
||||
"""
|
||||
for backend_tag, replicas_list in self.backend_replicas_to_stop.items(
|
||||
@@ -326,26 +328,26 @@ class ActorStateReconciler:
|
||||
|
||||
self.backend_replicas_to_stop.clear()
|
||||
|
||||
def _start_routers_if_needed(self, http_host: str, http_port: str,
|
||||
http_middlewares: List[Any]) -> None:
|
||||
"""Start a router on every node if it doesn't already exist."""
|
||||
def _start_http_proxies_if_needed(self, http_host: str, http_port: str,
|
||||
http_middlewares: List[Any]) -> None:
|
||||
"""Start an HTTP proxy on every node if it doesn't already exist."""
|
||||
if http_host is None:
|
||||
return
|
||||
|
||||
for node_id, node_resource in get_all_node_ids():
|
||||
if node_id in self.routers_cache:
|
||||
if node_id in self.http_proxy_cache:
|
||||
continue
|
||||
|
||||
router_name = format_actor_name(SERVE_PROXY_NAME,
|
||||
self.controller_name, node_id)
|
||||
name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
|
||||
node_id)
|
||||
try:
|
||||
router = ray.get_actor(router_name)
|
||||
proxy = ray.get_actor(name)
|
||||
except ValueError:
|
||||
logger.info("Starting router with name '{}' on node '{}' "
|
||||
logger.info("Starting HTTP proxy with name '{}' on node '{}' "
|
||||
"listening on '{}:{}'".format(
|
||||
router_name, node_id, http_host, http_port))
|
||||
router = HTTPProxyActor.options(
|
||||
name=router_name,
|
||||
name, node_id, http_host, http_port))
|
||||
proxy = HTTPProxyActor.options(
|
||||
name=name,
|
||||
lifetime="detached" if self.detached else None,
|
||||
max_concurrency=ASYNC_CONCURRENCY,
|
||||
max_restarts=-1,
|
||||
@@ -359,10 +361,10 @@ class ActorStateReconciler:
|
||||
controller_name=self.controller_name,
|
||||
http_middlewares=http_middlewares)
|
||||
|
||||
self.routers_cache[node_id] = router
|
||||
self.http_proxy_cache[node_id] = proxy
|
||||
|
||||
def _stop_routers_if_needed(self) -> bool:
|
||||
"""Removes router actors from any nodes that no longer exist.
|
||||
def _stop_http_proxies_if_needed(self) -> bool:
|
||||
"""Removes HTTP proxy actors from any nodes that no longer exist.
|
||||
|
||||
Returns whether or not any actors were removed (a checkpoint should
|
||||
be taken).
|
||||
@@ -370,25 +372,25 @@ class ActorStateReconciler:
|
||||
actor_stopped = False
|
||||
all_node_ids = {node_id for node_id, _ in get_all_node_ids()}
|
||||
to_stop = []
|
||||
for node_id in self.routers_cache:
|
||||
for node_id in self.http_proxy_cache:
|
||||
if node_id not in all_node_ids:
|
||||
logger.info(
|
||||
"Removing router on removed node '{}'.".format(node_id))
|
||||
logger.info("Removing HTTP proxy on removed node '{}'.".format(
|
||||
node_id))
|
||||
to_stop.append(node_id)
|
||||
|
||||
for node_id in to_stop:
|
||||
router_handle = self.routers_cache.pop(node_id)
|
||||
ray.kill(router_handle, no_restart=True)
|
||||
proxy = self.http_proxy_cache.pop(node_id)
|
||||
ray.kill(proxy, no_restart=True)
|
||||
actor_stopped = True
|
||||
|
||||
return actor_stopped
|
||||
|
||||
def _recover_actor_handles(self) -> None:
|
||||
# Refresh the RouterCache
|
||||
for node_id in self.routers_cache.keys():
|
||||
router_name = format_actor_name(SERVE_PROXY_NAME,
|
||||
self.controller_name, node_id)
|
||||
self.routers_cache[node_id] = ray.get_actor(router_name)
|
||||
for node_id in self.http_proxy_cache.keys():
|
||||
name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
|
||||
node_id)
|
||||
self.http_proxy_cache[node_id] = ray.get_actor(name)
|
||||
|
||||
# Fetch actor handles for all of the backend replicas in the system.
|
||||
# All of these backend_replicas are guaranteed to already exist because
|
||||
@@ -420,12 +422,19 @@ class ActorStateReconciler:
|
||||
return autoscaling_policies
|
||||
|
||||
|
||||
@dataclass
|
||||
class FutureResult:
|
||||
# Goal requested when this future was created
|
||||
requested_goal: Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Checkpoint:
|
||||
goal_state: SystemState
|
||||
current_state: SystemState
|
||||
reconciler: ActorStateReconciler
|
||||
# TODO(ilr) Rename reconciler to PendingState
|
||||
inflight_reqs: Dict[uuid4, FutureResult]
|
||||
|
||||
|
||||
@ray.remote
|
||||
@@ -474,7 +483,7 @@ class ServeController:
|
||||
# backend -> AutoscalingPolicy
|
||||
self.autoscaling_policies = dict()
|
||||
|
||||
# Dictionary of backend_tag -> router_name -> most recent queue length.
|
||||
# Dictionary of backend_tag -> proxy_name -> most recent queue length.
|
||||
self.backend_stats = defaultdict(lambda: defaultdict(dict))
|
||||
|
||||
# Used to ensure that only a single state-changing operation happens
|
||||
@@ -487,56 +496,87 @@ class ServeController:
|
||||
|
||||
# If starting the actor for the first time, starts up the other system
|
||||
# components. If recovering, fetches their actor handles.
|
||||
self.actor_reconciler._start_routers_if_needed(
|
||||
self.actor_reconciler._start_http_proxies_if_needed(
|
||||
self.http_host, self.http_port, self.http_middlewares)
|
||||
|
||||
# NOTE(edoakes): unfortunately, we can't completely recover from a
|
||||
# checkpoint in the constructor because we block while waiting for
|
||||
# other actors to start up, and those actors fetch soft state from
|
||||
# this actor. Because no other tasks will start executing until after
|
||||
# the constructor finishes, if we were to run this logic in the
|
||||
# constructor it could lead to deadlock between this actor and a child.
|
||||
# However we do need to guarantee that we have fully recovered from a
|
||||
# checkpoint before any other state-changing calls run. We address this
|
||||
# by acquiring the write_lock and then posting the task to recover from
|
||||
# a checkpoint to the event loop. Other state-changing calls acquire
|
||||
# this lock and will be blocked until recovering from the checkpoint
|
||||
# finishes.
|
||||
# Map of awaiting results
|
||||
# TODO(ilr): Checkpoint this once this becomes asynchronous
|
||||
self.inflight_results: Dict[UUID, asyncio.Event] = dict()
|
||||
self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()
|
||||
|
||||
checkpoint = self.kv_store.get(CHECKPOINT_KEY)
|
||||
if checkpoint is None:
|
||||
logger.debug("No checkpoint found")
|
||||
else:
|
||||
await self.write_lock.acquire()
|
||||
asyncio.get_event_loop().create_task(
|
||||
self._recover_from_checkpoint(checkpoint))
|
||||
await self._recover_from_checkpoint(checkpoint)
|
||||
|
||||
# NOTE(simon): Currently we do all-to-all broadcast. This means
|
||||
# any listeners will receive notification for all changes. This
|
||||
# can be problem at scale, e.g. updating a single backend config
|
||||
# will send over the entire configs. In the future, we should
|
||||
# optimize the logic to support subscription by key.
|
||||
self.long_poll_host = LongPollerHost()
|
||||
self.long_poll_host = LongPollHost()
|
||||
|
||||
# The configs pushed out here get updated by
|
||||
# self._recover_from_checkpoint in the failure scenario, so that must
|
||||
# be run before we notify the changes.
|
||||
self.notify_backend_configs_changed()
|
||||
self.notify_replica_handles_changed()
|
||||
self.notify_traffic_policies_changed()
|
||||
self.notify_route_table_changed()
|
||||
|
||||
asyncio.get_event_loop().create_task(self.run_control_loop())
|
||||
|
||||
async def wait_for_event(self, uuid: UUID) -> bool:
|
||||
if uuid not in self.inflight_results:
|
||||
return True
|
||||
event = self.inflight_results[uuid]
|
||||
await event.wait()
|
||||
self.inflight_results.pop(uuid)
|
||||
self._serializable_inflight_results.pop(uuid)
|
||||
async with self.write_lock:
|
||||
self._checkpoint()
|
||||
|
||||
return True
|
||||
|
||||
def _create_event_with_result(
|
||||
self,
|
||||
goal_state: Dict[str, any],
|
||||
recreation_uuid: Optional[UUID] = None) -> UUID:
|
||||
# NOTE(ilr) Must be called before checkpointing!
|
||||
event = asyncio.Event()
|
||||
event.result = FutureResult(goal_state)
|
||||
event.set()
|
||||
uuid_val = recreation_uuid or uuid4()
|
||||
self.inflight_results[uuid_val] = event
|
||||
self._serializable_inflight_results[uuid_val] = event.result
|
||||
return uuid_val
|
||||
|
||||
async def _num_inflight_results(self) -> int:
|
||||
return len(self.inflight_results)
|
||||
|
||||
def notify_replica_handles_changed(self):
|
||||
self.long_poll_host.notify_changed(
|
||||
"worker_handles", {
|
||||
LongPollKey.REPLICA_HANDLES, {
|
||||
backend_tag: list(replica_dict.values())
|
||||
for backend_tag, replica_dict in
|
||||
self.actor_reconciler.backend_replicas.items()
|
||||
})
|
||||
|
||||
def notify_traffic_policies_changed(self):
|
||||
self.long_poll_host.notify_changed("traffic_policies",
|
||||
self.current_state.traffic_policies)
|
||||
self.long_poll_host.notify_changed(
|
||||
LongPollKey.TRAFFIC_POLICIES,
|
||||
self.current_state.traffic_policies,
|
||||
)
|
||||
|
||||
def notify_backend_configs_changed(self):
|
||||
self.long_poll_host.notify_changed(
|
||||
"backend_configs", self.current_state.get_backend_configs())
|
||||
LongPollKey.BACKEND_CONFIGS,
|
||||
self.current_state.get_backend_configs())
|
||||
|
||||
def notify_route_table_changed(self):
|
||||
self.long_poll_host.notify_changed(LongPollKey.ROUTE_TABLE,
|
||||
self.current_state.routes)
|
||||
|
||||
async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
|
||||
"""Proxy long pull client's listen request.
|
||||
@@ -549,13 +589,9 @@ class ServeController:
|
||||
return await (
|
||||
self.long_poll_host.listen_for_change(keys_to_snapshot_ids))
|
||||
|
||||
def get_routers(self) -> Dict[str, ActorHandle]:
|
||||
"""Returns a dictionary of node ID to router actor handles."""
|
||||
return self.actor_reconciler.routers_cache
|
||||
|
||||
def get_router_config(self) -> Dict[str, Tuple[str, List[str]]]:
|
||||
"""Called by the router on startup to fetch required state."""
|
||||
return self.current_state.routes
|
||||
def get_http_proxies(self) -> Dict[str, ActorHandle]:
|
||||
"""Returns a dictionary of node ID to http_proxy actor handles."""
|
||||
return self.actor_reconciler.http_proxy_cache
|
||||
|
||||
def _checkpoint(self) -> None:
|
||||
"""Checkpoint internal state and write it to the KV store."""
|
||||
@@ -565,7 +601,8 @@ class ServeController:
|
||||
|
||||
checkpoint = pickle.dumps(
|
||||
Checkpoint(self.goal_state, self.current_state,
|
||||
self.actor_reconciler))
|
||||
self.actor_reconciler,
|
||||
self._serializable_inflight_results))
|
||||
|
||||
self.kv_store.put(CHECKPOINT_KEY, checkpoint)
|
||||
logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))
|
||||
@@ -578,35 +615,51 @@ class ServeController:
|
||||
async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None:
|
||||
"""Recover the instance state from the provided checkpoint.
|
||||
|
||||
This should be called in the constructor to ensure that the internal
|
||||
state is updated before any other operations run. After running this,
|
||||
internal state will be updated and long-poll clients may be notified.
|
||||
|
||||
Performs the following operations:
|
||||
1) Deserializes the internal state from the checkpoint.
|
||||
2) Pushes the latest configuration to the routers
|
||||
in case we crashed before updating them.
|
||||
3) Starts/stops any replicas that are pending creation or
|
||||
2) Starts/stops any replicas that are pending creation or
|
||||
deletion.
|
||||
|
||||
NOTE: this requires that self.write_lock is already acquired and will
|
||||
release it before returning.
|
||||
"""
|
||||
assert self.write_lock.locked()
|
||||
|
||||
start = time.time()
|
||||
logger.info("Recovering from checkpoint")
|
||||
|
||||
restored_checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
|
||||
# Restore SystemState
|
||||
self.current_state = restored_checkpoint.current_state
|
||||
|
||||
# Restore ActorStateReconciler
|
||||
self.actor_reconciler = restored_checkpoint.reconciler
|
||||
|
||||
self.autoscaling_policies = await self.actor_reconciler.\
|
||||
_recover_from_checkpoint(self.current_state, self)
|
||||
self._serializable_inflight_results = restored_checkpoint.inflight_reqs
|
||||
for uuid, fut_result in self._serializable_inflight_results.items():
|
||||
self._create_event_with_result(fut_result.requested_goal, uuid)
|
||||
|
||||
logger.info(
|
||||
"Recovered from checkpoint in {:.3f}s".format(time.time() - start))
|
||||
# NOTE(edoakes): unfortunately, we can't completely recover from a
|
||||
# checkpoint in the constructor because we block while waiting for
|
||||
# other actors to start up, and those actors fetch soft state from
|
||||
# this actor. Because no other tasks will start executing until after
|
||||
# the constructor finishes, if we were to run this logic in the
|
||||
# constructor it could lead to deadlock between this actor and a child.
|
||||
# However, we do need to guarantee that we have fully recovered from a
|
||||
# checkpoint before any other state-changing calls run. We address this
|
||||
# by acquiring the write_lock and then posting the task to recover from
|
||||
# a checkpoint to the event loop. Other state-changing calls acquire
|
||||
# this lock and will be blocked until recovering from the checkpoint
|
||||
# finishes. This can be removed once we move to the async control loop.
|
||||
|
||||
self.write_lock.release()
|
||||
async def finish_recover_from_checkpoint():
|
||||
assert self.write_lock.locked()
|
||||
self.autoscaling_policies = await self.actor_reconciler.\
|
||||
_recover_from_checkpoint(self.current_state, self)
|
||||
self.write_lock.release()
|
||||
logger.info(
|
||||
"Recovered from checkpoint in {:.3f}s".format(time.time() -
|
||||
start))
|
||||
|
||||
await self.write_lock.acquire()
|
||||
asyncio.get_event_loop().create_task(finish_recover_from_checkpoint())
|
||||
|
||||
async def do_autoscale(self) -> None:
|
||||
for backend, info in self.current_state.backends.items():
|
||||
@@ -623,44 +676,30 @@ class ServeController:
|
||||
while True:
|
||||
await self.do_autoscale()
|
||||
async with self.write_lock:
|
||||
self.actor_reconciler._start_routers_if_needed(
|
||||
self.actor_reconciler._start_http_proxies_if_needed(
|
||||
self.http_host, self.http_port, self.http_middlewares)
|
||||
checkpoint_required = self.actor_reconciler.\
|
||||
_stop_routers_if_needed()
|
||||
_stop_http_proxies_if_needed()
|
||||
if checkpoint_required:
|
||||
self._checkpoint()
|
||||
|
||||
await asyncio.sleep(CONTROL_LOOP_PERIOD_S)
|
||||
|
||||
def get_backend_configs(self) -> Dict[str, BackendConfig]:
|
||||
"""Fetched by the router on startup."""
|
||||
return self.current_state.get_backend_configs()
|
||||
|
||||
def get_traffic_policies(self) -> Dict[str, TrafficPolicy]:
|
||||
"""Fetched by the router on startup."""
|
||||
return self.current_state.traffic_policies
|
||||
|
||||
def _list_replicas(self, backend_tag: BackendTag) -> List[ReplicaTag]:
|
||||
"""Used only for testing."""
|
||||
return list(self.actor_reconciler.backend_replicas[backend_tag].keys())
|
||||
|
||||
def get_traffic_policy(self, endpoint: str) -> TrafficPolicy:
|
||||
"""Fetched by serve handles."""
|
||||
return self.current_state.traffic_policies[endpoint]
|
||||
|
||||
def get_all_replica_handles(self) -> Dict[str, Dict[str, ActorHandle]]:
|
||||
"""Fetched by the router on startup."""
|
||||
def _all_replica_handles(
|
||||
self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
|
||||
"""Used for testing."""
|
||||
return self.actor_reconciler.backend_replicas
|
||||
|
||||
def get_all_backends(self) -> Dict[str, BackendConfig]:
|
||||
def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
|
||||
"""Returns a dictionary of backend tag to backend config."""
|
||||
return self.current_state.get_backend_configs()
|
||||
|
||||
def get_all_endpoints(self) -> Dict[str, Dict[str, Any]]:
|
||||
def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
|
||||
"""Returns a dictionary of backend tag to backend config."""
|
||||
return self.current_state.get_endpoints()
|
||||
|
||||
async def _set_traffic(self, endpoint_name: str,
|
||||
traffic_dict: Dict[str, float]) -> None:
|
||||
traffic_dict: Dict[str, float]) -> UUID:
|
||||
if endpoint_name not in self.current_state.get_endpoints():
|
||||
raise ValueError("Attempted to assign traffic for an endpoint '{}'"
|
||||
" that is not registered.".format(endpoint_name))
|
||||
@@ -677,21 +716,25 @@ class ServeController:
|
||||
traffic_policy = TrafficPolicy(traffic_dict)
|
||||
self.current_state.traffic_policies[endpoint_name] = traffic_policy
|
||||
|
||||
return_uuid = self._create_event_with_result({
|
||||
endpoint_name: traffic_policy
|
||||
})
|
||||
# NOTE(edoakes): we must write a checkpoint before pushing the
|
||||
# update to avoid inconsistent state if we crash after pushing the
|
||||
# update.
|
||||
self._checkpoint()
|
||||
|
||||
self.notify_traffic_policies_changed()
|
||||
return return_uuid
|
||||
|
||||
async def set_traffic(self, endpoint_name: str,
|
||||
traffic_dict: Dict[str, float]) -> None:
|
||||
traffic_dict: Dict[str, float]) -> UUID:
|
||||
"""Sets the traffic policy for the specified endpoint."""
|
||||
async with self.write_lock:
|
||||
await self._set_traffic(endpoint_name, traffic_dict)
|
||||
return_uuid = await self._set_traffic(endpoint_name, traffic_dict)
|
||||
return return_uuid
|
||||
|
||||
async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
|
||||
proportion: float) -> None:
|
||||
proportion: float) -> UUID:
|
||||
"""Shadow traffic from the endpoint to the backend."""
|
||||
async with self.write_lock:
|
||||
if endpoint_name not in self.current_state.get_endpoints():
|
||||
@@ -707,16 +750,22 @@ class ServeController:
|
||||
self.current_state.traffic_policies[endpoint_name].set_shadow(
|
||||
backend_tag, proportion)
|
||||
|
||||
traffic_policy = self.current_state.traffic_policies[endpoint_name]
|
||||
|
||||
return_uuid = self._create_event_with_result({
|
||||
endpoint_name: traffic_policy
|
||||
})
|
||||
# NOTE(edoakes): we must write a checkpoint before pushing the
|
||||
# update to avoid inconsistent state if we crash after pushing the
|
||||
# update.
|
||||
self._checkpoint()
|
||||
self.notify_traffic_policies_changed()
|
||||
return return_uuid
|
||||
|
||||
# TODO(architkulkarni): add Optional for route after cloudpickle upgrade
|
||||
async def create_endpoint(self, endpoint: str,
|
||||
traffic_dict: Dict[str, float], route,
|
||||
methods) -> None:
|
||||
methods) -> UUID:
|
||||
"""Create a new endpoint with the specified route and methods.
|
||||
|
||||
If the route is None, this is a "headless" endpoint that will not
|
||||
@@ -755,13 +804,11 @@ class ServeController:
|
||||
self.current_state.routes[route] = (endpoint, methods)
|
||||
|
||||
# NOTE(edoakes): checkpoint is written in self._set_traffic.
|
||||
await self._set_traffic(endpoint, traffic_dict)
|
||||
await asyncio.gather(*[
|
||||
router.set_route_table.remote(self.current_state.routes)
|
||||
for router in self.actor_reconciler.router_handles()
|
||||
])
|
||||
return_uuid = await self._set_traffic(endpoint, traffic_dict)
|
||||
self.notify_route_table_changed()
|
||||
return return_uuid
|
||||
|
||||
async def delete_endpoint(self, endpoint: str) -> None:
|
||||
async def delete_endpoint(self, endpoint: str) -> UUID:
|
||||
"""Delete the specified endpoint.
|
||||
|
||||
Does not modify any corresponding backends.
|
||||
@@ -788,19 +835,20 @@ class ServeController:
|
||||
|
||||
self.actor_reconciler.endpoints_to_remove.append(endpoint)
|
||||
|
||||
return_uuid = self._create_event_with_result({
|
||||
route_to_delete: None,
|
||||
endpoint: None
|
||||
})
|
||||
# NOTE(edoakes): we must write a checkpoint before pushing the
|
||||
# updates to the routers to avoid inconsistent state if we crash
|
||||
# updates to the proxies to avoid inconsistent state if we crash
|
||||
# after pushing the update.
|
||||
self._checkpoint()
|
||||
|
||||
await asyncio.gather(*[
|
||||
router.set_route_table.remote(self.current_state.routes)
|
||||
for router in self.actor_reconciler.router_handles()
|
||||
])
|
||||
self.notify_route_table_changed()
|
||||
return return_uuid
|
||||
|
||||
async def create_backend(self, backend_tag: BackendTag,
|
||||
backend_config: BackendConfig,
|
||||
replica_config: ReplicaConfig) -> None:
|
||||
replica_config: ReplicaConfig) -> UUID:
|
||||
"""Register a new backend under the specified tag."""
|
||||
async with self.write_lock:
|
||||
# Ensures this method is idempotent.
|
||||
@@ -815,12 +863,11 @@ class ServeController:
|
||||
|
||||
# Save creator that starts replicas, the arguments to be passed in,
|
||||
# and the configuration for the backends.
|
||||
self.current_state.add_backend(
|
||||
backend_tag,
|
||||
BackendInfo(
|
||||
worker_class=backend_replica,
|
||||
backend_config=backend_config,
|
||||
replica_config=replica_config))
|
||||
backend_info = BackendInfo(
|
||||
worker_class=backend_replica,
|
||||
backend_config=backend_config,
|
||||
replica_config=replica_config)
|
||||
self.current_state.add_backend(backend_tag, backend_info)
|
||||
metadata = backend_config.internal_metadata
|
||||
if metadata.autoscaling_config is not None:
|
||||
self.autoscaling_policies[
|
||||
@@ -835,6 +882,9 @@ class ServeController:
|
||||
del self.current_state.backends[backend_tag]
|
||||
raise e
|
||||
|
||||
return_uuid = self._create_event_with_result({
|
||||
backend_tag: backend_info
|
||||
})
|
||||
# NOTE(edoakes): we must write a checkpoint before starting new
|
||||
# or pushing the updated config to avoid inconsistent state if we
|
||||
# crash while making the change.
|
||||
@@ -844,11 +894,12 @@ class ServeController:
|
||||
|
||||
self.notify_replica_handles_changed()
|
||||
|
||||
# Set the backend config inside the router
|
||||
# Set the backend config inside routers
|
||||
# (particularly for max_concurrent_queries).
|
||||
self.notify_backend_configs_changed()
|
||||
return return_uuid
|
||||
|
||||
async def delete_backend(self, backend_tag: BackendTag) -> None:
|
||||
async def delete_backend(self, backend_tag: BackendTag) -> UUID:
|
||||
async with self.write_lock:
|
||||
# This method must be idempotent. We should validate that the
|
||||
# specified backend exists on the client.
|
||||
@@ -876,19 +927,21 @@ class ServeController:
|
||||
if backend_tag in self.autoscaling_policies:
|
||||
del self.autoscaling_policies[backend_tag]
|
||||
|
||||
# Add the intention to remove the backend from the router.
|
||||
# Add the intention to remove the backend from the routers.
|
||||
self.actor_reconciler.backends_to_remove.append(backend_tag)
|
||||
|
||||
return_uuid = self._create_event_with_result({backend_tag: None})
|
||||
# NOTE(edoakes): we must write a checkpoint before removing the
|
||||
# backend from the router to avoid inconsistent state if we crash
|
||||
# backend from the routers to avoid inconsistent state if we crash
|
||||
# after pushing the update.
|
||||
self._checkpoint()
|
||||
await self.actor_reconciler._stop_pending_backend_replicas()
|
||||
|
||||
self.notify_replica_handles_changed()
|
||||
return return_uuid
|
||||
|
||||
async def update_backend_config(self, backend_tag: BackendTag,
|
||||
config_options: BackendConfig) -> None:
|
||||
config_options: BackendConfig) -> UUID:
|
||||
"""Set the config for the specified backend."""
|
||||
async with self.write_lock:
|
||||
assert (self.current_state.get_backend(backend_tag)
|
||||
@@ -902,18 +955,22 @@ class ServeController:
|
||||
backend_config._validate_complete()
|
||||
self.current_state.get_backend(
|
||||
backend_tag).backend_config = backend_config
|
||||
backend_info = self.current_state.get_backend(backend_tag)
|
||||
|
||||
# Scale the replicas with the new configuration.
|
||||
self.actor_reconciler._scale_backend_replicas(
|
||||
self.current_state.backends, backend_tag,
|
||||
backend_config.num_replicas)
|
||||
|
||||
return_uuid = self._create_event_with_result({
|
||||
backend_tag: backend_info
|
||||
})
|
||||
# NOTE(edoakes): we must write a checkpoint before pushing the
|
||||
# update to avoid inconsistent state if we crash after pushing the
|
||||
# update.
|
||||
self._checkpoint()
|
||||
|
||||
# Inform the router about change in configuration
|
||||
# Inform the routers about change in configuration
|
||||
# (particularly for setting max_batch_size).
|
||||
|
||||
await self.actor_reconciler._start_pending_backend_replicas(
|
||||
@@ -922,6 +979,7 @@ class ServeController:
|
||||
|
||||
self.notify_replica_handles_changed()
|
||||
self.notify_backend_configs_changed()
|
||||
return return_uuid
|
||||
|
||||
def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
|
||||
"""Get the current config for the specified backend."""
|
||||
@@ -929,11 +987,15 @@ class ServeController:
|
||||
), "Backend {} is not registered.".format(backend_tag)
|
||||
return self.current_state.get_backend(backend_tag).backend_config
|
||||
|
||||
def get_http_config(self):
|
||||
"""Return the HTTP proxy configuration."""
|
||||
return self.http_host, self.http_port
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""Shuts down the serve instance completely."""
|
||||
async with self.write_lock:
|
||||
for router in self.actor_reconciler.router_handles():
|
||||
ray.kill(router, no_restart=True)
|
||||
for http_proxy in self.actor_reconciler.http_proxy_handles():
|
||||
ray.kill(http_proxy, no_restart=True)
|
||||
for replica in self.actor_reconciler.get_replica_handles():
|
||||
ray.kill(replica, no_restart=True)
|
||||
self.kv_store.delete(CHECKPOINT_KEY)
|
||||
|
||||
@@ -89,5 +89,6 @@ class RandomEndpointPolicy(EndpointPolicy):
|
||||
query.metadata.shard_key.encode("utf-8"))
|
||||
|
||||
chosen_backend, shadow_backends = self._select_backends(value)
|
||||
logger.debug(f"Chosen backend {chosen_backend} for query {query}")
|
||||
logger.debug(f"Assigning query {query.metadata.request_id} "
|
||||
f"to backend {chosen_backend}.")
|
||||
return [chosen_backend] + shadow_backends
|
||||
|
||||
@@ -7,6 +7,7 @@ import ray
|
||||
from ray.serve.context import TaskContext
|
||||
from ray.serve.router import RequestMetadata, Router
|
||||
from ray.serve.utils import get_random_letters
|
||||
from ray.serve.exceptions import RayServeException
|
||||
|
||||
global_async_loop = None
|
||||
|
||||
@@ -109,16 +110,25 @@ class RayServeHandle:
|
||||
``**kwargs``: All keyword arguments will be available in
|
||||
``request.args``.
|
||||
"""
|
||||
assert self.sync, "handle.remote() should be called from sync handle."
|
||||
if not self.sync:
|
||||
raise RayServeException(
|
||||
"You are trying to call handle.remote() with async handle. "
|
||||
"Please use `await handle.remote_async()` instead.")
|
||||
|
||||
coro = self._remote(request_data, kwargs)
|
||||
future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe(
|
||||
coro, self.async_loop)
|
||||
|
||||
# Block until the result is ready.
|
||||
return future.result()
|
||||
|
||||
async def _remote_async(self, request_data, **kwargs) -> ray.ObjectRef:
|
||||
async def remote_async(self,
|
||||
request_data: Optional[Union[Dict, Any]] = None,
|
||||
**kwargs) -> ray.ObjectRef:
|
||||
"""Experimental API for enqueue a request in async context."""
|
||||
assert not self.sync, "_remote_async must be called inside async loop."
|
||||
if not asyncio.get_event_loop().is_running():
|
||||
raise RayServeException(
|
||||
"remote_async must be called from a running event loop.")
|
||||
return await self._remote(request_data, kwargs)
|
||||
|
||||
def options(self,
|
||||
|
||||
@@ -3,46 +3,46 @@ import socket
|
||||
from typing import List
|
||||
|
||||
import uvicorn
|
||||
import starlette.responses
|
||||
|
||||
import ray
|
||||
from ray.exceptions import RayTaskError
|
||||
from ray.serve.constants import LongPollKey
|
||||
from ray.serve.context import TaskContext
|
||||
from ray.util import metrics
|
||||
from ray.serve.utils import _get_logger, get_random_letters
|
||||
from ray.serve.http_util import Response
|
||||
from ray.serve.long_poll import LongPollAsyncClient
|
||||
from ray.serve.router import Router, RequestMetadata
|
||||
|
||||
# The maximum number of times to retry a request due to actor failure.
|
||||
# TODO(edoakes): this should probably be configurable.
|
||||
MAX_ACTOR_DEAD_RETRIES = 10
|
||||
|
||||
logger = _get_logger()
|
||||
|
||||
|
||||
class HTTPProxy:
|
||||
"""
|
||||
This class should be instantiated and ran by ASGI server.
|
||||
"""This class is meant to be instantiated and run by an ASGI HTTP server.
|
||||
|
||||
>>> import uvicorn
|
||||
>>> uvicorn.run(HTTPProxy(kv_store_actor_handle, router_handle))
|
||||
# blocks forever
|
||||
"""
|
||||
|
||||
async def fetch_config_from_controller(self, controller_name):
|
||||
assert ray.is_initialized()
|
||||
def __init__(self, controller_name):
|
||||
controller = ray.get_actor(controller_name)
|
||||
|
||||
self.route_table = await controller.get_router_config.remote()
|
||||
self.route_table = {} # Should be updated via long polling.
|
||||
self.router = Router(controller)
|
||||
self.long_poll_client = LongPollAsyncClient(controller, {
|
||||
LongPollKey.ROUTE_TABLE: self._update_route_table,
|
||||
})
|
||||
|
||||
self.request_counter = metrics.Count(
|
||||
"num_http_requests",
|
||||
description="The number of HTTP requests processed",
|
||||
tag_keys=("route", ))
|
||||
|
||||
self.router = Router(controller)
|
||||
async def setup(self):
|
||||
await self.router.setup_in_async_loop()
|
||||
|
||||
def set_route_table(self, route_table):
|
||||
async def _update_route_table(self, route_table):
|
||||
logger.debug(f"HTTP Proxy: Get updated route table: {route_table}.")
|
||||
self.route_table = route_table
|
||||
|
||||
async def receive_http_body(self, scope, receive, send):
|
||||
@@ -74,8 +74,11 @@ class HTTPProxy:
|
||||
status_code=404).send(scope, receive, send)
|
||||
|
||||
async def __call__(self, scope, receive, send):
|
||||
# NOTE: This implements ASGI protocol specified in
|
||||
# https://asgi.readthedocs.io/en/latest/specs/index.html
|
||||
"""Implements the ASGI protocol.
|
||||
|
||||
See details at:
|
||||
https://asgi.readthedocs.io/en/latest/specs/index.html.
|
||||
"""
|
||||
|
||||
error_sender = self._make_error_sender(scope, receive, send)
|
||||
|
||||
@@ -126,6 +129,18 @@ class HTTPProxy:
|
||||
if isinstance(result, RayTaskError):
|
||||
error_message = "Task Error. Traceback: {}.".format(result)
|
||||
await error_sender(error_message, 500)
|
||||
elif isinstance(result, starlette.responses.Response):
|
||||
if isinstance(result, starlette.responses.StreamingResponse):
|
||||
raise TypeError("Starlette StreamingResponse returned by "
|
||||
f"backend for endpoint {endpoint_name}. "
|
||||
"StreamingResponse is unserializable and not "
|
||||
"supported by Ray Serve. Consider using "
|
||||
"another Starlette response type such as "
|
||||
"Response, HTMLResponse, PlainTextResponse, "
|
||||
"or JSONResponse. If support for "
|
||||
"StreamingResponse is desired, please let "
|
||||
"the Ray team know by making a Github issue!")
|
||||
await result(scope, receive, send)
|
||||
else:
|
||||
await Response(result).send(scope, receive, send)
|
||||
|
||||
@@ -137,12 +152,13 @@ class HTTPProxyActor:
|
||||
host,
|
||||
port,
|
||||
controller_name,
|
||||
http_middlewares: List["starlette.middleware.Middleware"] = []):
|
||||
http_middlewares: List[
|
||||
"starlette.middleware.Middleware"] = []): # noqa: F821
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
self.app = HTTPProxy()
|
||||
await self.app.fetch_config_from_controller(controller_name)
|
||||
self.app = HTTPProxy(controller_name)
|
||||
await self.app.setup()
|
||||
|
||||
self.wrapped_app = self.app
|
||||
for middleware in http_middlewares:
|
||||
@@ -180,12 +196,3 @@ class HTTPProxyActor:
|
||||
# the main thread and uvicorn doesn't expose a way to configure it.
|
||||
server.install_signal_handlers = lambda: None
|
||||
await server.serve(sockets=[sock])
|
||||
|
||||
async def set_route_table(self, route_table):
|
||||
self.app.set_route_table(route_table)
|
||||
|
||||
# ------ Proxy router logic ------ #
|
||||
async def assign_request(self, request_meta, *request_args,
|
||||
**request_kwargs):
|
||||
return await (await self.app.router.assign_request(
|
||||
request_meta, *request_args, **request_kwargs))
|
||||
|
||||
@@ -117,7 +117,7 @@ class Response:
|
||||
elif content_type == "json":
|
||||
self.raw_headers.append([b"content-type", b"application/json"])
|
||||
else:
|
||||
raise ValueError("Invalid content type {}".foramt(content_type))
|
||||
raise ValueError("Invalid content type {}".format(content_type))
|
||||
|
||||
async def send(self, scope, receive, send):
|
||||
await send({
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
from inspect import iscoroutinefunction
|
||||
import random
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
@@ -22,7 +23,7 @@ class UpdatedObject:
|
||||
UpdateStateAsyncCallable = Callable[[Any], Awaitable[None]]
|
||||
|
||||
|
||||
class LongPollerAsyncClient:
|
||||
class LongPollAsyncClient:
|
||||
"""The asynchronous long polling client.
|
||||
|
||||
Internally, it runs `await object_ref` in a `while True` loop. When a
|
||||
@@ -31,7 +32,7 @@ class LongPollerAsyncClient:
|
||||
the next poll.
|
||||
|
||||
Args:
|
||||
host_actor(ray.ActorHandle): handle to actor embedding LongPollerHost.
|
||||
host_actor(ray.ActorHandle): handle to actor embedding LongPollHost.
|
||||
key_listeners(Dict[str, AsyncCallable]): a dictionary mapping keys to
|
||||
callbacks to be called on state update for the corresponding keys.
|
||||
"""
|
||||
@@ -40,6 +41,10 @@ class LongPollerAsyncClient:
|
||||
key_listeners: Dict[str, UpdateStateAsyncCallable]) -> None:
|
||||
self.host_actor = host_actor
|
||||
self.key_listeners = key_listeners
|
||||
for callback in key_listeners.values():
|
||||
if not iscoroutinefunction(callback):
|
||||
raise ValueError(
|
||||
"Callbacks to async long poller must be 'async def'.")
|
||||
|
||||
self.snapshot_ids: Dict[str, int] = {
|
||||
key: -1
|
||||
@@ -56,34 +61,31 @@ class LongPollerAsyncClient:
|
||||
self.snapshot_ids)
|
||||
return object_ref
|
||||
|
||||
def _update(self, updates: Dict[str, UpdatedObject]):
|
||||
for key, update in updates.items():
|
||||
self.object_snapshots[key] = update.object_snapshot
|
||||
self.snapshot_ids[key] = update.snapshot_id
|
||||
|
||||
async def _do_long_poll(self):
|
||||
while True:
|
||||
try:
|
||||
updates: Dict[str, UpdatedObject] = await self._poll_once()
|
||||
self._update(updates)
|
||||
logger.debug(f"LongPollerClient received udpates: {updates}")
|
||||
for key, updated_object in updates.items():
|
||||
logger.debug("LongPollClient received updates for keys: "
|
||||
f"{list(updates.keys())}.")
|
||||
for key, update in updates.items():
|
||||
self.object_snapshots[key] = update.object_snapshot
|
||||
self.snapshot_ids[key] = update.snapshot_id
|
||||
# NOTE(simon):
|
||||
# This blocks the loop from doing another poll. Consider
|
||||
# use loop.create_task here or poll first then call the
|
||||
# callbacks.
|
||||
callback = self.key_listeners[key]
|
||||
await callback(updated_object.object_snapshot)
|
||||
await callback(update.object_snapshot)
|
||||
except ray.exceptions.RayActorError:
|
||||
# This can happen during shutdown where the controller is
|
||||
# intentionally killed, the client should just gracefully
|
||||
# exit.
|
||||
logger.debug("LongPollerClient failed to connect to host. "
|
||||
logger.debug("LongPollClient failed to connect to host. "
|
||||
"Shutting down.")
|
||||
break
|
||||
|
||||
|
||||
class LongPollerHost:
|
||||
class LongPollHost:
|
||||
"""The server side object that manages long pulling requests.
|
||||
|
||||
The desired use case is to embed this in an Ray actor. Client will be
|
||||
@@ -115,11 +117,10 @@ class LongPollerHost:
|
||||
immediately if the snapshot_ids are outdated, otherwise it will block
|
||||
until there's one updates.
|
||||
"""
|
||||
# 1. Figure out which keys do we care about
|
||||
watched_keys = set(self.snapshot_ids.keys()).intersection(
|
||||
keys_to_snapshot_ids.keys())
|
||||
if len(watched_keys) == 0:
|
||||
raise ValueError("Keys not found.")
|
||||
watched_keys = keys_to_snapshot_ids.keys()
|
||||
nonexistent_keys = set(watched_keys) - set(self.snapshot_ids.keys())
|
||||
if len(nonexistent_keys) > 0:
|
||||
raise ValueError(f"Keys not found: {nonexistent_keys}.")
|
||||
|
||||
# 2. If there are any outdated keys (by comparing snapshot ids)
|
||||
# return immediately.
|
||||
@@ -159,7 +160,7 @@ class LongPollerHost:
|
||||
def notify_changed(self, object_key: str, updated_object: Any):
|
||||
self.snapshot_ids[object_key] += 1
|
||||
self.object_snapshots[object_key] = updated_object
|
||||
logger.debug(f"LongPollerHost: {object_key} = {updated_object}")
|
||||
logger.debug(f"LongPollHost: Notify change for key {object_key}.")
|
||||
|
||||
if object_key in self.notifier_events:
|
||||
for event in self.notifier_events.pop(object_key):
|
||||
|
||||
+15
-12
@@ -6,9 +6,10 @@ from typing import Any, DefaultDict, Dict, Iterable, List, Optional
|
||||
|
||||
import ray
|
||||
from ray.actor import ActorHandle
|
||||
from ray.serve.constants import LongPollKey
|
||||
from ray.serve.context import TaskContext
|
||||
from ray.serve.endpoint_policy import EndpointPolicy, RandomEndpointPolicy
|
||||
from ray.serve.long_poll import LongPollerAsyncClient
|
||||
from ray.serve.long_poll import LongPollAsyncClient
|
||||
from ray.serve.utils import logger
|
||||
from ray.util import metrics
|
||||
|
||||
@@ -106,7 +107,8 @@ class ReplicaSet:
|
||||
) >= self.max_concurrent_queries:
|
||||
# This replica is overloaded, try next one
|
||||
continue
|
||||
logger.debug(f"Replica set assigned {query} to {replica}")
|
||||
logger.debug(f"Assigned query {query.metadata.request_id} "
|
||||
f"to replica {replica}.")
|
||||
ref = replica.handle_request.remote(query)
|
||||
self.in_flight_queries[replica].add(ref)
|
||||
return ref
|
||||
@@ -133,7 +135,8 @@ class ReplicaSet:
|
||||
"""
|
||||
assigned_ref = self._try_assign_replica(query)
|
||||
while assigned_ref is None: # Can't assign a replica right now.
|
||||
logger.debug(f"Failed to assign a replica for query {query}")
|
||||
logger.debug("Failed to assign a replica for "
|
||||
f"query {query.metadata.request_id}")
|
||||
# Maybe there exists a free replica, we just need to refresh our
|
||||
# query tracker.
|
||||
num_finished = self._drain_completed_object_refs()
|
||||
@@ -141,7 +144,7 @@ class ReplicaSet:
|
||||
# config to be updated.
|
||||
if num_finished == 0:
|
||||
logger.debug(
|
||||
f"All replicas are busy, waiting for a free replica.")
|
||||
"All replicas are busy, waiting for a free replica.")
|
||||
await asyncio.wait(
|
||||
self._all_query_refs + [self.config_updated_event.wait()],
|
||||
return_when=asyncio.FIRST_COMPLETED)
|
||||
@@ -176,14 +179,14 @@ class Router:
|
||||
|
||||
async def setup_in_async_loop(self):
|
||||
# NOTE(simon): Instead of performing initialization in __init__,
|
||||
# We separated the init of LongPollerAsyncClient to this method because
|
||||
# __init__ might be called in sync context. LongPollerAsyncClient
|
||||
# We separated the init of LongPollAsyncClient to this method because
|
||||
# __init__ might be called in sync context. LongPollAsyncClient
|
||||
# requires async context.
|
||||
self.long_pull_client = LongPollerAsyncClient(
|
||||
self.long_poll_client = LongPollAsyncClient(
|
||||
self.controller, {
|
||||
"traffic_policies": self._update_traffic_policies,
|
||||
"worker_handles": self._update_worker_handles,
|
||||
"backend_configs": self._update_backend_configs,
|
||||
LongPollKey.TRAFFIC_POLICIES: self._update_traffic_policies,
|
||||
LongPollKey.REPLICA_HANDLES: self._update_replica_handles,
|
||||
LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
|
||||
})
|
||||
|
||||
async def _update_traffic_policies(self, traffic_policies):
|
||||
@@ -194,8 +197,8 @@ class Router:
|
||||
event = self._pending_endpoints.pop(endpoint)
|
||||
event.set()
|
||||
|
||||
async def _update_worker_handles(self, worker_handles):
|
||||
for backend_tag, replica_handles in worker_handles.items():
|
||||
async def _update_replica_handles(self, replica_handles):
|
||||
for backend_tag, replica_handles in replica_handles.items():
|
||||
self.backend_replicas[backend_tag].update_worker_replicas(
|
||||
replica_handles)
|
||||
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import click
|
||||
|
||||
import ray
|
||||
from ray import serve
|
||||
from ray.serve.constants import DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT
|
||||
|
||||
|
||||
@click.group(
|
||||
help="[EXPERIMENTAL] CLI for managing Serve instances on a Ray cluster.")
|
||||
@click.option(
|
||||
"--address",
|
||||
"-a",
|
||||
default="auto",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Address of the running Ray cluster to connect to. "
|
||||
"Defaults to \"auto\".")
|
||||
def cli(address):
|
||||
ray.init(address=address)
|
||||
|
||||
|
||||
@cli.command(help="Start a detached Serve instance on the Ray cluster.")
|
||||
@click.option(
|
||||
"--http-host",
|
||||
default=DEFAULT_HTTP_HOST,
|
||||
required=False,
|
||||
type=str,
|
||||
help="Host for HTTP servers to listen on. "
|
||||
f"Defaults to {DEFAULT_HTTP_HOST}.")
|
||||
@click.option(
|
||||
"--http-port",
|
||||
default=DEFAULT_HTTP_PORT,
|
||||
required=False,
|
||||
type=int,
|
||||
help="Port for HTTP servers to listen on. "
|
||||
f"Defaults to {DEFAULT_HTTP_PORT}.")
|
||||
def start(http_host, http_port):
|
||||
serve.start(detached=True, http_host=http_host, http_port=http_port)
|
||||
|
||||
|
||||
@cli.command(help="Shutdown the running Serve instance on the Ray cluster.")
|
||||
def shutdown():
|
||||
serve.connect().shutdown()
|
||||
@@ -7,6 +7,7 @@ import pytest
|
||||
import ray
|
||||
from ray import serve
|
||||
from ray.serve.config import BackendConfig
|
||||
from ray.serve.constants import LongPollKey
|
||||
|
||||
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
|
||||
serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5
|
||||
@@ -42,22 +43,22 @@ def mock_controller_with_name():
|
||||
@ray.remote(num_cpus=0)
|
||||
class MockControllerActor:
|
||||
def __init__(self):
|
||||
from ray.serve.long_poll import LongPollerHost
|
||||
self.host = LongPollerHost()
|
||||
from ray.serve.long_poll import LongPollHost
|
||||
self.host = LongPollHost()
|
||||
self.backend_replicas = defaultdict(list)
|
||||
self.backend_configs = dict()
|
||||
self.clear()
|
||||
|
||||
def clear(self):
|
||||
self.host.notify_changed("worker_handles", {})
|
||||
self.host.notify_changed("traffic_policies", {})
|
||||
self.host.notify_changed("backend_configs", {})
|
||||
self.host.notify_changed(LongPollKey.REPLICA_HANDLES, {})
|
||||
self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES, {})
|
||||
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS, {})
|
||||
|
||||
async def listen_for_change(self, snapshot_ids):
|
||||
return await self.host.listen_for_change(snapshot_ids)
|
||||
|
||||
def set_traffic(self, endpoint, traffic_policy):
|
||||
self.host.notify_changed("traffic_policies",
|
||||
self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES,
|
||||
{endpoint: traffic_policy})
|
||||
|
||||
def add_new_replica(self,
|
||||
@@ -68,15 +69,17 @@ def mock_controller_with_name():
|
||||
self.backend_configs[backend_tag] = backend_config
|
||||
|
||||
self.host.notify_changed(
|
||||
"worker_handles",
|
||||
LongPollKey.REPLICA_HANDLES,
|
||||
self.backend_replicas,
|
||||
)
|
||||
self.host.notify_changed("backend_configs", self.backend_configs)
|
||||
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
|
||||
self.backend_configs)
|
||||
|
||||
def update_backend(self, backend_tag: str,
|
||||
backend_config: BackendConfig):
|
||||
self.backend_configs[backend_tag] = backend_config
|
||||
self.host.notify_changed("backend_configs", self.backend_configs)
|
||||
self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
|
||||
self.backend_configs)
|
||||
|
||||
name = f"MockController{random.randint(0,10e4)}"
|
||||
yield name, MockControllerActor.options(name=name).remote()
|
||||
|
||||
@@ -4,6 +4,7 @@ import time
|
||||
import os
|
||||
import pytest
|
||||
import requests
|
||||
import starlette.responses
|
||||
|
||||
import ray
|
||||
from ray import serve
|
||||
@@ -25,22 +26,6 @@ def test_e2e(serve_instance):
|
||||
client.create_endpoint(
|
||||
"endpoint", backend="echo:v1", route="/api", methods=["GET", "POST"])
|
||||
|
||||
retry_count = 5
|
||||
timeout_sleep = 0.5
|
||||
while True:
|
||||
try:
|
||||
resp = requests.get(
|
||||
"http://127.0.0.1:8000/-/routes", timeout=0.5).json()
|
||||
assert resp == {"/api": ["endpoint", ["GET", "POST"]]}
|
||||
break
|
||||
except Exception as e:
|
||||
time.sleep(timeout_sleep)
|
||||
timeout_sleep *= 2
|
||||
retry_count -= 1
|
||||
if retry_count == 0:
|
||||
assert False, ("Route table hasn't been updated after 3 tries."
|
||||
"The latest error was {}").format(e)
|
||||
|
||||
resp = requests.get("http://127.0.0.1:8000/api").json()["method"]
|
||||
assert resp == "GET"
|
||||
|
||||
@@ -48,6 +33,63 @@ def test_e2e(serve_instance):
|
||||
assert resp == "POST"
|
||||
|
||||
|
||||
def test_starlette_response(serve_instance):
|
||||
client = serve_instance
|
||||
|
||||
def basic_response(_):
|
||||
return starlette.responses.Response(
|
||||
"Hello, world!", media_type="text/plain")
|
||||
|
||||
client.create_backend("basic_response", basic_response)
|
||||
client.create_endpoint(
|
||||
"basic_response", backend="basic_response", route="/basic_response")
|
||||
assert requests.get(
|
||||
"http://127.0.0.1:8000/basic_response").text == "Hello, world!"
|
||||
|
||||
def html_response(_):
|
||||
return starlette.responses.HTMLResponse(
|
||||
"<html><body><h1>Hello, world!</h1></body></html>")
|
||||
|
||||
client.create_backend("html_response", html_response)
|
||||
client.create_endpoint(
|
||||
"html_response", backend="html_response", route="/html_response")
|
||||
assert requests.get(
|
||||
"http://127.0.0.1:8000/html_response"
|
||||
).text == "<html><body><h1>Hello, world!</h1></body></html>"
|
||||
|
||||
def plain_text_response(_):
|
||||
return starlette.responses.PlainTextResponse("Hello, world!")
|
||||
|
||||
client.create_backend("plain_text_response", plain_text_response)
|
||||
client.create_endpoint(
|
||||
"plain_text_response",
|
||||
backend="plain_text_response",
|
||||
route="/plain_text_response")
|
||||
assert requests.get(
|
||||
"http://127.0.0.1:8000/plain_text_response").text == "Hello, world!"
|
||||
|
||||
def json_response(_):
|
||||
return starlette.responses.JSONResponse({"hello": "world"})
|
||||
|
||||
client.create_backend("json_response", json_response)
|
||||
client.create_endpoint(
|
||||
"json_response", backend="json_response", route="/json_response")
|
||||
assert requests.get("http://127.0.0.1:8000/json_response").json()[
|
||||
"hello"] == "world"
|
||||
|
||||
def redirect_response(_):
|
||||
return starlette.responses.RedirectResponse(
|
||||
url="http://127.0.0.1:8000/basic_response")
|
||||
|
||||
client.create_backend("redirect_response", redirect_response)
|
||||
client.create_endpoint(
|
||||
"redirect_response",
|
||||
backend="redirect_response",
|
||||
route="/redirect_response")
|
||||
assert requests.get(
|
||||
"http://127.0.0.1:8000/redirect_response").text == "Hello, world!"
|
||||
|
||||
|
||||
def test_backend_user_config(serve_instance):
|
||||
client = serve_instance
|
||||
|
||||
@@ -63,25 +105,26 @@ def test_backend_user_config(serve_instance):
|
||||
|
||||
config = BackendConfig(num_replicas=2, user_config={"count": 123, "b": 2})
|
||||
client.create_backend("counter", Counter, config=config)
|
||||
client.create_endpoint("counter", backend="counter", route="/counter")
|
||||
client.create_endpoint("counter", backend="counter")
|
||||
handle = client.get_handle("counter")
|
||||
|
||||
def check(val, num_replicas):
|
||||
pids_seen = set()
|
||||
for i in range(100):
|
||||
result = ray.get(handle.remote())
|
||||
assert (str(result[0]) == val), result[0]
|
||||
if str(result[0]) != val:
|
||||
return False
|
||||
pids_seen.add(result[1])
|
||||
assert (len(pids_seen) == num_replicas)
|
||||
return len(pids_seen) == num_replicas
|
||||
|
||||
check("123", 2)
|
||||
wait_for_condition(lambda: check("123", 2))
|
||||
|
||||
client.update_backend_config("counter", BackendConfig(num_replicas=3))
|
||||
check("123", 3)
|
||||
wait_for_condition(lambda: check("123", 3))
|
||||
|
||||
config = BackendConfig(user_config={"count": 456})
|
||||
client.update_backend_config("counter", config)
|
||||
check("456", 3)
|
||||
wait_for_condition(lambda: check("456", 3))
|
||||
|
||||
|
||||
def test_call_method(serve_instance):
|
||||
@@ -183,7 +226,7 @@ def test_reject_duplicate_endpoint_and_route(serve_instance):
|
||||
def test_no_http(serve_instance):
|
||||
client = serve.start(http_host=None)
|
||||
|
||||
assert len(ray.get(client._controller.get_routers.remote())) == 0
|
||||
assert len(ray.get(client._controller.get_http_proxies.remote())) == 0
|
||||
|
||||
def hello(*args):
|
||||
return "hello"
|
||||
@@ -223,11 +266,6 @@ def test_scaling_replicas(serve_instance):
|
||||
|
||||
client.create_endpoint("counter", backend="counter:v1", route="/increment")
|
||||
|
||||
# Keep checking the routing table until /increment is populated
|
||||
while "/increment" not in requests.get(
|
||||
"http://127.0.0.1:8000/-/routes").json():
|
||||
time.sleep(0.2)
|
||||
|
||||
counter_result = []
|
||||
for _ in range(10):
|
||||
resp = requests.get("http://127.0.0.1:8000/increment").json()
|
||||
@@ -267,11 +305,6 @@ def test_batching(serve_instance):
|
||||
client.create_endpoint(
|
||||
"counter1", backend="counter:v11", route="/increment2")
|
||||
|
||||
# Keep checking the routing table until /increment is populated
|
||||
while "/increment2" not in requests.get(
|
||||
"http://127.0.0.1:8000/-/routes").json():
|
||||
time.sleep(0.2)
|
||||
|
||||
future_list = []
|
||||
handle = client.get_handle("counter1")
|
||||
for _ in range(20):
|
||||
@@ -299,8 +332,7 @@ def test_batching_exception(serve_instance):
|
||||
# Set the max batch size.
|
||||
config = BackendConfig(max_batch_size=5)
|
||||
client.create_backend("exception:v1", NoListReturned, config=config)
|
||||
client.create_endpoint(
|
||||
"exception-test", backend="exception:v1", route="/noListReturned")
|
||||
client.create_endpoint("exception-test", backend="exception:v1")
|
||||
|
||||
handle = client.get_handle("exception-test")
|
||||
with pytest.raises(ray.exceptions.RayTaskError):
|
||||
@@ -323,16 +355,16 @@ def test_updating_config(serve_instance):
|
||||
client.create_endpoint("bsimple", backend="bsimple:v1", route="/bsimple")
|
||||
|
||||
controller = client._controller
|
||||
old_replica_tag_list = ray.get(
|
||||
controller._list_replicas.remote("bsimple:v1"))
|
||||
old_replica_tag_list = list(
|
||||
ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())
|
||||
|
||||
update_config = BackendConfig(max_batch_size=5)
|
||||
client.update_backend_config("bsimple:v1", update_config)
|
||||
new_replica_tag_list = ray.get(
|
||||
controller._list_replicas.remote("bsimple:v1"))
|
||||
new_replica_tag_list = list(
|
||||
ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())
|
||||
new_all_tag_list = []
|
||||
for worker_dict in ray.get(
|
||||
controller.get_all_replica_handles.remote()).values():
|
||||
controller._all_replica_handles.remote()).values():
|
||||
new_all_tag_list.extend(list(worker_dict.keys()))
|
||||
|
||||
# the old and new replica tag list should be identical
|
||||
@@ -648,7 +680,7 @@ def test_create_infeasible_error(serve_instance):
|
||||
"MagicMLResource": 100
|
||||
}})
|
||||
|
||||
# Even each replica might be feasible, the total might not be.
|
||||
# Even though each replica might be feasible, the total might not be.
|
||||
current_cpus = int(ray.nodes()[0]["Resources"]["CPU"])
|
||||
num_replicas = current_cpus + 20
|
||||
config = BackendConfig(num_replicas=num_replicas)
|
||||
@@ -661,10 +693,6 @@ def test_create_infeasible_error(serve_instance):
|
||||
}},
|
||||
config=config)
|
||||
|
||||
# No replica should be created!
|
||||
replicas = ray.get(client._controller._list_replicas.remote("f1"))
|
||||
assert len(replicas) == 0
|
||||
|
||||
|
||||
def test_shutdown():
|
||||
def f():
|
||||
@@ -797,6 +825,7 @@ def test_serve_metrics(serve_instance):
|
||||
|
||||
client.create_backend("metrics", batcher)
|
||||
client.create_endpoint("metrics", backend="metrics", route="/metrics")
|
||||
|
||||
# send 10 concurrent requests
|
||||
url = "http://127.0.0.1:8000/metrics"
|
||||
ray.get([block_until_http_ready.remote(url) for _ in range(10)])
|
||||
|
||||
@@ -48,7 +48,7 @@ def setup_worker(name,
|
||||
async def add_servable_to_router(servable, router, controller_name, **kwargs):
|
||||
worker = setup_worker(
|
||||
"backend", servable, controller_name=controller_name, **kwargs)
|
||||
await router._update_worker_handles.remote({"backend": [worker]})
|
||||
await router._update_replica_handles.remote({"backend": [worker]})
|
||||
await router._update_traffic_policies.remote({
|
||||
"endpoint": TrafficPolicy({
|
||||
"backend": 1.0
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
|
||||
|
||||
def test_controller_inflight_requests_clear(serve_instance):
|
||||
client = serve_instance
|
||||
initial_number_reqs = ray.get(
|
||||
client._controller._num_inflight_results.remote())
|
||||
|
||||
def function(_):
|
||||
return "hello"
|
||||
|
||||
client.create_backend("tst", function)
|
||||
client.create_endpoint("end_pt", backend="tst")
|
||||
|
||||
assert ray.get(client._controller._num_inflight_results.remote()
|
||||
) - initial_number_reqs == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", "-s", __file__]))
|
||||
@@ -4,6 +4,7 @@ import tempfile
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.test_utils import wait_for_condition
|
||||
from ray import serve
|
||||
from ray.serve.config import BackendConfig, ReplicaConfig
|
||||
|
||||
@@ -53,9 +54,11 @@ def test_controller_failure(serve_instance):
|
||||
client.create_backend("controller_failure:v2", function)
|
||||
client.set_traffic("controller_failure", {"controller_failure:v2": 1.0})
|
||||
|
||||
for _ in range(10):
|
||||
def check_controller_failure():
|
||||
response = request_with_retries("/controller_failure", timeout=30)
|
||||
assert response.text == "hello2"
|
||||
return response.text == "hello2"
|
||||
|
||||
wait_for_condition(check_controller_failure)
|
||||
|
||||
def function(_):
|
||||
return "hello3"
|
||||
@@ -76,10 +79,10 @@ def test_controller_failure(serve_instance):
|
||||
assert response.text == "hello3"
|
||||
|
||||
|
||||
def _kill_routers(client):
|
||||
routers = ray.get(client._controller.get_routers.remote())
|
||||
for router in routers.values():
|
||||
ray.kill(router, no_restart=False)
|
||||
def _kill_http_proxies(client):
|
||||
http_proxies = ray.get(client._controller.get_http_proxies.remote())
|
||||
for http_proxy in http_proxies.values():
|
||||
ray.kill(http_proxy, no_restart=False)
|
||||
|
||||
|
||||
def test_http_proxy_failure(serve_instance):
|
||||
@@ -98,7 +101,7 @@ def test_http_proxy_failure(serve_instance):
|
||||
response = request_with_retries("/proxy_failure", timeout=30)
|
||||
assert response.text == "hello1"
|
||||
|
||||
_kill_routers(client)
|
||||
_kill_http_proxies(client)
|
||||
|
||||
def function(_):
|
||||
return "hello2"
|
||||
@@ -113,7 +116,7 @@ def test_http_proxy_failure(serve_instance):
|
||||
|
||||
def _get_worker_handles(client, backend):
|
||||
controller = client._controller
|
||||
backend_dict = ray.get(controller.get_all_replica_handles.remote())
|
||||
backend_dict = ray.get(controller._all_replica_handles.remote())
|
||||
|
||||
return list(backend_dict[backend].values())
|
||||
|
||||
@@ -124,7 +127,7 @@ def test_worker_restart(serve_instance):
|
||||
client = serve_instance
|
||||
|
||||
class Worker1:
|
||||
def __call__(self):
|
||||
def __call__(self, *args):
|
||||
return os.getpid()
|
||||
|
||||
client.create_backend("worker_failure:v1", Worker1)
|
||||
@@ -176,7 +179,7 @@ def test_worker_replica_failure(serve_instance):
|
||||
while True:
|
||||
pass
|
||||
|
||||
def __call__(self):
|
||||
def __call__(self, *args):
|
||||
pass
|
||||
|
||||
temp_path = os.path.join(tempfile.gettempdir(),
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import sys
|
||||
import functools
|
||||
import time
|
||||
import asyncio
|
||||
import os
|
||||
@@ -8,12 +7,12 @@ from typing import Dict
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
from ray.serve.long_poll import (LongPollerAsyncClient, LongPollerHost,
|
||||
from ray.serve.long_poll import (LongPollAsyncClient, LongPollHost,
|
||||
UpdatedObject)
|
||||
|
||||
|
||||
def test_host_standalone(serve_instance):
|
||||
host = ray.remote(LongPollerHost).remote()
|
||||
host = ray.remote(LongPollHost).remote()
|
||||
|
||||
# Write two values
|
||||
ray.get(host.notify_changed.remote("key_1", 999))
|
||||
@@ -44,10 +43,10 @@ def test_long_poll_restarts(serve_instance):
|
||||
max_restarts=-1,
|
||||
max_task_retries=-1,
|
||||
)
|
||||
class RestartableLongPollerHost:
|
||||
class RestartableLongPollHost:
|
||||
def __init__(self) -> None:
|
||||
print("actor started")
|
||||
self.host = LongPollerHost()
|
||||
self.host = LongPollHost()
|
||||
self.host.notify_changed("timer", time.time())
|
||||
self.should_exit = False
|
||||
|
||||
@@ -63,7 +62,7 @@ def test_long_poll_restarts(serve_instance):
|
||||
print("actor exit")
|
||||
os._exit(1)
|
||||
|
||||
host = RestartableLongPollerHost.remote()
|
||||
host = RestartableLongPollHost.remote()
|
||||
updated_values = ray.get(host.listen_for_change.remote({"timer": -1}))
|
||||
timer: UpdatedObject = updated_values["timer"]
|
||||
|
||||
@@ -81,22 +80,31 @@ def test_long_poll_restarts(serve_instance):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_client(serve_instance):
|
||||
host = ray.remote(LongPollerHost).remote()
|
||||
host = ray.remote(LongPollHost).remote()
|
||||
|
||||
# Write two values
|
||||
ray.get(host.notify_changed.remote("key_1", 100))
|
||||
ray.get(host.notify_changed.remote("key_2", 999))
|
||||
|
||||
# Check that construction fails with a sync callback.
|
||||
def callback(result, key):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
client = LongPollAsyncClient(host, {"key": callback})
|
||||
|
||||
callback_results = dict()
|
||||
|
||||
async def callback(result, key):
|
||||
callback_results[key] = result
|
||||
async def key_1_callback(result):
|
||||
callback_results["key_1"] = result
|
||||
|
||||
client = LongPollerAsyncClient(
|
||||
host, {
|
||||
"key_1": functools.partial(callback, key="key_1"),
|
||||
"key_2": functools.partial(callback, key="key_2")
|
||||
})
|
||||
async def key_2_callback(result):
|
||||
callback_results["key_2"] = result
|
||||
|
||||
client = LongPollAsyncClient(host, {
|
||||
"key_1": key_1_callback,
|
||||
"key_2": key_2_callback,
|
||||
})
|
||||
|
||||
while len(client.object_snapshots) == 0:
|
||||
# Yield the loop for client to get the result
|
||||
|
||||
@@ -144,6 +144,7 @@ class ServeEncoder(json.JSONEncoder):
|
||||
@ray.remote(num_cpus=0)
|
||||
def block_until_http_ready(http_endpoint,
|
||||
backoff_time_s=1,
|
||||
check_ready=None,
|
||||
timeout=HTTP_PROXY_TIMEOUT):
|
||||
http_is_ready = False
|
||||
start_time = time.time()
|
||||
@@ -152,7 +153,10 @@ def block_until_http_ready(http_endpoint,
|
||||
try:
|
||||
resp = requests.get(http_endpoint)
|
||||
assert resp.status_code == 200
|
||||
http_is_ready = True
|
||||
if check_ready is None:
|
||||
http_is_ready = True
|
||||
else:
|
||||
http_is_ready = check_ready(resp)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -95,6 +95,8 @@ py_test_module_list(
|
||||
"test_dask_callback.py",
|
||||
"test_debug_tools.py",
|
||||
"test_experimental_client.py",
|
||||
"test_experimental_client_metadata.py",
|
||||
"test_experimental_client_terminate.py",
|
||||
"test_job.py",
|
||||
"test_memstat.py",
|
||||
"test_metrics_agent.py",
|
||||
|
||||
+1
-1
@@ -1 +1 @@
|
||||
ray[debug]
|
||||
ray
|
||||
|
||||
+1
-1
@@ -1 +1 @@
|
||||
ray[debug]
|
||||
ray
|
||||
|
||||
+1
-1
@@ -1 +1 @@
|
||||
ray[debug]
|
||||
ray
|
||||
|
||||
@@ -8,6 +8,7 @@ try:
|
||||
except ImportError:
|
||||
pytest_timeout = None
|
||||
import sys
|
||||
import tempfile
|
||||
import datetime
|
||||
|
||||
import ray
|
||||
@@ -867,5 +868,61 @@ def test_actor_creation_latency(ray_start_regular_shared):
|
||||
actor_create_time - start, end - start))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"exit_condition",
|
||||
[
|
||||
# "out_of_scope", TODO(edoakes): enable this once fixed.
|
||||
"__ray_terminate__",
|
||||
"ray.actor.exit_actor",
|
||||
"ray.kill"
|
||||
])
|
||||
def test_atexit_handler(ray_start_regular_shared, exit_condition):
|
||||
@ray.remote
|
||||
class A():
|
||||
def __init__(self, tmpfile, data):
|
||||
import atexit
|
||||
|
||||
def f(*args, **kwargs):
|
||||
with open(tmpfile, "w") as f:
|
||||
f.write(data)
|
||||
f.flush()
|
||||
|
||||
atexit.register(f)
|
||||
|
||||
def ready(self):
|
||||
pass
|
||||
|
||||
def exit(self):
|
||||
ray.actor.exit_actor()
|
||||
|
||||
data = "hello"
|
||||
tmpfile = tempfile.NamedTemporaryFile()
|
||||
a = A.remote(tmpfile.name, data)
|
||||
ray.get(a.ready.remote())
|
||||
|
||||
if exit_condition == "out_of_scope":
|
||||
del a
|
||||
elif exit_condition == "__ray_terminate__":
|
||||
ray.wait([a.__ray_terminate__.remote()])
|
||||
elif exit_condition == "ray.actor.exit_actor":
|
||||
ray.wait([a.exit.remote()])
|
||||
elif exit_condition == "ray.kill":
|
||||
ray.kill(a)
|
||||
else:
|
||||
assert False, "Unrecognized condition"
|
||||
|
||||
def check_file_written():
|
||||
with open(tmpfile.name) as f:
|
||||
if f.read() == data:
|
||||
return True
|
||||
return False
|
||||
|
||||
# ray.kill() should not trigger atexit handlers, all other methods should.
|
||||
if exit_condition == "ray.kill":
|
||||
assert not check_file_written()
|
||||
else:
|
||||
ray.test_utils.wait_for_condition(check_file_written)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -1055,11 +1055,11 @@ def test_actor_resource_demand(shutdown_only):
|
||||
ray.get(a.foo.remote())
|
||||
time.sleep(1)
|
||||
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
|
||||
# The actor is scheduled so there should be no more demands left.
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 0
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 0
|
||||
|
||||
@ray.remote(num_cpus=80)
|
||||
class Actor2:
|
||||
@@ -1070,23 +1070,24 @@ def test_actor_resource_demand(shutdown_only):
|
||||
time.sleep(1)
|
||||
|
||||
# This actor cannot be scheduled.
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0].shape == {
|
||||
"CPU": 80.0
|
||||
})
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0]
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
|
||||
assert (
|
||||
resource_usages.resource_load_by_shape.resource_demands[0].shape == {
|
||||
"CPU": 80.0
|
||||
})
|
||||
assert (resource_usages.resource_load_by_shape.resource_demands[0]
|
||||
.num_infeasible_requests_queued == 1)
|
||||
|
||||
actors.append(Actor2.remote())
|
||||
time.sleep(1)
|
||||
|
||||
# Two actors cannot be scheduled.
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0]
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
|
||||
assert (resource_usages.resource_load_by_shape.resource_demands[0]
|
||||
.num_infeasible_requests_queued == 2)
|
||||
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import collections
|
||||
import numpy as np
|
||||
import os
|
||||
@@ -211,6 +212,66 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
|
||||
@ray.remote(num_cpus=0)
|
||||
class Counter:
|
||||
def __init__(self):
|
||||
self.count = 0
|
||||
self.event = asyncio.Event()
|
||||
|
||||
def increment(self):
|
||||
self.count += 1
|
||||
self.event.set()
|
||||
|
||||
async def wait_for_count(self, count):
|
||||
while True:
|
||||
if self.count >= count:
|
||||
return
|
||||
await self.event.wait()
|
||||
self.event.clear()
|
||||
|
||||
@ray.remote
|
||||
class ActorToKill:
|
||||
def __init__(self, counter):
|
||||
counter.increment.remote()
|
||||
|
||||
def run(self, counter, signal):
|
||||
counter.increment.remote()
|
||||
ray.get(signal.wait.remote())
|
||||
|
||||
@ray.remote
|
||||
class CallingActor:
|
||||
def __init__(self):
|
||||
self.actor = ray.get_actor("a")
|
||||
|
||||
def call_other(self, counter, signal):
|
||||
return ray.get(self.actor.run.remote(counter, signal))
|
||||
|
||||
init_counter = Counter.remote()
|
||||
run_counter = Counter.remote()
|
||||
signal = SignalActor.remote()
|
||||
|
||||
# Start the two actors, wait for ActorToKill's constructor to run.
|
||||
a = ActorToKill.options(
|
||||
name="a", max_restarts=-1, max_task_retries=-1).remote(init_counter)
|
||||
c = CallingActor.remote()
|
||||
ray.get(init_counter.wait_for_count.remote(1), timeout=30)
|
||||
|
||||
# Signal the CallingActor to call ActorToKill, wait for it to be running,
|
||||
# then kill ActorToKill.
|
||||
# Verify that this causes ActorToKill's constructor to run a second time
|
||||
# and the run method to begin a second time.
|
||||
ref = c.call_other.remote(run_counter, signal)
|
||||
ray.get(run_counter.wait_for_count.remote(1), timeout=30)
|
||||
ray.kill(a, no_restart=False)
|
||||
ray.get(init_counter.wait_for_count.remote(2), timeout=30)
|
||||
ray.get(run_counter.wait_for_count.remote(2), timeout=30)
|
||||
|
||||
# Signal the run method to finish, verify that the CallingActor returns.
|
||||
signal.send.remote()
|
||||
ray.get(ref, timeout=30)
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
|
||||
@@ -94,8 +94,13 @@ def test_local_scheduling_first(ray_start_cluster):
|
||||
assert local()
|
||||
|
||||
|
||||
@pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often")
|
||||
def test_load_balancing_with_dependencies(ray_start_cluster):
|
||||
@pytest.mark.parametrize("fast", [True, False])
|
||||
def test_load_balancing_with_dependencies(ray_start_cluster, fast):
|
||||
if fast and new_scheduler_enabled:
|
||||
# Load-balancing on new scheduler can be inefficient if (task
|
||||
# duration:heartbeat interval) is small enough.
|
||||
pytest.skip()
|
||||
|
||||
# This test ensures that tasks are being assigned to all raylets in a
|
||||
# roughly equal manner even when the tasks have dependencies.
|
||||
cluster = ray_start_cluster
|
||||
@@ -106,7 +111,10 @@ def test_load_balancing_with_dependencies(ray_start_cluster):
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
time.sleep(0.010)
|
||||
if fast:
|
||||
time.sleep(0.010)
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
# This object will be local to one of the raylets. Make sure
|
||||
|
||||
@@ -198,6 +198,32 @@ async def test_asyncio_double_await(ray_start_regular_shared):
|
||||
await waiting
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_exit_actor(ray_start_regular_shared):
|
||||
# https://github.com/ray-project/ray/issues/12649
|
||||
# The test should just hang without the fix.
|
||||
|
||||
@ray.remote
|
||||
class Actor:
|
||||
async def exit(self):
|
||||
ray.actor.exit_actor()
|
||||
|
||||
async def ping(self):
|
||||
return "pong"
|
||||
|
||||
async def loop_forever(self):
|
||||
while True:
|
||||
await asyncio.sleep(5)
|
||||
|
||||
a = Actor.options(max_task_retries=0).remote()
|
||||
a.loop_forever.remote()
|
||||
# Make sure exit_actor exits immediately, not once all tasks completed.
|
||||
ray.get(a.exit.remote())
|
||||
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(a.ping.remote())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -537,6 +537,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.provider = MockProvider()
|
||||
self.provider.create_node({}, {TAG_RAY_NODE_KIND: "worker"}, 10)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -558,6 +559,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
@@ -613,6 +615,70 @@ class AutoscalingTest(unittest.TestCase):
|
||||
autoscaler.update()
|
||||
self.waitForNodes(0)
|
||||
|
||||
def testLegacyYamlWithRequestResources(self):
|
||||
"""Test when using legacy yamls request_resources() adds workers.
|
||||
|
||||
Makes sure that requested resources are added for legacy yamls when
|
||||
necessary. So if requested resources for instance fit on the headnode
|
||||
we don't add more nodes. But we add more nodes when they don't fit.
|
||||
"""
|
||||
config = SMALL_CLUSTER.copy()
|
||||
config["min_workers"] = 0
|
||||
config["max_workers"] = 100
|
||||
config["idle_timeout_minutes"] = 0
|
||||
config["upscaling_speed"] = 1
|
||||
config_path = self.write_config(config)
|
||||
|
||||
self.provider = MockProvider()
|
||||
self.provider.create_node({}, {
|
||||
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
||||
TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
|
||||
}, 1)
|
||||
head_ip = self.provider.non_terminated_node_ips(
|
||||
tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
|
||||
|
||||
lm = LoadMetrics()
|
||||
lm.local_ip = head_ip
|
||||
lm.update(head_ip, {"CPU": 1}, {"CPU": 1}, {})
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
lm,
|
||||
max_launch_batch=5,
|
||||
max_concurrent_launches=5,
|
||||
max_failures=0,
|
||||
process_runner=runner,
|
||||
update_interval_s=0)
|
||||
autoscaler.update()
|
||||
# 1 head node.
|
||||
self.waitForNodes(1)
|
||||
autoscaler.request_resources([{"CPU": 1}])
|
||||
autoscaler.update()
|
||||
# still 1 head node because request_resources fits in the headnode.
|
||||
self.waitForNodes(1)
|
||||
autoscaler.request_resources([{"CPU": 1}] + [{"CPU": 2}] * 9)
|
||||
autoscaler.update()
|
||||
self.waitForNodes(2) # Adds a single worker to get its resources.
|
||||
autoscaler.update()
|
||||
self.waitForNodes(2) # Still 1 worker because its resources
|
||||
# aren't known.
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
|
||||
autoscaler.update()
|
||||
self.waitForNodes(10) # 9 workers and 1 head node, scaled immediately.
|
||||
lm.update(
|
||||
"172.0.0.1", {"CPU": 2}, {"CPU": 2}, {},
|
||||
waiting_bundles=[{
|
||||
"CPU": 2
|
||||
}] * 9,
|
||||
infeasible_bundles=[{
|
||||
"CPU": 1
|
||||
}] * 1)
|
||||
autoscaler.update()
|
||||
# Make sure that if all the resources fit on the exising nodes not
|
||||
# to add any more.
|
||||
self.waitForNodes(10)
|
||||
|
||||
def testAggressiveAutoscaling(self):
|
||||
config = SMALL_CLUSTER.copy()
|
||||
config["min_workers"] = 0
|
||||
@@ -629,7 +695,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
head_ip = self.provider.non_terminated_node_ips(
|
||||
tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
|
||||
runner = MockProcessRunner()
|
||||
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
|
||||
lm = LoadMetrics()
|
||||
lm.local_ip = head_ip
|
||||
|
||||
@@ -782,6 +848,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -817,6 +884,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -896,6 +964,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
@@ -949,6 +1018,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -989,6 +1059,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner(fail_cmds=["setup_cmd"])
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -1000,14 +1071,18 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.waitForNodes(2)
|
||||
self.provider.finish_starting_nodes()
|
||||
autoscaler.update()
|
||||
self.waitForNodes(
|
||||
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
|
||||
try:
|
||||
self.waitForNodes(
|
||||
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
|
||||
except AssertionError:
|
||||
# The failed nodes might have been already terminated by autoscaler
|
||||
assert len(self.provider.non_terminated_nodes({})) == 0
|
||||
|
||||
def testConfiguresOutdatedNodes(self):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
@@ -1038,6 +1113,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.provider = MockProvider()
|
||||
lm = LoadMetrics()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(5)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
lm,
|
||||
@@ -1087,12 +1163,22 @@ class AutoscalingTest(unittest.TestCase):
|
||||
autoscaler.update()
|
||||
|
||||
assert autoscaler.pending_launches.value == 0
|
||||
assert len(self.provider.non_terminated_nodes({})) == 3
|
||||
# This actually remained 4 instead of 3, because the other 2 nodes
|
||||
# are not connected and hence we rely more on connected nodes for
|
||||
# min_workers. When the "pending" nodes show up as connected,
|
||||
# then we can terminate the ones connected before.
|
||||
assert len(self.provider.non_terminated_nodes({})) == 4
|
||||
lm.last_used_time_by_ip["172.0.0.2"] = 0
|
||||
lm.last_used_time_by_ip["172.0.0.3"] = 0
|
||||
autoscaler.update()
|
||||
assert autoscaler.pending_launches.value == 0
|
||||
assert len(self.provider.non_terminated_nodes({})) == 1
|
||||
# 2 nodes and not 1 because 1 is needed for min_worker and the other 1
|
||||
# is still not connected.
|
||||
self.waitForNodes(2)
|
||||
# when we connect it, we will see 1 node.
|
||||
lm.last_used_time_by_ip["172.0.0.4"] = 0
|
||||
autoscaler.update()
|
||||
self.waitForNodes(1)
|
||||
|
||||
def testTargetUtilizationFraction(self):
|
||||
config = SMALL_CLUSTER.copy()
|
||||
@@ -1103,6 +1189,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.provider = MockProvider()
|
||||
lm = LoadMetrics()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(12)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
lm,
|
||||
@@ -1161,7 +1248,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
import ray
|
||||
import ray.cluster_utils
|
||||
@@ -6,7 +7,7 @@ import ray.test_utils
|
||||
|
||||
|
||||
def test_cross_language_raise_kwargs(shutdown_only):
|
||||
ray.init(_load_code_from_local=True)
|
||||
ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
|
||||
|
||||
with pytest.raises(Exception, match="kwargs"):
|
||||
ray.java_function("a", "b").remote(x="arg1")
|
||||
@@ -16,7 +17,7 @@ def test_cross_language_raise_kwargs(shutdown_only):
|
||||
|
||||
|
||||
def test_cross_language_raise_exception(shutdown_only):
|
||||
ray.init(_load_code_from_local=True)
|
||||
ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
|
||||
|
||||
class PythonObject(object):
|
||||
pass
|
||||
|
||||
@@ -2,7 +2,7 @@ import pytest
|
||||
from contextlib import contextmanager
|
||||
|
||||
import ray.experimental.client.server.server as ray_client_server
|
||||
from ray.experimental.client import ray
|
||||
from ray.experimental.client import ray, reset_api
|
||||
from ray.experimental.client.common import ClientObjectRef
|
||||
|
||||
|
||||
@@ -10,9 +10,12 @@ from ray.experimental.client.common import ClientObjectRef
|
||||
def ray_start_client_server():
|
||||
server = ray_client_server.serve("localhost:50051", test_mode=True)
|
||||
ray.connect("localhost:50051")
|
||||
yield ray
|
||||
ray.disconnect()
|
||||
server.stop(0)
|
||||
try:
|
||||
yield ray
|
||||
finally:
|
||||
ray.disconnect()
|
||||
server.stop(0)
|
||||
reset_api()
|
||||
|
||||
|
||||
def test_real_ray_fallback(ray_start_regular_shared):
|
||||
@@ -34,9 +37,6 @@ def test_real_ray_fallback(ray_start_regular_shared):
|
||||
nodes = ray.get(get_nodes.remote())
|
||||
assert len(nodes) == 1, nodes
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
print(ray.nodes())
|
||||
|
||||
|
||||
def test_nested_function(ray_start_regular_shared):
|
||||
with ray_start_client_server() as ray:
|
||||
@@ -170,6 +170,70 @@ def test_basic_actor(ray_start_regular_shared):
|
||||
assert count == 2
|
||||
|
||||
|
||||
def test_pass_handles(ray_start_regular_shared):
|
||||
"""
|
||||
Test that passing client handles to actors and functions to remote actors
|
||||
in functions (on the server or raylet side) works transparently to the
|
||||
caller.
|
||||
"""
|
||||
with ray_start_client_server() as ray:
|
||||
|
||||
@ray.remote
|
||||
class ExecActor:
|
||||
def exec(self, f, x):
|
||||
return ray.get(f.remote(x))
|
||||
|
||||
def exec_exec(self, actor, f, x):
|
||||
return ray.get(actor.exec.remote(f, x))
|
||||
|
||||
@ray.remote
|
||||
def fact(x):
|
||||
out = 1
|
||||
while x > 0:
|
||||
out = out * x
|
||||
x -= 1
|
||||
return out
|
||||
|
||||
@ray.remote
|
||||
def func_exec(f, x):
|
||||
return ray.get(f.remote(x))
|
||||
|
||||
@ray.remote
|
||||
def func_actor_exec(actor, f, x):
|
||||
return ray.get(actor.exec.remote(f, x))
|
||||
|
||||
@ray.remote
|
||||
def sneaky_func_exec(obj, x):
|
||||
return ray.get(obj["f"].remote(x))
|
||||
|
||||
@ray.remote
|
||||
def sneaky_actor_exec(obj, x):
|
||||
return ray.get(obj["actor"].exec.remote(obj["f"], x))
|
||||
|
||||
def local_fact(x):
|
||||
if x <= 0:
|
||||
return 1
|
||||
return x * local_fact(x - 1)
|
||||
|
||||
assert ray.get(fact.remote(7)) == local_fact(7)
|
||||
assert ray.get(func_exec.remote(fact, 8)) == local_fact(8)
|
||||
test_obj = {}
|
||||
test_obj["f"] = fact
|
||||
assert ray.get(sneaky_func_exec.remote(test_obj, 5)) == local_fact(5)
|
||||
actor_handle = ExecActor.remote()
|
||||
assert ray.get(actor_handle.exec.remote(fact, 7)) == local_fact(7)
|
||||
assert ray.get(func_actor_exec.remote(actor_handle, fact,
|
||||
10)) == local_fact(10)
|
||||
second_actor = ExecActor.remote()
|
||||
assert ray.get(actor_handle.exec_exec.remote(second_actor, fact,
|
||||
9)) == local_fact(9)
|
||||
test_actor_obj = {}
|
||||
test_actor_obj["actor"] = second_actor
|
||||
test_actor_obj["f"] = fact
|
||||
assert ray.get(sneaky_actor_exec.remote(test_actor_obj,
|
||||
4)) == local_fact(4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
from ray.tests.test_experimental_client import ray_start_client_server
|
||||
|
||||
|
||||
def test_get_ray_metadata(ray_start_regular_shared):
|
||||
"""
|
||||
Test the ClusterInfo client data pathway and API surface
|
||||
"""
|
||||
with ray_start_client_server() as ray:
|
||||
ip_address = ray_start_regular_shared["node_ip_address"]
|
||||
|
||||
initialized = ray.is_initialized()
|
||||
assert initialized
|
||||
|
||||
nodes = ray.nodes()
|
||||
assert len(nodes) == 1, nodes
|
||||
assert nodes[0]["NodeManagerAddress"] == ip_address
|
||||
|
||||
current_node_id = "node:" + ip_address
|
||||
|
||||
cluster_resources = ray.cluster_resources()
|
||||
available_resources = ray.available_resources()
|
||||
|
||||
assert cluster_resources["CPU"] == 1.0
|
||||
assert current_node_id in cluster_resources
|
||||
assert current_node_id in available_resources
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user