mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 17:49:47 +08:00
[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)
* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
|
||||
@@ -42,9 +41,9 @@ cleanup_test_files()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_with_dashboard", [{
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"agent_register_timeout_ms": 5000
|
||||
})
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_basic(ray_start_with_dashboard):
|
||||
|
||||
@@ -84,7 +84,7 @@ public abstract class BaseMultiLanguageTest {
|
||||
"--load-code-from-local",
|
||||
"--include-java",
|
||||
"--java-worker-options=" + workerOptions,
|
||||
"--internal-config=" + new Gson().toJson(RayConfig.create().rayletConfigParameters)
|
||||
"--system-config=" + new Gson().toJson(RayConfig.create().rayletConfigParameters)
|
||||
);
|
||||
if (!executeCommand(startCommand, 10, getRayStartEnv())) {
|
||||
throw new RuntimeException("Couldn't start ray cluster.");
|
||||
|
||||
@@ -2,8 +2,6 @@ import inspect
|
||||
import logging
|
||||
import weakref
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import namedtuple
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray._raylet
|
||||
import ray.signature as signature
|
||||
@@ -854,11 +852,6 @@ def modify_class(cls):
|
||||
"classes. In Python 2, you must declare the class with "
|
||||
"'class ClassName(object):' instead of 'class ClassName:'.")
|
||||
|
||||
if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
|
||||
raise TypeError(
|
||||
"A checkpointable actor class should implement all abstract "
|
||||
"methods in the `Checkpointable` interface.")
|
||||
|
||||
# Modify the class to have an additional method that will be used for
|
||||
# terminating the worker.
|
||||
class Class(cls):
|
||||
@@ -869,20 +862,6 @@ def modify_class(cls):
|
||||
if worker.mode != ray.LOCAL_MODE:
|
||||
ray.actor.exit_actor()
|
||||
|
||||
def __ray_checkpoint__(self):
|
||||
"""Save a checkpoint.
|
||||
|
||||
This task saves the current state of the actor, the current task
|
||||
frontier according to the raylet, and the checkpoint index
|
||||
(number of tasks executed so far).
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
if not isinstance(self, ray.actor.Checkpointable):
|
||||
raise TypeError(
|
||||
"__ray_checkpoint__.remote() may only be called on actors "
|
||||
"that implement ray.actor.Checkpointable")
|
||||
return worker._save_actor_checkpoint()
|
||||
|
||||
Class.__module__ = cls.__module__
|
||||
Class.__name__ = cls.__name__
|
||||
|
||||
@@ -951,128 +930,3 @@ def exit_actor():
|
||||
assert False, "This process should have terminated."
|
||||
else:
|
||||
raise TypeError("exit_actor called on a non-actor worker.")
|
||||
|
||||
|
||||
CheckpointContext = namedtuple(
|
||||
"CheckpointContext",
|
||||
[
|
||||
# Actor's ID.
|
||||
"actor_id",
|
||||
# Number of tasks executed since last checkpoint.
|
||||
"num_tasks_since_last_checkpoint",
|
||||
# Time elapsed since last checkpoint, in milliseconds.
|
||||
"time_elapsed_ms_since_last_checkpoint",
|
||||
],
|
||||
)
|
||||
"""A namedtuple that contains information about actor's last checkpoint."""
|
||||
|
||||
Checkpoint = namedtuple(
|
||||
"Checkpoint",
|
||||
[
|
||||
# ID of this checkpoint.
|
||||
"checkpoint_id",
|
||||
# The timestamp at which this checkpoint was saved,
|
||||
# represented as milliseconds elapsed since Unix epoch.
|
||||
"timestamp",
|
||||
],
|
||||
)
|
||||
"""A namedtuple that represents a checkpoint."""
|
||||
|
||||
|
||||
class Checkpointable(metaclass=ABCMeta):
|
||||
"""An interface that indicates an actor can be checkpointed."""
|
||||
|
||||
@abstractmethod
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
"""Whether this actor needs to be checkpointed.
|
||||
|
||||
This method will be called after every task. You should implement this
|
||||
callback to decide whether this actor needs to be checkpointed at this
|
||||
time, based on the checkpoint context, or any other factors.
|
||||
|
||||
Args:
|
||||
checkpoint_context: A namedtuple that contains info about last
|
||||
checkpoint.
|
||||
|
||||
Returns:
|
||||
A boolean value that indicates whether this actor needs to be
|
||||
checkpointed.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
"""Save a checkpoint to persistent storage.
|
||||
|
||||
If `should_checkpoint` returns true, this method will be called. You
|
||||
should implement this callback to save actor's checkpoint and the given
|
||||
checkpoint id to persistent storage.
|
||||
|
||||
Args:
|
||||
actor_id: Actor's ID.
|
||||
checkpoint_id: ID of this checkpoint. You should save it together
|
||||
with actor's checkpoint data. And it will be used by the
|
||||
`load_checkpoint` method.
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
"""Load actor's previous checkpoint, and restore actor's state.
|
||||
|
||||
This method will be called when an actor is restarted, after
|
||||
actor's constructor.
|
||||
If the actor needs to restore from previous checkpoint, this function
|
||||
should restore actor's state and return the checkpoint ID. Otherwise,
|
||||
it should do nothing and return None.
|
||||
Note, this method must return one of the checkpoint IDs in the
|
||||
`available_checkpoints` list, or None. Otherwise, an exception will be
|
||||
raised.
|
||||
|
||||
Args:
|
||||
actor_id: Actor's ID.
|
||||
available_checkpoints: A list of `Checkpoint` namedtuples that
|
||||
contains all available checkpoint IDs and their timestamps,
|
||||
sorted by timestamp in descending order.
|
||||
Returns:
|
||||
The ID of the checkpoint from which the actor was resumed, or None
|
||||
if the actor should restart from the beginning.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
"""Delete an expired checkpoint.
|
||||
|
||||
This method will be called when an checkpoint is expired. You should
|
||||
implement this method to delete your application checkpoint data.
|
||||
Note, the maximum number of checkpoints kept in the backend can be
|
||||
configured at `RayConfig.num_actor_checkpoints_to_keep`.
|
||||
|
||||
Args:
|
||||
actor_id: ID of the actor.
|
||||
checkpoint_id: ID of the checkpoint that has expired.
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def get_checkpoints_for_actor(actor_id):
|
||||
"""Get the available checkpoints for the given actor ID, return a list
|
||||
sorted by checkpoint timestamp in descending order.
|
||||
"""
|
||||
checkpoint_info = ray.state.state.actor_checkpoint_info(actor_id)
|
||||
if checkpoint_info is None:
|
||||
return []
|
||||
checkpoints = [
|
||||
Checkpoint(checkpoint_id, timestamp) for checkpoint_id, timestamp in
|
||||
zip(checkpoint_info["CheckpointIds"], checkpoint_info["Timestamps"])
|
||||
]
|
||||
return sorted(
|
||||
checkpoints,
|
||||
key=lambda checkpoint: checkpoint.timestamp,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
@@ -80,9 +79,6 @@ class Cluster:
|
||||
"min_worker_port": 0,
|
||||
"max_worker_port": 0,
|
||||
}
|
||||
if "_internal_config" in node_args:
|
||||
node_args["_internal_config"] = json.loads(
|
||||
node_args["_internal_config"])
|
||||
ray_params = ray.parameter.RayParams(**node_args)
|
||||
ray_params.update_if_absent(**default_kwargs)
|
||||
if self.head_node is None:
|
||||
|
||||
@@ -544,43 +544,13 @@ class FunctionActorManager:
|
||||
"""
|
||||
|
||||
def actor_method_executor(actor, *args, **kwargs):
|
||||
# Update the actor's task counter to reflect the task we're about
|
||||
# to execute.
|
||||
self._worker.actor_task_counter += 1
|
||||
|
||||
# Execute the assigned method and save a checkpoint if necessary.
|
||||
try:
|
||||
is_bound = (is_class_method(method)
|
||||
or is_static_method(type(actor), method_name))
|
||||
if is_bound:
|
||||
method_returns = method(*args, **kwargs)
|
||||
else:
|
||||
method_returns = method(actor, *args, **kwargs)
|
||||
except Exception as e:
|
||||
# Save the checkpoint before allowing the method exception
|
||||
# to be thrown, but don't save the checkpoint for actor
|
||||
# creation task.
|
||||
if (isinstance(actor, ray.actor.Checkpointable)
|
||||
and self._worker.actor_task_counter != 1):
|
||||
self._save_and_log_checkpoint(actor)
|
||||
raise e
|
||||
# Execute the assigned method.
|
||||
is_bound = (is_class_method(method)
|
||||
or is_static_method(type(actor), method_name))
|
||||
if is_bound:
|
||||
return method(*args, **kwargs)
|
||||
else:
|
||||
# Handle any checkpointing operations before storing the
|
||||
# method's return values.
|
||||
# NOTE(swang): If method_returns is a pointer to the actor's
|
||||
# state and the checkpointing operations can modify the return
|
||||
# values if they mutate the actor's state. Is this okay?
|
||||
if isinstance(actor, ray.actor.Checkpointable):
|
||||
# If this is the first task to execute on the actor, try to
|
||||
# resume from a checkpoint.
|
||||
if self._worker.actor_task_counter == 1:
|
||||
if actor_imported:
|
||||
self._restore_and_log_checkpoint(actor)
|
||||
else:
|
||||
# Save the checkpoint before returning the method's
|
||||
# return values.
|
||||
self._save_and_log_checkpoint(actor)
|
||||
return method_returns
|
||||
return method(actor, *args, **kwargs)
|
||||
|
||||
# Set method_name and method as attributes to the executor clusore
|
||||
# so we can make decision based on these attributes in task executor.
|
||||
@@ -591,86 +561,3 @@ class FunctionActorManager:
|
||||
actor_method_executor.method = method
|
||||
|
||||
return actor_method_executor
|
||||
|
||||
def _save_and_log_checkpoint(self, actor):
|
||||
"""Save an actor checkpoint if necessary and log any errors.
|
||||
|
||||
Args:
|
||||
actor: The actor to checkpoint.
|
||||
|
||||
Returns:
|
||||
The result of the actor's user-defined `save_checkpoint` method.
|
||||
"""
|
||||
actor_id = self._worker.actor_id
|
||||
checkpoint_info = self._worker.actor_checkpoint_info[actor_id]
|
||||
checkpoint_info.num_tasks_since_last_checkpoint += 1
|
||||
now = int(1000 * time.time())
|
||||
checkpoint_context = ray.actor.CheckpointContext(
|
||||
actor_id, checkpoint_info.num_tasks_since_last_checkpoint,
|
||||
now - checkpoint_info.last_checkpoint_timestamp)
|
||||
# If we should take a checkpoint, notify raylet to prepare a checkpoint
|
||||
# and then call `save_checkpoint`.
|
||||
if actor.should_checkpoint(checkpoint_context):
|
||||
try:
|
||||
now = int(1000 * time.time())
|
||||
checkpoint_id = (
|
||||
self._worker.core_worker.prepare_actor_checkpoint(actor_id)
|
||||
)
|
||||
checkpoint_info.checkpoint_ids.append(checkpoint_id)
|
||||
actor.save_checkpoint(actor_id, checkpoint_id)
|
||||
if (len(checkpoint_info.checkpoint_ids) >
|
||||
ray._config.num_actor_checkpoints_to_keep()):
|
||||
actor.checkpoint_expired(
|
||||
actor_id,
|
||||
checkpoint_info.checkpoint_ids.pop(0),
|
||||
)
|
||||
checkpoint_info.num_tasks_since_last_checkpoint = 0
|
||||
checkpoint_info.last_checkpoint_timestamp = now
|
||||
except Exception:
|
||||
# Checkpoint save or reload failed. Notify the driver.
|
||||
traceback_str = ray.utils.format_error_message(
|
||||
traceback.format_exc())
|
||||
ray.utils.push_error_to_driver(
|
||||
self._worker,
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
job_id=self._worker.current_job_id)
|
||||
|
||||
def _restore_and_log_checkpoint(self, actor):
|
||||
"""Restore an actor from a checkpoint if available and log any errors.
|
||||
|
||||
This should only be called on workers that have just executed an actor
|
||||
creation task.
|
||||
|
||||
Args:
|
||||
actor: The actor to restore from a checkpoint.
|
||||
"""
|
||||
actor_id = self._worker.actor_id
|
||||
try:
|
||||
checkpoints = ray.actor.get_checkpoints_for_actor(actor_id)
|
||||
if len(checkpoints) > 0:
|
||||
# If we found previously saved checkpoints for this actor,
|
||||
# call the `load_checkpoint` callback.
|
||||
checkpoint_id = actor.load_checkpoint(actor_id, checkpoints)
|
||||
if checkpoint_id is not None:
|
||||
# Check that the returned checkpoint id is in the
|
||||
# `available_checkpoints` list.
|
||||
msg = (
|
||||
"`load_checkpoint` must return a checkpoint id that " +
|
||||
"exists in the `available_checkpoints` list, or None.")
|
||||
assert any(checkpoint_id == checkpoint.checkpoint_id
|
||||
for checkpoint in checkpoints), msg
|
||||
# Notify raylet that this actor has been resumed from
|
||||
# a checkpoint.
|
||||
(self._worker.core_worker.
|
||||
notify_actor_resumed_from_checkpoint(
|
||||
actor_id, checkpoint_id))
|
||||
except Exception:
|
||||
# Checkpoint save or reload failed. Notify the driver.
|
||||
traceback_str = ray.utils.format_error_message(
|
||||
traceback.format_exc())
|
||||
ray.utils.push_error_to_driver(
|
||||
self._worker,
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
job_id=self._worker.current_job_id)
|
||||
|
||||
@@ -21,52 +21,28 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
uint64_t num_heartbeats_warning() const
|
||||
|
||||
int64_t initial_reconstruction_timeout_milliseconds() const
|
||||
int64_t object_timeout_milliseconds() const
|
||||
|
||||
int64_t get_timeout_milliseconds() const
|
||||
|
||||
uint64_t max_lineage_size() const
|
||||
|
||||
int64_t worker_get_request_size() const
|
||||
|
||||
int64_t worker_fetch_request_size() const
|
||||
|
||||
int64_t actor_max_dummy_objects() const
|
||||
|
||||
int64_t raylet_client_num_connect_attempts() const
|
||||
|
||||
int64_t raylet_client_connect_timeout_milliseconds() const
|
||||
|
||||
int64_t raylet_fetch_timeout_milliseconds() const
|
||||
|
||||
int64_t raylet_reconstruction_timeout_milliseconds() const
|
||||
|
||||
int64_t max_num_to_reconstruct() const
|
||||
|
||||
int64_t raylet_fetch_request_size() const
|
||||
|
||||
int64_t kill_worker_timeout_milliseconds() const
|
||||
|
||||
int64_t worker_register_timeout_seconds() const
|
||||
|
||||
int64_t max_time_for_handler_milliseconds() const
|
||||
|
||||
int64_t max_time_for_loop() const
|
||||
|
||||
int64_t redis_db_connect_retries()
|
||||
|
||||
int64_t redis_db_connect_wait_milliseconds() const
|
||||
|
||||
int64_t plasma_default_release_delay() const
|
||||
|
||||
int64_t L3_cache_size_bytes() const
|
||||
|
||||
int64_t max_tasks_to_spillback() const
|
||||
|
||||
int64_t actor_creation_num_spillbacks_warning() const
|
||||
|
||||
int node_manager_forward_task_retry_timeout_milliseconds() const
|
||||
|
||||
int object_manager_pull_timeout_ms() const
|
||||
|
||||
int object_manager_push_timeout_ms() const
|
||||
@@ -79,10 +55,6 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
int num_workers_per_process_java() const
|
||||
|
||||
int64_t max_task_lease_timeout_ms() const
|
||||
|
||||
uint32_t num_actor_checkpoints_to_keep() const
|
||||
|
||||
uint32_t maximum_gcs_deletion_batch_size() const
|
||||
|
||||
int64_t max_direct_call_object_size() const
|
||||
|
||||
@@ -26,18 +26,14 @@ cdef class Config:
|
||||
return RayConfig.instance().num_heartbeats_warning()
|
||||
|
||||
@staticmethod
|
||||
def initial_reconstruction_timeout_milliseconds():
|
||||
def object_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.initial_reconstruction_timeout_milliseconds())
|
||||
.object_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def get_timeout_milliseconds():
|
||||
return RayConfig.instance().get_timeout_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def max_lineage_size():
|
||||
return RayConfig.instance().max_lineage_size()
|
||||
|
||||
@staticmethod
|
||||
def worker_get_request_size():
|
||||
return RayConfig.instance().worker_get_request_size()
|
||||
@@ -46,10 +42,6 @@ cdef class Config:
|
||||
def worker_fetch_request_size():
|
||||
return RayConfig.instance().worker_fetch_request_size()
|
||||
|
||||
@staticmethod
|
||||
def actor_max_dummy_objects():
|
||||
return RayConfig.instance().actor_max_dummy_objects()
|
||||
|
||||
@staticmethod
|
||||
def raylet_client_num_connect_attempts():
|
||||
return RayConfig.instance().raylet_client_num_connect_attempts()
|
||||
@@ -64,19 +56,6 @@ cdef class Config:
|
||||
return (RayConfig.instance()
|
||||
.raylet_fetch_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def raylet_reconstruction_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.raylet_reconstruction_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def max_num_to_reconstruct():
|
||||
return RayConfig.instance().max_num_to_reconstruct()
|
||||
|
||||
@staticmethod
|
||||
def raylet_fetch_request_size():
|
||||
return RayConfig.instance().raylet_fetch_request_size()
|
||||
|
||||
@staticmethod
|
||||
def kill_worker_timeout_milliseconds():
|
||||
return RayConfig.instance().kill_worker_timeout_milliseconds()
|
||||
@@ -85,14 +64,6 @@ cdef class Config:
|
||||
def worker_register_timeout_seconds():
|
||||
return RayConfig.instance().worker_register_timeout_seconds()
|
||||
|
||||
@staticmethod
|
||||
def max_time_for_handler_milliseconds():
|
||||
return RayConfig.instance().max_time_for_handler_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def max_time_for_loop():
|
||||
return RayConfig.instance().max_time_for_loop()
|
||||
|
||||
@staticmethod
|
||||
def redis_db_connect_retries():
|
||||
return RayConfig.instance().redis_db_connect_retries()
|
||||
@@ -101,27 +72,6 @@ cdef class Config:
|
||||
def redis_db_connect_wait_milliseconds():
|
||||
return RayConfig.instance().redis_db_connect_wait_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def plasma_default_release_delay():
|
||||
return RayConfig.instance().plasma_default_release_delay()
|
||||
|
||||
@staticmethod
|
||||
def L3_cache_size_bytes():
|
||||
return RayConfig.instance().L3_cache_size_bytes()
|
||||
|
||||
@staticmethod
|
||||
def max_tasks_to_spillback():
|
||||
return RayConfig.instance().max_tasks_to_spillback()
|
||||
|
||||
@staticmethod
|
||||
def actor_creation_num_spillbacks_warning():
|
||||
return RayConfig.instance().actor_creation_num_spillbacks_warning()
|
||||
|
||||
@staticmethod
|
||||
def node_manager_forward_task_retry_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.node_manager_forward_task_retry_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def object_manager_pull_timeout_ms():
|
||||
return RayConfig.instance().object_manager_pull_timeout_ms()
|
||||
@@ -146,14 +96,6 @@ cdef class Config:
|
||||
def num_workers_per_process_java():
|
||||
return RayConfig.instance().num_workers_per_process_java()
|
||||
|
||||
@staticmethod
|
||||
def max_task_lease_timeout_ms():
|
||||
return RayConfig.instance().max_task_lease_timeout_ms()
|
||||
|
||||
@staticmethod
|
||||
def num_actor_checkpoints_to_keep():
|
||||
return RayConfig.instance().num_actor_checkpoints_to_keep()
|
||||
|
||||
@staticmethod
|
||||
def maximum_gcs_deletion_batch_size():
|
||||
return RayConfig.instance().maximum_gcs_deletion_batch_size()
|
||||
|
||||
+4
-4
@@ -93,9 +93,9 @@ class Node:
|
||||
"The raylet IP address should only be different than the node "
|
||||
"IP address when connecting to an existing raylet; i.e., when "
|
||||
"head=False and connect_only=True.")
|
||||
if ray_params._internal_config and len(
|
||||
ray_params._internal_config) > 0 and (not head
|
||||
and not connect_only):
|
||||
if ray_params._system_config and len(
|
||||
ray_params._system_config) > 0 and (not head
|
||||
and not connect_only):
|
||||
raise ValueError(
|
||||
"Internal config parameters can only be set on the head node.")
|
||||
|
||||
@@ -124,7 +124,7 @@ class Node:
|
||||
self._localhost = socket.gethostbyname("localhost")
|
||||
self._ray_params = ray_params
|
||||
self._redis_address = ray_params.redis_address
|
||||
self._config = ray_params._internal_config or {}
|
||||
self._config = ray_params._system_config or {}
|
||||
|
||||
# Enable Plasma Store as a thread by default.
|
||||
if "plasma_store_as_thread" not in self._config:
|
||||
|
||||
+16
-14
@@ -91,8 +91,9 @@ class RayParams:
|
||||
metrics_agent_port(int): The port to bind metrics agent.
|
||||
metrics_export_port(int): The port at which metrics are exposed
|
||||
through a Prometheus endpoint.
|
||||
_internal_config (str): JSON configuration for overriding
|
||||
RayConfig defaults. For testing purposes ONLY.
|
||||
_system_config (dict): Configuration for overriding RayConfig
|
||||
defaults. Used to set system configuration and for experimental Ray
|
||||
core feature flags.
|
||||
lru_evict (bool): Enable LRU eviction if space is needed.
|
||||
enable_object_reconstruction (bool): Enable plasma reconstruction on
|
||||
failure.
|
||||
@@ -141,7 +142,7 @@ class RayParams:
|
||||
java_worker_options=None,
|
||||
load_code_from_local=False,
|
||||
start_initial_python_workers_for_first_job=False,
|
||||
_internal_config=None,
|
||||
_system_config=None,
|
||||
enable_object_reconstruction=False,
|
||||
metrics_agent_port=None,
|
||||
metrics_export_port=None,
|
||||
@@ -188,7 +189,7 @@ class RayParams:
|
||||
self.metrics_export_port = metrics_export_port
|
||||
self.start_initial_python_workers_for_first_job = (
|
||||
start_initial_python_workers_for_first_job)
|
||||
self._internal_config = _internal_config
|
||||
self._system_config = _system_config
|
||||
self._lru_evict = lru_evict
|
||||
self._enable_object_reconstruction = enable_object_reconstruction
|
||||
self.object_spilling_config = object_spilling_config
|
||||
@@ -197,26 +198,27 @@ class RayParams:
|
||||
# Set the internal config options for LRU eviction.
|
||||
if lru_evict:
|
||||
# Turn off object pinning.
|
||||
if self._internal_config is None:
|
||||
self._internal_config = dict()
|
||||
if self._internal_config.get("object_pinning_enabled", False):
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if self._system_config.get("object_pinning_enabled", False):
|
||||
raise Exception(
|
||||
"Object pinning cannot be enabled if using LRU eviction.")
|
||||
self._internal_config["object_pinning_enabled"] = False
|
||||
self._internal_config["object_store_full_max_retries"] = -1
|
||||
self._internal_config["free_objects_period_milliseconds"] = 1000
|
||||
self._system_config["object_pinning_enabled"] = False
|
||||
self._system_config["object_store_full_max_retries"] = -1
|
||||
self._system_config["free_objects_period_milliseconds"] = 1000
|
||||
|
||||
# Set the internal config options for object reconstruction.
|
||||
if enable_object_reconstruction:
|
||||
# Turn off object pinning.
|
||||
if self._internal_config is None:
|
||||
self._internal_config = dict()
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if lru_evict:
|
||||
raise Exception(
|
||||
"Object reconstruction cannot be enabled if using LRU "
|
||||
"eviction.")
|
||||
self._internal_config["lineage_pinning_enabled"] = True
|
||||
self._internal_config["free_objects_period_milliseconds"] = -1
|
||||
print(self._system_config)
|
||||
self._system_config["lineage_pinning_enabled"] = True
|
||||
self._system_config["free_objects_period_milliseconds"] = -1
|
||||
|
||||
def update(self, **kwargs):
|
||||
"""Update the settings according to the keyword arguments.
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""This is the script for `ray microbenchmark`."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@@ -110,10 +109,7 @@ def main():
|
||||
|
||||
print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")
|
||||
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"put_small_object_in_memory_store": True
|
||||
}))
|
||||
ray.init(_system_config={"put_small_object_in_memory_store": True})
|
||||
|
||||
value = ray.put(0)
|
||||
|
||||
@@ -138,10 +134,7 @@ def main():
|
||||
timeit("multi client put calls", put_multi_small, 1000)
|
||||
|
||||
ray.shutdown()
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"put_small_object_in_memory_store": False
|
||||
}))
|
||||
ray.init(_system_config={"put_small_object_in_memory_store": False})
|
||||
|
||||
value = ray.put(0)
|
||||
arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
|
||||
|
||||
@@ -358,10 +358,10 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
|
||||
type=str,
|
||||
help="Overwrite the options to start Java workers.")
|
||||
@click.option(
|
||||
"--internal-config",
|
||||
"--system-config",
|
||||
default=None,
|
||||
type=json.loads,
|
||||
help="Do NOT use this. This is for debugging/development purposes ONLY.")
|
||||
help="Override system configuration defaults.")
|
||||
@click.option(
|
||||
"--load-code-from-local",
|
||||
is_flag=True,
|
||||
@@ -394,9 +394,9 @@ def start(node_ip_address, redis_address, address, redis_port, port,
|
||||
dashboard_port, block, plasma_directory, huge_pages,
|
||||
autoscaling_config, no_redirect_worker_output, no_redirect_output,
|
||||
plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
|
||||
java_worker_options, load_code_from_local, internal_config,
|
||||
lru_evict, enable_object_reconstruction, metrics_export_port,
|
||||
log_new_style, log_color, verbose):
|
||||
java_worker_options, load_code_from_local, system_config, lru_evict,
|
||||
enable_object_reconstruction, metrics_export_port, log_new_style,
|
||||
log_color, verbose):
|
||||
"""Start Ray processes manually on the local machine."""
|
||||
cli_logger.old_style = not log_new_style
|
||||
cli_logger.color_mode = log_color
|
||||
@@ -508,7 +508,8 @@ def start(node_ip_address, redis_address, address, redis_port, port,
|
||||
dashboard_port=dashboard_port,
|
||||
java_worker_options=java_worker_options,
|
||||
load_code_from_local=load_code_from_local,
|
||||
_internal_config=internal_config,
|
||||
_system_config=json.loads(system_config)
|
||||
if system_config else system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=metrics_export_port)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import asyncio
|
||||
import errno
|
||||
import io
|
||||
import json
|
||||
import fnmatch
|
||||
import os
|
||||
import subprocess
|
||||
@@ -282,10 +281,9 @@ def recursive_fnmatch(dirpath, pattern):
|
||||
return matches
|
||||
|
||||
|
||||
def generate_internal_config_map(**kwargs):
|
||||
internal_config = json.dumps(kwargs)
|
||||
def generate_system_config_map(**kwargs):
|
||||
ray_kwargs = {
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": kwargs,
|
||||
}
|
||||
return ray_kwargs
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ This file defines the common pytest fixtures used in current directory.
|
||||
"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
import json
|
||||
import pytest
|
||||
import subprocess
|
||||
|
||||
@@ -19,22 +18,22 @@ def shutdown_only():
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def get_default_fixure_internal_config():
|
||||
internal_config = json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
def get_default_fixure_system_config():
|
||||
system_config = {
|
||||
"object_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
"object_store_full_max_retries": 3,
|
||||
"object_store_full_initial_delay_ms": 100,
|
||||
})
|
||||
return internal_config
|
||||
}
|
||||
return system_config
|
||||
|
||||
|
||||
def get_default_fixture_ray_kwargs():
|
||||
internal_config = get_default_fixure_internal_config()
|
||||
system_config = get_default_fixure_system_config()
|
||||
ray_kwargs = {
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": 150 * 1024 * 1024,
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": system_config,
|
||||
}
|
||||
return ray_kwargs
|
||||
|
||||
@@ -125,8 +124,8 @@ def _ray_start_cluster(**kwargs):
|
||||
cluster = Cluster()
|
||||
remote_nodes = []
|
||||
for i in range(num_nodes):
|
||||
if i > 0 and "_internal_config" in init_kwargs:
|
||||
del init_kwargs["_internal_config"]
|
||||
if i > 0 and "_system_config" in init_kwargs:
|
||||
del init_kwargs["_system_config"]
|
||||
remote_nodes.append(cluster.add_node(**init_kwargs))
|
||||
# We assume driver will connect to the head (first node),
|
||||
# so ray init will be invoked if do_init is true
|
||||
@@ -164,10 +163,10 @@ def ray_start_cluster_2_nodes(request):
|
||||
def ray_start_object_store_memory(request):
|
||||
# Start the Ray processes.
|
||||
store_size = request.param
|
||||
internal_config = get_default_fixure_internal_config()
|
||||
system_config = get_default_fixure_system_config()
|
||||
init_kwargs = {
|
||||
"num_cpus": 1,
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": system_config,
|
||||
"object_store_memory": store_size,
|
||||
}
|
||||
ray.init(**init_kwargs)
|
||||
@@ -208,12 +207,12 @@ def call_ray_stop_only():
|
||||
|
||||
@pytest.fixture()
|
||||
def two_node_cluster():
|
||||
internal_config = json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
system_config = {
|
||||
"object_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
})
|
||||
}
|
||||
cluster = ray.cluster_utils.Cluster(
|
||||
head_node_args={"_internal_config": internal_config})
|
||||
head_node_args={"_system_config": system_config})
|
||||
for _ in range(2):
|
||||
remote_node = cluster.add_node(num_cpus=1)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
@@ -8,94 +7,22 @@ import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
from ray.test_utils import (
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
generate_internal_config_map,
|
||||
generate_system_config_map,
|
||||
get_other_nodes,
|
||||
SignalActor,
|
||||
get_error_message,
|
||||
)
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_checkpointable_actor_cls(request):
|
||||
checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
|
||||
"ray_temp_checkpoint_dir") + os.sep
|
||||
if not os.path.isdir(checkpoint_dir):
|
||||
os.mkdir(checkpoint_dir)
|
||||
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
self.resumed_from_checkpoint = False
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
|
||||
def node_id(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get(self):
|
||||
return self.value
|
||||
|
||||
def was_resumed_from_checkpoint(self):
|
||||
return self.resumed_from_checkpoint
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Checkpoint the actor when value is increased to 3.
|
||||
should_checkpoint = self.value == 3
|
||||
return should_checkpoint
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
|
||||
# Save checkpoint into a file.
|
||||
with open(self.checkpoint_dir + actor_id, "a+") as f:
|
||||
print(checkpoint_id, self.value, file=f)
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
actor_id = actor_id.hex()
|
||||
filename = self.checkpoint_dir + actor_id
|
||||
# Load checkpoint from the file.
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
available_checkpoint_ids = [
|
||||
c.checkpoint_id for c in available_checkpoints
|
||||
]
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
checkpoint_id, value = line.strip().split(" ")
|
||||
checkpoint_id = ray.ActorCheckpointID(
|
||||
ray.utils.hex_to_binary(checkpoint_id))
|
||||
if checkpoint_id in available_checkpoint_ids:
|
||||
self.value = int(value)
|
||||
self.resumed_from_checkpoint = True
|
||||
return checkpoint_id
|
||||
return None
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
return CheckpointableActor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_init_with_task_retry_delay():
|
||||
address = ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"task_retry_delay_ms": 100
|
||||
}))
|
||||
address = ray.init(_system_config={"task_retry_delay_ms": 100})
|
||||
yield address
|
||||
ray.shutdown()
|
||||
|
||||
@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
"object_timeout_milliseconds": 1000,
|
||||
"task_retry_delay_ms": 100,
|
||||
})
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
cluster.wait_for_nodes()
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
assert ray.get(RetryableTask.remote(remote_actor)) == 3
|
||||
|
||||
|
||||
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
|
||||
# NOTE(hchen): we set object_timeout_milliseconds to 1s for
|
||||
# this test. Because if this value is too small, suprious task reconstruction
|
||||
# may happen and cause the test fauilure. If the value is too large, this test
|
||||
# could be very slow. We can remove this once we support dynamic timeout.
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
initial_reconstruction_timeout_milliseconds=1000,
|
||||
num_heartbeats_timeout=10)
|
||||
generate_system_config_map(
|
||||
object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
|
||||
],
|
||||
indirect=True)
|
||||
def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
@@ -520,287 +446,6 @@ def kill_actor(actor):
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
|
||||
# Call increase 3 times, triggering a checkpoint.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test checkpointing of a remote actor through method invocation."""
|
||||
|
||||
# Define a class that exposes a method to save checkpoints.
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
super(RemoteCheckpointableActor, self).__init__()
|
||||
self._should_checkpoint = False
|
||||
|
||||
def checkpoint(self):
|
||||
self._should_checkpoint = True
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
should_checkpoint = self._should_checkpoint
|
||||
self._should_checkpoint = False
|
||||
return should_checkpoint
|
||||
|
||||
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
|
||||
actor = cls.remote()
|
||||
# Call increase 3 times.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Call a checkpoint task.
|
||||
actor.checkpoint.remote()
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
# Place the actor on the remote node.
|
||||
cluster = ray_start_cluster_2_nodes
|
||||
remote_node = list(cluster.worker_nodes)
|
||||
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
|
||||
actor = actor_cls.remote()
|
||||
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
|
||||
actor = actor_cls.remote()
|
||||
|
||||
# Call increase several times.
|
||||
expected = 0
|
||||
for _ in range(6):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
cluster.remove_node(remote_node[0])
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def save_checkpoint(self, actor_id, checkpoint_context):
|
||||
raise Exception("Intentional error saving checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will fail.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def load_checkpoint(self, actor_id, checkpoints):
|
||||
raise Exception("Intentional error loading checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will succeed.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint because loading
|
||||
# it failed.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
# This overwrite currently isn't effective,
|
||||
# see https://github.com/ray-project/ray/issues/3926.
|
||||
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_deleting_actor_checkpoint(ray_start_regular):
|
||||
"""Test deleting old actor checkpoints."""
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.checkpoint_ids = []
|
||||
|
||||
def get_checkpoint_ids(self):
|
||||
return self.checkpoint_ids
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Save checkpoints after every task
|
||||
return True
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
self.checkpoint_ids.append(checkpoint_id)
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
pass
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
assert checkpoint_id == self.checkpoint_ids[0]
|
||||
del self.checkpoint_ids[0]
|
||||
|
||||
actor = CheckpointableActor.remote()
|
||||
for i in range(19):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
|
||||
for _ in range(20):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
|
||||
|
||||
|
||||
def test_bad_checkpointable_actor_class():
|
||||
"""Test error raised if an actor class doesn't implement all abstract
|
||||
methods in the Checkpointable interface."""
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
|
||||
@ray.remote
|
||||
class BadCheckpointableActor(ray.actor.Checkpointable):
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
|
||||
def test_init_exception_in_checkpointable_actor(
|
||||
ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
|
||||
# This test is similar to test_failure.py::test_failed_actor_init.
|
||||
# This test is used to guarantee that checkpointable actor does not
|
||||
# break the same logic.
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
a = CheckpointableFailedActor.remote()
|
||||
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
|
||||
def test_decorated_method(ray_start_regular):
|
||||
def method_invocation_decorator(f):
|
||||
def new_f_invocation(args, kwargs):
|
||||
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return self.dependency
|
||||
|
||||
# Make sure it is scheduled in the second node.
|
||||
@ray.remote(resources={"node": 1}, num_cpus=1)
|
||||
@ray.remote(resources={"node": 1})
|
||||
class Owner:
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
# Wait until the `Caller` start executing the remote `call` method.
|
||||
ray.get(signal_handle.wait.remote())
|
||||
|
||||
@ray.remote
|
||||
@ray.remote(resources={"caller": 1})
|
||||
class Caller:
|
||||
def call(self, owner_pid, signal_handle, actor_handle):
|
||||
# Notify the `Owner` that the `Caller` is executing the remote
|
||||
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return True
|
||||
|
||||
cluster = ray_start_cluster
|
||||
node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
|
||||
node_to_be_broken = cluster.add_node(resources={"node": 1})
|
||||
cluster.add_node(resources={"caller": 1})
|
||||
|
||||
owner = Owner.remote()
|
||||
owner_pid = ray.get(owner.get_pid.remote())
|
||||
|
||||
caller = Caller.remote()
|
||||
owner.create_actor.remote(caller)
|
||||
ray.get(owner.create_actor.remote(caller))
|
||||
cluster.remove_node(node_to_be_broken)
|
||||
# Wait for the `Owner` to exit.
|
||||
wait_for_pid_to_exit(owner_pid)
|
||||
|
||||
# It will hang here if location is not properly resolved.
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
try:
|
||||
@@ -241,9 +240,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet,
|
||||
num_gpus=num_gpus_per_raylet,
|
||||
_internal_config=json.dumps({
|
||||
"num_heartbeats_timeout": 1000
|
||||
} if i == 0 else {}))
|
||||
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import socket
|
||||
import time
|
||||
@@ -69,9 +68,9 @@ def test_local_scheduling_first(ray_start_cluster):
|
||||
# Disable worker caching.
|
||||
cluster.add_node(
|
||||
num_cpus=num_cpus,
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"worker_lease_timeout_milliseconds": 0,
|
||||
}))
|
||||
})
|
||||
cluster.add_node(num_cpus=num_cpus)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -332,9 +331,7 @@ def test_wait_reconstruction(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=int(10**8),
|
||||
_internal_config=json.dumps({
|
||||
"object_pinning_enabled": 0
|
||||
}))
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
@@ -607,11 +604,7 @@ def test_move_log_files_to_old(shutdown_only):
|
||||
|
||||
|
||||
def test_lease_request_leak(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
}))
|
||||
ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200})
|
||||
assert len(ray.objects()) == 0
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -3,7 +3,6 @@ import numpy as np
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
import pytest
|
||||
import sys
|
||||
import json
|
||||
|
||||
import ray
|
||||
import ray.experimental.array.remote as ra
|
||||
@@ -59,13 +58,13 @@ def test_distributed_array_assemble(ray_start_2_cpus, reload_modules):
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_2_nodes",
|
||||
[{
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# NOTE(swang): If plasma store notifications to the raylet for new
|
||||
# objects are delayed by long enough, then this causes concurrent
|
||||
# fetch calls to timeout and mistakenly mark the object as lost.
|
||||
# Set the timeout very high to prevent this.
|
||||
"initial_reconstruction_timeout_milliseconds": 60000,
|
||||
})
|
||||
"object_timeout_milliseconds": 60000,
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# coding: utf-8
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
@@ -206,10 +205,7 @@ def test_background_tasks_with_max_calls(shutdown_only):
|
||||
|
||||
|
||||
def test_fair_queueing(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1, _internal_config=json.dumps({
|
||||
"fair_queueing_enabled": 1
|
||||
}))
|
||||
ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1})
|
||||
|
||||
@ray.remote
|
||||
def h():
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
@@ -333,19 +332,16 @@ def test_call_chain(ray_start_cluster):
|
||||
assert ray.get(x) == 100
|
||||
|
||||
|
||||
def test_internal_config_when_connecting(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
"object_pinning_enabled": 0,
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
def test_system_config_when_connecting(ray_start_cluster):
|
||||
config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
|
||||
cluster = ray.cluster_utils.Cluster()
|
||||
cluster.add_node(
|
||||
_internal_config=config, object_store_memory=100 * 1024 * 1024)
|
||||
_system_config=config, object_store_memory=100 * 1024 * 1024)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Specifying _internal_config when connecting to a cluster is disallowed.
|
||||
# Specifying _system_config when connecting to a cluster is disallowed.
|
||||
with pytest.raises(ValueError):
|
||||
ray.init(address=cluster.address, _internal_config=config)
|
||||
ray.init(address=cluster.address, _system_config=config)
|
||||
|
||||
# Check that the config was picked up (object pinning is disabled).
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -138,9 +137,9 @@ def check_components_alive(cluster, component_type, check_component_alive):
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 100
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_raylet_failed(ray_start_cluster):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -908,12 +907,12 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
|
||||
|
||||
def test_connect_with_disconnected_node(shutdown_only):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 50,
|
||||
"raylet_heartbeat_timeout_milliseconds": 10,
|
||||
})
|
||||
}
|
||||
cluster = Cluster()
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
p = init_error_pubsub()
|
||||
errors = get_error_message(p, 1, timeout=5)
|
||||
@@ -943,9 +942,9 @@ def test_connect_with_disconnected_node(shutdown_only):
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 5,
|
||||
"object_store_memory": 10**8,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"object_store_full_max_retries": 0
|
||||
})
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
|
||||
@@ -965,9 +964,7 @@ def test_fill_object_store_exception(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0
|
||||
}))
|
||||
_system_config={"object_store_full_max_retries": 0})
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
@@ -997,14 +994,14 @@ def test_fill_object_store_exception(shutdown_only):
|
||||
|
||||
|
||||
def test_fill_object_store_lru_fallback(shutdown_only):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"free_objects_batch_size": 1,
|
||||
})
|
||||
}
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
lru_evict=True,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
@@ -1125,13 +1122,13 @@ def test_serialized_id(ray_start_cluster):
|
||||
[(False, False), (False, True), (True, False),
|
||||
(True, True)])
|
||||
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
})
|
||||
}
|
||||
cluster = Cluster()
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the parent actor.
|
||||
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
|
||||
|
||||
@@ -3,7 +3,7 @@ import sys
|
||||
import ray
|
||||
import pytest
|
||||
from ray.test_utils import (
|
||||
generate_internal_config_map,
|
||||
generate_system_config_map,
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
)
|
||||
@@ -22,7 +22,7 @@ def increase(x):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_gcs_server_restart(ray_start_regular):
|
||||
actor1 = Increase.remote()
|
||||
@@ -45,7 +45,7 @@ def test_gcs_server_restart(ray_start_regular):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
ids = []
|
||||
@@ -64,7 +64,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
|
||||
"""Checks that the node failure detector is correct when gcs server restart.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import pytest
|
||||
try:
|
||||
import pytest_timeout
|
||||
@@ -140,9 +139,9 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
cluster = ray.init(
|
||||
num_cpus=1,
|
||||
resources={resource1: 1},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"max_resource_shapes_per_load_report": max_shapes,
|
||||
}))
|
||||
})
|
||||
redis = ray.services.create_redis_client(
|
||||
cluster["redis_address"],
|
||||
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
|
||||
@@ -48,10 +48,7 @@ def _setup_cluster_for_test(ray_start_cluster):
|
||||
NUM_NODES = 2
|
||||
cluster = ray_start_cluster
|
||||
# Add a head node.
|
||||
cluster.add_node(
|
||||
_internal_config=json.dumps({
|
||||
"metrics_report_interval_ms": 1000
|
||||
}))
|
||||
cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
|
||||
# Add worker nodes.
|
||||
[cluster.add_node() for _ in range(NUM_NODES - 1)]
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@@ -6,7 +6,7 @@ import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.monitor import Monitor
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import generate_internal_config_map, SignalActor
|
||||
from ray.test_utils import generate_system_config_map, SignalActor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,12 +33,11 @@ def test_shutdown():
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
num_heartbeats_timeout=20,
|
||||
initial_reconstruction_timeout_milliseconds=12345)
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
|
||||
],
|
||||
indirect=True)
|
||||
def test_internal_config(ray_start_cluster_head):
|
||||
def test_system_config(ray_start_cluster_head):
|
||||
"""Checks that the internal configuration setting works.
|
||||
|
||||
We set the cluster to timeout nodes after 2 seconds of no timeouts. We
|
||||
@@ -52,8 +51,7 @@ def test_internal_config(ray_start_cluster_head):
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
assert ray._config.initial_reconstruction_timeout_milliseconds(
|
||||
) == 12345
|
||||
assert ray._config.object_timeout_milliseconds() == 12345
|
||||
assert ray._config.num_heartbeats_timeout() == 20
|
||||
|
||||
ray.get([f.remote() for _ in range(5)])
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
@@ -19,9 +18,7 @@ def test_initial_workers(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
include_dashboard=True,
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"])
|
||||
@@ -43,11 +40,7 @@ def test_initial_workers(shutdown_only):
|
||||
# different drivers were scheduled to the same worker process, that is, tasks
|
||||
# of different jobs were not correctly isolated during execution.
|
||||
def test_multi_drivers(shutdown_only):
|
||||
info = ray.init(
|
||||
num_cpus=10,
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})
|
||||
|
||||
driver_code = """
|
||||
import os
|
||||
@@ -120,9 +113,7 @@ def test_worker_env(shutdown_only):
|
||||
"foo1": "bar1",
|
||||
"foo2": "bar2"
|
||||
}),
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
|
||||
@ray.remote
|
||||
def get_env(key):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -145,10 +144,10 @@ def check_components_alive(cluster, component_type, check_component_alive):
|
||||
[{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# Raylet codepath is not stable with a shorter timeout.
|
||||
"num_heartbeats_timeout": 10
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_raylet_failed(ray_start_cluster):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
@@ -19,13 +18,13 @@ import ray.ray_constants as ray_constants
|
||||
"num_cpus": 1,
|
||||
"num_nodes": 4,
|
||||
"object_store_memory": 1000 * 1024 * 1024,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# Raylet codepath is not stable with a shorter timeout.
|
||||
"num_heartbeats_timeout": 10,
|
||||
"object_manager_pull_timeout_ms": 1000,
|
||||
"object_manager_push_timeout_ms": 1000,
|
||||
"object_manager_repeated_push_delay_ms": 1000,
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_object_reconstruction(ray_start_cluster):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import multiprocessing
|
||||
import numpy as np
|
||||
import pytest
|
||||
@@ -207,14 +206,14 @@ def test_object_transfer_retry(ray_start_cluster):
|
||||
# Also, force the receiving object manager to retry the pull sooner. We
|
||||
# make the chunk size smaller in order to make it easier to test objects
|
||||
# with multiple chunks.
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_manager_repeated_push_delay_ms": repeated_push_delay * 1000,
|
||||
"object_manager_pull_timeout_ms": repeated_push_delay * 1000 / 4,
|
||||
"object_manager_default_chunk_size": 1000
|
||||
})
|
||||
}
|
||||
object_store_memory = 150 * 1024 * 1024
|
||||
cluster.add_node(
|
||||
object_store_memory=object_store_memory, _internal_config=config)
|
||||
object_store_memory=object_store_memory, _system_config=config)
|
||||
cluster.add_node(num_gpus=1, object_store_memory=object_store_memory)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
|
||||
@@ -17,10 +17,10 @@ def test_spill_objects_manually(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
pinned_objects = set()
|
||||
@@ -64,10 +64,10 @@ def test_spill_objects_manually_from_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
|
||||
@ray.remote
|
||||
def _worker():
|
||||
@@ -90,10 +90,10 @@ def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
arrays = [np.random.rand(100 * 1024) for _ in range(50)]
|
||||
objects = [ray.put(arr) for arr in arrays]
|
||||
|
||||
@@ -117,7 +117,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": json.dumps({
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}),
|
||||
@@ -159,7 +159,7 @@ def test_spill_objects_automatically(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_internal_config=json.dumps({
|
||||
_system_config=json.dumps({
|
||||
"max_io_workers": 4,
|
||||
"object_store_full_max_retries": 2,
|
||||
"object_store_full_initial_delay_ms": 10,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -16,14 +15,14 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
def test_cached_object(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
@@ -61,18 +60,17 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -121,18 +119,17 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -171,18 +168,17 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -229,18 +225,17 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -303,18 +298,17 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -384,18 +378,17 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -445,18 +438,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
object_store_memory=10**8,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
@@ -493,17 +485,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
|
||||
|
||||
def test_reconstruction_stress(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"max_direct_call_object_size": 100,
|
||||
"task_retry_delay_ms": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0, _internal_config=config, enable_object_reconstruction=True)
|
||||
num_cpus=0, _system_config=config, enable_object_reconstruction=True)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# coding: utf-8
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@@ -18,14 +17,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
@pytest.fixture
|
||||
def one_worker_100MiB(request):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_store_full_max_retries": 2,
|
||||
"task_retry_delay_ms": 0,
|
||||
})
|
||||
}
|
||||
yield ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@@ -245,9 +244,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
|
||||
def test_feature_flag(shutdown_only):
|
||||
ray.init(
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=json.dumps({
|
||||
"object_pinning_enabled": 0
|
||||
}))
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
|
||||
@ray.remote
|
||||
def f(array):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -20,15 +19,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
@pytest.fixture
|
||||
def one_worker_100MiB(request):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_store_full_max_retries": 2,
|
||||
"task_retry_delay_ms": 0,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
})
|
||||
"object_timeout_milliseconds": 1000,
|
||||
}
|
||||
yield ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
@@ -23,9 +22,9 @@ def ray_start_reconstruction(request):
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": plasma_store_memory // num_nodes,
|
||||
"redis_max_memory": 10**7,
|
||||
"_internal_config": json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
"_system_config": {
|
||||
"object_timeout_milliseconds": 200
|
||||
}
|
||||
})
|
||||
for i in range(num_nodes - 1):
|
||||
cluster.add_node(
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import inspect
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import pytest
|
||||
@@ -45,9 +44,9 @@ def _start_new_cluster():
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
register_trainable("__fake_remote", MockRemoteTrainer)
|
||||
@@ -74,9 +73,9 @@ def start_connected_emptyhead_cluster():
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 0,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
_register_all()
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
@@ -190,9 +189,9 @@ class RayExecutorQueueTest(unittest.TestCase):
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
_register_all()
|
||||
|
||||
+11
-10
@@ -107,7 +107,6 @@ class Worker:
|
||||
self.actors = {}
|
||||
# Information used to maintain actor checkpoints.
|
||||
self.actor_checkpoint_info = {}
|
||||
self.actor_task_counter = 0
|
||||
# When the worker is constructed. Record the original value of the
|
||||
# CUDA_VISIBLE_DEVICES environment variable.
|
||||
self.original_gpu_ids = ray.utils.get_cuda_visible_devices()
|
||||
@@ -515,7 +514,7 @@ def init(address=None,
|
||||
load_code_from_local=False,
|
||||
java_worker_options=None,
|
||||
use_pickle=True,
|
||||
_internal_config=None,
|
||||
_system_config=None,
|
||||
lru_evict=False,
|
||||
enable_object_reconstruction=False,
|
||||
_metrics_export_port=None,
|
||||
@@ -631,8 +630,9 @@ def init(address=None,
|
||||
module or from the GCS.
|
||||
java_worker_options: Overwrite the options to start Java workers.
|
||||
use_pickle: Deprecated.
|
||||
_internal_config (str): JSON configuration for overriding
|
||||
RayConfig defaults. For testing purposes ONLY.
|
||||
_system_config (dict): Configuration for overriding RayConfig
|
||||
defaults. Used to set system configuration and for experimental Ray
|
||||
core feature flags.
|
||||
lru_evict (bool): If True, when an object store is full, it will evict
|
||||
objects in LRU order to make more space and when under memory
|
||||
pressure, ray.UnreconstructableError may be thrown. If False, then
|
||||
@@ -706,8 +706,9 @@ def init(address=None,
|
||||
|
||||
raylet_ip_address = node_ip_address
|
||||
|
||||
_internal_config = (json.loads(_internal_config)
|
||||
if _internal_config else {})
|
||||
_system_config = _system_config or {}
|
||||
if not isinstance(_system_config, dict):
|
||||
raise TypeError("The _system_config must be a dict.")
|
||||
|
||||
global _global_node
|
||||
if redis_address is None:
|
||||
@@ -742,7 +743,7 @@ def init(address=None,
|
||||
load_code_from_local=load_code_from_local,
|
||||
java_worker_options=java_worker_options,
|
||||
start_initial_python_workers_for_first_job=True,
|
||||
_internal_config=_internal_config,
|
||||
_system_config=_system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=_metrics_export_port,
|
||||
@@ -798,9 +799,9 @@ def init(address=None,
|
||||
if java_worker_options is not None:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"java_worker_options must not be provided.")
|
||||
if _internal_config is not None and len(_internal_config) != 0:
|
||||
if _system_config is not None and len(_system_config) != 0:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"_internal_config must not be provided.")
|
||||
"_system_config must not be provided.")
|
||||
if lru_evict:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"lru_evict must not be provided.")
|
||||
@@ -818,7 +819,7 @@ def init(address=None,
|
||||
object_ref_seed=object_ref_seed,
|
||||
temp_dir=temp_dir,
|
||||
load_code_from_local=load_code_from_local,
|
||||
_internal_config=_internal_config,
|
||||
_system_config=_system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=_metrics_export_port)
|
||||
|
||||
@@ -122,13 +122,13 @@ if __name__ == "__main__":
|
||||
object_spilling_config = {}
|
||||
external_storage.setup_external_storage(object_spilling_config)
|
||||
|
||||
internal_config = {}
|
||||
system_config = {}
|
||||
if args.config_list is not None:
|
||||
config_list = args.config_list.split(",")
|
||||
if len(config_list) > 1:
|
||||
i = 0
|
||||
while i < len(config_list):
|
||||
internal_config[config_list[i]] = config_list[i + 1]
|
||||
system_config[config_list[i]] = config_list[i + 1]
|
||||
i += 2
|
||||
|
||||
raylet_ip_address = args.raylet_ip_address
|
||||
@@ -146,7 +146,7 @@ if __name__ == "__main__":
|
||||
temp_dir=args.temp_dir,
|
||||
load_code_from_local=args.load_code_from_local,
|
||||
metrics_agent_port=args.metrics_agent_port,
|
||||
_internal_config=json.dumps(internal_config),
|
||||
_system_config=system_config,
|
||||
)
|
||||
|
||||
node = ray.node.Node(
|
||||
|
||||
@@ -131,11 +131,12 @@ RAY_CONFIG(int64_t, grpc_server_retry_timeout_milliseconds, 1000)
|
||||
// of creation retries will be MAX(actor_creation_min_retries, max_restarts).
|
||||
RAY_CONFIG(uint64_t, actor_creation_min_retries, 3)
|
||||
|
||||
/// The initial period for a task execution lease. The lease will expire this
|
||||
/// many milliseconds after the first acquisition of the lease. Nodes that
|
||||
/// require an object will not try to reconstruct the task until at least
|
||||
/// this many milliseconds.
|
||||
RAY_CONFIG(int64_t, initial_reconstruction_timeout_milliseconds, 10000)
|
||||
/// When trying to resolve an object, the initial period that the raylet will
|
||||
/// wait before contacting the object's owner to check if the object is still
|
||||
/// available. This is a lower bound on the time to report the loss of an
|
||||
/// object stored in the distributed object store in the case that the worker
|
||||
/// that created the original ObjectRef dies.
|
||||
RAY_CONFIG(int64_t, object_timeout_milliseconds, 100)
|
||||
|
||||
/// The maximum duration that workers can hold on to another worker's lease
|
||||
/// for direct task submission until it must be returned to the raylet.
|
||||
@@ -151,15 +152,6 @@ RAY_CONFIG(int64_t, get_timeout_milliseconds, 1000)
|
||||
RAY_CONFIG(int64_t, worker_get_request_size, 10000)
|
||||
RAY_CONFIG(int64_t, worker_fetch_request_size, 10000)
|
||||
|
||||
/// This is used to bound the size of the Raylet's lineage cache. This is
|
||||
/// the maximum uncommitted lineage size that any remote task in the cache
|
||||
/// can have before eviction will be attempted.
|
||||
RAY_CONFIG(uint64_t, max_lineage_size, 100)
|
||||
|
||||
/// This is a temporary constant used by actors to determine how many dummy
|
||||
/// objects to store.
|
||||
RAY_CONFIG(int64_t, actor_max_dummy_objects, 1000)
|
||||
|
||||
/// Number of times raylet client tries connecting to a raylet.
|
||||
RAY_CONFIG(int64_t, raylet_client_num_connect_attempts, 10)
|
||||
RAY_CONFIG(int64_t, raylet_client_connect_timeout_milliseconds, 1000)
|
||||
@@ -169,62 +161,18 @@ RAY_CONFIG(int64_t, raylet_client_connect_timeout_milliseconds, 1000)
|
||||
/// the number of missing task dependencies.
|
||||
RAY_CONFIG(int64_t, raylet_fetch_timeout_milliseconds, 1000)
|
||||
|
||||
/// The duration that the raylet will wait between initiating
|
||||
/// reconstruction calls for missing task dependencies. If there are many
|
||||
/// missing task dependencies, we will only iniate reconstruction calls for
|
||||
/// some of them each time.
|
||||
RAY_CONFIG(int64_t, raylet_reconstruction_timeout_milliseconds, 1000)
|
||||
|
||||
/// The maximum number of objects that the raylet will issue
|
||||
/// reconstruct calls for in a single pass through the reconstruct object
|
||||
/// timeout handler.
|
||||
RAY_CONFIG(int64_t, max_num_to_reconstruct, 10000)
|
||||
|
||||
/// The maximum number of objects to include in a single fetch request in the
|
||||
/// regular raylet fetch timeout handler.
|
||||
RAY_CONFIG(int64_t, raylet_fetch_request_size, 10000)
|
||||
|
||||
/// The maximum number of active object IDs to report in a heartbeat.
|
||||
/// # NOTE: currently disabled by default.
|
||||
RAY_CONFIG(size_t, raylet_max_active_object_ids, 0)
|
||||
|
||||
/// The duration that we wait after sending a worker SIGTERM before sending
|
||||
/// the worker SIGKILL.
|
||||
RAY_CONFIG(int64_t, kill_worker_timeout_milliseconds, 100)
|
||||
|
||||
/// The duration that we wait after the worekr is launched before the
|
||||
/// The duration that we wait after the worker is launched before the
|
||||
/// starting_worker_timeout_callback() is called.
|
||||
RAY_CONFIG(int64_t, worker_register_timeout_seconds, 30)
|
||||
|
||||
/// This is a timeout used to cause failures in the plasma manager and raylet
|
||||
/// when certain event loop handlers take too long.
|
||||
RAY_CONFIG(int64_t, max_time_for_handler_milliseconds, 1000)
|
||||
|
||||
/// This is used to cause failures when a certain loop in redis.cc which
|
||||
/// synchronously looks up object manager addresses in redis is slow.
|
||||
RAY_CONFIG(int64_t, max_time_for_loop, 1000)
|
||||
|
||||
/// Allow up to 5 seconds for connecting to Redis.
|
||||
RAY_CONFIG(int64_t, redis_db_connect_retries, 50)
|
||||
RAY_CONFIG(int64_t, redis_db_connect_wait_milliseconds, 100)
|
||||
|
||||
/// TODO(rkn): These constants are currently unused.
|
||||
RAY_CONFIG(int64_t, plasma_default_release_delay, 64)
|
||||
RAY_CONFIG(int64_t, L3_cache_size_bytes, 100000000)
|
||||
|
||||
/// Constants for the spillback scheduling policy.
|
||||
RAY_CONFIG(int64_t, max_tasks_to_spillback, 10)
|
||||
|
||||
/// Every time an actor creation task has been spilled back a number of times
|
||||
/// that is a multiple of this quantity, a warning will be pushed to the
|
||||
/// corresponding driver. Since spillback currently occurs on a 100ms timer,
|
||||
/// a value of 100 corresponds to a warning every 10 seconds.
|
||||
RAY_CONFIG(int64_t, actor_creation_num_spillbacks_warning, 100)
|
||||
|
||||
/// If a node manager attempts to forward a task to another node manager and
|
||||
/// the forward fails, then it will resubmit the task after this duration.
|
||||
RAY_CONFIG(int64_t, node_manager_forward_task_retry_timeout_milliseconds, 1000)
|
||||
|
||||
/// Timeout, in milliseconds, to wait before retrying a failed pull in the
|
||||
/// ObjectManager.
|
||||
RAY_CONFIG(int, object_manager_pull_timeout_ms, 10000)
|
||||
@@ -251,15 +199,6 @@ RAY_CONFIG(int, num_workers_per_process_python, 1)
|
||||
/// Number of workers per Java worker process
|
||||
RAY_CONFIG(int, num_workers_per_process_java, 10)
|
||||
|
||||
/// Maximum timeout in milliseconds within which a task lease must be renewed.
|
||||
RAY_CONFIG(int64_t, max_task_lease_timeout_ms, 60000)
|
||||
|
||||
/// Maximum number of checkpoints to keep in GCS for an actor.
|
||||
/// Note: this number should be set to at least 2. Because saving a application
|
||||
/// checkpoint isn't atomic with saving the backend checkpoint, and it will break
|
||||
/// if this number is set to 1 and users save application checkpoints in place.
|
||||
RAY_CONFIG(int32_t, num_actor_checkpoints_to_keep, 20)
|
||||
|
||||
/// Maximum number of ids in one batch to send to GCS to delete keys.
|
||||
RAY_CONFIG(uint32_t, maximum_gcs_deletion_batch_size, 1000)
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ std::string TestSetupUtil::StartGcsServer(const std::string &redis_address) {
|
||||
ray::JoinPaths(ray::GetUserTempDir(), "gcs_server" + ObjectID::FromRandom().Hex());
|
||||
std::vector<std::string> cmdargs(
|
||||
{TEST_GCS_SERVER_EXEC_PATH, "--redis_address=" + redis_address, "--redis_port=6379",
|
||||
"--config_list=initial_reconstruction_timeout_milliseconds,2000"});
|
||||
"--config_list=object_timeout_milliseconds,2000"});
|
||||
RAY_LOG(INFO) << "Start gcs server command: " << CreateCommandLine(cmdargs);
|
||||
RAY_CHECK(!Process::Spawn(cmdargs, true, gcs_server_socket_name + ".pid").second);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||
@@ -153,7 +153,7 @@ std::string TestSetupUtil::StartRaylet(const std::string &store_socket_name,
|
||||
"--python_worker_command=" +
|
||||
CreateCommandLine({TEST_MOCK_WORKER_EXEC_PATH, store_socket_name,
|
||||
raylet_socket_name, std::to_string(port)}),
|
||||
"--config_list=initial_reconstruction_timeout_milliseconds,2000"});
|
||||
"--config_list=object_timeout_milliseconds,2000"});
|
||||
RAY_LOG(DEBUG) << "Raylet Start command: " << CreateCommandLine(cmdargs);
|
||||
RAY_CHECK(!Process::Spawn(cmdargs, true, raylet_socket_name + ".pid").second);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||
|
||||
@@ -302,11 +302,11 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
|
||||
options_.raylet_ip_address, options_.node_manager_port, *client_call_manager_);
|
||||
ClientID local_raylet_id;
|
||||
int assigned_port;
|
||||
std::unordered_map<std::string, std::string> internal_config;
|
||||
std::unordered_map<std::string, std::string> system_config;
|
||||
local_raylet_client_ = std::shared_ptr<raylet::RayletClient>(new raylet::RayletClient(
|
||||
io_service_, std::move(grpc_client), options_.raylet_socket, GetWorkerID(),
|
||||
options_.worker_type, worker_context_.GetCurrentJobID(), options_.language,
|
||||
options_.node_ip_address, &local_raylet_id, &assigned_port, &internal_config,
|
||||
options_.node_ip_address, &local_raylet_id, &assigned_port, &system_config,
|
||||
options_.serialized_job_config));
|
||||
connected_ = true;
|
||||
|
||||
@@ -316,7 +316,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
|
||||
"start'.";
|
||||
|
||||
// NOTE(edoakes): any initialization depending on RayConfig must happen after this line.
|
||||
RayConfig::instance().initialize(internal_config);
|
||||
RayConfig::instance().initialize(system_config);
|
||||
// Start RPC server after all the task receivers are properly initialized and we have
|
||||
// our assigned port from the raylet.
|
||||
core_worker_server_ = std::unique_ptr<rpc::GrpcServer>(
|
||||
|
||||
@@ -86,9 +86,8 @@ Java_io_ray_runtime_gcs_GlobalStateAccessor_nativeGetInternalConfig(
|
||||
JNIEnv *env, jobject o, jlong gcs_accessor_ptr) {
|
||||
auto *gcs_accessor =
|
||||
reinterpret_cast<ray::gcs::GlobalStateAccessor *>(gcs_accessor_ptr);
|
||||
auto internal_config_string = gcs_accessor->GetInternalConfig();
|
||||
return static_cast<jbyteArray>(
|
||||
NativeStringToJavaByteArray(env, internal_config_string));
|
||||
auto system_config_string = gcs_accessor->GetInternalConfig();
|
||||
return static_cast<jbyteArray>(NativeStringToJavaByteArray(env, system_config_string));
|
||||
}
|
||||
|
||||
JNIEXPORT jobject JNICALL
|
||||
|
||||
@@ -342,16 +342,16 @@ void GcsNodeManager::HandleSetInternalConfig(const rpc::SetInternalConfigRequest
|
||||
void GcsNodeManager::HandleGetInternalConfig(const rpc::GetInternalConfigRequest &request,
|
||||
rpc::GetInternalConfigReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
auto get_internal_config = [reply, send_reply_callback](
|
||||
ray::Status status,
|
||||
const boost::optional<rpc::StoredConfig> &config) {
|
||||
auto get_system_config = [reply, send_reply_callback](
|
||||
ray::Status status,
|
||||
const boost::optional<rpc::StoredConfig> &config) {
|
||||
if (config.has_value()) {
|
||||
reply->mutable_config()->CopyFrom(config.get());
|
||||
}
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||
};
|
||||
RAY_CHECK_OK(gcs_table_storage_->InternalConfigTable().Get(UniqueID::Nil(),
|
||||
get_internal_config));
|
||||
RAY_CHECK_OK(
|
||||
gcs_table_storage_->InternalConfigTable().Get(UniqueID::Nil(), get_system_config));
|
||||
}
|
||||
|
||||
std::shared_ptr<rpc::GcsNodeInfo> GcsNodeManager::GetNode(
|
||||
|
||||
@@ -396,8 +396,8 @@ class GcsTableStorage {
|
||||
}
|
||||
|
||||
GcsInternalConfigTable &InternalConfigTable() {
|
||||
RAY_CHECK(internal_config_table_ != nullptr);
|
||||
return *internal_config_table_;
|
||||
RAY_CHECK(system_config_table_ != nullptr);
|
||||
return *system_config_table_;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -418,7 +418,7 @@ class GcsTableStorage {
|
||||
std::unique_ptr<GcsHeartbeatBatchTable> heartbeat_batch_table_;
|
||||
std::unique_ptr<GcsProfileTable> profile_table_;
|
||||
std::unique_ptr<GcsWorkerTable> worker_table_;
|
||||
std::unique_ptr<GcsInternalConfigTable> internal_config_table_;
|
||||
std::unique_ptr<GcsInternalConfigTable> system_config_table_;
|
||||
};
|
||||
|
||||
/// \class RedisGcsTableStorage
|
||||
@@ -447,7 +447,7 @@ class RedisGcsTableStorage : public GcsTableStorage {
|
||||
heartbeat_batch_table_.reset(new GcsHeartbeatBatchTable(store_client_));
|
||||
profile_table_.reset(new GcsProfileTable(store_client_));
|
||||
worker_table_.reset(new GcsWorkerTable(store_client_));
|
||||
internal_config_table_.reset(new GcsInternalConfigTable(store_client_));
|
||||
system_config_table_.reset(new GcsInternalConfigTable(store_client_));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -475,7 +475,7 @@ class InMemoryGcsTableStorage : public GcsTableStorage {
|
||||
heartbeat_batch_table_.reset(new GcsHeartbeatBatchTable(store_client_));
|
||||
profile_table_.reset(new GcsProfileTable(store_client_));
|
||||
worker_table_.reset(new GcsWorkerTable(store_client_));
|
||||
internal_config_table_.reset(new GcsInternalConfigTable(store_client_));
|
||||
system_config_table_.reset(new GcsInternalConfigTable(store_client_));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -834,7 +834,9 @@ Status ActorCheckpointIdTable::AddCheckpointId(const JobID &job_id,
|
||||
std::make_shared<ActorCheckpointIdData>(data);
|
||||
copy->add_timestamps(absl::GetCurrentTimeNanos() / 1000000);
|
||||
copy->add_checkpoint_ids(checkpoint_id.Binary());
|
||||
auto num_to_keep = RayConfig::instance().num_actor_checkpoints_to_keep();
|
||||
// TODO(swang): This is a temporary value while we deprecate the actor
|
||||
// checkpoint table.
|
||||
auto num_to_keep = 20;
|
||||
while (copy->timestamps().size() > num_to_keep) {
|
||||
// Delete the checkpoint from actor checkpoint table.
|
||||
const auto &to_delete = ActorCheckpointID::FromBinary(copy->checkpoint_ids(0));
|
||||
|
||||
@@ -155,9 +155,9 @@ table RegisterClientReply {
|
||||
// Port that this worker should listen on.
|
||||
port: int;
|
||||
// Keys for internal config options.
|
||||
internal_config_keys: [string];
|
||||
system_config_keys: [string];
|
||||
// Values for internal config options corresponding to keys above.
|
||||
internal_config_values: [string];
|
||||
system_config_values: [string];
|
||||
}
|
||||
|
||||
table AnnounceWorkerPort {
|
||||
|
||||
@@ -114,7 +114,7 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
RAY_CHECK_OK(gcs_client->Connect(main_service));
|
||||
|
||||
// The internal_config is only set on the head node--other nodes get it from GCS.
|
||||
// The system_config is only set on the head node--other nodes get it from GCS.
|
||||
if (head_node) {
|
||||
// Parse the configuration list.
|
||||
std::istringstream config_string(config_list);
|
||||
@@ -202,7 +202,6 @@ int main(int argc, char *argv[]) {
|
||||
RayConfig::instance().fair_queueing_enabled();
|
||||
node_manager_config.object_pinning_enabled =
|
||||
RayConfig::instance().object_pinning_enabled();
|
||||
node_manager_config.max_lineage_size = RayConfig::instance().max_lineage_size();
|
||||
node_manager_config.store_socket_name = store_socket_name;
|
||||
node_manager_config.temp_dir = temp_dir;
|
||||
node_manager_config.session_dir = session_dir;
|
||||
|
||||
@@ -152,12 +152,9 @@ NodeManager::NodeManager(boost::asio::io_service &io_service,
|
||||
[this](const TaskID &task_id, const ObjectID &required_object_id) {
|
||||
HandleTaskReconstruction(task_id, required_object_id);
|
||||
},
|
||||
RayConfig::instance().initial_reconstruction_timeout_milliseconds(),
|
||||
self_node_id_, gcs_client_, object_directory_),
|
||||
task_dependency_manager_(
|
||||
object_manager, reconstruction_policy_, io_service, self_node_id_,
|
||||
RayConfig::instance().initial_reconstruction_timeout_milliseconds(),
|
||||
gcs_client_),
|
||||
RayConfig::instance().object_timeout_milliseconds(), self_node_id_, gcs_client_,
|
||||
object_directory_),
|
||||
task_dependency_manager_(object_manager, reconstruction_policy_),
|
||||
actor_registry_(),
|
||||
node_manager_server_("NodeManager", config.node_manager_port),
|
||||
node_manager_service_(io_service, *this),
|
||||
@@ -1262,16 +1259,16 @@ void NodeManager::ProcessRegisterClientRequestMessage(
|
||||
|
||||
auto send_reply_callback = [this, client](int assigned_port) {
|
||||
flatbuffers::FlatBufferBuilder fbb;
|
||||
std::vector<std::string> internal_config_keys;
|
||||
std::vector<std::string> internal_config_values;
|
||||
std::vector<std::string> system_config_keys;
|
||||
std::vector<std::string> system_config_values;
|
||||
for (auto kv : initial_config_.raylet_config) {
|
||||
internal_config_keys.push_back(kv.first);
|
||||
internal_config_values.push_back(kv.second);
|
||||
system_config_keys.push_back(kv.first);
|
||||
system_config_values.push_back(kv.second);
|
||||
}
|
||||
auto reply = ray::protocol::CreateRegisterClientReply(
|
||||
fbb, to_flatbuf(fbb, self_node_id_), assigned_port,
|
||||
string_vec_to_flatbuf(fbb, internal_config_keys),
|
||||
string_vec_to_flatbuf(fbb, internal_config_values));
|
||||
string_vec_to_flatbuf(fbb, system_config_keys),
|
||||
string_vec_to_flatbuf(fbb, system_config_values));
|
||||
fbb.Finish(reply);
|
||||
client->WriteMessageAsync(
|
||||
static_cast<int64_t>(protocol::MessageType::RegisterClientReply), fbb.GetSize(),
|
||||
@@ -2114,18 +2111,6 @@ void NodeManager::ScheduleTasks(
|
||||
RAY_CHECK(local_queues_.GetTasks(TaskState::PLACEABLE).size() == 0);
|
||||
}
|
||||
|
||||
bool NodeManager::CheckDependencyManagerInvariant() const {
|
||||
std::vector<TaskID> pending_task_ids = task_dependency_manager_.GetPendingTasks();
|
||||
// Assert that each pending task in the task dependency manager is in one of the queues.
|
||||
for (const auto &task_id : pending_task_ids) {
|
||||
if (!local_queues_.HasTask(task_id)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// TODO(atumanov): perform the check in the opposite direction.
|
||||
return true;
|
||||
}
|
||||
|
||||
void NodeManager::TreatTaskAsFailed(const Task &task, const ErrorType &error_type) {
|
||||
const TaskSpecification &spec = task.GetTaskSpecification();
|
||||
RAY_LOG(DEBUG) << "Treating task " << spec.TaskId() << " as failed because of error "
|
||||
|
||||
@@ -87,8 +87,6 @@ struct NodeManagerConfig {
|
||||
bool fair_queueing_enabled;
|
||||
/// Whether to enable pinning for plasma objects.
|
||||
bool object_pinning_enabled;
|
||||
/// the maximum lineage size.
|
||||
uint64_t max_lineage_size;
|
||||
/// The store socket name.
|
||||
std::string store_socket_name;
|
||||
/// The path to the ray temp dir.
|
||||
@@ -438,13 +436,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
|
||||
/// \return Void.
|
||||
void HandleJobFinished(const JobID &job_id, const JobTableData &job_data);
|
||||
|
||||
/// Check if certain invariants associated with the task dependency manager
|
||||
/// and the local queues are satisfied. This is only used for debugging
|
||||
/// purposes.
|
||||
///
|
||||
/// \return True if the invariants are satisfied and false otherwise.
|
||||
bool CheckDependencyManagerInvariant() const;
|
||||
|
||||
/// Process client message of SubmitTask
|
||||
///
|
||||
/// \param message_data A pointer to the message data.
|
||||
|
||||
@@ -23,15 +23,8 @@ namespace raylet {
|
||||
|
||||
TaskDependencyManager::TaskDependencyManager(
|
||||
ObjectManagerInterface &object_manager,
|
||||
ReconstructionPolicyInterface &reconstruction_policy,
|
||||
boost::asio::io_service &io_service, const ClientID &client_id,
|
||||
int64_t initial_lease_period_ms, std::shared_ptr<gcs::GcsClient> gcs_client)
|
||||
: object_manager_(object_manager),
|
||||
reconstruction_policy_(reconstruction_policy),
|
||||
io_service_(io_service),
|
||||
client_id_(client_id),
|
||||
initial_lease_period_ms_(initial_lease_period_ms),
|
||||
gcs_client_(gcs_client) {}
|
||||
ReconstructionPolicyInterface &reconstruction_policy)
|
||||
: object_manager_(object_manager), reconstruction_policy_(reconstruction_policy) {}
|
||||
|
||||
bool TaskDependencyManager::CheckObjectLocal(const ObjectID &object_id) const {
|
||||
return local_objects_.count(object_id) == 1;
|
||||
@@ -334,15 +327,6 @@ void TaskDependencyManager::UnsubscribeWaitDependencies(const WorkerID &worker_i
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TaskID> TaskDependencyManager::GetPendingTasks() const {
|
||||
std::vector<TaskID> keys;
|
||||
keys.reserve(pending_tasks_.size());
|
||||
for (const auto &id_task_pair : pending_tasks_) {
|
||||
keys.push_back(id_task_pair.first);
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
void TaskDependencyManager::TaskPending(const Task &task) {
|
||||
// Direct tasks are not tracked by the raylet.
|
||||
// NOTE(zhijunfu): Direct tasks are not tracked by the raylet,
|
||||
@@ -380,8 +364,7 @@ void TaskDependencyManager::TaskPending(const Task &task) {
|
||||
RAY_LOG(DEBUG) << "Task execution " << task_id << " pending";
|
||||
|
||||
// Record that the task is pending execution.
|
||||
auto inserted =
|
||||
pending_tasks_.emplace(task_id, PendingTask(initial_lease_period_ms_, io_service_));
|
||||
auto inserted = pending_tasks_.insert(task_id);
|
||||
if (inserted.second) {
|
||||
// This is the first time we've heard that this task is pending. Find any
|
||||
// subscribed tasks that are dependent on objects created by the pending
|
||||
@@ -395,50 +378,9 @@ void TaskDependencyManager::TaskPending(const Task &task) {
|
||||
HandleRemoteDependencyCanceled(object_entry.first);
|
||||
}
|
||||
}
|
||||
|
||||
// Acquire the lease for the task's execution in the global lease table.
|
||||
AcquireTaskLease(task_id);
|
||||
}
|
||||
}
|
||||
|
||||
void TaskDependencyManager::AcquireTaskLease(const TaskID &task_id) {
|
||||
auto it = pending_tasks_.find(task_id);
|
||||
int64_t now_ms = current_time_ms();
|
||||
if (it == pending_tasks_.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check that we were able to renew the task lease before the previous one
|
||||
// expired.
|
||||
if (now_ms > it->second.expires_at) {
|
||||
RAY_LOG(WARNING) << "Task " << task_id << " lease to renew has already expired by "
|
||||
<< (it->second.expires_at - now_ms) << "ms";
|
||||
}
|
||||
|
||||
auto task_lease_data = std::make_shared<TaskLeaseData>();
|
||||
task_lease_data->set_task_id(task_id.Binary());
|
||||
task_lease_data->set_node_manager_id(client_id_.Binary());
|
||||
task_lease_data->set_acquired_at(absl::GetCurrentTimeNanos() / 1000000);
|
||||
task_lease_data->set_timeout(it->second.lease_period);
|
||||
RAY_CHECK_OK(gcs_client_->Tasks().AsyncAddTaskLease(task_lease_data, nullptr));
|
||||
|
||||
auto period = boost::posix_time::milliseconds(it->second.lease_period / 2);
|
||||
it->second.lease_timer->expires_from_now(period);
|
||||
it->second.lease_timer->async_wait(
|
||||
[this, task_id](const boost::system::error_code &error) {
|
||||
if (!error) {
|
||||
AcquireTaskLease(task_id);
|
||||
} else {
|
||||
// Check that the error was due to the timer being canceled.
|
||||
RAY_CHECK(error == boost::asio::error::operation_aborted);
|
||||
}
|
||||
});
|
||||
|
||||
it->second.expires_at = now_ms + it->second.lease_period;
|
||||
it->second.lease_period = std::min(it->second.lease_period * 2,
|
||||
RayConfig::instance().max_task_lease_timeout_ms());
|
||||
}
|
||||
|
||||
void TaskDependencyManager::TaskCanceled(const TaskID &task_id) {
|
||||
RAY_LOG(DEBUG) << "Task execution " << task_id << " canceled";
|
||||
// Record that the task is no longer pending execution.
|
||||
|
||||
@@ -44,10 +44,7 @@ class TaskDependencyManager {
|
||||
public:
|
||||
/// Create a task dependency manager.
|
||||
TaskDependencyManager(ObjectManagerInterface &object_manager,
|
||||
ReconstructionPolicyInterface &reconstruction_policy,
|
||||
boost::asio::io_service &io_service, const ClientID &client_id,
|
||||
int64_t initial_lease_period_ms,
|
||||
std::shared_ptr<gcs::GcsClient> gcs_client);
|
||||
ReconstructionPolicyInterface &reconstruction_policy);
|
||||
|
||||
/// Check whether an object is locally available.
|
||||
///
|
||||
@@ -142,12 +139,6 @@ class TaskDependencyManager {
|
||||
/// this object dependency.
|
||||
std::vector<TaskID> HandleObjectMissing(const ray::ObjectID &object_id);
|
||||
|
||||
/// Get a list of all Tasks currently marked as pending object dependencies in the task
|
||||
/// dependency manager.
|
||||
///
|
||||
/// \return Return a vector of TaskIDs for tasks registered as pending.
|
||||
std::vector<TaskID> GetPendingTasks() const;
|
||||
|
||||
/// Remove all of the tasks specified. These tasks will no longer be
|
||||
/// considered pending and the objects they depend on will no longer be
|
||||
/// required.
|
||||
@@ -208,21 +199,6 @@ class TaskDependencyManager {
|
||||
/// will be automatically removed from this set once it becomes local.
|
||||
using WorkerDependencies = std::unordered_set<ObjectID>;
|
||||
|
||||
struct PendingTask {
|
||||
PendingTask(int64_t initial_lease_period_ms, boost::asio::io_service &io_service)
|
||||
: lease_period(initial_lease_period_ms),
|
||||
expires_at(INT64_MAX),
|
||||
lease_timer(new boost::asio::deadline_timer(io_service)) {}
|
||||
|
||||
/// The timeout within which the lease should be renewed.
|
||||
int64_t lease_period;
|
||||
/// The time at which the current lease will expire, according to this
|
||||
/// node's steady clock.
|
||||
int64_t expires_at;
|
||||
/// A timer used to determine when to next renew the lease.
|
||||
std::unique_ptr<boost::asio::deadline_timer> lease_timer;
|
||||
};
|
||||
|
||||
/// Check whether the given object needs to be made available through object
|
||||
/// transfer or reconstruction. These are objects for which: (1) there is a
|
||||
/// subscribed task dependent on it, (2) the object is not local, and (3) the
|
||||
@@ -235,29 +211,12 @@ class TaskDependencyManager {
|
||||
/// operations to make the object available through object transfer or
|
||||
/// reconstruction.
|
||||
void HandleRemoteDependencyCanceled(const ObjectID &object_id);
|
||||
/// Acquire the task lease in the GCS for the given task. This is used to
|
||||
/// indicate to other nodes that the task is currently pending on this node.
|
||||
/// The task lease has an expiration time. If we do not renew the lease
|
||||
/// before that time, then other nodes may choose to execute the task.
|
||||
void AcquireTaskLease(const TaskID &task_id);
|
||||
|
||||
/// The object manager, used to fetch required objects from remote nodes.
|
||||
ObjectManagerInterface &object_manager_;
|
||||
/// The reconstruction policy, used to reconstruct required objects that no
|
||||
/// longer exist on any live nodes.
|
||||
ReconstructionPolicyInterface &reconstruction_policy_;
|
||||
/// The event loop, used to set timers for renewing task leases. The task
|
||||
/// leases are used to indicate which tasks are pending execution on this
|
||||
/// node and must be periodically renewed.
|
||||
boost::asio::io_service &io_service_;
|
||||
/// This node's GCS client ID, used in the task lease information.
|
||||
const ClientID client_id_;
|
||||
/// For a given task, the expiration period of the initial task lease that is
|
||||
/// added to the GCS. The lease expiration period is doubled every time the
|
||||
/// lease is renewed.
|
||||
const int64_t initial_lease_period_ms_;
|
||||
/// A client connection to the GCS.
|
||||
std::shared_ptr<gcs::GcsClient> gcs_client_;
|
||||
/// A mapping from task ID of each subscribed task to its list of object
|
||||
/// dependencies, either task arguments or objects passed into `ray.get`.
|
||||
std::unordered_map<ray::TaskID, TaskDependencies> task_dependencies_;
|
||||
@@ -277,7 +236,7 @@ class TaskDependencyManager {
|
||||
std::unordered_set<ray::ObjectID> local_objects_;
|
||||
/// The set of tasks that are pending execution. Any objects created by these
|
||||
/// tasks that are not already local are pending creation.
|
||||
std::unordered_map<ray::TaskID, PendingTask> pending_tasks_;
|
||||
std::unordered_set<ray::TaskID> pending_tasks_;
|
||||
};
|
||||
|
||||
} // namespace raylet
|
||||
|
||||
@@ -48,60 +48,16 @@ class MockReconstructionPolicy : public ReconstructionPolicyInterface {
|
||||
MOCK_METHOD1(Cancel, void(const ObjectID &object_id));
|
||||
};
|
||||
|
||||
class MockTaskInfoAccessor : public gcs::RedisTaskInfoAccessor {
|
||||
public:
|
||||
MockTaskInfoAccessor(gcs::RedisGcsClient *client)
|
||||
: gcs::RedisTaskInfoAccessor(client) {}
|
||||
|
||||
MOCK_METHOD2(AsyncAddTaskLease,
|
||||
ray::Status(const std::shared_ptr<TaskLeaseData> &data_ptr,
|
||||
const gcs::StatusCallback &callback));
|
||||
};
|
||||
|
||||
class MockGcsClient : public gcs::RedisGcsClient {
|
||||
public:
|
||||
MockGcsClient(const gcs::GcsClientOptions &options) : gcs::RedisGcsClient(options) {}
|
||||
|
||||
void Init(MockTaskInfoAccessor *task_accessor_mock) {
|
||||
task_accessor_.reset(task_accessor_mock);
|
||||
}
|
||||
};
|
||||
|
||||
class TaskDependencyManagerTest : public ::testing::Test {
|
||||
public:
|
||||
TaskDependencyManagerTest()
|
||||
: object_manager_mock_(),
|
||||
reconstruction_policy_mock_(),
|
||||
io_service_(),
|
||||
options_("", 1, ""),
|
||||
gcs_client_mock_(new MockGcsClient(options_)),
|
||||
task_accessor_mock_(new MockTaskInfoAccessor(gcs_client_mock_.get())),
|
||||
initial_lease_period_ms_(100),
|
||||
task_dependency_manager_(object_manager_mock_, reconstruction_policy_mock_,
|
||||
io_service_, ClientID::Nil(), initial_lease_period_ms_,
|
||||
gcs_client_mock_) {
|
||||
gcs_client_mock_->Init(task_accessor_mock_);
|
||||
}
|
||||
|
||||
void Run(uint64_t timeout_ms) {
|
||||
auto timer_period = boost::posix_time::milliseconds(timeout_ms);
|
||||
auto timer = std::make_shared<boost::asio::deadline_timer>(io_service_, timer_period);
|
||||
timer->async_wait([this](const boost::system::error_code &error) {
|
||||
ASSERT_FALSE(error);
|
||||
io_service_.stop();
|
||||
});
|
||||
io_service_.run();
|
||||
io_service_.reset();
|
||||
}
|
||||
task_dependency_manager_(object_manager_mock_, reconstruction_policy_mock_) {}
|
||||
|
||||
protected:
|
||||
MockObjectManager object_manager_mock_;
|
||||
MockReconstructionPolicy reconstruction_policy_mock_;
|
||||
boost::asio::io_service io_service_;
|
||||
gcs::GcsClientOptions options_;
|
||||
std::shared_ptr<MockGcsClient> gcs_client_mock_;
|
||||
MockTaskInfoAccessor *task_accessor_mock_;
|
||||
int64_t initial_lease_period_ms_;
|
||||
TaskDependencyManager task_dependency_manager_;
|
||||
};
|
||||
|
||||
@@ -270,9 +226,7 @@ TEST_F(TaskDependencyManagerTest, TestTaskChain) {
|
||||
ASSERT_FALSE(ready);
|
||||
}
|
||||
|
||||
// Mark each task as pending. A lease entry should be added to the GCS for
|
||||
// each task.
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _));
|
||||
// Mark each task as pending.
|
||||
task_dependency_manager_.TaskPending(task);
|
||||
|
||||
i++;
|
||||
@@ -323,7 +277,6 @@ TEST_F(TaskDependencyManagerTest, TestDependentPut) {
|
||||
// it is pending execution.
|
||||
EXPECT_CALL(object_manager_mock_, CancelPull(put_id));
|
||||
EXPECT_CALL(reconstruction_policy_mock_, Cancel(put_id));
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _));
|
||||
task_dependency_manager_.TaskPending(task1);
|
||||
}
|
||||
|
||||
@@ -336,7 +289,6 @@ TEST_F(TaskDependencyManagerTest, TestTaskForwarding) {
|
||||
const auto &arguments = task.GetDependencies();
|
||||
static_cast<void>(task_dependency_manager_.SubscribeGetDependencies(
|
||||
task.GetTaskSpecification().TaskId(), arguments));
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _));
|
||||
task_dependency_manager_.TaskPending(task);
|
||||
}
|
||||
|
||||
@@ -437,31 +389,6 @@ TEST_F(TaskDependencyManagerTest, TestEviction) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TaskDependencyManagerTest, TestTaskLeaseRenewal) {
|
||||
// Mark a task as pending.
|
||||
auto task = ExampleTask({}, 0);
|
||||
// We expect an initial call to acquire the lease.
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _));
|
||||
|
||||
task_dependency_manager_.TaskPending(task);
|
||||
|
||||
// Check that while the task is still pending, there is one call to renew the
|
||||
// lease for each lease period that passes. The lease period doubles with
|
||||
// each renewal.
|
||||
int num_expected_calls = 4;
|
||||
int64_t sleep_time = 0;
|
||||
for (int i = 1; i <= num_expected_calls; i++) {
|
||||
sleep_time += i * initial_lease_period_ms_;
|
||||
}
|
||||
// When sleep_time = 10 * initial_lease_period_ms_, test case fails, because the
|
||||
// AsyncAddTaskLease function is expected to be called four times, but only three times.
|
||||
// It's hard to determine the sleep_time value, so let's double it for now.
|
||||
sleep_time = sleep_time * 2;
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _))
|
||||
.Times(testing::AtLeast(num_expected_calls));
|
||||
Run(sleep_time);
|
||||
}
|
||||
|
||||
TEST_F(TaskDependencyManagerTest, TestRemoveTasksAndRelatedObjects) {
|
||||
// Create 3 tasks, each dependent on the previous. The first task has no
|
||||
// arguments.
|
||||
@@ -478,9 +405,7 @@ TEST_F(TaskDependencyManagerTest, TestRemoveTasksAndRelatedObjects) {
|
||||
const auto &arguments = task.GetDependencies();
|
||||
task_dependency_manager_.SubscribeGetDependencies(
|
||||
task.GetTaskSpecification().TaskId(), arguments);
|
||||
// Mark each task as pending. A lease entry should be added to the GCS for
|
||||
// each task.
|
||||
EXPECT_CALL(*task_accessor_mock_, AsyncAddTaskLease(_, _));
|
||||
// Mark each task as pending.
|
||||
task_dependency_manager_.TaskPending(task);
|
||||
}
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ raylet::RayletClient::RayletClient(
|
||||
const std::string &raylet_socket, const WorkerID &worker_id,
|
||||
rpc::WorkerType worker_type, const JobID &job_id, const Language &language,
|
||||
const std::string &ip_address, ClientID *raylet_id, int *port,
|
||||
std::unordered_map<std::string, std::string> *internal_config,
|
||||
std::unordered_map<std::string, std::string> *system_config,
|
||||
const std::string &job_config)
|
||||
: grpc_client_(std::move(grpc_client)),
|
||||
worker_id_(worker_id),
|
||||
@@ -110,12 +110,12 @@ raylet::RayletClient::RayletClient(
|
||||
*raylet_id = ClientID::FromBinary(reply_message->raylet_id()->str());
|
||||
*port = reply_message->port();
|
||||
|
||||
RAY_CHECK(internal_config);
|
||||
auto keys = reply_message->internal_config_keys();
|
||||
auto values = reply_message->internal_config_values();
|
||||
RAY_CHECK(system_config);
|
||||
auto keys = reply_message->system_config_keys();
|
||||
auto values = reply_message->system_config_values();
|
||||
RAY_CHECK(keys->size() == values->size());
|
||||
for (size_t i = 0; i < keys->size(); i++) {
|
||||
internal_config->emplace(keys->Get(i)->str(), values->Get(i)->str());
|
||||
system_config->emplace(keys->Get(i)->str(), values->Get(i)->str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -169,7 +169,7 @@ class RayletClient : public PinObjectsInterface,
|
||||
/// \param language Language of the worker.
|
||||
/// \param ip_address The IP address of the worker.
|
||||
/// \param raylet_id This will be populated with the local raylet's ClientID.
|
||||
/// \param internal_config This will be populated with internal config parameters
|
||||
/// \param system_config This will be populated with internal config parameters
|
||||
/// provided by the raylet.
|
||||
/// \param port The port that the worker should listen on for gRPC requests. If
|
||||
/// 0, the worker should choose a random port.
|
||||
@@ -178,7 +178,7 @@ class RayletClient : public PinObjectsInterface,
|
||||
const std::string &raylet_socket, const WorkerID &worker_id,
|
||||
rpc::WorkerType worker_type, const JobID &job_id, const Language &language,
|
||||
const std::string &ip_address, ClientID *raylet_id, int *port,
|
||||
std::unordered_map<std::string, std::string> *internal_config,
|
||||
std::unordered_map<std::string, std::string> *system_config,
|
||||
const std::string &job_config);
|
||||
|
||||
/// Connect to the raylet via grpc only.
|
||||
|
||||
+1
-1
@@ -69,7 +69,7 @@ class ClusterStarter {
|
||||
"--load-code-from-local",
|
||||
"--include-java",
|
||||
"--java-worker-options=" + workerOptions,
|
||||
"--internal-config=" + new Gson().toJson(config)
|
||||
"--system-config=" + new Gson().toJson(config)
|
||||
);
|
||||
if (!executeCommand(startCommand, 10)) {
|
||||
throw new RuntimeException("Couldn't start ray cluster.");
|
||||
|
||||
@@ -35,9 +35,7 @@ def test_hybrid_stream():
|
||||
load_code_from_local=True,
|
||||
include_java=True,
|
||||
java_worker_options=java_worker_options,
|
||||
_internal_config=json.dumps({
|
||||
"num_workers_per_process_java": 1
|
||||
}))
|
||||
_system_config={"num_workers_per_process_java": 1})
|
||||
|
||||
sink_file = "/tmp/ray_streaming_test_hybrid_stream.txt"
|
||||
if os.path.exists(sink_file):
|
||||
|
||||
Reference in New Issue
Block a user