mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 00:44:50 +08:00
[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)
* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
This commit is contained in:
@@ -2,8 +2,6 @@ import inspect
|
||||
import logging
|
||||
import weakref
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import namedtuple
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray._raylet
|
||||
import ray.signature as signature
|
||||
@@ -854,11 +852,6 @@ def modify_class(cls):
|
||||
"classes. In Python 2, you must declare the class with "
|
||||
"'class ClassName(object):' instead of 'class ClassName:'.")
|
||||
|
||||
if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
|
||||
raise TypeError(
|
||||
"A checkpointable actor class should implement all abstract "
|
||||
"methods in the `Checkpointable` interface.")
|
||||
|
||||
# Modify the class to have an additional method that will be used for
|
||||
# terminating the worker.
|
||||
class Class(cls):
|
||||
@@ -869,20 +862,6 @@ def modify_class(cls):
|
||||
if worker.mode != ray.LOCAL_MODE:
|
||||
ray.actor.exit_actor()
|
||||
|
||||
def __ray_checkpoint__(self):
|
||||
"""Save a checkpoint.
|
||||
|
||||
This task saves the current state of the actor, the current task
|
||||
frontier according to the raylet, and the checkpoint index
|
||||
(number of tasks executed so far).
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
if not isinstance(self, ray.actor.Checkpointable):
|
||||
raise TypeError(
|
||||
"__ray_checkpoint__.remote() may only be called on actors "
|
||||
"that implement ray.actor.Checkpointable")
|
||||
return worker._save_actor_checkpoint()
|
||||
|
||||
Class.__module__ = cls.__module__
|
||||
Class.__name__ = cls.__name__
|
||||
|
||||
@@ -951,128 +930,3 @@ def exit_actor():
|
||||
assert False, "This process should have terminated."
|
||||
else:
|
||||
raise TypeError("exit_actor called on a non-actor worker.")
|
||||
|
||||
|
||||
CheckpointContext = namedtuple(
|
||||
"CheckpointContext",
|
||||
[
|
||||
# Actor's ID.
|
||||
"actor_id",
|
||||
# Number of tasks executed since last checkpoint.
|
||||
"num_tasks_since_last_checkpoint",
|
||||
# Time elapsed since last checkpoint, in milliseconds.
|
||||
"time_elapsed_ms_since_last_checkpoint",
|
||||
],
|
||||
)
|
||||
"""A namedtuple that contains information about actor's last checkpoint."""
|
||||
|
||||
Checkpoint = namedtuple(
|
||||
"Checkpoint",
|
||||
[
|
||||
# ID of this checkpoint.
|
||||
"checkpoint_id",
|
||||
# The timestamp at which this checkpoint was saved,
|
||||
# represented as milliseconds elapsed since Unix epoch.
|
||||
"timestamp",
|
||||
],
|
||||
)
|
||||
"""A namedtuple that represents a checkpoint."""
|
||||
|
||||
|
||||
class Checkpointable(metaclass=ABCMeta):
|
||||
"""An interface that indicates an actor can be checkpointed."""
|
||||
|
||||
@abstractmethod
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
"""Whether this actor needs to be checkpointed.
|
||||
|
||||
This method will be called after every task. You should implement this
|
||||
callback to decide whether this actor needs to be checkpointed at this
|
||||
time, based on the checkpoint context, or any other factors.
|
||||
|
||||
Args:
|
||||
checkpoint_context: A namedtuple that contains info about last
|
||||
checkpoint.
|
||||
|
||||
Returns:
|
||||
A boolean value that indicates whether this actor needs to be
|
||||
checkpointed.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
"""Save a checkpoint to persistent storage.
|
||||
|
||||
If `should_checkpoint` returns true, this method will be called. You
|
||||
should implement this callback to save actor's checkpoint and the given
|
||||
checkpoint id to persistent storage.
|
||||
|
||||
Args:
|
||||
actor_id: Actor's ID.
|
||||
checkpoint_id: ID of this checkpoint. You should save it together
|
||||
with actor's checkpoint data. And it will be used by the
|
||||
`load_checkpoint` method.
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
"""Load actor's previous checkpoint, and restore actor's state.
|
||||
|
||||
This method will be called when an actor is restarted, after
|
||||
actor's constructor.
|
||||
If the actor needs to restore from previous checkpoint, this function
|
||||
should restore actor's state and return the checkpoint ID. Otherwise,
|
||||
it should do nothing and return None.
|
||||
Note, this method must return one of the checkpoint IDs in the
|
||||
`available_checkpoints` list, or None. Otherwise, an exception will be
|
||||
raised.
|
||||
|
||||
Args:
|
||||
actor_id: Actor's ID.
|
||||
available_checkpoints: A list of `Checkpoint` namedtuples that
|
||||
contains all available checkpoint IDs and their timestamps,
|
||||
sorted by timestamp in descending order.
|
||||
Returns:
|
||||
The ID of the checkpoint from which the actor was resumed, or None
|
||||
if the actor should restart from the beginning.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
"""Delete an expired checkpoint.
|
||||
|
||||
This method will be called when an checkpoint is expired. You should
|
||||
implement this method to delete your application checkpoint data.
|
||||
Note, the maximum number of checkpoints kept in the backend can be
|
||||
configured at `RayConfig.num_actor_checkpoints_to_keep`.
|
||||
|
||||
Args:
|
||||
actor_id: ID of the actor.
|
||||
checkpoint_id: ID of the checkpoint that has expired.
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def get_checkpoints_for_actor(actor_id):
|
||||
"""Get the available checkpoints for the given actor ID, return a list
|
||||
sorted by checkpoint timestamp in descending order.
|
||||
"""
|
||||
checkpoint_info = ray.state.state.actor_checkpoint_info(actor_id)
|
||||
if checkpoint_info is None:
|
||||
return []
|
||||
checkpoints = [
|
||||
Checkpoint(checkpoint_id, timestamp) for checkpoint_id, timestamp in
|
||||
zip(checkpoint_info["CheckpointIds"], checkpoint_info["Timestamps"])
|
||||
]
|
||||
return sorted(
|
||||
checkpoints,
|
||||
key=lambda checkpoint: checkpoint.timestamp,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
@@ -80,9 +79,6 @@ class Cluster:
|
||||
"min_worker_port": 0,
|
||||
"max_worker_port": 0,
|
||||
}
|
||||
if "_internal_config" in node_args:
|
||||
node_args["_internal_config"] = json.loads(
|
||||
node_args["_internal_config"])
|
||||
ray_params = ray.parameter.RayParams(**node_args)
|
||||
ray_params.update_if_absent(**default_kwargs)
|
||||
if self.head_node is None:
|
||||
|
||||
@@ -544,43 +544,13 @@ class FunctionActorManager:
|
||||
"""
|
||||
|
||||
def actor_method_executor(actor, *args, **kwargs):
|
||||
# Update the actor's task counter to reflect the task we're about
|
||||
# to execute.
|
||||
self._worker.actor_task_counter += 1
|
||||
|
||||
# Execute the assigned method and save a checkpoint if necessary.
|
||||
try:
|
||||
is_bound = (is_class_method(method)
|
||||
or is_static_method(type(actor), method_name))
|
||||
if is_bound:
|
||||
method_returns = method(*args, **kwargs)
|
||||
else:
|
||||
method_returns = method(actor, *args, **kwargs)
|
||||
except Exception as e:
|
||||
# Save the checkpoint before allowing the method exception
|
||||
# to be thrown, but don't save the checkpoint for actor
|
||||
# creation task.
|
||||
if (isinstance(actor, ray.actor.Checkpointable)
|
||||
and self._worker.actor_task_counter != 1):
|
||||
self._save_and_log_checkpoint(actor)
|
||||
raise e
|
||||
# Execute the assigned method.
|
||||
is_bound = (is_class_method(method)
|
||||
or is_static_method(type(actor), method_name))
|
||||
if is_bound:
|
||||
return method(*args, **kwargs)
|
||||
else:
|
||||
# Handle any checkpointing operations before storing the
|
||||
# method's return values.
|
||||
# NOTE(swang): If method_returns is a pointer to the actor's
|
||||
# state and the checkpointing operations can modify the return
|
||||
# values if they mutate the actor's state. Is this okay?
|
||||
if isinstance(actor, ray.actor.Checkpointable):
|
||||
# If this is the first task to execute on the actor, try to
|
||||
# resume from a checkpoint.
|
||||
if self._worker.actor_task_counter == 1:
|
||||
if actor_imported:
|
||||
self._restore_and_log_checkpoint(actor)
|
||||
else:
|
||||
# Save the checkpoint before returning the method's
|
||||
# return values.
|
||||
self._save_and_log_checkpoint(actor)
|
||||
return method_returns
|
||||
return method(actor, *args, **kwargs)
|
||||
|
||||
# Set method_name and method as attributes to the executor clusore
|
||||
# so we can make decision based on these attributes in task executor.
|
||||
@@ -591,86 +561,3 @@ class FunctionActorManager:
|
||||
actor_method_executor.method = method
|
||||
|
||||
return actor_method_executor
|
||||
|
||||
def _save_and_log_checkpoint(self, actor):
|
||||
"""Save an actor checkpoint if necessary and log any errors.
|
||||
|
||||
Args:
|
||||
actor: The actor to checkpoint.
|
||||
|
||||
Returns:
|
||||
The result of the actor's user-defined `save_checkpoint` method.
|
||||
"""
|
||||
actor_id = self._worker.actor_id
|
||||
checkpoint_info = self._worker.actor_checkpoint_info[actor_id]
|
||||
checkpoint_info.num_tasks_since_last_checkpoint += 1
|
||||
now = int(1000 * time.time())
|
||||
checkpoint_context = ray.actor.CheckpointContext(
|
||||
actor_id, checkpoint_info.num_tasks_since_last_checkpoint,
|
||||
now - checkpoint_info.last_checkpoint_timestamp)
|
||||
# If we should take a checkpoint, notify raylet to prepare a checkpoint
|
||||
# and then call `save_checkpoint`.
|
||||
if actor.should_checkpoint(checkpoint_context):
|
||||
try:
|
||||
now = int(1000 * time.time())
|
||||
checkpoint_id = (
|
||||
self._worker.core_worker.prepare_actor_checkpoint(actor_id)
|
||||
)
|
||||
checkpoint_info.checkpoint_ids.append(checkpoint_id)
|
||||
actor.save_checkpoint(actor_id, checkpoint_id)
|
||||
if (len(checkpoint_info.checkpoint_ids) >
|
||||
ray._config.num_actor_checkpoints_to_keep()):
|
||||
actor.checkpoint_expired(
|
||||
actor_id,
|
||||
checkpoint_info.checkpoint_ids.pop(0),
|
||||
)
|
||||
checkpoint_info.num_tasks_since_last_checkpoint = 0
|
||||
checkpoint_info.last_checkpoint_timestamp = now
|
||||
except Exception:
|
||||
# Checkpoint save or reload failed. Notify the driver.
|
||||
traceback_str = ray.utils.format_error_message(
|
||||
traceback.format_exc())
|
||||
ray.utils.push_error_to_driver(
|
||||
self._worker,
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
job_id=self._worker.current_job_id)
|
||||
|
||||
def _restore_and_log_checkpoint(self, actor):
|
||||
"""Restore an actor from a checkpoint if available and log any errors.
|
||||
|
||||
This should only be called on workers that have just executed an actor
|
||||
creation task.
|
||||
|
||||
Args:
|
||||
actor: The actor to restore from a checkpoint.
|
||||
"""
|
||||
actor_id = self._worker.actor_id
|
||||
try:
|
||||
checkpoints = ray.actor.get_checkpoints_for_actor(actor_id)
|
||||
if len(checkpoints) > 0:
|
||||
# If we found previously saved checkpoints for this actor,
|
||||
# call the `load_checkpoint` callback.
|
||||
checkpoint_id = actor.load_checkpoint(actor_id, checkpoints)
|
||||
if checkpoint_id is not None:
|
||||
# Check that the returned checkpoint id is in the
|
||||
# `available_checkpoints` list.
|
||||
msg = (
|
||||
"`load_checkpoint` must return a checkpoint id that " +
|
||||
"exists in the `available_checkpoints` list, or None.")
|
||||
assert any(checkpoint_id == checkpoint.checkpoint_id
|
||||
for checkpoint in checkpoints), msg
|
||||
# Notify raylet that this actor has been resumed from
|
||||
# a checkpoint.
|
||||
(self._worker.core_worker.
|
||||
notify_actor_resumed_from_checkpoint(
|
||||
actor_id, checkpoint_id))
|
||||
except Exception:
|
||||
# Checkpoint save or reload failed. Notify the driver.
|
||||
traceback_str = ray.utils.format_error_message(
|
||||
traceback.format_exc())
|
||||
ray.utils.push_error_to_driver(
|
||||
self._worker,
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
job_id=self._worker.current_job_id)
|
||||
|
||||
@@ -21,52 +21,28 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
uint64_t num_heartbeats_warning() const
|
||||
|
||||
int64_t initial_reconstruction_timeout_milliseconds() const
|
||||
int64_t object_timeout_milliseconds() const
|
||||
|
||||
int64_t get_timeout_milliseconds() const
|
||||
|
||||
uint64_t max_lineage_size() const
|
||||
|
||||
int64_t worker_get_request_size() const
|
||||
|
||||
int64_t worker_fetch_request_size() const
|
||||
|
||||
int64_t actor_max_dummy_objects() const
|
||||
|
||||
int64_t raylet_client_num_connect_attempts() const
|
||||
|
||||
int64_t raylet_client_connect_timeout_milliseconds() const
|
||||
|
||||
int64_t raylet_fetch_timeout_milliseconds() const
|
||||
|
||||
int64_t raylet_reconstruction_timeout_milliseconds() const
|
||||
|
||||
int64_t max_num_to_reconstruct() const
|
||||
|
||||
int64_t raylet_fetch_request_size() const
|
||||
|
||||
int64_t kill_worker_timeout_milliseconds() const
|
||||
|
||||
int64_t worker_register_timeout_seconds() const
|
||||
|
||||
int64_t max_time_for_handler_milliseconds() const
|
||||
|
||||
int64_t max_time_for_loop() const
|
||||
|
||||
int64_t redis_db_connect_retries()
|
||||
|
||||
int64_t redis_db_connect_wait_milliseconds() const
|
||||
|
||||
int64_t plasma_default_release_delay() const
|
||||
|
||||
int64_t L3_cache_size_bytes() const
|
||||
|
||||
int64_t max_tasks_to_spillback() const
|
||||
|
||||
int64_t actor_creation_num_spillbacks_warning() const
|
||||
|
||||
int node_manager_forward_task_retry_timeout_milliseconds() const
|
||||
|
||||
int object_manager_pull_timeout_ms() const
|
||||
|
||||
int object_manager_push_timeout_ms() const
|
||||
@@ -79,10 +55,6 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
int num_workers_per_process_java() const
|
||||
|
||||
int64_t max_task_lease_timeout_ms() const
|
||||
|
||||
uint32_t num_actor_checkpoints_to_keep() const
|
||||
|
||||
uint32_t maximum_gcs_deletion_batch_size() const
|
||||
|
||||
int64_t max_direct_call_object_size() const
|
||||
|
||||
@@ -26,18 +26,14 @@ cdef class Config:
|
||||
return RayConfig.instance().num_heartbeats_warning()
|
||||
|
||||
@staticmethod
|
||||
def initial_reconstruction_timeout_milliseconds():
|
||||
def object_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.initial_reconstruction_timeout_milliseconds())
|
||||
.object_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def get_timeout_milliseconds():
|
||||
return RayConfig.instance().get_timeout_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def max_lineage_size():
|
||||
return RayConfig.instance().max_lineage_size()
|
||||
|
||||
@staticmethod
|
||||
def worker_get_request_size():
|
||||
return RayConfig.instance().worker_get_request_size()
|
||||
@@ -46,10 +42,6 @@ cdef class Config:
|
||||
def worker_fetch_request_size():
|
||||
return RayConfig.instance().worker_fetch_request_size()
|
||||
|
||||
@staticmethod
|
||||
def actor_max_dummy_objects():
|
||||
return RayConfig.instance().actor_max_dummy_objects()
|
||||
|
||||
@staticmethod
|
||||
def raylet_client_num_connect_attempts():
|
||||
return RayConfig.instance().raylet_client_num_connect_attempts()
|
||||
@@ -64,19 +56,6 @@ cdef class Config:
|
||||
return (RayConfig.instance()
|
||||
.raylet_fetch_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def raylet_reconstruction_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.raylet_reconstruction_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def max_num_to_reconstruct():
|
||||
return RayConfig.instance().max_num_to_reconstruct()
|
||||
|
||||
@staticmethod
|
||||
def raylet_fetch_request_size():
|
||||
return RayConfig.instance().raylet_fetch_request_size()
|
||||
|
||||
@staticmethod
|
||||
def kill_worker_timeout_milliseconds():
|
||||
return RayConfig.instance().kill_worker_timeout_milliseconds()
|
||||
@@ -85,14 +64,6 @@ cdef class Config:
|
||||
def worker_register_timeout_seconds():
|
||||
return RayConfig.instance().worker_register_timeout_seconds()
|
||||
|
||||
@staticmethod
|
||||
def max_time_for_handler_milliseconds():
|
||||
return RayConfig.instance().max_time_for_handler_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def max_time_for_loop():
|
||||
return RayConfig.instance().max_time_for_loop()
|
||||
|
||||
@staticmethod
|
||||
def redis_db_connect_retries():
|
||||
return RayConfig.instance().redis_db_connect_retries()
|
||||
@@ -101,27 +72,6 @@ cdef class Config:
|
||||
def redis_db_connect_wait_milliseconds():
|
||||
return RayConfig.instance().redis_db_connect_wait_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def plasma_default_release_delay():
|
||||
return RayConfig.instance().plasma_default_release_delay()
|
||||
|
||||
@staticmethod
|
||||
def L3_cache_size_bytes():
|
||||
return RayConfig.instance().L3_cache_size_bytes()
|
||||
|
||||
@staticmethod
|
||||
def max_tasks_to_spillback():
|
||||
return RayConfig.instance().max_tasks_to_spillback()
|
||||
|
||||
@staticmethod
|
||||
def actor_creation_num_spillbacks_warning():
|
||||
return RayConfig.instance().actor_creation_num_spillbacks_warning()
|
||||
|
||||
@staticmethod
|
||||
def node_manager_forward_task_retry_timeout_milliseconds():
|
||||
return (RayConfig.instance()
|
||||
.node_manager_forward_task_retry_timeout_milliseconds())
|
||||
|
||||
@staticmethod
|
||||
def object_manager_pull_timeout_ms():
|
||||
return RayConfig.instance().object_manager_pull_timeout_ms()
|
||||
@@ -146,14 +96,6 @@ cdef class Config:
|
||||
def num_workers_per_process_java():
|
||||
return RayConfig.instance().num_workers_per_process_java()
|
||||
|
||||
@staticmethod
|
||||
def max_task_lease_timeout_ms():
|
||||
return RayConfig.instance().max_task_lease_timeout_ms()
|
||||
|
||||
@staticmethod
|
||||
def num_actor_checkpoints_to_keep():
|
||||
return RayConfig.instance().num_actor_checkpoints_to_keep()
|
||||
|
||||
@staticmethod
|
||||
def maximum_gcs_deletion_batch_size():
|
||||
return RayConfig.instance().maximum_gcs_deletion_batch_size()
|
||||
|
||||
+4
-4
@@ -93,9 +93,9 @@ class Node:
|
||||
"The raylet IP address should only be different than the node "
|
||||
"IP address when connecting to an existing raylet; i.e., when "
|
||||
"head=False and connect_only=True.")
|
||||
if ray_params._internal_config and len(
|
||||
ray_params._internal_config) > 0 and (not head
|
||||
and not connect_only):
|
||||
if ray_params._system_config and len(
|
||||
ray_params._system_config) > 0 and (not head
|
||||
and not connect_only):
|
||||
raise ValueError(
|
||||
"Internal config parameters can only be set on the head node.")
|
||||
|
||||
@@ -124,7 +124,7 @@ class Node:
|
||||
self._localhost = socket.gethostbyname("localhost")
|
||||
self._ray_params = ray_params
|
||||
self._redis_address = ray_params.redis_address
|
||||
self._config = ray_params._internal_config or {}
|
||||
self._config = ray_params._system_config or {}
|
||||
|
||||
# Enable Plasma Store as a thread by default.
|
||||
if "plasma_store_as_thread" not in self._config:
|
||||
|
||||
+16
-14
@@ -91,8 +91,9 @@ class RayParams:
|
||||
metrics_agent_port(int): The port to bind metrics agent.
|
||||
metrics_export_port(int): The port at which metrics are exposed
|
||||
through a Prometheus endpoint.
|
||||
_internal_config (str): JSON configuration for overriding
|
||||
RayConfig defaults. For testing purposes ONLY.
|
||||
_system_config (dict): Configuration for overriding RayConfig
|
||||
defaults. Used to set system configuration and for experimental Ray
|
||||
core feature flags.
|
||||
lru_evict (bool): Enable LRU eviction if space is needed.
|
||||
enable_object_reconstruction (bool): Enable plasma reconstruction on
|
||||
failure.
|
||||
@@ -141,7 +142,7 @@ class RayParams:
|
||||
java_worker_options=None,
|
||||
load_code_from_local=False,
|
||||
start_initial_python_workers_for_first_job=False,
|
||||
_internal_config=None,
|
||||
_system_config=None,
|
||||
enable_object_reconstruction=False,
|
||||
metrics_agent_port=None,
|
||||
metrics_export_port=None,
|
||||
@@ -188,7 +189,7 @@ class RayParams:
|
||||
self.metrics_export_port = metrics_export_port
|
||||
self.start_initial_python_workers_for_first_job = (
|
||||
start_initial_python_workers_for_first_job)
|
||||
self._internal_config = _internal_config
|
||||
self._system_config = _system_config
|
||||
self._lru_evict = lru_evict
|
||||
self._enable_object_reconstruction = enable_object_reconstruction
|
||||
self.object_spilling_config = object_spilling_config
|
||||
@@ -197,26 +198,27 @@ class RayParams:
|
||||
# Set the internal config options for LRU eviction.
|
||||
if lru_evict:
|
||||
# Turn off object pinning.
|
||||
if self._internal_config is None:
|
||||
self._internal_config = dict()
|
||||
if self._internal_config.get("object_pinning_enabled", False):
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if self._system_config.get("object_pinning_enabled", False):
|
||||
raise Exception(
|
||||
"Object pinning cannot be enabled if using LRU eviction.")
|
||||
self._internal_config["object_pinning_enabled"] = False
|
||||
self._internal_config["object_store_full_max_retries"] = -1
|
||||
self._internal_config["free_objects_period_milliseconds"] = 1000
|
||||
self._system_config["object_pinning_enabled"] = False
|
||||
self._system_config["object_store_full_max_retries"] = -1
|
||||
self._system_config["free_objects_period_milliseconds"] = 1000
|
||||
|
||||
# Set the internal config options for object reconstruction.
|
||||
if enable_object_reconstruction:
|
||||
# Turn off object pinning.
|
||||
if self._internal_config is None:
|
||||
self._internal_config = dict()
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if lru_evict:
|
||||
raise Exception(
|
||||
"Object reconstruction cannot be enabled if using LRU "
|
||||
"eviction.")
|
||||
self._internal_config["lineage_pinning_enabled"] = True
|
||||
self._internal_config["free_objects_period_milliseconds"] = -1
|
||||
print(self._system_config)
|
||||
self._system_config["lineage_pinning_enabled"] = True
|
||||
self._system_config["free_objects_period_milliseconds"] = -1
|
||||
|
||||
def update(self, **kwargs):
|
||||
"""Update the settings according to the keyword arguments.
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""This is the script for `ray microbenchmark`."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@@ -110,10 +109,7 @@ def main():
|
||||
|
||||
print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")
|
||||
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"put_small_object_in_memory_store": True
|
||||
}))
|
||||
ray.init(_system_config={"put_small_object_in_memory_store": True})
|
||||
|
||||
value = ray.put(0)
|
||||
|
||||
@@ -138,10 +134,7 @@ def main():
|
||||
timeit("multi client put calls", put_multi_small, 1000)
|
||||
|
||||
ray.shutdown()
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"put_small_object_in_memory_store": False
|
||||
}))
|
||||
ray.init(_system_config={"put_small_object_in_memory_store": False})
|
||||
|
||||
value = ray.put(0)
|
||||
arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
|
||||
|
||||
@@ -358,10 +358,10 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
|
||||
type=str,
|
||||
help="Overwrite the options to start Java workers.")
|
||||
@click.option(
|
||||
"--internal-config",
|
||||
"--system-config",
|
||||
default=None,
|
||||
type=json.loads,
|
||||
help="Do NOT use this. This is for debugging/development purposes ONLY.")
|
||||
help="Override system configuration defaults.")
|
||||
@click.option(
|
||||
"--load-code-from-local",
|
||||
is_flag=True,
|
||||
@@ -394,9 +394,9 @@ def start(node_ip_address, redis_address, address, redis_port, port,
|
||||
dashboard_port, block, plasma_directory, huge_pages,
|
||||
autoscaling_config, no_redirect_worker_output, no_redirect_output,
|
||||
plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
|
||||
java_worker_options, load_code_from_local, internal_config,
|
||||
lru_evict, enable_object_reconstruction, metrics_export_port,
|
||||
log_new_style, log_color, verbose):
|
||||
java_worker_options, load_code_from_local, system_config, lru_evict,
|
||||
enable_object_reconstruction, metrics_export_port, log_new_style,
|
||||
log_color, verbose):
|
||||
"""Start Ray processes manually on the local machine."""
|
||||
cli_logger.old_style = not log_new_style
|
||||
cli_logger.color_mode = log_color
|
||||
@@ -508,7 +508,8 @@ def start(node_ip_address, redis_address, address, redis_port, port,
|
||||
dashboard_port=dashboard_port,
|
||||
java_worker_options=java_worker_options,
|
||||
load_code_from_local=load_code_from_local,
|
||||
_internal_config=internal_config,
|
||||
_system_config=json.loads(system_config)
|
||||
if system_config else system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=metrics_export_port)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import asyncio
|
||||
import errno
|
||||
import io
|
||||
import json
|
||||
import fnmatch
|
||||
import os
|
||||
import subprocess
|
||||
@@ -282,10 +281,9 @@ def recursive_fnmatch(dirpath, pattern):
|
||||
return matches
|
||||
|
||||
|
||||
def generate_internal_config_map(**kwargs):
|
||||
internal_config = json.dumps(kwargs)
|
||||
def generate_system_config_map(**kwargs):
|
||||
ray_kwargs = {
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": kwargs,
|
||||
}
|
||||
return ray_kwargs
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ This file defines the common pytest fixtures used in current directory.
|
||||
"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
import json
|
||||
import pytest
|
||||
import subprocess
|
||||
|
||||
@@ -19,22 +18,22 @@ def shutdown_only():
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def get_default_fixure_internal_config():
|
||||
internal_config = json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
def get_default_fixure_system_config():
|
||||
system_config = {
|
||||
"object_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
"object_store_full_max_retries": 3,
|
||||
"object_store_full_initial_delay_ms": 100,
|
||||
})
|
||||
return internal_config
|
||||
}
|
||||
return system_config
|
||||
|
||||
|
||||
def get_default_fixture_ray_kwargs():
|
||||
internal_config = get_default_fixure_internal_config()
|
||||
system_config = get_default_fixure_system_config()
|
||||
ray_kwargs = {
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": 150 * 1024 * 1024,
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": system_config,
|
||||
}
|
||||
return ray_kwargs
|
||||
|
||||
@@ -125,8 +124,8 @@ def _ray_start_cluster(**kwargs):
|
||||
cluster = Cluster()
|
||||
remote_nodes = []
|
||||
for i in range(num_nodes):
|
||||
if i > 0 and "_internal_config" in init_kwargs:
|
||||
del init_kwargs["_internal_config"]
|
||||
if i > 0 and "_system_config" in init_kwargs:
|
||||
del init_kwargs["_system_config"]
|
||||
remote_nodes.append(cluster.add_node(**init_kwargs))
|
||||
# We assume driver will connect to the head (first node),
|
||||
# so ray init will be invoked if do_init is true
|
||||
@@ -164,10 +163,10 @@ def ray_start_cluster_2_nodes(request):
|
||||
def ray_start_object_store_memory(request):
|
||||
# Start the Ray processes.
|
||||
store_size = request.param
|
||||
internal_config = get_default_fixure_internal_config()
|
||||
system_config = get_default_fixure_system_config()
|
||||
init_kwargs = {
|
||||
"num_cpus": 1,
|
||||
"_internal_config": internal_config,
|
||||
"_system_config": system_config,
|
||||
"object_store_memory": store_size,
|
||||
}
|
||||
ray.init(**init_kwargs)
|
||||
@@ -208,12 +207,12 @@ def call_ray_stop_only():
|
||||
|
||||
@pytest.fixture()
|
||||
def two_node_cluster():
|
||||
internal_config = json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
system_config = {
|
||||
"object_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
})
|
||||
}
|
||||
cluster = ray.cluster_utils.Cluster(
|
||||
head_node_args={"_internal_config": internal_config})
|
||||
head_node_args={"_system_config": system_config})
|
||||
for _ in range(2):
|
||||
remote_node = cluster.add_node(num_cpus=1)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
@@ -8,94 +7,22 @@ import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
from ray.test_utils import (
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
generate_internal_config_map,
|
||||
generate_system_config_map,
|
||||
get_other_nodes,
|
||||
SignalActor,
|
||||
get_error_message,
|
||||
)
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_checkpointable_actor_cls(request):
|
||||
checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
|
||||
"ray_temp_checkpoint_dir") + os.sep
|
||||
if not os.path.isdir(checkpoint_dir):
|
||||
os.mkdir(checkpoint_dir)
|
||||
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
self.resumed_from_checkpoint = False
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
|
||||
def node_id(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get(self):
|
||||
return self.value
|
||||
|
||||
def was_resumed_from_checkpoint(self):
|
||||
return self.resumed_from_checkpoint
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Checkpoint the actor when value is increased to 3.
|
||||
should_checkpoint = self.value == 3
|
||||
return should_checkpoint
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
|
||||
# Save checkpoint into a file.
|
||||
with open(self.checkpoint_dir + actor_id, "a+") as f:
|
||||
print(checkpoint_id, self.value, file=f)
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
actor_id = actor_id.hex()
|
||||
filename = self.checkpoint_dir + actor_id
|
||||
# Load checkpoint from the file.
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
available_checkpoint_ids = [
|
||||
c.checkpoint_id for c in available_checkpoints
|
||||
]
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
checkpoint_id, value = line.strip().split(" ")
|
||||
checkpoint_id = ray.ActorCheckpointID(
|
||||
ray.utils.hex_to_binary(checkpoint_id))
|
||||
if checkpoint_id in available_checkpoint_ids:
|
||||
self.value = int(value)
|
||||
self.resumed_from_checkpoint = True
|
||||
return checkpoint_id
|
||||
return None
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
return CheckpointableActor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_init_with_task_retry_delay():
|
||||
address = ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"task_retry_delay_ms": 100
|
||||
}))
|
||||
address = ray.init(_system_config={"task_retry_delay_ms": 100})
|
||||
yield address
|
||||
ray.shutdown()
|
||||
|
||||
@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
"object_timeout_milliseconds": 1000,
|
||||
"task_retry_delay_ms": 100,
|
||||
})
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
cluster.wait_for_nodes()
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
assert ray.get(RetryableTask.remote(remote_actor)) == 3
|
||||
|
||||
|
||||
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
|
||||
# NOTE(hchen): we set object_timeout_milliseconds to 1s for
|
||||
# this test. Because if this value is too small, suprious task reconstruction
|
||||
# may happen and cause the test fauilure. If the value is too large, this test
|
||||
# could be very slow. We can remove this once we support dynamic timeout.
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
initial_reconstruction_timeout_milliseconds=1000,
|
||||
num_heartbeats_timeout=10)
|
||||
generate_system_config_map(
|
||||
object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
|
||||
],
|
||||
indirect=True)
|
||||
def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
@@ -520,287 +446,6 @@ def kill_actor(actor):
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
|
||||
# Call increase 3 times, triggering a checkpoint.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test checkpointing of a remote actor through method invocation."""
|
||||
|
||||
# Define a class that exposes a method to save checkpoints.
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
super(RemoteCheckpointableActor, self).__init__()
|
||||
self._should_checkpoint = False
|
||||
|
||||
def checkpoint(self):
|
||||
self._should_checkpoint = True
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
should_checkpoint = self._should_checkpoint
|
||||
self._should_checkpoint = False
|
||||
return should_checkpoint
|
||||
|
||||
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
|
||||
actor = cls.remote()
|
||||
# Call increase 3 times.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Call a checkpoint task.
|
||||
actor.checkpoint.remote()
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
# Place the actor on the remote node.
|
||||
cluster = ray_start_cluster_2_nodes
|
||||
remote_node = list(cluster.worker_nodes)
|
||||
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
|
||||
actor = actor_cls.remote()
|
||||
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
|
||||
actor = actor_cls.remote()
|
||||
|
||||
# Call increase several times.
|
||||
expected = 0
|
||||
for _ in range(6):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
cluster.remove_node(remote_node[0])
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def save_checkpoint(self, actor_id, checkpoint_context):
|
||||
raise Exception("Intentional error saving checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will fail.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def load_checkpoint(self, actor_id, checkpoints):
|
||||
raise Exception("Intentional error loading checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will succeed.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint because loading
|
||||
# it failed.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
# This overwrite currently isn't effective,
|
||||
# see https://github.com/ray-project/ray/issues/3926.
|
||||
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_deleting_actor_checkpoint(ray_start_regular):
|
||||
"""Test deleting old actor checkpoints."""
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.checkpoint_ids = []
|
||||
|
||||
def get_checkpoint_ids(self):
|
||||
return self.checkpoint_ids
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Save checkpoints after every task
|
||||
return True
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
self.checkpoint_ids.append(checkpoint_id)
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
pass
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
assert checkpoint_id == self.checkpoint_ids[0]
|
||||
del self.checkpoint_ids[0]
|
||||
|
||||
actor = CheckpointableActor.remote()
|
||||
for i in range(19):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
|
||||
for _ in range(20):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
|
||||
|
||||
|
||||
def test_bad_checkpointable_actor_class():
|
||||
"""Test error raised if an actor class doesn't implement all abstract
|
||||
methods in the Checkpointable interface."""
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
|
||||
@ray.remote
|
||||
class BadCheckpointableActor(ray.actor.Checkpointable):
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
|
||||
def test_init_exception_in_checkpointable_actor(
|
||||
ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
|
||||
# This test is similar to test_failure.py::test_failed_actor_init.
|
||||
# This test is used to guarantee that checkpointable actor does not
|
||||
# break the same logic.
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
a = CheckpointableFailedActor.remote()
|
||||
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
|
||||
def test_decorated_method(ray_start_regular):
|
||||
def method_invocation_decorator(f):
|
||||
def new_f_invocation(args, kwargs):
|
||||
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return self.dependency
|
||||
|
||||
# Make sure it is scheduled in the second node.
|
||||
@ray.remote(resources={"node": 1}, num_cpus=1)
|
||||
@ray.remote(resources={"node": 1})
|
||||
class Owner:
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
# Wait until the `Caller` start executing the remote `call` method.
|
||||
ray.get(signal_handle.wait.remote())
|
||||
|
||||
@ray.remote
|
||||
@ray.remote(resources={"caller": 1})
|
||||
class Caller:
|
||||
def call(self, owner_pid, signal_handle, actor_handle):
|
||||
# Notify the `Owner` that the `Caller` is executing the remote
|
||||
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return True
|
||||
|
||||
cluster = ray_start_cluster
|
||||
node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
|
||||
node_to_be_broken = cluster.add_node(resources={"node": 1})
|
||||
cluster.add_node(resources={"caller": 1})
|
||||
|
||||
owner = Owner.remote()
|
||||
owner_pid = ray.get(owner.get_pid.remote())
|
||||
|
||||
caller = Caller.remote()
|
||||
owner.create_actor.remote(caller)
|
||||
ray.get(owner.create_actor.remote(caller))
|
||||
cluster.remove_node(node_to_be_broken)
|
||||
# Wait for the `Owner` to exit.
|
||||
wait_for_pid_to_exit(owner_pid)
|
||||
|
||||
# It will hang here if location is not properly resolved.
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
try:
|
||||
@@ -241,9 +240,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet,
|
||||
num_gpus=num_gpus_per_raylet,
|
||||
_internal_config=json.dumps({
|
||||
"num_heartbeats_timeout": 1000
|
||||
} if i == 0 else {}))
|
||||
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import socket
|
||||
import time
|
||||
@@ -69,9 +68,9 @@ def test_local_scheduling_first(ray_start_cluster):
|
||||
# Disable worker caching.
|
||||
cluster.add_node(
|
||||
num_cpus=num_cpus,
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"worker_lease_timeout_milliseconds": 0,
|
||||
}))
|
||||
})
|
||||
cluster.add_node(num_cpus=num_cpus)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -332,9 +331,7 @@ def test_wait_reconstruction(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=int(10**8),
|
||||
_internal_config=json.dumps({
|
||||
"object_pinning_enabled": 0
|
||||
}))
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
@@ -607,11 +604,7 @@ def test_move_log_files_to_old(shutdown_only):
|
||||
|
||||
|
||||
def test_lease_request_leak(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
}))
|
||||
ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200})
|
||||
assert len(ray.objects()) == 0
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -3,7 +3,6 @@ import numpy as np
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
import pytest
|
||||
import sys
|
||||
import json
|
||||
|
||||
import ray
|
||||
import ray.experimental.array.remote as ra
|
||||
@@ -59,13 +58,13 @@ def test_distributed_array_assemble(ray_start_2_cpus, reload_modules):
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_2_nodes",
|
||||
[{
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# NOTE(swang): If plasma store notifications to the raylet for new
|
||||
# objects are delayed by long enough, then this causes concurrent
|
||||
# fetch calls to timeout and mistakenly mark the object as lost.
|
||||
# Set the timeout very high to prevent this.
|
||||
"initial_reconstruction_timeout_milliseconds": 60000,
|
||||
})
|
||||
"object_timeout_milliseconds": 60000,
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# coding: utf-8
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
@@ -206,10 +205,7 @@ def test_background_tasks_with_max_calls(shutdown_only):
|
||||
|
||||
|
||||
def test_fair_queueing(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1, _internal_config=json.dumps({
|
||||
"fair_queueing_enabled": 1
|
||||
}))
|
||||
ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1})
|
||||
|
||||
@ray.remote
|
||||
def h():
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
@@ -333,19 +332,16 @@ def test_call_chain(ray_start_cluster):
|
||||
assert ray.get(x) == 100
|
||||
|
||||
|
||||
def test_internal_config_when_connecting(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
"object_pinning_enabled": 0,
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
def test_system_config_when_connecting(ray_start_cluster):
|
||||
config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
|
||||
cluster = ray.cluster_utils.Cluster()
|
||||
cluster.add_node(
|
||||
_internal_config=config, object_store_memory=100 * 1024 * 1024)
|
||||
_system_config=config, object_store_memory=100 * 1024 * 1024)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Specifying _internal_config when connecting to a cluster is disallowed.
|
||||
# Specifying _system_config when connecting to a cluster is disallowed.
|
||||
with pytest.raises(ValueError):
|
||||
ray.init(address=cluster.address, _internal_config=config)
|
||||
ray.init(address=cluster.address, _system_config=config)
|
||||
|
||||
# Check that the config was picked up (object pinning is disabled).
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -138,9 +137,9 @@ def check_components_alive(cluster, component_type, check_component_alive):
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 100
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_raylet_failed(ray_start_cluster):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -908,12 +907,12 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
|
||||
|
||||
def test_connect_with_disconnected_node(shutdown_only):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 50,
|
||||
"raylet_heartbeat_timeout_milliseconds": 10,
|
||||
})
|
||||
}
|
||||
cluster = Cluster()
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
p = init_error_pubsub()
|
||||
errors = get_error_message(p, 1, timeout=5)
|
||||
@@ -943,9 +942,9 @@ def test_connect_with_disconnected_node(shutdown_only):
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 5,
|
||||
"object_store_memory": 10**8,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"object_store_full_max_retries": 0
|
||||
})
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
|
||||
@@ -965,9 +964,7 @@ def test_fill_object_store_exception(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0
|
||||
}))
|
||||
_system_config={"object_store_full_max_retries": 0})
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
@@ -997,14 +994,14 @@ def test_fill_object_store_exception(shutdown_only):
|
||||
|
||||
|
||||
def test_fill_object_store_lru_fallback(shutdown_only):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"free_objects_batch_size": 1,
|
||||
})
|
||||
}
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
lru_evict=True,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
@@ -1125,13 +1122,13 @@ def test_serialized_id(ray_start_cluster):
|
||||
[(False, False), (False, True), (True, False),
|
||||
(True, True)])
|
||||
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
})
|
||||
}
|
||||
cluster = Cluster()
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the parent actor.
|
||||
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
|
||||
|
||||
@@ -3,7 +3,7 @@ import sys
|
||||
import ray
|
||||
import pytest
|
||||
from ray.test_utils import (
|
||||
generate_internal_config_map,
|
||||
generate_system_config_map,
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
)
|
||||
@@ -22,7 +22,7 @@ def increase(x):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_gcs_server_restart(ray_start_regular):
|
||||
actor1 = Increase.remote()
|
||||
@@ -45,7 +45,7 @@ def test_gcs_server_restart(ray_start_regular):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
ids = []
|
||||
@@ -64,7 +64,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head",
|
||||
[generate_internal_config_map(num_heartbeats_timeout=20)],
|
||||
[generate_system_config_map(num_heartbeats_timeout=20)],
|
||||
indirect=True)
|
||||
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
|
||||
"""Checks that the node failure detector is correct when gcs server restart.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import pytest
|
||||
try:
|
||||
import pytest_timeout
|
||||
@@ -140,9 +139,9 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
cluster = ray.init(
|
||||
num_cpus=1,
|
||||
resources={resource1: 1},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"max_resource_shapes_per_load_report": max_shapes,
|
||||
}))
|
||||
})
|
||||
redis = ray.services.create_redis_client(
|
||||
cluster["redis_address"],
|
||||
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
|
||||
@@ -48,10 +48,7 @@ def _setup_cluster_for_test(ray_start_cluster):
|
||||
NUM_NODES = 2
|
||||
cluster = ray_start_cluster
|
||||
# Add a head node.
|
||||
cluster.add_node(
|
||||
_internal_config=json.dumps({
|
||||
"metrics_report_interval_ms": 1000
|
||||
}))
|
||||
cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
|
||||
# Add worker nodes.
|
||||
[cluster.add_node() for _ in range(NUM_NODES - 1)]
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@@ -6,7 +6,7 @@ import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.monitor import Monitor
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import generate_internal_config_map, SignalActor
|
||||
from ray.test_utils import generate_system_config_map, SignalActor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,12 +33,11 @@ def test_shutdown():
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
num_heartbeats_timeout=20,
|
||||
initial_reconstruction_timeout_milliseconds=12345)
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
|
||||
],
|
||||
indirect=True)
|
||||
def test_internal_config(ray_start_cluster_head):
|
||||
def test_system_config(ray_start_cluster_head):
|
||||
"""Checks that the internal configuration setting works.
|
||||
|
||||
We set the cluster to timeout nodes after 2 seconds of no timeouts. We
|
||||
@@ -52,8 +51,7 @@ def test_internal_config(ray_start_cluster_head):
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
assert ray._config.initial_reconstruction_timeout_milliseconds(
|
||||
) == 12345
|
||||
assert ray._config.object_timeout_milliseconds() == 12345
|
||||
assert ray._config.num_heartbeats_timeout() == 20
|
||||
|
||||
ray.get([f.remote() for _ in range(5)])
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
@@ -19,9 +18,7 @@ def test_initial_workers(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
include_dashboard=True,
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"])
|
||||
@@ -43,11 +40,7 @@ def test_initial_workers(shutdown_only):
|
||||
# different drivers were scheduled to the same worker process, that is, tasks
|
||||
# of different jobs were not correctly isolated during execution.
|
||||
def test_multi_drivers(shutdown_only):
|
||||
info = ray.init(
|
||||
num_cpus=10,
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})
|
||||
|
||||
driver_code = """
|
||||
import os
|
||||
@@ -120,9 +113,7 @@ def test_worker_env(shutdown_only):
|
||||
"foo1": "bar1",
|
||||
"foo2": "bar2"
|
||||
}),
|
||||
_internal_config=json.dumps({
|
||||
"enable_multi_tenancy": True
|
||||
}))
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
|
||||
@ray.remote
|
||||
def get_env(key):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -145,10 +144,10 @@ def check_components_alive(cluster, component_type, check_component_alive):
|
||||
[{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# Raylet codepath is not stable with a shorter timeout.
|
||||
"num_heartbeats_timeout": 10
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_raylet_failed(ray_start_cluster):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
@@ -19,13 +18,13 @@ import ray.ray_constants as ray_constants
|
||||
"num_cpus": 1,
|
||||
"num_nodes": 4,
|
||||
"object_store_memory": 1000 * 1024 * 1024,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
# Raylet codepath is not stable with a shorter timeout.
|
||||
"num_heartbeats_timeout": 10,
|
||||
"object_manager_pull_timeout_ms": 1000,
|
||||
"object_manager_push_timeout_ms": 1000,
|
||||
"object_manager_repeated_push_delay_ms": 1000,
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_object_reconstruction(ray_start_cluster):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import multiprocessing
|
||||
import numpy as np
|
||||
import pytest
|
||||
@@ -207,14 +206,14 @@ def test_object_transfer_retry(ray_start_cluster):
|
||||
# Also, force the receiving object manager to retry the pull sooner. We
|
||||
# make the chunk size smaller in order to make it easier to test objects
|
||||
# with multiple chunks.
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_manager_repeated_push_delay_ms": repeated_push_delay * 1000,
|
||||
"object_manager_pull_timeout_ms": repeated_push_delay * 1000 / 4,
|
||||
"object_manager_default_chunk_size": 1000
|
||||
})
|
||||
}
|
||||
object_store_memory = 150 * 1024 * 1024
|
||||
cluster.add_node(
|
||||
object_store_memory=object_store_memory, _internal_config=config)
|
||||
object_store_memory=object_store_memory, _system_config=config)
|
||||
cluster.add_node(num_gpus=1, object_store_memory=object_store_memory)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
|
||||
@@ -17,10 +17,10 @@ def test_spill_objects_manually(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
pinned_objects = set()
|
||||
@@ -64,10 +64,10 @@ def test_spill_objects_manually_from_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
|
||||
@ray.remote
|
||||
def _worker():
|
||||
@@ -90,10 +90,10 @@ def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
_system_config={
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
})
|
||||
arrays = [np.random.rand(100 * 1024) for _ in range(50)]
|
||||
objects = [ray.put(arr) for arr in arrays]
|
||||
|
||||
@@ -117,7 +117,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": json.dumps({
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}),
|
||||
@@ -159,7 +159,7 @@ def test_spill_objects_automatically(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_internal_config=json.dumps({
|
||||
_system_config=json.dumps({
|
||||
"max_io_workers": 4,
|
||||
"object_store_full_max_retries": 2,
|
||||
"object_store_full_initial_delay_ms": 10,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -16,14 +15,14 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
def test_cached_object(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
@@ -61,18 +60,17 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -121,18 +119,17 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -171,18 +168,17 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -229,18 +225,17 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -303,18 +298,17 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -384,18 +378,17 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
@@ -445,18 +438,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
_system_config=config,
|
||||
object_store_memory=10**8,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
@@ -493,17 +485,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
|
||||
|
||||
def test_reconstruction_stress(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"max_direct_call_object_size": 100,
|
||||
"task_retry_delay_ms": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0, _internal_config=config, enable_object_reconstruction=True)
|
||||
num_cpus=0, _system_config=config, enable_object_reconstruction=True)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# coding: utf-8
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@@ -18,14 +17,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
@pytest.fixture
|
||||
def one_worker_100MiB(request):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_store_full_max_retries": 2,
|
||||
"task_retry_delay_ms": 0,
|
||||
})
|
||||
}
|
||||
yield ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@@ -245,9 +244,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
|
||||
def test_feature_flag(shutdown_only):
|
||||
ray.init(
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=json.dumps({
|
||||
"object_pinning_enabled": 0
|
||||
}))
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
|
||||
@ray.remote
|
||||
def f(array):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -20,15 +19,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
@pytest.fixture
|
||||
def one_worker_100MiB(request):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"object_store_full_max_retries": 2,
|
||||
"task_retry_delay_ms": 0,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
})
|
||||
"object_timeout_milliseconds": 1000,
|
||||
}
|
||||
yield ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_internal_config=config)
|
||||
_system_config=config)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
@@ -23,9 +22,9 @@ def ray_start_reconstruction(request):
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": plasma_store_memory // num_nodes,
|
||||
"redis_max_memory": 10**7,
|
||||
"_internal_config": json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
"_system_config": {
|
||||
"object_timeout_milliseconds": 200
|
||||
}
|
||||
})
|
||||
for i in range(num_nodes - 1):
|
||||
cluster.add_node(
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import inspect
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import pytest
|
||||
@@ -45,9 +44,9 @@ def _start_new_cluster():
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
register_trainable("__fake_remote", MockRemoteTrainer)
|
||||
@@ -74,9 +73,9 @@ def start_connected_emptyhead_cluster():
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 0,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
_register_all()
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# coding: utf-8
|
||||
import json
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
@@ -190,9 +189,9 @@ class RayExecutorQueueTest(unittest.TestCase):
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
}
|
||||
})
|
||||
# Pytest doesn't play nicely with imports
|
||||
_register_all()
|
||||
|
||||
+11
-10
@@ -107,7 +107,6 @@ class Worker:
|
||||
self.actors = {}
|
||||
# Information used to maintain actor checkpoints.
|
||||
self.actor_checkpoint_info = {}
|
||||
self.actor_task_counter = 0
|
||||
# When the worker is constructed. Record the original value of the
|
||||
# CUDA_VISIBLE_DEVICES environment variable.
|
||||
self.original_gpu_ids = ray.utils.get_cuda_visible_devices()
|
||||
@@ -515,7 +514,7 @@ def init(address=None,
|
||||
load_code_from_local=False,
|
||||
java_worker_options=None,
|
||||
use_pickle=True,
|
||||
_internal_config=None,
|
||||
_system_config=None,
|
||||
lru_evict=False,
|
||||
enable_object_reconstruction=False,
|
||||
_metrics_export_port=None,
|
||||
@@ -631,8 +630,9 @@ def init(address=None,
|
||||
module or from the GCS.
|
||||
java_worker_options: Overwrite the options to start Java workers.
|
||||
use_pickle: Deprecated.
|
||||
_internal_config (str): JSON configuration for overriding
|
||||
RayConfig defaults. For testing purposes ONLY.
|
||||
_system_config (dict): Configuration for overriding RayConfig
|
||||
defaults. Used to set system configuration and for experimental Ray
|
||||
core feature flags.
|
||||
lru_evict (bool): If True, when an object store is full, it will evict
|
||||
objects in LRU order to make more space and when under memory
|
||||
pressure, ray.UnreconstructableError may be thrown. If False, then
|
||||
@@ -706,8 +706,9 @@ def init(address=None,
|
||||
|
||||
raylet_ip_address = node_ip_address
|
||||
|
||||
_internal_config = (json.loads(_internal_config)
|
||||
if _internal_config else {})
|
||||
_system_config = _system_config or {}
|
||||
if not isinstance(_system_config, dict):
|
||||
raise TypeError("The _system_config must be a dict.")
|
||||
|
||||
global _global_node
|
||||
if redis_address is None:
|
||||
@@ -742,7 +743,7 @@ def init(address=None,
|
||||
load_code_from_local=load_code_from_local,
|
||||
java_worker_options=java_worker_options,
|
||||
start_initial_python_workers_for_first_job=True,
|
||||
_internal_config=_internal_config,
|
||||
_system_config=_system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=_metrics_export_port,
|
||||
@@ -798,9 +799,9 @@ def init(address=None,
|
||||
if java_worker_options is not None:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"java_worker_options must not be provided.")
|
||||
if _internal_config is not None and len(_internal_config) != 0:
|
||||
if _system_config is not None and len(_system_config) != 0:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"_internal_config must not be provided.")
|
||||
"_system_config must not be provided.")
|
||||
if lru_evict:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"lru_evict must not be provided.")
|
||||
@@ -818,7 +819,7 @@ def init(address=None,
|
||||
object_ref_seed=object_ref_seed,
|
||||
temp_dir=temp_dir,
|
||||
load_code_from_local=load_code_from_local,
|
||||
_internal_config=_internal_config,
|
||||
_system_config=_system_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=_metrics_export_port)
|
||||
|
||||
@@ -122,13 +122,13 @@ if __name__ == "__main__":
|
||||
object_spilling_config = {}
|
||||
external_storage.setup_external_storage(object_spilling_config)
|
||||
|
||||
internal_config = {}
|
||||
system_config = {}
|
||||
if args.config_list is not None:
|
||||
config_list = args.config_list.split(",")
|
||||
if len(config_list) > 1:
|
||||
i = 0
|
||||
while i < len(config_list):
|
||||
internal_config[config_list[i]] = config_list[i + 1]
|
||||
system_config[config_list[i]] = config_list[i + 1]
|
||||
i += 2
|
||||
|
||||
raylet_ip_address = args.raylet_ip_address
|
||||
@@ -146,7 +146,7 @@ if __name__ == "__main__":
|
||||
temp_dir=args.temp_dir,
|
||||
load_code_from_local=args.load_code_from_local,
|
||||
metrics_agent_port=args.metrics_agent_port,
|
||||
_internal_config=json.dumps(internal_config),
|
||||
_system_config=system_config,
|
||||
)
|
||||
|
||||
node = ray.node.Node(
|
||||
|
||||
Reference in New Issue
Block a user