mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:34:51 +08:00
Rename max_reconstructions to max_restarts and use -1 for infinite (#8274)
Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
@@ -902,7 +902,7 @@ cdef class CoreWorker:
|
||||
Language language,
|
||||
FunctionDescriptor function_descriptor,
|
||||
args,
|
||||
uint64_t max_reconstructions,
|
||||
int64_t max_restarts,
|
||||
resources,
|
||||
placement_resources,
|
||||
int32_t max_concurrency,
|
||||
@@ -929,7 +929,7 @@ cdef class CoreWorker:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().CreateActor(
|
||||
ray_function, args_vector,
|
||||
CActorCreationOptions(
|
||||
max_reconstructions, max_concurrency,
|
||||
max_restarts, max_concurrency,
|
||||
c_resources, c_placement_resources,
|
||||
dynamic_worker_options, is_detached, name, is_asyncio),
|
||||
extension_data,
|
||||
@@ -970,13 +970,13 @@ cdef class CoreWorker:
|
||||
|
||||
return VectorToObjectIDs(return_ids)
|
||||
|
||||
def kill_actor(self, ActorID actor_id, c_bool no_reconstruction):
|
||||
def kill_actor(self, ActorID actor_id, c_bool no_restart):
|
||||
cdef:
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
with nogil:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().KillActor(
|
||||
c_actor_id, True, no_reconstruction))
|
||||
c_actor_id, True, no_restart))
|
||||
|
||||
def cancel_task(self, ObjectID object_id, c_bool force_kill):
|
||||
cdef:
|
||||
|
||||
+29
-27
@@ -243,9 +243,8 @@ class ActorClassMetadata:
|
||||
"""
|
||||
|
||||
def __init__(self, language, modified_class,
|
||||
actor_creation_function_descriptor, class_id,
|
||||
max_reconstructions, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources):
|
||||
actor_creation_function_descriptor, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources):
|
||||
self.language = language
|
||||
self.modified_class = modified_class
|
||||
self.actor_creation_function_descriptor = \
|
||||
@@ -253,7 +252,7 @@ class ActorClassMetadata:
|
||||
self.class_name = actor_creation_function_descriptor.class_name
|
||||
self.is_cross_language = language != Language.PYTHON
|
||||
self.class_id = class_id
|
||||
self.max_reconstructions = max_reconstructions
|
||||
self.max_restarts = max_restarts
|
||||
self.num_cpus = num_cpus
|
||||
self.num_gpus = num_gpus
|
||||
self.memory = memory
|
||||
@@ -314,9 +313,9 @@ class ActorClass:
|
||||
self.__ray_metadata__.class_name))
|
||||
|
||||
@classmethod
|
||||
def _ray_from_modified_class(cls, modified_class, class_id,
|
||||
max_reconstructions, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources):
|
||||
def _ray_from_modified_class(cls, modified_class, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources):
|
||||
for attribute in [
|
||||
"remote", "_remote", "_ray_from_modified_class",
|
||||
"_ray_from_function_descriptor"
|
||||
@@ -344,22 +343,21 @@ class ActorClass:
|
||||
|
||||
self.__ray_metadata__ = ActorClassMetadata(
|
||||
Language.PYTHON, modified_class,
|
||||
actor_creation_function_descriptor, class_id, max_reconstructions,
|
||||
actor_creation_function_descriptor, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources)
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def _ray_from_function_descriptor(cls, language,
|
||||
actor_creation_function_descriptor,
|
||||
max_reconstructions, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources):
|
||||
def _ray_from_function_descriptor(
|
||||
cls, language, actor_creation_function_descriptor, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources):
|
||||
self = ActorClass.__new__(ActorClass)
|
||||
|
||||
self.__ray_metadata__ = ActorClassMetadata(
|
||||
language, None, actor_creation_function_descriptor, None,
|
||||
max_reconstructions, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources)
|
||||
max_restarts, num_cpus, num_gpus, memory, object_store_memory,
|
||||
resources)
|
||||
|
||||
return self
|
||||
|
||||
@@ -407,7 +405,7 @@ class ActorClass:
|
||||
resources=None,
|
||||
is_direct_call=None,
|
||||
max_concurrency=None,
|
||||
max_reconstructions=None,
|
||||
max_restarts=None,
|
||||
name=None,
|
||||
detached=False):
|
||||
"""Create an actor.
|
||||
@@ -558,7 +556,7 @@ class ActorClass:
|
||||
meta.language,
|
||||
meta.actor_creation_function_descriptor,
|
||||
creation_args,
|
||||
max_reconstructions or meta.max_reconstructions,
|
||||
max_restarts or meta.max_restarts,
|
||||
resources,
|
||||
actor_placement_resources,
|
||||
max_concurrency,
|
||||
@@ -893,21 +891,25 @@ def modify_class(cls):
|
||||
|
||||
|
||||
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
|
||||
max_reconstructions):
|
||||
max_restarts):
|
||||
Class = modify_class(cls)
|
||||
|
||||
if max_reconstructions is None:
|
||||
max_reconstructions = 0
|
||||
if max_restarts is None:
|
||||
max_restarts = 0
|
||||
|
||||
if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
|
||||
ray_constants.INFINITE_RECONSTRUCTION):
|
||||
raise ValueError("max_reconstructions must be in range [%d, %d]." %
|
||||
(ray_constants.NO_RECONSTRUCTION,
|
||||
ray_constants.INFINITE_RECONSTRUCTION))
|
||||
infinite_restart = max_restarts == -1
|
||||
if not infinite_restart:
|
||||
if max_restarts < 0:
|
||||
raise ValueError("max_restarts must be an integer >= -1 "
|
||||
"-1 indicates infinite restarts")
|
||||
else:
|
||||
# Make sure we don't pass too big of an int to C++, causing
|
||||
# an overflow.
|
||||
max_restarts = min(max_restarts, ray_constants.MAX_INT64_VALUE)
|
||||
|
||||
return ActorClass._ray_from_modified_class(
|
||||
Class, ActorClassID.from_random(), max_reconstructions, num_cpus,
|
||||
num_gpus, memory, object_store_memory, resources)
|
||||
Class, ActorClassID.from_random(), max_restarts, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources)
|
||||
|
||||
|
||||
def exit_actor():
|
||||
@@ -1005,7 +1007,7 @@ class Checkpointable(metaclass=ABCMeta):
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
"""Load actor's previous checkpoint, and restore actor's state.
|
||||
|
||||
This method will be called when an actor is reconstructed, after
|
||||
This method will be called when an actor is restarted, after
|
||||
actor's constructor.
|
||||
If the actor needs to restore from previous checkpoint, this function
|
||||
should restore actor's state and return the checkpoint ID. Otherwise,
|
||||
|
||||
@@ -76,7 +76,7 @@ def java_actor_class(class_name):
|
||||
return ActorClass._ray_from_function_descriptor(
|
||||
Language.JAVA,
|
||||
JavaFunctionDescriptor(class_name, "<init>", ""),
|
||||
0, # max_reconstructions,
|
||||
0, # max_restarts,
|
||||
None, # num_cpus,
|
||||
None, # num_gpus,
|
||||
None, # memory,
|
||||
|
||||
@@ -230,7 +230,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
|
||||
cdef cppclass CActorCreationOptions "ray::ActorCreationOptions":
|
||||
CActorCreationOptions()
|
||||
CActorCreationOptions(
|
||||
uint64_t max_reconstructions,
|
||||
int64_t max_restarts,
|
||||
int32_t max_concurrency,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const unordered_map[c_string, double] &placement_resources,
|
||||
|
||||
@@ -96,7 +96,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
c_vector[CObjectID] *return_ids)
|
||||
CRayStatus KillActor(
|
||||
const CActorID &actor_id, c_bool force_kill,
|
||||
c_bool no_reconstruction)
|
||||
c_bool no_restart)
|
||||
CRayStatus CancelTask(const CObjectID &object_id, c_bool force_kill)
|
||||
|
||||
unique_ptr[CProfileEvent] CreateProfileEvent(
|
||||
|
||||
@@ -159,11 +159,6 @@ LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
|
||||
LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info',"
|
||||
" 'warning', 'error', 'critical'], default='info'")
|
||||
|
||||
# A constant indicating that an actor doesn't need reconstructions.
|
||||
NO_RECONSTRUCTION = 0
|
||||
# A constant indicating that an actor should be reconstructed infinite times.
|
||||
INFINITE_RECONSTRUCTION = 2**30
|
||||
|
||||
# Constants used to define the different process types.
|
||||
PROCESS_TYPE_REAPER = "reaper"
|
||||
PROCESS_TYPE_MONITOR = "monitor"
|
||||
@@ -203,3 +198,7 @@ MACH_PAGE_SIZE_BYTES = 4096
|
||||
# TODO(ffbin): Once we entirely migrate to service-based GCS, we should
|
||||
# remove it.
|
||||
GCS_SERVICE_ENABLED = env_bool("RAY_GCS_SERVICE_ENABLED", True)
|
||||
|
||||
# Max 64 bit integer value, which is needed to ensure against overflow
|
||||
# in C++ when passing integer values cross-language.
|
||||
MAX_INT64_VALUE = 9223372036854775807
|
||||
|
||||
@@ -127,7 +127,7 @@ def init(blocking=False,
|
||||
master_actor = ServeMaster.options(
|
||||
detached=True,
|
||||
name=SERVE_MASTER_NAME,
|
||||
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
|
||||
max_restarts=-1,
|
||||
).remote(queueing_policy.value, policy_kwargs, start_server, http_host,
|
||||
http_port, metric_exporter)
|
||||
|
||||
|
||||
@@ -91,8 +91,8 @@ class ReplicaConfig:
|
||||
elif "name" in self.ray_actor_options:
|
||||
raise ValueError(
|
||||
"Specifying name in actor_init_args is not allowed.")
|
||||
elif "max_reconstructions" in self.ray_actor_options:
|
||||
raise ValueError("Specifying max_reconstructions in "
|
||||
elif "max_restarts" in self.ray_actor_options:
|
||||
raise ValueError("Specifying max_restarts in "
|
||||
"actor_init_args is not allowed.")
|
||||
else:
|
||||
num_cpus = self.ray_actor_options.get("num_cpus", 0)
|
||||
|
||||
@@ -127,8 +127,7 @@ class ServeMaster:
|
||||
detached=True,
|
||||
name=SERVE_ROUTER_NAME,
|
||||
max_concurrency=ASYNC_CONCURRENCY,
|
||||
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
|
||||
).remote(policy, policy_kwargs)
|
||||
max_restarts=-1).remote(policy, policy_kwargs)
|
||||
|
||||
def get_router(self):
|
||||
"""Returns a handle to the router managed by this actor."""
|
||||
@@ -148,7 +147,7 @@ class ServeMaster:
|
||||
detached=True,
|
||||
name=SERVE_PROXY_NAME,
|
||||
max_concurrency=ASYNC_CONCURRENCY,
|
||||
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
|
||||
max_restarts=-1,
|
||||
).remote(host, port)
|
||||
|
||||
def get_http_proxy(self):
|
||||
@@ -295,7 +294,7 @@ class ServeMaster:
|
||||
worker_handle = async_retryable(ray.remote(backend_worker)).options(
|
||||
detached=True,
|
||||
name=replica_tag,
|
||||
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
|
||||
max_restarts=-1,
|
||||
**replica_config.ray_actor_options).remote(
|
||||
backend_tag, replica_tag, replica_config.actor_init_args)
|
||||
# TODO(edoakes): we should probably have a timeout here.
|
||||
|
||||
@@ -131,4 +131,4 @@ def test_replica_config_validation():
|
||||
with pytest.raises(ValueError):
|
||||
ReplicaConfig(Class, ray_actor_options={"detached": None})
|
||||
with pytest.raises(ValueError):
|
||||
ReplicaConfig(Class, ray_actor_options={"max_reconstructions": None})
|
||||
ReplicaConfig(Class, ray_actor_options={"max_restarts": None})
|
||||
|
||||
@@ -120,7 +120,7 @@ def async_retryable(cls):
|
||||
be invoked in an async context.
|
||||
|
||||
Usage:
|
||||
@ray.remote(max_reconstructions=10000)
|
||||
@ray.remote(max_restarts=10000)
|
||||
@async_retryable
|
||||
class A:
|
||||
pass
|
||||
|
||||
@@ -119,7 +119,7 @@ def test_actor_lifetime_load_balancing(ray_start_cluster):
|
||||
}],
|
||||
indirect=True)
|
||||
def test_deleted_actor_no_restart(ray_start_regular):
|
||||
@ray.remote(resources={"actor": 1}, max_reconstructions=3)
|
||||
@ray.remote(resources={"actor": 1}, max_restarts=3)
|
||||
class Actor:
|
||||
def method(self):
|
||||
return 1
|
||||
@@ -155,7 +155,7 @@ def test_exception_raised_when_actor_node_dies(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
remote_node = cluster.add_node()
|
||||
|
||||
@ray.remote(max_reconstructions=0)
|
||||
@ray.remote(max_restarts=0)
|
||||
class Counter:
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
@@ -195,7 +195,7 @@ def test_actor_init_fails(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
remote_node = cluster.add_node()
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
@ray.remote(max_restarts=1)
|
||||
class Counter:
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
@@ -221,7 +221,7 @@ def test_reconstruction_suppression(ray_start_cluster_head):
|
||||
num_nodes = 5
|
||||
worker_nodes = [cluster.add_node() for _ in range(num_nodes)]
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
@ray.remote(max_restarts=1)
|
||||
class Counter:
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
@@ -247,7 +247,7 @@ def test_reconstruction_suppression(ray_start_cluster_head):
|
||||
results = []
|
||||
for _ in range(10):
|
||||
results += [inc.remote(actor) for actor in actors]
|
||||
# Make sure that we can get the results from the reconstructed actor.
|
||||
# Make sure that we can get the results from the restarted actor.
|
||||
results = ray.get(results)
|
||||
|
||||
|
||||
@@ -767,7 +767,7 @@ def test_kill(ray_start_regular, deprecated_codepath):
|
||||
# hang the caller.
|
||||
def test_actor_creation_task_crash(ray_start_regular):
|
||||
# Test actor death in constructor.
|
||||
@ray.remote(max_reconstructions=0)
|
||||
@ray.remote(max_restarts=0)
|
||||
class Actor:
|
||||
def __init__(self):
|
||||
print("crash")
|
||||
@@ -781,10 +781,10 @@ def test_actor_creation_task_crash(ray_start_regular):
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(a.f.remote())
|
||||
|
||||
# Test an actor can be reconstructed successfully
|
||||
# Test an actor can be restarted successfully
|
||||
# afte it dies in its constructor.
|
||||
@ray.remote(max_reconstructions=3)
|
||||
class ReconstructableActor:
|
||||
@ray.remote(max_restarts=3)
|
||||
class RestartableActor:
|
||||
def __init__(self):
|
||||
count = self.get_count()
|
||||
count += 1
|
||||
@@ -811,7 +811,7 @@ def test_actor_creation_task_crash(ray_start_regular):
|
||||
_internal_kv_put("count", count, True)
|
||||
|
||||
# Verify we can get the object successfully.
|
||||
ra = ReconstructableActor.remote()
|
||||
ra = RestartableActor.remote()
|
||||
ray.get(ra.f.remote())
|
||||
|
||||
|
||||
|
||||
@@ -127,12 +127,12 @@ def test_actor_eviction(ray_start_object_store_memory):
|
||||
assert num_success > 0
|
||||
|
||||
|
||||
def test_actor_reconstruction(ray_start_regular):
|
||||
def test_actor_restart(ray_start_regular):
|
||||
"""Test actor reconstruction when actor process is killed."""
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor:
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
@ray.remote(max_restarts=1)
|
||||
class RestartableActor:
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
@@ -145,7 +145,7 @@ def test_actor_reconstruction(ray_start_regular):
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
actor = ReconstructableActor.remote()
|
||||
actor = RestartableActor.remote()
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
# Call increase 3 times
|
||||
for _ in range(3):
|
||||
@@ -156,31 +156,31 @@ def test_actor_reconstruction(ray_start_regular):
|
||||
time.sleep(0.2)
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Check that the above task didn't fail and the actor is reconstructed.
|
||||
# Check that the above task didn't fail and the actor is restarted.
|
||||
assert ray.get(result) == 4
|
||||
# Check that we can still call the actor.
|
||||
assert ray.get(actor.increase.remote()) == 5
|
||||
# kill actor process one more time.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# The actor has exceeded max reconstructions, and this task should fail.
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
# Create another actor.
|
||||
actor = ReconstructableActor.remote()
|
||||
actor = RestartableActor.remote()
|
||||
# Intentionlly exit the actor
|
||||
actor.__ray_terminate__.remote()
|
||||
# Check that the actor won't be reconstructed.
|
||||
# Check that the actor won't be restarted.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
def test_actor_reconstruction_without_task(ray_start_regular):
|
||||
"""Test a dead actor can be reconstructed without sending task to it."""
|
||||
def test_actor_restart_without_task(ray_start_regular):
|
||||
"""Test a dead actor can be restarted without sending task to it."""
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor:
|
||||
@ray.remote(max_restarts=1)
|
||||
class RestartableActor:
|
||||
def __init__(self, obj_ids):
|
||||
for obj_id in obj_ids:
|
||||
# Every time the actor gets constructed,
|
||||
@@ -194,26 +194,26 @@ def test_actor_reconstruction_without_task(ray_start_regular):
|
||||
return os.getpid()
|
||||
|
||||
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
|
||||
actor = ReconstructableActor.remote(obj_ids)
|
||||
actor = RestartableActor.remote(obj_ids)
|
||||
# Kill the actor.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
|
||||
# Wait until the actor is reconstructed.
|
||||
def check_reconstructed():
|
||||
def check_restarted():
|
||||
worker = ray.worker.global_worker
|
||||
return worker.core_worker.object_exists(obj_ids[1])
|
||||
|
||||
assert wait_for_condition(check_reconstructed)
|
||||
assert wait_for_condition(check_restarted)
|
||||
|
||||
|
||||
def test_caller_actor_reconstruction(ray_start_regular):
|
||||
"""Test tasks from a reconstructed actor can be correctly processed
|
||||
def test_caller_actor_restart(ray_start_regular):
|
||||
"""Test tasks from a restarted actor can be correctly processed
|
||||
by the receiving actor."""
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor:
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
@ray.remote(max_restarts=1)
|
||||
class RestartableActor:
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
def __init__(self, actor):
|
||||
self.actor = actor
|
||||
@@ -224,9 +224,9 @@ def test_caller_actor_reconstruction(ray_start_regular):
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
@ray.remote(max_restarts=1)
|
||||
class Actor:
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
@@ -236,7 +236,7 @@ def test_caller_actor_reconstruction(ray_start_regular):
|
||||
return self.value
|
||||
|
||||
remote_actor = Actor.remote()
|
||||
actor = ReconstructableActor.remote(remote_actor)
|
||||
actor = RestartableActor.remote(remote_actor)
|
||||
# Call increase 3 times
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
@@ -261,9 +261,9 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
else:
|
||||
os._exit(0)
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
@ray.remote(max_restarts=1)
|
||||
class Actor:
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
@@ -277,14 +277,14 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
assert ray.get(RetryableTask.remote(remote_actor)) == 3
|
||||
|
||||
|
||||
def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster_head):
|
||||
"""Test actor reconstruction when node dies unexpectedly."""
|
||||
cluster = ray_start_cluster_head
|
||||
max_reconstructions = 3
|
||||
max_restarts = 3
|
||||
# Add a few nodes to the cluster.
|
||||
# Use custom resource to make sure the actor is only created on worker
|
||||
# nodes, not on the head node.
|
||||
for _ in range(max_reconstructions + 2):
|
||||
for _ in range(max_restarts + 2):
|
||||
cluster.add_node(
|
||||
resources={"a": 1},
|
||||
_internal_config=json.dumps({
|
||||
@@ -300,7 +300,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
node_to_remove = node
|
||||
cluster.remove_node(node_to_remove)
|
||||
|
||||
@ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
|
||||
@ray.remote(max_restarts=max_restarts, resources={"a": 1})
|
||||
class MyActor:
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
@@ -317,13 +317,13 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
for i in range(max_reconstructions):
|
||||
for i in range(max_restarts):
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
# Kill actor's node and the actor should be reconstructed
|
||||
# Kill actor's node and the actor should be restarted
|
||||
# on a different node.
|
||||
kill_node(object_store_socket)
|
||||
# Call increase again.
|
||||
# Check that the actor is reconstructed and value is correct.
|
||||
# Check that the actor is restarted and value is correct.
|
||||
assert ray.get(actor.increase.remote()) == 4 + i
|
||||
# Check that the actor is now on a different node.
|
||||
assert object_store_socket != ray.get(
|
||||
@@ -332,7 +332,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
# kill the node again.
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
kill_node(object_store_socket)
|
||||
# The actor has exceeded max reconstructions, and this task should fail.
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
@@ -347,7 +347,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
initial_reconstruction_timeout_milliseconds=1000)
|
||||
],
|
||||
indirect=True)
|
||||
def test_multiple_actor_reconstruction(ray_start_cluster_head):
|
||||
def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
# This test can be made more stressful by increasing the numbers below.
|
||||
# The total number of actors created will be
|
||||
@@ -365,7 +365,7 @@ def test_multiple_actor_reconstruction(ray_start_cluster_head):
|
||||
})) for _ in range(num_nodes)
|
||||
]
|
||||
|
||||
@ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
|
||||
@ray.remote(max_restarts=-1)
|
||||
class SlowCounter:
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
@@ -420,8 +420,7 @@ def kill_actor(actor):
|
||||
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(
|
||||
max_reconstructions=2)(ray_checkpointable_actor_cls).remote()
|
||||
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
|
||||
# Call increase 3 times, triggering a checkpoint.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
@@ -465,7 +464,7 @@ def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
self._should_checkpoint = False
|
||||
return should_checkpoint
|
||||
|
||||
cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor)
|
||||
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
|
||||
actor = cls.remote()
|
||||
# Call increase 3 times.
|
||||
expected = 0
|
||||
@@ -501,7 +500,7 @@ def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
# Place the actor on the remote node.
|
||||
cluster = ray_start_cluster_2_nodes
|
||||
remote_node = list(cluster.worker_nodes)
|
||||
actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls)
|
||||
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
|
||||
actor = actor_cls.remote()
|
||||
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
|
||||
actor = actor_cls.remote()
|
||||
@@ -525,7 +524,7 @@ def test_checkpointing_save_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
|
||||
@ray.remote(max_reconstructions=2)
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def save_checkpoint(self, actor_id, checkpoint_context):
|
||||
raise Exception("Intentional error saving checkpoint.")
|
||||
@@ -564,7 +563,7 @@ def test_checkpointing_load_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
|
||||
@ray.remote(max_reconstructions=2)
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def load_checkpoint(self, actor_id, checkpoints):
|
||||
raise Exception("Intentional error loading checkpoint.")
|
||||
|
||||
@@ -70,7 +70,7 @@ def test_actor_creation_node_failure(ray_start_cluster):
|
||||
assert len(ready) == len(children_out)
|
||||
|
||||
# Remove a node. Any actor creation tasks that were forwarded to this
|
||||
# node must be reconstructed.
|
||||
# node must be restarted.
|
||||
cluster.remove_node(cluster.list_all_nodes()[-1])
|
||||
|
||||
|
||||
|
||||
@@ -379,7 +379,7 @@ def test_actor_worker_dying(ray_start_regular):
|
||||
|
||||
|
||||
def test_actor_worker_dying_future_tasks(ray_start_regular):
|
||||
@ray.remote(max_reconstructions=0)
|
||||
@ray.remote(max_restarts=0)
|
||||
class Actor:
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
@@ -401,7 +401,7 @@ def test_actor_worker_dying_future_tasks(ray_start_regular):
|
||||
|
||||
|
||||
def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
|
||||
@ray.remote(max_reconstructions=0)
|
||||
@ray.remote(max_restarts=0)
|
||||
class Actor:
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
@@ -1077,7 +1077,7 @@ def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
|
||||
def probe():
|
||||
return
|
||||
|
||||
# TODO(swang): This test does not pass if max_reconstructions > 0 for the
|
||||
# TODO(swang): This test does not pass if max_restarts > 0 for the
|
||||
# raylet codepath. Add this parameter once the GCS actor service is enabled
|
||||
# by default.
|
||||
@ray.remote
|
||||
|
||||
@@ -120,7 +120,7 @@ def test_actor_creation_node_failure(ray_start_cluster):
|
||||
except ray.exceptions.RayActorError:
|
||||
children[i] = Child.remote(death_probability)
|
||||
# Remove a node. Any actor creation tasks that were forwarded to this
|
||||
# node must be reconstructed.
|
||||
# node must be resubmitted.
|
||||
cluster.remove_node(cluster.list_all_nodes()[-1])
|
||||
|
||||
|
||||
|
||||
@@ -274,7 +274,7 @@ def test_nondeterministic_task(ray_start_reconstruction):
|
||||
def error_check(errors):
|
||||
if num_nodes == 1:
|
||||
# In a single-node setting, each object is evicted and
|
||||
# reconstructed exactly once, so exactly half the objects will
|
||||
# restarted exactly once, so exactly half the objects will
|
||||
# produce an error during reconstruction.
|
||||
min_errors = num_objects // 2
|
||||
else:
|
||||
|
||||
+13
-14
@@ -1729,14 +1729,14 @@ def make_decorator(num_return_vals=None,
|
||||
resources=None,
|
||||
max_calls=None,
|
||||
max_retries=None,
|
||||
max_reconstructions=None,
|
||||
max_restarts=None,
|
||||
worker=None):
|
||||
def decorator(function_or_class):
|
||||
if (inspect.isfunction(function_or_class)
|
||||
or is_cython(function_or_class)):
|
||||
# Set the remote function default resources.
|
||||
if max_reconstructions is not None:
|
||||
raise ValueError("The keyword 'max_reconstructions' is not "
|
||||
if max_restarts is not None:
|
||||
raise ValueError("The keyword 'max_restarts' is not "
|
||||
"allowed for remote functions.")
|
||||
|
||||
return ray.remote_function.RemoteFunction(
|
||||
@@ -1754,7 +1754,7 @@ def make_decorator(num_return_vals=None,
|
||||
|
||||
return ray.actor.make_actor(function_or_class, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources,
|
||||
max_reconstructions)
|
||||
max_restarts)
|
||||
|
||||
raise TypeError("The @ray.remote decorator must be applied to "
|
||||
"either a function or to a class.")
|
||||
@@ -1796,16 +1796,15 @@ def remote(*args, **kwargs):
|
||||
third-party libraries or to reclaim resources that cannot easily be
|
||||
released, e.g., GPU memory that was acquired by TensorFlow). By
|
||||
default this is infinite.
|
||||
* **max_reconstructions**: Only for *actors*. This specifies the maximum
|
||||
number of times that the actor should be reconstructed when it dies
|
||||
* **max_restarts**: Only for *actors*. This specifies the maximum
|
||||
number of times that the actor should be restarted when it dies
|
||||
unexpectedly. The minimum valid value is 0 (default), which indicates
|
||||
that the actor doesn't need to be reconstructed. And the maximum valid
|
||||
value is ray.ray_constants.INFINITE_RECONSTRUCTION.
|
||||
that the actor doesn't need to be restarted. A value of -1
|
||||
indicates that an actor should be restarted indefinitely.
|
||||
* **max_retries**: Only for *remote functions*. This specifies the maximum
|
||||
number of times that the remote function should be rerun when the worker
|
||||
process executing it crashes unexpectedly. The minimum valid value is 0,
|
||||
the default is 4 (default), and the maximum valid value is
|
||||
ray.ray_constants.INFINITE_RECONSTRUCTION.
|
||||
the default is 4 (default), and a value of -1 indicates infinite retries.
|
||||
|
||||
This can be done as follows:
|
||||
|
||||
@@ -1854,7 +1853,7 @@ def remote(*args, **kwargs):
|
||||
"'@ray.remote', or it must be applied using some of "
|
||||
"the arguments 'num_return_vals', 'num_cpus', 'num_gpus', "
|
||||
"'memory', 'object_store_memory', 'resources', "
|
||||
"'max_calls', or 'max_reconstructions', like "
|
||||
"'max_calls', or 'max_restarts', like "
|
||||
"'@ray.remote(num_return_vals=2, "
|
||||
"resources={\"CustomResource\": 1})'.")
|
||||
assert len(args) == 0 and len(kwargs) > 0, error_string
|
||||
@@ -1867,7 +1866,7 @@ def remote(*args, **kwargs):
|
||||
"object_store_memory",
|
||||
"resources",
|
||||
"max_calls",
|
||||
"max_reconstructions",
|
||||
"max_restarts",
|
||||
"max_retries",
|
||||
], error_string
|
||||
|
||||
@@ -1885,7 +1884,7 @@ def remote(*args, **kwargs):
|
||||
# Handle other arguments.
|
||||
num_return_vals = kwargs.get("num_return_vals")
|
||||
max_calls = kwargs.get("max_calls")
|
||||
max_reconstructions = kwargs.get("max_reconstructions")
|
||||
max_restarts = kwargs.get("max_restarts")
|
||||
memory = kwargs.get("memory")
|
||||
object_store_memory = kwargs.get("object_store_memory")
|
||||
max_retries = kwargs.get("max_retries")
|
||||
@@ -1898,6 +1897,6 @@ def remote(*args, **kwargs):
|
||||
object_store_memory=object_store_memory,
|
||||
resources=resources,
|
||||
max_calls=max_calls,
|
||||
max_reconstructions=max_reconstructions,
|
||||
max_restarts=max_restarts,
|
||||
max_retries=max_retries,
|
||||
worker=worker)
|
||||
|
||||
Reference in New Issue
Block a user