Rename max_reconstructions to max_restarts and use -1 for infinite (#8274)

Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
Max Fitton
2020-05-14 08:30:29 -07:00
committed by GitHub
parent 5f4c196fed
commit 00325eb2b2
71 changed files with 403 additions and 393 deletions
+4 -4
View File
@@ -902,7 +902,7 @@ cdef class CoreWorker:
Language language,
FunctionDescriptor function_descriptor,
args,
uint64_t max_reconstructions,
int64_t max_restarts,
resources,
placement_resources,
int32_t max_concurrency,
@@ -929,7 +929,7 @@ cdef class CoreWorker:
check_status(CCoreWorkerProcess.GetCoreWorker().CreateActor(
ray_function, args_vector,
CActorCreationOptions(
max_reconstructions, max_concurrency,
max_restarts, max_concurrency,
c_resources, c_placement_resources,
dynamic_worker_options, is_detached, name, is_asyncio),
extension_data,
@@ -970,13 +970,13 @@ cdef class CoreWorker:
return VectorToObjectIDs(return_ids)
def kill_actor(self, ActorID actor_id, c_bool no_reconstruction):
def kill_actor(self, ActorID actor_id, c_bool no_restart):
cdef:
CActorID c_actor_id = actor_id.native()
with nogil:
check_status(CCoreWorkerProcess.GetCoreWorker().KillActor(
c_actor_id, True, no_reconstruction))
c_actor_id, True, no_restart))
def cancel_task(self, ObjectID object_id, c_bool force_kill):
cdef:
+29 -27
View File
@@ -243,9 +243,8 @@ class ActorClassMetadata:
"""
def __init__(self, language, modified_class,
actor_creation_function_descriptor, class_id,
max_reconstructions, num_cpus, num_gpus, memory,
object_store_memory, resources):
actor_creation_function_descriptor, class_id, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources):
self.language = language
self.modified_class = modified_class
self.actor_creation_function_descriptor = \
@@ -253,7 +252,7 @@ class ActorClassMetadata:
self.class_name = actor_creation_function_descriptor.class_name
self.is_cross_language = language != Language.PYTHON
self.class_id = class_id
self.max_reconstructions = max_reconstructions
self.max_restarts = max_restarts
self.num_cpus = num_cpus
self.num_gpus = num_gpus
self.memory = memory
@@ -314,9 +313,9 @@ class ActorClass:
self.__ray_metadata__.class_name))
@classmethod
def _ray_from_modified_class(cls, modified_class, class_id,
max_reconstructions, num_cpus, num_gpus,
memory, object_store_memory, resources):
def _ray_from_modified_class(cls, modified_class, class_id, max_restarts,
num_cpus, num_gpus, memory,
object_store_memory, resources):
for attribute in [
"remote", "_remote", "_ray_from_modified_class",
"_ray_from_function_descriptor"
@@ -344,22 +343,21 @@ class ActorClass:
self.__ray_metadata__ = ActorClassMetadata(
Language.PYTHON, modified_class,
actor_creation_function_descriptor, class_id, max_reconstructions,
actor_creation_function_descriptor, class_id, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources)
return self
@classmethod
def _ray_from_function_descriptor(cls, language,
actor_creation_function_descriptor,
max_reconstructions, num_cpus, num_gpus,
memory, object_store_memory, resources):
def _ray_from_function_descriptor(
cls, language, actor_creation_function_descriptor, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources):
self = ActorClass.__new__(ActorClass)
self.__ray_metadata__ = ActorClassMetadata(
language, None, actor_creation_function_descriptor, None,
max_reconstructions, num_cpus, num_gpus, memory,
object_store_memory, resources)
max_restarts, num_cpus, num_gpus, memory, object_store_memory,
resources)
return self
@@ -407,7 +405,7 @@ class ActorClass:
resources=None,
is_direct_call=None,
max_concurrency=None,
max_reconstructions=None,
max_restarts=None,
name=None,
detached=False):
"""Create an actor.
@@ -558,7 +556,7 @@ class ActorClass:
meta.language,
meta.actor_creation_function_descriptor,
creation_args,
max_reconstructions or meta.max_reconstructions,
max_restarts or meta.max_restarts,
resources,
actor_placement_resources,
max_concurrency,
@@ -893,21 +891,25 @@ def modify_class(cls):
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
max_reconstructions):
max_restarts):
Class = modify_class(cls)
if max_reconstructions is None:
max_reconstructions = 0
if max_restarts is None:
max_restarts = 0
if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
ray_constants.INFINITE_RECONSTRUCTION):
raise ValueError("max_reconstructions must be in range [%d, %d]." %
(ray_constants.NO_RECONSTRUCTION,
ray_constants.INFINITE_RECONSTRUCTION))
infinite_restart = max_restarts == -1
if not infinite_restart:
if max_restarts < 0:
raise ValueError("max_restarts must be an integer >= -1 "
"-1 indicates infinite restarts")
else:
# Make sure we don't pass too big of an int to C++, causing
# an overflow.
max_restarts = min(max_restarts, ray_constants.MAX_INT64_VALUE)
return ActorClass._ray_from_modified_class(
Class, ActorClassID.from_random(), max_reconstructions, num_cpus,
num_gpus, memory, object_store_memory, resources)
Class, ActorClassID.from_random(), max_restarts, num_cpus, num_gpus,
memory, object_store_memory, resources)
def exit_actor():
@@ -1005,7 +1007,7 @@ class Checkpointable(metaclass=ABCMeta):
def load_checkpoint(self, actor_id, available_checkpoints):
"""Load actor's previous checkpoint, and restore actor's state.
This method will be called when an actor is reconstructed, after
This method will be called when an actor is restarted, after
actor's constructor.
If the actor needs to restore from previous checkpoint, this function
should restore actor's state and return the checkpoint ID. Otherwise,
+1 -1
View File
@@ -76,7 +76,7 @@ def java_actor_class(class_name):
return ActorClass._ray_from_function_descriptor(
Language.JAVA,
JavaFunctionDescriptor(class_name, "<init>", ""),
0, # max_reconstructions,
0, # max_restarts,
None, # num_cpus,
None, # num_gpus,
None, # memory,
+1 -1
View File
@@ -230,7 +230,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
cdef cppclass CActorCreationOptions "ray::ActorCreationOptions":
CActorCreationOptions()
CActorCreationOptions(
uint64_t max_reconstructions,
int64_t max_restarts,
int32_t max_concurrency,
const unordered_map[c_string, double] &resources,
const unordered_map[c_string, double] &placement_resources,
+1 -1
View File
@@ -96,7 +96,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
c_vector[CObjectID] *return_ids)
CRayStatus KillActor(
const CActorID &actor_id, c_bool force_kill,
c_bool no_reconstruction)
c_bool no_restart)
CRayStatus CancelTask(const CObjectID &object_id, c_bool force_kill)
unique_ptr[CProfileEvent] CreateProfileEvent(
+4 -5
View File
@@ -159,11 +159,6 @@ LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info',"
" 'warning', 'error', 'critical'], default='info'")
# A constant indicating that an actor doesn't need reconstructions.
NO_RECONSTRUCTION = 0
# A constant indicating that an actor should be reconstructed infinite times.
INFINITE_RECONSTRUCTION = 2**30
# Constants used to define the different process types.
PROCESS_TYPE_REAPER = "reaper"
PROCESS_TYPE_MONITOR = "monitor"
@@ -203,3 +198,7 @@ MACH_PAGE_SIZE_BYTES = 4096
# TODO(ffbin): Once we entirely migrate to service-based GCS, we should
# remove it.
GCS_SERVICE_ENABLED = env_bool("RAY_GCS_SERVICE_ENABLED", True)
# Max 64 bit integer value, which is needed to ensure against overflow
# in C++ when passing integer values cross-language.
MAX_INT64_VALUE = 9223372036854775807
+1 -1
View File
@@ -127,7 +127,7 @@ def init(blocking=False,
master_actor = ServeMaster.options(
detached=True,
name=SERVE_MASTER_NAME,
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
max_restarts=-1,
).remote(queueing_policy.value, policy_kwargs, start_server, http_host,
http_port, metric_exporter)
+2 -2
View File
@@ -91,8 +91,8 @@ class ReplicaConfig:
elif "name" in self.ray_actor_options:
raise ValueError(
"Specifying name in actor_init_args is not allowed.")
elif "max_reconstructions" in self.ray_actor_options:
raise ValueError("Specifying max_reconstructions in "
elif "max_restarts" in self.ray_actor_options:
raise ValueError("Specifying max_restarts in "
"actor_init_args is not allowed.")
else:
num_cpus = self.ray_actor_options.get("num_cpus", 0)
+3 -4
View File
@@ -127,8 +127,7 @@ class ServeMaster:
detached=True,
name=SERVE_ROUTER_NAME,
max_concurrency=ASYNC_CONCURRENCY,
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
).remote(policy, policy_kwargs)
max_restarts=-1).remote(policy, policy_kwargs)
def get_router(self):
"""Returns a handle to the router managed by this actor."""
@@ -148,7 +147,7 @@ class ServeMaster:
detached=True,
name=SERVE_PROXY_NAME,
max_concurrency=ASYNC_CONCURRENCY,
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
max_restarts=-1,
).remote(host, port)
def get_http_proxy(self):
@@ -295,7 +294,7 @@ class ServeMaster:
worker_handle = async_retryable(ray.remote(backend_worker)).options(
detached=True,
name=replica_tag,
max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
max_restarts=-1,
**replica_config.ray_actor_options).remote(
backend_tag, replica_tag, replica_config.actor_init_args)
# TODO(edoakes): we should probably have a timeout here.
+1 -1
View File
@@ -131,4 +131,4 @@ def test_replica_config_validation():
with pytest.raises(ValueError):
ReplicaConfig(Class, ray_actor_options={"detached": None})
with pytest.raises(ValueError):
ReplicaConfig(Class, ray_actor_options={"max_reconstructions": None})
ReplicaConfig(Class, ray_actor_options={"max_restarts": None})
+1 -1
View File
@@ -120,7 +120,7 @@ def async_retryable(cls):
be invoked in an async context.
Usage:
@ray.remote(max_reconstructions=10000)
@ray.remote(max_restarts=10000)
@async_retryable
class A:
pass
+10 -10
View File
@@ -119,7 +119,7 @@ def test_actor_lifetime_load_balancing(ray_start_cluster):
}],
indirect=True)
def test_deleted_actor_no_restart(ray_start_regular):
@ray.remote(resources={"actor": 1}, max_reconstructions=3)
@ray.remote(resources={"actor": 1}, max_restarts=3)
class Actor:
def method(self):
return 1
@@ -155,7 +155,7 @@ def test_exception_raised_when_actor_node_dies(ray_start_cluster_head):
cluster = ray_start_cluster_head
remote_node = cluster.add_node()
@ray.remote(max_reconstructions=0)
@ray.remote(max_restarts=0)
class Counter:
def __init__(self):
self.x = 0
@@ -195,7 +195,7 @@ def test_actor_init_fails(ray_start_cluster_head):
cluster = ray_start_cluster_head
remote_node = cluster.add_node()
@ray.remote(max_reconstructions=1)
@ray.remote(max_restarts=1)
class Counter:
def __init__(self):
self.x = 0
@@ -221,7 +221,7 @@ def test_reconstruction_suppression(ray_start_cluster_head):
num_nodes = 5
worker_nodes = [cluster.add_node() for _ in range(num_nodes)]
@ray.remote(max_reconstructions=1)
@ray.remote(max_restarts=1)
class Counter:
def __init__(self):
self.x = 0
@@ -247,7 +247,7 @@ def test_reconstruction_suppression(ray_start_cluster_head):
results = []
for _ in range(10):
results += [inc.remote(actor) for actor in actors]
# Make sure that we can get the results from the reconstructed actor.
# Make sure that we can get the results from the restarted actor.
results = ray.get(results)
@@ -767,7 +767,7 @@ def test_kill(ray_start_regular, deprecated_codepath):
# hang the caller.
def test_actor_creation_task_crash(ray_start_regular):
# Test actor death in constructor.
@ray.remote(max_reconstructions=0)
@ray.remote(max_restarts=0)
class Actor:
def __init__(self):
print("crash")
@@ -781,10 +781,10 @@ def test_actor_creation_task_crash(ray_start_regular):
with pytest.raises(ray.exceptions.RayActorError):
ray.get(a.f.remote())
# Test an actor can be reconstructed successfully
# Test an actor can be restarted successfully
# afte it dies in its constructor.
@ray.remote(max_reconstructions=3)
class ReconstructableActor:
@ray.remote(max_restarts=3)
class RestartableActor:
def __init__(self):
count = self.get_count()
count += 1
@@ -811,7 +811,7 @@ def test_actor_creation_task_crash(ray_start_regular):
_internal_kv_put("count", count, True)
# Verify we can get the object successfully.
ra = ReconstructableActor.remote()
ra = RestartableActor.remote()
ray.get(ra.f.remote())
+41 -42
View File
@@ -127,12 +127,12 @@ def test_actor_eviction(ray_start_object_store_memory):
assert num_success > 0
def test_actor_reconstruction(ray_start_regular):
def test_actor_restart(ray_start_regular):
"""Test actor reconstruction when actor process is killed."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor:
"""An actor that will be reconstructed at most once."""
@ray.remote(max_restarts=1)
class RestartableActor:
"""An actor that will be restarted at most once."""
def __init__(self):
self.value = 0
@@ -145,7 +145,7 @@ def test_actor_reconstruction(ray_start_regular):
def get_pid(self):
return os.getpid()
actor = ReconstructableActor.remote()
actor = RestartableActor.remote()
pid = ray.get(actor.get_pid.remote())
# Call increase 3 times
for _ in range(3):
@@ -156,31 +156,31 @@ def test_actor_reconstruction(ray_start_regular):
time.sleep(0.2)
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
# Check that the above task didn't fail and the actor is reconstructed.
# Check that the above task didn't fail and the actor is restarted.
assert ray.get(result) == 4
# Check that we can still call the actor.
assert ray.get(actor.increase.remote()) == 5
# kill actor process one more time.
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# The actor has exceeded max reconstructions, and this task should fail.
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
# Create another actor.
actor = ReconstructableActor.remote()
actor = RestartableActor.remote()
# Intentionlly exit the actor
actor.__ray_terminate__.remote()
# Check that the actor won't be reconstructed.
# Check that the actor won't be restarted.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
def test_actor_reconstruction_without_task(ray_start_regular):
"""Test a dead actor can be reconstructed without sending task to it."""
def test_actor_restart_without_task(ray_start_regular):
"""Test a dead actor can be restarted without sending task to it."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor:
@ray.remote(max_restarts=1)
class RestartableActor:
def __init__(self, obj_ids):
for obj_id in obj_ids:
# Every time the actor gets constructed,
@@ -194,26 +194,26 @@ def test_actor_reconstruction_without_task(ray_start_regular):
return os.getpid()
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
actor = ReconstructableActor.remote(obj_ids)
actor = RestartableActor.remote(obj_ids)
# Kill the actor.
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# Wait until the actor is reconstructed.
def check_reconstructed():
def check_restarted():
worker = ray.worker.global_worker
return worker.core_worker.object_exists(obj_ids[1])
assert wait_for_condition(check_reconstructed)
assert wait_for_condition(check_restarted)
def test_caller_actor_reconstruction(ray_start_regular):
"""Test tasks from a reconstructed actor can be correctly processed
def test_caller_actor_restart(ray_start_regular):
"""Test tasks from a restarted actor can be correctly processed
by the receiving actor."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor:
"""An actor that will be reconstructed at most once."""
@ray.remote(max_restarts=1)
class RestartableActor:
"""An actor that will be restarted at most once."""
def __init__(self, actor):
self.actor = actor
@@ -224,9 +224,9 @@ def test_caller_actor_reconstruction(ray_start_regular):
def get_pid(self):
return os.getpid()
@ray.remote(max_reconstructions=1)
@ray.remote(max_restarts=1)
class Actor:
"""An actor that will be reconstructed at most once."""
"""An actor that will be restarted at most once."""
def __init__(self):
self.value = 0
@@ -236,7 +236,7 @@ def test_caller_actor_reconstruction(ray_start_regular):
return self.value
remote_actor = Actor.remote()
actor = ReconstructableActor.remote(remote_actor)
actor = RestartableActor.remote(remote_actor)
# Call increase 3 times
for _ in range(3):
ray.get(actor.increase.remote())
@@ -261,9 +261,9 @@ def test_caller_task_reconstruction(ray_start_regular):
else:
os._exit(0)
@ray.remote(max_reconstructions=1)
@ray.remote(max_restarts=1)
class Actor:
"""An actor that will be reconstructed at most once."""
"""An actor that will be restarted at most once."""
def __init__(self):
self.value = 0
@@ -277,14 +277,14 @@ def test_caller_task_reconstruction(ray_start_regular):
assert ray.get(RetryableTask.remote(remote_actor)) == 3
def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
def test_actor_restart_on_node_failure(ray_start_cluster_head):
"""Test actor reconstruction when node dies unexpectedly."""
cluster = ray_start_cluster_head
max_reconstructions = 3
max_restarts = 3
# Add a few nodes to the cluster.
# Use custom resource to make sure the actor is only created on worker
# nodes, not on the head node.
for _ in range(max_reconstructions + 2):
for _ in range(max_restarts + 2):
cluster.add_node(
resources={"a": 1},
_internal_config=json.dumps({
@@ -300,7 +300,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
node_to_remove = node
cluster.remove_node(node_to_remove)
@ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
@ray.remote(max_restarts=max_restarts, resources={"a": 1})
class MyActor:
def __init__(self):
self.value = 0
@@ -317,13 +317,13 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
for _ in range(3):
ray.get(actor.increase.remote())
for i in range(max_reconstructions):
for i in range(max_restarts):
object_store_socket = ray.get(actor.get_object_store_socket.remote())
# Kill actor's node and the actor should be reconstructed
# Kill actor's node and the actor should be restarted
# on a different node.
kill_node(object_store_socket)
# Call increase again.
# Check that the actor is reconstructed and value is correct.
# Check that the actor is restarted and value is correct.
assert ray.get(actor.increase.remote()) == 4 + i
# Check that the actor is now on a different node.
assert object_store_socket != ray.get(
@@ -332,7 +332,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
# kill the node again.
object_store_socket = ray.get(actor.get_object_store_socket.remote())
kill_node(object_store_socket)
# The actor has exceeded max reconstructions, and this task should fail.
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
@@ -347,7 +347,7 @@ def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
initial_reconstruction_timeout_milliseconds=1000)
],
indirect=True)
def test_multiple_actor_reconstruction(ray_start_cluster_head):
def test_multiple_actor_restart(ray_start_cluster_head):
cluster = ray_start_cluster_head
# This test can be made more stressful by increasing the numbers below.
# The total number of actors created will be
@@ -365,7 +365,7 @@ def test_multiple_actor_reconstruction(ray_start_cluster_head):
})) for _ in range(num_nodes)
]
@ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
@ray.remote(max_restarts=-1)
class SlowCounter:
def __init__(self):
self.x = 0
@@ -420,8 +420,7 @@ def kill_actor(actor):
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test actor checkpointing and restoring from a checkpoint."""
actor = ray.remote(
max_reconstructions=2)(ray_checkpointable_actor_cls).remote()
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
# Call increase 3 times, triggering a checkpoint.
expected = 0
for _ in range(3):
@@ -465,7 +464,7 @@ def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
self._should_checkpoint = False
return should_checkpoint
cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor)
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
actor = cls.remote()
# Call increase 3 times.
expected = 0
@@ -501,7 +500,7 @@ def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
# Place the actor on the remote node.
cluster = ray_start_cluster_2_nodes
remote_node = list(cluster.worker_nodes)
actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls)
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
actor = actor_cls.remote()
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
actor = actor_cls.remote()
@@ -525,7 +524,7 @@ def test_checkpointing_save_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to complete."""
@ray.remote(max_reconstructions=2)
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def save_checkpoint(self, actor_id, checkpoint_context):
raise Exception("Intentional error saving checkpoint.")
@@ -564,7 +563,7 @@ def test_checkpointing_load_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to load."""
@ray.remote(max_reconstructions=2)
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def load_checkpoint(self, actor_id, checkpoints):
raise Exception("Intentional error loading checkpoint.")
@@ -70,7 +70,7 @@ def test_actor_creation_node_failure(ray_start_cluster):
assert len(ready) == len(children_out)
# Remove a node. Any actor creation tasks that were forwarded to this
# node must be reconstructed.
# node must be restarted.
cluster.remove_node(cluster.list_all_nodes()[-1])
+3 -3
View File
@@ -379,7 +379,7 @@ def test_actor_worker_dying(ray_start_regular):
def test_actor_worker_dying_future_tasks(ray_start_regular):
@ray.remote(max_reconstructions=0)
@ray.remote(max_restarts=0)
class Actor:
def getpid(self):
return os.getpid()
@@ -401,7 +401,7 @@ def test_actor_worker_dying_future_tasks(ray_start_regular):
def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
@ray.remote(max_reconstructions=0)
@ray.remote(max_restarts=0)
class Actor:
def getpid(self):
return os.getpid()
@@ -1077,7 +1077,7 @@ def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
def probe():
return
# TODO(swang): This test does not pass if max_reconstructions > 0 for the
# TODO(swang): This test does not pass if max_restarts > 0 for the
# raylet codepath. Add this parameter once the GCS actor service is enabled
# by default.
@ray.remote
@@ -120,7 +120,7 @@ def test_actor_creation_node_failure(ray_start_cluster):
except ray.exceptions.RayActorError:
children[i] = Child.remote(death_probability)
# Remove a node. Any actor creation tasks that were forwarded to this
# node must be reconstructed.
# node must be resubmitted.
cluster.remove_node(cluster.list_all_nodes()[-1])
+1 -1
View File
@@ -274,7 +274,7 @@ def test_nondeterministic_task(ray_start_reconstruction):
def error_check(errors):
if num_nodes == 1:
# In a single-node setting, each object is evicted and
# reconstructed exactly once, so exactly half the objects will
# restarted exactly once, so exactly half the objects will
# produce an error during reconstruction.
min_errors = num_objects // 2
else:
+13 -14
View File
@@ -1729,14 +1729,14 @@ def make_decorator(num_return_vals=None,
resources=None,
max_calls=None,
max_retries=None,
max_reconstructions=None,
max_restarts=None,
worker=None):
def decorator(function_or_class):
if (inspect.isfunction(function_or_class)
or is_cython(function_or_class)):
# Set the remote function default resources.
if max_reconstructions is not None:
raise ValueError("The keyword 'max_reconstructions' is not "
if max_restarts is not None:
raise ValueError("The keyword 'max_restarts' is not "
"allowed for remote functions.")
return ray.remote_function.RemoteFunction(
@@ -1754,7 +1754,7 @@ def make_decorator(num_return_vals=None,
return ray.actor.make_actor(function_or_class, num_cpus, num_gpus,
memory, object_store_memory, resources,
max_reconstructions)
max_restarts)
raise TypeError("The @ray.remote decorator must be applied to "
"either a function or to a class.")
@@ -1796,16 +1796,15 @@ def remote(*args, **kwargs):
third-party libraries or to reclaim resources that cannot easily be
released, e.g., GPU memory that was acquired by TensorFlow). By
default this is infinite.
* **max_reconstructions**: Only for *actors*. This specifies the maximum
number of times that the actor should be reconstructed when it dies
* **max_restarts**: Only for *actors*. This specifies the maximum
number of times that the actor should be restarted when it dies
unexpectedly. The minimum valid value is 0 (default), which indicates
that the actor doesn't need to be reconstructed. And the maximum valid
value is ray.ray_constants.INFINITE_RECONSTRUCTION.
that the actor doesn't need to be restarted. A value of -1
indicates that an actor should be restarted indefinitely.
* **max_retries**: Only for *remote functions*. This specifies the maximum
number of times that the remote function should be rerun when the worker
process executing it crashes unexpectedly. The minimum valid value is 0,
the default is 4 (default), and the maximum valid value is
ray.ray_constants.INFINITE_RECONSTRUCTION.
the default is 4 (default), and a value of -1 indicates infinite retries.
This can be done as follows:
@@ -1854,7 +1853,7 @@ def remote(*args, **kwargs):
"'@ray.remote', or it must be applied using some of "
"the arguments 'num_return_vals', 'num_cpus', 'num_gpus', "
"'memory', 'object_store_memory', 'resources', "
"'max_calls', or 'max_reconstructions', like "
"'max_calls', or 'max_restarts', like "
"'@ray.remote(num_return_vals=2, "
"resources={\"CustomResource\": 1})'.")
assert len(args) == 0 and len(kwargs) > 0, error_string
@@ -1867,7 +1866,7 @@ def remote(*args, **kwargs):
"object_store_memory",
"resources",
"max_calls",
"max_reconstructions",
"max_restarts",
"max_retries",
], error_string
@@ -1885,7 +1884,7 @@ def remote(*args, **kwargs):
# Handle other arguments.
num_return_vals = kwargs.get("num_return_vals")
max_calls = kwargs.get("max_calls")
max_reconstructions = kwargs.get("max_reconstructions")
max_restarts = kwargs.get("max_restarts")
memory = kwargs.get("memory")
object_store_memory = kwargs.get("object_store_memory")
max_retries = kwargs.get("max_retries")
@@ -1898,6 +1897,6 @@ def remote(*args, **kwargs):
object_store_memory=object_store_memory,
resources=resources,
max_calls=max_calls,
max_reconstructions=max_reconstructions,
max_restarts=max_restarts,
max_retries=max_retries,
worker=worker)