mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 11:01:06 +08:00
[core] Enable object reconstruction for retryable actor tasks (#9557)
* Test actor plasma reconstruction * Allow resubmission of actor tasks * doc * Test for actor constructor * Kill PID before removing node * Kill pid before node
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
@@ -6,7 +8,11 @@ import pytest
|
||||
|
||||
import ray
|
||||
from ray.test_utils import (
|
||||
wait_for_condition, )
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
)
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
def test_cached_object(ray_start_cluster):
|
||||
@@ -217,6 +223,161 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
|
||||
def test_basic_reconstruction_actor_task(ray_start_cluster,
|
||||
reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
|
||||
cluster.add_node(
|
||||
num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@ray.remote(
|
||||
max_restarts=-1,
|
||||
max_task_retries=-1 if reconstruction_enabled else 0,
|
||||
resources={"node1": 1},
|
||||
num_cpus=0)
|
||||
class Actor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def large_object(self):
|
||||
return np.zeros(10**7, dtype=np.uint8)
|
||||
|
||||
def pid(self):
|
||||
return os.getpid()
|
||||
|
||||
@ray.remote
|
||||
def dependent_task(x):
|
||||
return
|
||||
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.pid.remote())
|
||||
obj = a.large_object.remote()
|
||||
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
|
||||
|
||||
# Workaround to kill the actor process too since there is a bug where the
|
||||
# actor's plasma client hangs after the plasma store has exited.
|
||||
os.kill(pid, SIGKILL)
|
||||
|
||||
cluster.remove_node(node_to_kill, allow_graceful=False)
|
||||
cluster.add_node(
|
||||
num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
|
||||
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
if reconstruction_enabled:
|
||||
ray.get(dependent_task.remote(obj))
|
||||
else:
|
||||
with pytest.raises(ray.exceptions.RayTaskError) as e:
|
||||
ray.get(dependent_task.remote(obj))
|
||||
with pytest.raises(ray.exceptions.UnreconstructableError):
|
||||
raise e.as_instanceof_cause()
|
||||
|
||||
# Make sure the actor handle is still usable.
|
||||
pid = ray.get(a.pid.remote())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
|
||||
def test_basic_reconstruction_actor_constructor(ray_start_cluster,
|
||||
reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
if not reconstruction_enabled:
|
||||
config["lineage_pinning_enabled"] = 0
|
||||
config = json.dumps(config)
|
||||
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
_internal_config=config,
|
||||
enable_object_reconstruction=reconstruction_enabled)
|
||||
ray.init(address=cluster.address)
|
||||
# Node to place the initial object.
|
||||
node_to_kill = cluster.add_node(
|
||||
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
|
||||
cluster.add_node(
|
||||
num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@ray.remote(max_retries=1 if reconstruction_enabled else 0)
|
||||
def large_object():
|
||||
return np.zeros(10**7, dtype=np.uint8)
|
||||
|
||||
# Both the constructor and a method depend on the large object.
|
||||
@ray.remote(max_restarts=-1)
|
||||
class Actor:
|
||||
def __init__(self, x):
|
||||
pass
|
||||
|
||||
def dependent_task(self, x):
|
||||
return
|
||||
|
||||
def pid(self):
|
||||
return os.getpid()
|
||||
|
||||
obj = large_object.options(resources={"node1": 1}).remote()
|
||||
a = Actor.options(resources={"node1": 1}).remote(obj)
|
||||
ray.get(a.dependent_task.remote(obj))
|
||||
pid = ray.get(a.pid.remote())
|
||||
|
||||
# Workaround to kill the actor process too since there is a bug where the
|
||||
# actor's plasma client hangs after the plasma store has exited.
|
||||
os.kill(pid, SIGKILL)
|
||||
|
||||
cluster.remove_node(node_to_kill, allow_graceful=False)
|
||||
cluster.add_node(
|
||||
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
|
||||
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
# Wait for the actor to restart.
|
||||
def probe():
|
||||
try:
|
||||
ray.get(a.dependent_task.remote(obj))
|
||||
return True
|
||||
except ray.exceptions.RayActorError:
|
||||
return False
|
||||
except (ray.exceptions.RayTaskError,
|
||||
ray.exceptions.UnreconstructableError):
|
||||
return True
|
||||
|
||||
wait_for_condition(probe)
|
||||
|
||||
if reconstruction_enabled:
|
||||
ray.get(a.dependent_task.remote(obj))
|
||||
else:
|
||||
with pytest.raises(ray.exceptions.RayTaskError) as e:
|
||||
x = a.dependent_task.remote(obj)
|
||||
print(x)
|
||||
ray.get(x)
|
||||
with pytest.raises(ray.exceptions.UnreconstructableError):
|
||||
raise e.as_instanceof_cause()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
|
||||
def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
|
||||
Reference in New Issue
Block a user