[core] Enable object reconstruction for retryable actor tasks (#9557)

* Test actor plasma reconstruction

* Allow resubmission of actor tasks

* doc

* Test for actor constructor

* Kill PID before removing node

* Kill pid before node
This commit is contained in:
Stephanie Wang
2020-07-23 21:15:12 -07:00
committed by GitHub
parent 239196fffc
commit f2705e2c73
8 changed files with 224 additions and 24 deletions
+162 -1
View File
@@ -1,4 +1,6 @@
import json
import os
import signal
import sys
import numpy as np
@@ -6,7 +8,11 @@ import pytest
import ray
from ray.test_utils import (
wait_for_condition, )
wait_for_condition,
wait_for_pid_to_exit,
)
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
def test_cached_object(ray_start_cluster):
@@ -217,6 +223,161 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
pass
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_basic_reconstruction_actor_task(ray_start_cluster,
reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
cluster.add_node(
num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
cluster.wait_for_nodes()
@ray.remote(
max_restarts=-1,
max_task_retries=-1 if reconstruction_enabled else 0,
resources={"node1": 1},
num_cpus=0)
class Actor:
def __init__(self):
pass
def large_object(self):
return np.zeros(10**7, dtype=np.uint8)
def pid(self):
return os.getpid()
@ray.remote
def dependent_task(x):
return
a = Actor.remote()
pid = ray.get(a.pid.remote())
obj = a.large_object.remote()
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
# Workaround to kill the actor process too since there is a bug where the
# actor's plasma client hangs after the plasma store has exited.
os.kill(pid, SIGKILL)
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
wait_for_pid_to_exit(pid)
if reconstruction_enabled:
ray.get(dependent_task.remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
# Make sure the actor handle is still usable.
pid = ray.get(a.pid.remote())
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_basic_reconstruction_actor_constructor(ray_start_cluster,
reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
cluster.add_node(
num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
cluster.wait_for_nodes()
@ray.remote(max_retries=1 if reconstruction_enabled else 0)
def large_object():
return np.zeros(10**7, dtype=np.uint8)
# Both the constructor and a method depend on the large object.
@ray.remote(max_restarts=-1)
class Actor:
def __init__(self, x):
pass
def dependent_task(self, x):
return
def pid(self):
return os.getpid()
obj = large_object.options(resources={"node1": 1}).remote()
a = Actor.options(resources={"node1": 1}).remote(obj)
ray.get(a.dependent_task.remote(obj))
pid = ray.get(a.pid.remote())
# Workaround to kill the actor process too since there is a bug where the
# actor's plasma client hangs after the plasma store has exited.
os.kill(pid, SIGKILL)
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
wait_for_pid_to_exit(pid)
# Wait for the actor to restart.
def probe():
try:
ray.get(a.dependent_task.remote(obj))
return True
except ray.exceptions.RayActorError:
return False
except (ray.exceptions.RayTaskError,
ray.exceptions.UnreconstructableError):
return True
wait_for_condition(probe)
if reconstruction_enabled:
ray.get(a.dependent_task.remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
x = a.dependent_task.remote(obj)
print(x)
ray.get(x)
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
config = {