Task and actor fate sharing with the owner process (#6818)

* Add test

* Kill workers leased by failed workers

* merge

* shorten test

* Add node failure test case

* Fix FromBinary for nil IDs, add assertions

* Test

* Fate sharing on node removal, fix owner address bug

* lint

* Update src/ray/raylet/node_manager.cc

Co-Authored-By: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com>

* fix

* Remove unneeded test

* fix IDs

Co-authored-by: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com>
This commit is contained in:
Stephanie Wang
2020-01-20 16:44:04 -08:00
committed by GitHub
parent 14016535a5
commit 815cd0e39a
10 changed files with 191 additions and 24 deletions
+2 -1
View File
@@ -149,10 +149,11 @@ def wait_for_condition(condition_predictor,
Whether the condition is met within the timeout.
"""
time_elapsed = 0
start = time.time()
while time_elapsed <= timeout_ms:
if condition_predictor():
return True
time_elapsed += retry_interval_ms
time_elapsed = (time.time() - start) * 1000
time.sleep(retry_interval_ms / 1000.0)
return False
+79
View File
@@ -15,6 +15,7 @@ import ray.ray_constants as ray_constants
from ray.cluster_utils import Cluster
from ray.test_utils import (
relevant_errors,
wait_for_condition,
wait_for_errors,
RayTestTimeoutException,
)
@@ -991,6 +992,84 @@ def test_serialized_id(ray_start_cluster):
ray.get(get.remote([obj], True))
def test_fate_sharing(ray_start_cluster):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the parent actor.
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
# Node to place the child actor.
cluster.add_node(num_cpus=1, resources={"child": 1})
cluster.wait_for_nodes()
ray.init(address=cluster.address)
@ray.remote
def sleep():
time.sleep(1000)
@ray.remote(resources={"child": 1})
def probe():
return
@ray.remote
class Actor(object):
def __init__(self):
return
def start_child(self, use_actors):
if use_actors:
child = Actor.options(resources={"child": 1}).remote()
ray.get(child.sleep.remote())
else:
ray.get(sleep.options(resources={"child": 1}).remote())
def sleep(self):
time.sleep(1000)
def get_pid(self):
return os.getpid()
# Returns whether the "child" resource is available.
def child_resource_available():
p = probe.remote()
ready, _ = ray.wait([p], timeout=1)
return len(ready) > 0
# Test fate sharing if the parent process dies.
def test_process_failure(use_actors):
a = Actor.options(resources={"parent": 1}).remote()
pid = ray.get(a.get_pid.remote())
a.start_child.remote(use_actors=use_actors)
# Wait for the child to be scheduled.
assert wait_for_condition(
lambda: not child_resource_available(), timeout_ms=10000)
# Kill the parent process.
os.kill(pid, 9)
assert wait_for_condition(child_resource_available, timeout_ms=10000)
# Test fate sharing if the parent node dies.
def test_node_failure(node_to_kill, use_actors):
a = Actor.options(resources={"parent": 1}).remote()
a.start_child.remote(use_actors=use_actors)
# Wait for the child to be scheduled.
assert wait_for_condition(
lambda: not child_resource_available(), timeout_ms=10000)
# Kill the parent process.
cluster.remove_node(node_to_kill, allow_graceful=False)
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
assert wait_for_condition(child_resource_available, timeout_ms=10000)
return node_to_kill
test_process_failure(use_actors=True)
test_process_failure(use_actors=False)
node_to_kill = test_node_failure(node_to_kill, use_actors=True)
node_to_kill = test_node_failure(node_to_kill, use_actors=False)
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))