mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:34:51 +08:00
Task and actor fate sharing with the owner process (#6818)
* Add test * Kill workers leased by failed workers * merge * shorten test * Add node failure test case * Fix FromBinary for nil IDs, add assertions * Test * Fate sharing on node removal, fix owner address bug * lint * Update src/ray/raylet/node_manager.cc Co-Authored-By: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com> * fix * Remove unneeded test * fix IDs Co-authored-by: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com>
This commit is contained in:
@@ -149,10 +149,11 @@ def wait_for_condition(condition_predictor,
|
||||
Whether the condition is met within the timeout.
|
||||
"""
|
||||
time_elapsed = 0
|
||||
start = time.time()
|
||||
while time_elapsed <= timeout_ms:
|
||||
if condition_predictor():
|
||||
return True
|
||||
time_elapsed += retry_interval_ms
|
||||
time_elapsed = (time.time() - start) * 1000
|
||||
time.sleep(retry_interval_ms / 1000.0)
|
||||
return False
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import ray.ray_constants as ray_constants
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import (
|
||||
relevant_errors,
|
||||
wait_for_condition,
|
||||
wait_for_errors,
|
||||
RayTestTimeoutException,
|
||||
)
|
||||
@@ -991,6 +992,84 @@ def test_serialized_id(ray_start_cluster):
|
||||
ray.get(get.remote([obj], True))
|
||||
|
||||
|
||||
def test_fate_sharing(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
})
|
||||
cluster = Cluster()
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
# Node to place the parent actor.
|
||||
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
|
||||
# Node to place the child actor.
|
||||
cluster.add_node(num_cpus=1, resources={"child": 1})
|
||||
cluster.wait_for_nodes()
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
def sleep():
|
||||
time.sleep(1000)
|
||||
|
||||
@ray.remote(resources={"child": 1})
|
||||
def probe():
|
||||
return
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
return
|
||||
|
||||
def start_child(self, use_actors):
|
||||
if use_actors:
|
||||
child = Actor.options(resources={"child": 1}).remote()
|
||||
ray.get(child.sleep.remote())
|
||||
else:
|
||||
ray.get(sleep.options(resources={"child": 1}).remote())
|
||||
|
||||
def sleep(self):
|
||||
time.sleep(1000)
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
# Returns whether the "child" resource is available.
|
||||
def child_resource_available():
|
||||
p = probe.remote()
|
||||
ready, _ = ray.wait([p], timeout=1)
|
||||
return len(ready) > 0
|
||||
|
||||
# Test fate sharing if the parent process dies.
|
||||
def test_process_failure(use_actors):
|
||||
a = Actor.options(resources={"parent": 1}).remote()
|
||||
pid = ray.get(a.get_pid.remote())
|
||||
a.start_child.remote(use_actors=use_actors)
|
||||
# Wait for the child to be scheduled.
|
||||
assert wait_for_condition(
|
||||
lambda: not child_resource_available(), timeout_ms=10000)
|
||||
# Kill the parent process.
|
||||
os.kill(pid, 9)
|
||||
assert wait_for_condition(child_resource_available, timeout_ms=10000)
|
||||
|
||||
# Test fate sharing if the parent node dies.
|
||||
def test_node_failure(node_to_kill, use_actors):
|
||||
a = Actor.options(resources={"parent": 1}).remote()
|
||||
a.start_child.remote(use_actors=use_actors)
|
||||
# Wait for the child to be scheduled.
|
||||
assert wait_for_condition(
|
||||
lambda: not child_resource_available(), timeout_ms=10000)
|
||||
# Kill the parent process.
|
||||
cluster.remove_node(node_to_kill, allow_graceful=False)
|
||||
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
|
||||
assert wait_for_condition(child_resource_available, timeout_ms=10000)
|
||||
return node_to_kill
|
||||
|
||||
test_process_failure(use_actors=True)
|
||||
test_process_failure(use_actors=False)
|
||||
node_to_kill = test_node_failure(node_to_kill, use_actors=True)
|
||||
node_to_kill = test_node_failure(node_to_kill, use_actors=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user