GCS server error handling for actor creation (#8899)

This commit is contained in:
fangfengbin
2020-07-02 16:27:32 +08:00
committed by GitHub
parent a7a7bef622
commit 8fcfcc4100
14 changed files with 298 additions and 104 deletions
+1 -1
View File
@@ -308,7 +308,7 @@ def test_actor_restart_on_node_failure(ray_start_cluster):
def ready(self):
return
actor = RestartableActor.remote()
actor = RestartableActor.options(detached=True).remote()
ray.get(actor.ready.remote())
results = [actor.increase.remote() for _ in range(100)]
# Kill actor node, while the above task is still being executed.
+24 -10
View File
@@ -3,18 +3,18 @@ import sys
import ray
def test_gcs_server_restart():
ray.init()
@ray.remote
class Increase:
def method(self, x):
return x + 2
@ray.remote
class Increase:
def method(self, x):
return x + 2
@ray.remote
def increase(x):
return x + 1
@ray.remote
def increase(x):
return x + 1
def test_gcs_server_restart(ray_start_regular):
actor1 = Increase.remote()
result = ray.get(actor1.method.remote(1))
assert result == 3
@@ -31,7 +31,21 @@ def test_gcs_server_restart():
result = ray.get(increase.remote(1))
assert result == 2
ray.shutdown()
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
ids = []
for i in range(0, 100):
actor = Increase.remote()
ids.append(actor.method.remote(1))
ray.worker._global_node.kill_gcs_server()
ray.worker._global_node.start_gcs_server()
ready, unready = ray.wait(ids, 100, 240)
print("Ready objects is {}.".format(ready))
print("Unready objects is {}.".format(unready))
assert len(unready) == 0
if __name__ == "__main__":