mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 22:38:16 +08:00
[Placement Group]Placement Group supports gcs failover (Part1) (#11933)
This commit is contained in:
@@ -53,14 +53,16 @@ def test_gcs_server_restart(ray_start_regular):
|
||||
indirect=True)
|
||||
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
ids = []
|
||||
for i in range(0, 100):
|
||||
# We reduce the number of actors because there are too many actors created
|
||||
# and `Too many open files` error will be thrown.
|
||||
for i in range(0, 20):
|
||||
actor = Increase.remote()
|
||||
ids.append(actor.method.remote(1))
|
||||
|
||||
ray.worker._global_node.kill_gcs_server()
|
||||
ray.worker._global_node.start_gcs_server()
|
||||
|
||||
ready, unready = ray.wait(ids, num_returns=100, timeout=240)
|
||||
ready, unready = ray.wait(ids, num_returns=20, timeout=240)
|
||||
print("Ready objects is {}.".format(ready))
|
||||
print("Unready objects is {}.".format(unready))
|
||||
assert len(unready) == 0
|
||||
|
||||
@@ -8,15 +8,21 @@ except ImportError:
|
||||
pytest_timeout = None
|
||||
|
||||
import ray
|
||||
from ray.test_utils import (get_other_nodes, wait_for_condition,
|
||||
from ray.test_utils import (generate_system_config_map, get_other_nodes,
|
||||
run_string_as_driver, wait_for_condition,
|
||||
get_error_message)
|
||||
import ray.cluster_utils
|
||||
from ray._raylet import PlacementGroupID
|
||||
from ray.test_utils import run_string_as_driver
|
||||
from ray.util.placement_group import (PlacementGroup,
|
||||
get_current_placement_group)
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Increase:
|
||||
def method(self, x):
|
||||
return x + 2
|
||||
|
||||
|
||||
def test_placement_group_pack(ray_start_cluster):
|
||||
@ray.remote(num_cpus=2)
|
||||
class Actor(object):
|
||||
@@ -1156,5 +1162,96 @@ ray.shutdown()
|
||||
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_placement_group_after_gcs_server_restarts(
|
||||
ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
cluster.add_node(num_cpus=2)
|
||||
cluster.add_node(num_cpus=2)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Create placement group 1 successfully.
|
||||
placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
ray.get(placement_group1.ready(), timeout=2)
|
||||
table = ray.util.placement_group_table(placement_group1)
|
||||
assert table["state"] == "CREATED"
|
||||
|
||||
# Restart gcs server.
|
||||
cluster.head_node.kill_gcs_server()
|
||||
cluster.head_node.start_gcs_server()
|
||||
|
||||
# Create placement group 2 successfully.
|
||||
placement_group2 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
ray.get(placement_group2.ready(), timeout=2)
|
||||
table = ray.util.placement_group_table(placement_group2)
|
||||
assert table["state"] == "CREATED"
|
||||
|
||||
# Create placement group 3.
|
||||
# Status is `PENDING` because the cluster resource is insufficient.
|
||||
placement_group3 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
with pytest.raises(ray.exceptions.GetTimeoutError):
|
||||
ray.get(placement_group3.ready(), timeout=2)
|
||||
table = ray.util.placement_group_table(placement_group3)
|
||||
assert table["state"] == "PENDING"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_actor_with_placement_group_after_gcs_server_restart(
|
||||
ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
cluster.add_node(num_cpus=2)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Create a placement group.
|
||||
placement_group = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
|
||||
# Create an actor that occupies resources after gcs server restart.
|
||||
cluster.head_node.kill_gcs_server()
|
||||
cluster.head_node.start_gcs_server()
|
||||
actor_2 = Increase.options(
|
||||
placement_group=placement_group,
|
||||
placement_group_bundle_index=1).remote()
|
||||
assert ray.get(actor_2.method.remote(1)) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_placement_group_during_gcs_server_restart(
|
||||
ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
cluster.add_node(num_cpus=20)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Create placement groups during gcs server restart.
|
||||
placement_groups = []
|
||||
for i in range(0, 100):
|
||||
placement_group = ray.util.placement_group([{
|
||||
"CPU": 0.1
|
||||
}, {
|
||||
"CPU": 0.1
|
||||
}])
|
||||
placement_groups.append(placement_group)
|
||||
|
||||
cluster.head_node.kill_gcs_server()
|
||||
cluster.head_node.start_gcs_server()
|
||||
|
||||
for i in range(0, 10):
|
||||
ray.get(placement_groups[i].ready(), timeout=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user