mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:16:06 +08:00
f51c26bae6
This reverts commit 2092b097ea.
1510 lines
46 KiB
Python
1510 lines
46 KiB
Python
import pytest
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
try:
|
|
import pytest_timeout
|
|
except ImportError:
|
|
pytest_timeout = None
|
|
|
|
import ray
|
|
from ray.test_utils import (generate_system_config_map, get_other_nodes,
|
|
run_string_as_driver, wait_for_condition,
|
|
get_error_message)
|
|
import ray.cluster_utils
|
|
from ray._raylet import PlacementGroupID
|
|
from ray.util.placement_group import (PlacementGroup, placement_group,
|
|
remove_placement_group,
|
|
get_current_placement_group)
|
|
|
|
|
|
@ray.remote
|
|
class Increase:
|
|
def method(self, x):
|
|
return x + 2
|
|
|
|
|
|
def test_placement_group_pack(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 2
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name",
|
|
strategy="PACK",
|
|
bundles=[
|
|
{
|
|
"CPU": 2,
|
|
"GPU": 0 # Test 0 resource spec doesn't break tests.
|
|
},
|
|
{
|
|
"CPU": 2
|
|
}
|
|
])
|
|
ray.get(placement_group.ready())
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
actor_2 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1).remote()
|
|
|
|
ray.get(actor_1.value.remote())
|
|
ray.get(actor_2.value.remote())
|
|
|
|
# Get all actors.
|
|
actor_infos = ray.actors()
|
|
|
|
# Make sure all actors in counter_list are collocated in one node.
|
|
actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
|
|
actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
|
|
|
|
assert actor_info_1 and actor_info_2
|
|
|
|
node_of_actor_1 = actor_info_1["Address"]["NodeID"]
|
|
node_of_actor_2 = actor_info_2["Address"]["NodeID"]
|
|
assert node_of_actor_1 == node_of_actor_2
|
|
|
|
|
|
def test_placement_group_strict_pack(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 2
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name", strategy="STRICT_PACK", bundles=[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}])
|
|
ray.get(placement_group.ready())
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
actor_2 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1).remote()
|
|
|
|
ray.get(actor_1.value.remote())
|
|
ray.get(actor_2.value.remote())
|
|
|
|
# Get all actors.
|
|
actor_infos = ray.actors()
|
|
|
|
# Make sure all actors in counter_list are collocated in one node.
|
|
actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
|
|
actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
|
|
|
|
assert actor_info_1 and actor_info_2
|
|
|
|
node_of_actor_1 = actor_info_1["Address"]["NodeID"]
|
|
node_of_actor_2 = actor_info_2["Address"]["NodeID"]
|
|
assert node_of_actor_1 == node_of_actor_2
|
|
|
|
|
|
def test_placement_group_spread(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 2
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name", strategy="SPREAD", bundles=[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}])
|
|
ray.get(placement_group.ready())
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
actor_2 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1).remote()
|
|
|
|
ray.get(actor_1.value.remote())
|
|
ray.get(actor_2.value.remote())
|
|
|
|
# Get all actors.
|
|
actor_infos = ray.actors()
|
|
|
|
# Make sure all actors in counter_list are located in separate nodes.
|
|
actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
|
|
actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
|
|
|
|
assert actor_info_1 and actor_info_2
|
|
|
|
node_of_actor_1 = actor_info_1["Address"]["NodeID"]
|
|
node_of_actor_2 = actor_info_2["Address"]["NodeID"]
|
|
assert node_of_actor_1 != node_of_actor_2
|
|
|
|
|
|
def test_placement_group_strict_spread(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 3
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name",
|
|
strategy="STRICT_SPREAD",
|
|
bundles=[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}])
|
|
ray.get(placement_group.ready())
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
actor_2 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1).remote()
|
|
actor_3 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=2).remote()
|
|
|
|
ray.get(actor_1.value.remote())
|
|
ray.get(actor_2.value.remote())
|
|
ray.get(actor_3.value.remote())
|
|
|
|
# Get all actors.
|
|
actor_infos = ray.actors()
|
|
|
|
# Make sure all actors in counter_list are located in separate nodes.
|
|
actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
|
|
actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
|
|
actor_info_3 = actor_infos.get(actor_3._actor_id.hex())
|
|
|
|
assert actor_info_1 and actor_info_2 and actor_info_3
|
|
|
|
node_of_actor_1 = actor_info_1["Address"]["NodeID"]
|
|
node_of_actor_2 = actor_info_2["Address"]["NodeID"]
|
|
node_of_actor_3 = actor_info_3["Address"]["NodeID"]
|
|
assert node_of_actor_1 != node_of_actor_2
|
|
assert node_of_actor_1 != node_of_actor_3
|
|
assert node_of_actor_2 != node_of_actor_3
|
|
|
|
|
|
def test_placement_group_actor_resource_ids(ray_start_cluster):
|
|
@ray.remote(num_cpus=1)
|
|
class F:
|
|
def f(self):
|
|
return ray.get_resource_ids()
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 1
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
g1 = ray.util.placement_group([{"CPU": 2}])
|
|
a1 = F.options(placement_group=g1).remote()
|
|
resources = ray.get(a1.f.remote())
|
|
assert len(resources) == 1, resources
|
|
assert "CPU_group_" in list(resources.keys())[0], resources
|
|
|
|
|
|
def test_placement_group_task_resource_ids(ray_start_cluster):
|
|
@ray.remote(num_cpus=1)
|
|
def f():
|
|
return ray.get_resource_ids()
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 1
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
g1 = ray.util.placement_group([{"CPU": 2}])
|
|
o1 = f.options(placement_group=g1).remote()
|
|
resources = ray.get(o1)
|
|
assert len(resources) == 1, resources
|
|
assert "CPU_group_" in list(resources.keys())[0], resources
|
|
assert "CPU_group_0_" not in list(resources.keys())[0], resources
|
|
|
|
# Now retry with a bundle index constraint.
|
|
o1 = f.options(placement_group=g1, placement_group_bundle_index=0).remote()
|
|
resources = ray.get(o1)
|
|
assert len(resources) == 2, resources
|
|
keys = list(resources.keys())
|
|
assert "CPU_group_" in keys[0], resources
|
|
assert "CPU_group_" in keys[1], resources
|
|
assert "CPU_group_0_" in keys[0] or "CPU_group_0_" in keys[1], resources
|
|
|
|
|
|
def test_placement_group_hang(ray_start_cluster):
|
|
@ray.remote(num_cpus=1)
|
|
def f():
|
|
return ray.get_resource_ids()
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 1
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
# Warm workers up, so that this triggers the hang rice.
|
|
ray.get(f.remote())
|
|
|
|
g1 = ray.util.placement_group([{"CPU": 2}])
|
|
# This will start out infeasible. The placement group will then be created
|
|
# and it transitions to feasible.
|
|
o1 = f.options(placement_group=g1).remote()
|
|
|
|
resources = ray.get(o1)
|
|
assert len(resources) == 1, resources
|
|
assert "CPU_group_" in list(resources.keys())[0], resources
|
|
|
|
|
|
def test_remove_placement_group(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
# First try to remove a placement group that doesn't
|
|
# exist. This should not do anything.
|
|
random_group_id = PlacementGroupID.from_random()
|
|
random_placement_group = PlacementGroup(random_group_id)
|
|
for _ in range(3):
|
|
ray.util.remove_placement_group(random_placement_group)
|
|
|
|
# Creating a placement group as soon as it is
|
|
# created should work.
|
|
placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
|
|
ray.util.remove_placement_group(placement_group)
|
|
|
|
def is_placement_group_removed():
|
|
table = ray.util.placement_group_table(placement_group)
|
|
if "state" not in table:
|
|
return False
|
|
return table["state"] == "REMOVED"
|
|
|
|
wait_for_condition(is_placement_group_removed)
|
|
|
|
# # Now let's create a placement group.
|
|
placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
|
|
|
|
# Create an actor that occupies resources.
|
|
@ray.remote(num_cpus=2)
|
|
class A:
|
|
def f(self):
|
|
return 3
|
|
|
|
# Currently, there's no way to prevent
|
|
# tasks to be retried for removed placement group.
|
|
# Set max_retrie=0 for testing.
|
|
# TODO(sang): Handle this edge case.
|
|
@ray.remote(num_cpus=2, max_retries=0)
|
|
def long_running_task():
|
|
print(os.getpid())
|
|
import time
|
|
time.sleep(50)
|
|
|
|
# Schedule a long running task and actor.
|
|
task_ref = long_running_task.options(
|
|
placement_group=placement_group).remote()
|
|
a = A.options(placement_group=placement_group).remote()
|
|
assert ray.get(a.f.remote()) == 3
|
|
|
|
ray.util.remove_placement_group(placement_group)
|
|
# Subsequent remove request shouldn't do anything.
|
|
for _ in range(3):
|
|
ray.util.remove_placement_group(placement_group)
|
|
|
|
# Make sure placement group resources are
|
|
# released and we can schedule this task.
|
|
@ray.remote(num_cpus=4)
|
|
def f():
|
|
return 3
|
|
|
|
assert ray.get(f.remote()) == 3
|
|
# Since the placement group is removed,
|
|
# the actor should've been killed.
|
|
# That means this request should fail.
|
|
with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
|
|
ray.get(a.f.remote(), timeout=3.0)
|
|
with pytest.raises(ray.exceptions.WorkerCrashedError):
|
|
ray.get(task_ref)
|
|
|
|
|
|
def test_remove_pending_placement_group(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
# Create a placement group that cannot be scheduled now.
|
|
placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}])
|
|
ray.util.remove_placement_group(placement_group)
|
|
|
|
# TODO(sang): Add state check here.
|
|
@ray.remote(num_cpus=4)
|
|
def f():
|
|
return 3
|
|
|
|
# Make sure this task is still schedulable.
|
|
assert ray.get(f.remote()) == 3
|
|
|
|
|
|
def test_placement_group_table(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 2
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
# Originally placement group creation should be pending because
|
|
# there are no resources.
|
|
name = "name"
|
|
strategy = "PACK"
|
|
bundles = [{"CPU": 2, "GPU": 1}, {"CPU": 2}]
|
|
placement_group = ray.util.placement_group(
|
|
name=name, strategy=strategy, bundles=bundles)
|
|
result = ray.util.placement_group_table(placement_group)
|
|
assert result["name"] == name
|
|
assert result["strategy"] == strategy
|
|
for i in range(len(bundles)):
|
|
assert bundles[i] == result["bundles"][i]
|
|
assert result["state"] == "PENDING"
|
|
|
|
# Now the placement group should be scheduled.
|
|
cluster.add_node(num_cpus=5, num_gpus=1)
|
|
|
|
cluster.wait_for_nodes()
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
ray.get(actor_1.value.remote())
|
|
|
|
result = ray.util.placement_group_table(placement_group)
|
|
assert result["state"] == "CREATED"
|
|
|
|
# Add tow more placement group for placement group table test.
|
|
second_strategy = "SPREAD"
|
|
ray.util.placement_group(
|
|
name="second_placement_group",
|
|
strategy=second_strategy,
|
|
bundles=bundles)
|
|
ray.util.placement_group(
|
|
name="third_placement_group",
|
|
strategy=second_strategy,
|
|
bundles=bundles)
|
|
|
|
placement_group_table = ray.util.placement_group_table()
|
|
assert len(placement_group_table) == 3
|
|
|
|
true_name_set = {"name", "second_placement_group", "third_placement_group"}
|
|
get_name_set = set()
|
|
|
|
for _, placement_group_data in placement_group_table.items():
|
|
get_name_set.add(placement_group_data["name"])
|
|
|
|
assert true_name_set == get_name_set
|
|
|
|
|
|
def test_cuda_visible_devices(ray_start_cluster):
|
|
@ray.remote(num_gpus=1)
|
|
def f():
|
|
return os.environ["CUDA_VISIBLE_DEVICES"]
|
|
|
|
cluster = ray_start_cluster
|
|
num_nodes = 1
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_gpus=1)
|
|
ray.init(address=cluster.address)
|
|
|
|
g1 = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
|
|
o1 = f.options(placement_group=g1).remote()
|
|
|
|
devices = ray.get(o1)
|
|
assert devices == "0", devices
|
|
|
|
|
|
def test_placement_group_reschedule_when_node_dead(ray_start_cluster):
|
|
@ray.remote(num_cpus=1)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
cluster.add_node(num_cpus=4)
|
|
cluster.add_node(num_cpus=4)
|
|
cluster.add_node(num_cpus=4)
|
|
cluster.wait_for_nodes()
|
|
ray.init(address=cluster.address)
|
|
|
|
# Make sure both head and worker node are alive.
|
|
nodes = ray.nodes()
|
|
assert len(nodes) == 3
|
|
assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name",
|
|
strategy="SPREAD",
|
|
bundles=[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}])
|
|
actor_1 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0,
|
|
lifetime="detached").remote()
|
|
actor_2 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1,
|
|
lifetime="detached").remote()
|
|
actor_3 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=2,
|
|
lifetime="detached").remote()
|
|
ray.get(actor_1.value.remote())
|
|
ray.get(actor_2.value.remote())
|
|
ray.get(actor_3.value.remote())
|
|
|
|
cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
|
|
cluster.wait_for_nodes()
|
|
|
|
actor_4 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0,
|
|
lifetime="detached").remote()
|
|
actor_5 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1,
|
|
lifetime="detached").remote()
|
|
actor_6 = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=2,
|
|
lifetime="detached").remote()
|
|
ray.get(actor_4.value.remote())
|
|
ray.get(actor_5.value.remote())
|
|
ray.get(actor_6.value.remote())
|
|
ray.shutdown()
|
|
|
|
|
|
def test_check_bundle_index(ray_start_cluster):
|
|
@ray.remote(num_cpus=2)
|
|
class Actor(object):
|
|
def __init__(self):
|
|
self.n = 0
|
|
|
|
def value(self):
|
|
return self.n
|
|
|
|
cluster = ray_start_cluster
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
placement_group = ray.util.placement_group(
|
|
name="name", strategy="SPREAD", bundles=[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}])
|
|
|
|
error_count = 0
|
|
try:
|
|
Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=3).remote()
|
|
except ValueError:
|
|
error_count = error_count + 1
|
|
assert error_count == 1
|
|
|
|
try:
|
|
Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=-2).remote()
|
|
except ValueError:
|
|
error_count = error_count + 1
|
|
assert error_count == 2
|
|
|
|
try:
|
|
Actor.options(placement_group_bundle_index=0).remote()
|
|
except ValueError:
|
|
error_count = error_count + 1
|
|
assert error_count == 3
|
|
|
|
|
|
def test_pending_placement_group_wait(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
[cluster.add_node(num_cpus=2) for _ in range(1)]
|
|
ray.init(address=cluster.address)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Wait on placement group that cannot be created.
|
|
placement_group = ray.util.placement_group(
|
|
name="name",
|
|
strategy="SPREAD",
|
|
bundles=[
|
|
{
|
|
"CPU": 2
|
|
},
|
|
{
|
|
"CPU": 2
|
|
},
|
|
{
|
|
"GPU": 2
|
|
},
|
|
])
|
|
ready, unready = ray.wait([placement_group.ready()], timeout=0.1)
|
|
assert len(unready) == 1
|
|
assert len(ready) == 0
|
|
table = ray.util.placement_group_table(placement_group)
|
|
assert table["state"] == "PENDING"
|
|
with pytest.raises(ray.exceptions.GetTimeoutError):
|
|
ray.get(placement_group.ready(), timeout=0.1)
|
|
|
|
|
|
def test_placement_group_wait(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
[cluster.add_node(num_cpus=2) for _ in range(2)]
|
|
ray.init(address=cluster.address)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Wait on placement group that cannot be created.
|
|
placement_group = ray.util.placement_group(
|
|
name="name", strategy="SPREAD", bundles=[
|
|
{
|
|
"CPU": 2
|
|
},
|
|
{
|
|
"CPU": 2
|
|
},
|
|
])
|
|
ready, unready = ray.wait([placement_group.ready()])
|
|
assert len(unready) == 0
|
|
assert len(ready) == 1
|
|
table = ray.util.placement_group_table(placement_group)
|
|
assert table["state"] == "CREATED"
|
|
|
|
pg = ray.get(placement_group.ready())
|
|
assert pg.bundle_specs == placement_group.bundle_specs
|
|
assert pg.id.binary() == placement_group.id.binary()
|
|
|
|
|
|
def test_schedule_placement_group_when_node_add(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
|
|
# Creating a placement group that cannot be satisfied yet.
|
|
placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}])
|
|
|
|
def is_placement_group_created():
|
|
table = ray.util.placement_group_table(placement_group)
|
|
if "state" not in table:
|
|
return False
|
|
return table["state"] == "CREATED"
|
|
|
|
# Add a node that has GPU.
|
|
cluster.add_node(num_cpus=4, num_gpus=4)
|
|
|
|
# Make sure the placement group is created.
|
|
wait_for_condition(is_placement_group_created)
|
|
|
|
|
|
def test_atomic_creation(ray_start_cluster):
|
|
# Setup cluster.
|
|
cluster = ray_start_cluster
|
|
bundle_cpu_size = 2
|
|
bundle_per_node = 2
|
|
num_nodes = 2
|
|
|
|
[
|
|
cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
|
|
for _ in range(num_nodes)
|
|
]
|
|
ray.init(address=cluster.address)
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class NormalActor:
|
|
def ping(self):
|
|
pass
|
|
|
|
@ray.remote(num_cpus=3)
|
|
def bothering_task():
|
|
time.sleep(6)
|
|
return True
|
|
|
|
# Schedule tasks to fail initial placement group creation.
|
|
tasks = [bothering_task.remote() for _ in range(2)]
|
|
|
|
# Make sure the two common task has scheduled.
|
|
def tasks_scheduled():
|
|
return ray.available_resources()["CPU"] == 2.0
|
|
|
|
wait_for_condition(tasks_scheduled)
|
|
|
|
# Create an actor that will fail bundle scheduling.
|
|
# It is important to use pack strategy to make test less flaky.
|
|
pg = ray.util.placement_group(
|
|
name="name",
|
|
strategy="SPREAD",
|
|
bundles=[{
|
|
"CPU": bundle_cpu_size
|
|
} for _ in range(num_nodes * bundle_per_node)])
|
|
|
|
# Create a placement group actor.
|
|
# This shouldn't be scheduled because atomic
|
|
# placement group creation should've failed.
|
|
pg_actor = NormalActor.options(
|
|
placement_group=pg,
|
|
placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote()
|
|
|
|
# Wait on the placement group now. It should be unready
|
|
# because normal actor takes resources that are required
|
|
# for one of bundle creation.
|
|
ready, unready = ray.wait([pg.ready()], timeout=0.5)
|
|
assert len(ready) == 0
|
|
assert len(unready) == 1
|
|
# Wait until all tasks are done.
|
|
assert all(ray.get(tasks))
|
|
|
|
# Wait on the placement group creation. Since resources are now available,
|
|
# it should be ready soon.
|
|
ready, unready = ray.wait([pg.ready()])
|
|
assert len(ready) == 1
|
|
assert len(unready) == 0
|
|
|
|
# Confirm that the placement group actor is created. It will
|
|
# raise an exception if actor was scheduled before placement
|
|
# group was created thus it checks atomicity.
|
|
ray.get(pg_actor.ping.remote(), timeout=3.0)
|
|
ray.kill(pg_actor)
|
|
|
|
# Make sure atomic creation failure didn't impact resources.
|
|
@ray.remote(num_cpus=bundle_cpu_size)
|
|
def resource_check():
|
|
return True
|
|
|
|
# This should hang because every resources
|
|
# are claimed by placement group.
|
|
check_without_pg = [
|
|
resource_check.remote() for _ in range(bundle_per_node * num_nodes)
|
|
]
|
|
|
|
# This all should scheduled on each bundle.
|
|
check_with_pg = [
|
|
resource_check.options(
|
|
placement_group=pg, placement_group_bundle_index=i).remote()
|
|
for i in range(bundle_per_node * num_nodes)
|
|
]
|
|
|
|
# Make sure these are hanging.
|
|
ready, unready = ray.wait(check_without_pg, timeout=0)
|
|
assert len(ready) == 0
|
|
assert len(unready) == bundle_per_node * num_nodes
|
|
|
|
# Make sure these are all scheduled.
|
|
assert all(ray.get(check_with_pg))
|
|
|
|
ray.util.remove_placement_group(pg)
|
|
|
|
def pg_removed():
|
|
return ray.util.placement_group_table(pg)["state"] == "REMOVED"
|
|
|
|
wait_for_condition(pg_removed)
|
|
|
|
# Make sure check without pgs are all
|
|
# scheduled properly because resources are cleaned up.
|
|
assert all(ray.get(check_without_pg))
|
|
|
|
|
|
def test_mini_integration(ray_start_cluster):
|
|
# Create bundles as many as number of gpus in the cluster.
|
|
# Do some random work and make sure all resources are properly recovered.
|
|
|
|
cluster = ray_start_cluster
|
|
|
|
num_nodes = 5
|
|
per_bundle_gpus = 2
|
|
gpu_per_node = 4
|
|
total_gpus = num_nodes * per_bundle_gpus * gpu_per_node
|
|
per_node_gpus = per_bundle_gpus * gpu_per_node
|
|
|
|
bundles_per_pg = 2
|
|
total_num_pg = total_gpus // (bundles_per_pg * per_bundle_gpus)
|
|
|
|
[
|
|
cluster.add_node(num_cpus=2, num_gpus=per_bundle_gpus * gpu_per_node)
|
|
for _ in range(num_nodes)
|
|
]
|
|
cluster.wait_for_nodes()
|
|
ray.init(address=cluster.address)
|
|
|
|
@ray.remote(num_cpus=0, num_gpus=1)
|
|
def random_tasks():
|
|
import time
|
|
import random
|
|
sleep_time = random.uniform(0.1, 0.2)
|
|
time.sleep(sleep_time)
|
|
return True
|
|
|
|
pgs = []
|
|
pg_tasks = []
|
|
# total bundle gpu usage = bundles_per_pg * total_num_pg * per_bundle_gpus
|
|
# Note this is half of total
|
|
for index in range(total_num_pg):
|
|
pgs.append(
|
|
ray.util.placement_group(
|
|
name=f"name{index}",
|
|
strategy="PACK",
|
|
bundles=[{
|
|
"GPU": per_bundle_gpus
|
|
} for _ in range(bundles_per_pg)]))
|
|
|
|
# Schedule tasks.
|
|
for i in range(total_num_pg):
|
|
pg = pgs[i]
|
|
pg_tasks.append([
|
|
random_tasks.options(
|
|
placement_group=pg,
|
|
placement_group_bundle_index=bundle_index).remote()
|
|
for bundle_index in range(bundles_per_pg)
|
|
])
|
|
|
|
# Make sure tasks are done and we remove placement groups.
|
|
num_removed_pg = 0
|
|
pg_indexes = [2, 3, 1, 7, 8, 9, 0, 6, 4, 5]
|
|
while num_removed_pg < total_num_pg:
|
|
index = pg_indexes[num_removed_pg]
|
|
pg = pgs[index]
|
|
assert all(ray.get(pg_tasks[index]))
|
|
ray.util.remove_placement_group(pg)
|
|
num_removed_pg += 1
|
|
|
|
@ray.remote(num_cpus=2, num_gpus=per_node_gpus)
|
|
class A:
|
|
def ping(self):
|
|
return True
|
|
|
|
# Make sure all resources are properly returned by scheduling
|
|
# actors that take up all existing resources.
|
|
actors = [A.remote() for _ in range(num_nodes)]
|
|
assert all(ray.get([a.ping.remote() for a in actors]))
|
|
|
|
|
|
def test_capture_child_actors(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
total_num_actors = 4
|
|
for _ in range(2):
|
|
cluster.add_node(num_cpus=total_num_actors)
|
|
ray.init(address=cluster.address)
|
|
|
|
pg = ray.util.placement_group(
|
|
[{
|
|
"CPU": 2
|
|
}, {
|
|
"CPU": 2
|
|
}], strategy="STRICT_PACK")
|
|
ray.get(pg.ready())
|
|
|
|
# If get_current_placement_group is used when the current worker/driver
|
|
# doesn't belong to any of placement group, it should return None.
|
|
assert get_current_placement_group() is None
|
|
|
|
# Test actors first.
|
|
@ray.remote(num_cpus=1)
|
|
class NestedActor:
|
|
def ready(self):
|
|
return True
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class Actor:
|
|
def __init__(self):
|
|
self.actors = []
|
|
|
|
def ready(self):
|
|
return True
|
|
|
|
def schedule_nested_actor(self):
|
|
# Make sure we can capture the current placement group.
|
|
assert get_current_placement_group() is not None
|
|
# Actors should be implicitly captured.
|
|
actor = NestedActor.remote()
|
|
ray.get(actor.ready.remote())
|
|
self.actors.append(actor)
|
|
|
|
def schedule_nested_actor_outside_pg(self):
|
|
# Don't use placement group.
|
|
actor = NestedActor.options(placement_group=None).remote()
|
|
ray.get(actor.ready.remote())
|
|
self.actors.append(actor)
|
|
|
|
a = Actor.options(placement_group=pg).remote()
|
|
ray.get(a.ready.remote())
|
|
# 1 top level actor + 3 children.
|
|
for _ in range(total_num_actors - 1):
|
|
ray.get(a.schedule_nested_actor.remote())
|
|
# Make sure all the actors are scheduled on the same node.
|
|
# (why? The placement group has STRICT_PACK strategy).
|
|
node_id_set = set()
|
|
for actor_info in ray.actors().values():
|
|
node_id = actor_info["Address"]["NodeID"]
|
|
node_id_set.add(node_id)
|
|
|
|
# Since all node id should be identical, set should be equal to 1.
|
|
assert len(node_id_set) == 1
|
|
|
|
# Kill an actor and wait until it is killed.
|
|
ray.kill(a)
|
|
with pytest.raises(ray.exceptions.RayActorError):
|
|
ray.get(a.ready.remote())
|
|
|
|
# Now create an actor, but do not capture the current tasks
|
|
a = Actor.options(
|
|
placement_group=pg,
|
|
placement_group_capture_child_tasks=False).remote()
|
|
ray.get(a.ready.remote())
|
|
# 1 top level actor + 3 children.
|
|
for _ in range(total_num_actors - 1):
|
|
ray.get(a.schedule_nested_actor.remote())
|
|
# Make sure all the actors are not scheduled on the same node.
|
|
# It is because the child tasks are not scheduled on the same
|
|
# placement group.
|
|
node_id_set = set()
|
|
for actor_info in ray.actors().values():
|
|
node_id = actor_info["Address"]["NodeID"]
|
|
node_id_set.add(node_id)
|
|
|
|
assert len(node_id_set) == 2
|
|
|
|
# Kill an actor and wait until it is killed.
|
|
ray.kill(a)
|
|
with pytest.raises(ray.exceptions.RayActorError):
|
|
ray.get(a.ready.remote())
|
|
|
|
# Lastly, make sure when None is specified, actors are not scheduled
|
|
# on the same placement group.
|
|
a = Actor.options(placement_group=pg).remote()
|
|
ray.get(a.ready.remote())
|
|
# 1 top level actor + 3 children.
|
|
for _ in range(total_num_actors - 1):
|
|
ray.get(a.schedule_nested_actor_outside_pg.remote())
|
|
# Make sure all the actors are not scheduled on the same node.
|
|
# It is because the child tasks are not scheduled on the same
|
|
# placement group.
|
|
node_id_set = set()
|
|
for actor_info in ray.actors().values():
|
|
node_id = actor_info["Address"]["NodeID"]
|
|
node_id_set.add(node_id)
|
|
|
|
assert len(node_id_set) == 2
|
|
|
|
|
|
def test_capture_child_tasks(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
total_num_tasks = 4
|
|
for _ in range(2):
|
|
cluster.add_node(num_cpus=total_num_tasks, num_gpus=total_num_tasks)
|
|
ray.init(address=cluster.address)
|
|
|
|
pg = ray.util.placement_group(
|
|
[{
|
|
"CPU": 2,
|
|
"GPU": 2,
|
|
}, {
|
|
"CPU": 2,
|
|
"GPU": 2,
|
|
}],
|
|
strategy="STRICT_PACK")
|
|
ray.get(pg.ready())
|
|
|
|
# If get_current_placement_group is used when the current worker/driver
|
|
# doesn't belong to any of placement group, it should return None.
|
|
assert get_current_placement_group() is None
|
|
|
|
# Test if tasks capture child tasks.
|
|
@ray.remote
|
|
def task():
|
|
return get_current_placement_group()
|
|
|
|
@ray.remote
|
|
def create_nested_task(child_cpu, child_gpu):
|
|
assert get_current_placement_group() is not None
|
|
return ray.get([
|
|
task.options(num_cpus=child_cpu, num_gpus=child_gpu).remote()
|
|
for _ in range(3)
|
|
])
|
|
|
|
t = create_nested_task.options(
|
|
num_cpus=1, num_gpus=0, placement_group=pg).remote(1, 0)
|
|
pgs = ray.get(t)
|
|
# Every task should have current placement group because they
|
|
# should be implicitly captured by default.
|
|
assert None not in pgs
|
|
|
|
# Test if tasks don't capture child tasks when the option is off.
|
|
t2 = create_nested_task.options(
|
|
num_cpus=0,
|
|
num_gpus=1,
|
|
placement_group=pg,
|
|
placement_group_capture_child_tasks=False).remote(0, 1)
|
|
pgs = ray.get(t2)
|
|
# All placement group should be None because we don't capture child tasks.
|
|
assert not all(pgs)
|
|
|
|
|
|
def test_ready_warning_suppressed(ray_start_regular, error_pubsub):
|
|
p = error_pubsub
|
|
# Create an infeasible pg.
|
|
pg = ray.util.placement_group([{"CPU": 2}] * 2, strategy="STRICT_PACK")
|
|
with pytest.raises(ray.exceptions.GetTimeoutError):
|
|
ray.get(pg.ready(), timeout=0.5)
|
|
|
|
errors = get_error_message(
|
|
p, 1, ray.ray_constants.INFEASIBLE_TASK_ERROR, timeout=0.1)
|
|
assert len(errors) == 0
|
|
|
|
|
|
def test_automatic_cleanup_job(ray_start_cluster):
|
|
# Make sure the placement groups created by a
|
|
# job, actor, and task are cleaned when the job is done.
|
|
cluster = ray_start_cluster
|
|
num_nodes = 3
|
|
num_cpu_per_node = 4
|
|
# Create 3 nodes cluster.
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=num_cpu_per_node)
|
|
|
|
info = ray.init(address=cluster.address)
|
|
available_cpus = ray.available_resources()["CPU"]
|
|
assert available_cpus == num_nodes * num_cpu_per_node
|
|
|
|
driver_code = f"""
|
|
import ray
|
|
|
|
ray.init(address="{info["redis_address"]}")
|
|
|
|
def create_pg():
|
|
pg = ray.util.placement_group(
|
|
[{{"CPU": 1}} for _ in range(3)],
|
|
strategy="STRICT_SPREAD")
|
|
ray.get(pg.ready())
|
|
return pg
|
|
|
|
@ray.remote(num_cpus=0)
|
|
def f():
|
|
create_pg()
|
|
|
|
@ray.remote(num_cpus=0)
|
|
class A:
|
|
def create_pg(self):
|
|
create_pg()
|
|
|
|
ray.get(f.remote())
|
|
a = A.remote()
|
|
ray.get(a.create_pg.remote())
|
|
# Create 2 pgs to make sure multiple placement groups that belong
|
|
# to a single job will be properly cleaned.
|
|
create_pg()
|
|
create_pg()
|
|
|
|
ray.shutdown()
|
|
"""
|
|
|
|
run_string_as_driver(driver_code)
|
|
|
|
# Wait until the driver is reported as dead by GCS.
|
|
def is_job_done():
|
|
jobs = ray.jobs()
|
|
for job in jobs:
|
|
if "StopTime" in job:
|
|
return True
|
|
return False
|
|
|
|
def assert_num_cpus(expected_num_cpus):
|
|
if expected_num_cpus == 0:
|
|
return "CPU" not in ray.available_resources()
|
|
return ray.available_resources()["CPU"] == expected_num_cpus
|
|
|
|
wait_for_condition(is_job_done)
|
|
available_cpus = ray.available_resources()["CPU"]
|
|
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
|
|
|
|
|
def test_automatic_cleanup_detached_actors(ray_start_cluster):
|
|
# Make sure the placement groups created by a
|
|
# detached actors are cleaned properly.
|
|
cluster = ray_start_cluster
|
|
num_nodes = 3
|
|
num_cpu_per_node = 2
|
|
# Create 3 nodes cluster.
|
|
for _ in range(num_nodes):
|
|
cluster.add_node(num_cpus=num_cpu_per_node)
|
|
|
|
info = ray.init(address=cluster.address)
|
|
available_cpus = ray.available_resources()["CPU"]
|
|
assert available_cpus == num_nodes * num_cpu_per_node
|
|
|
|
driver_code = f"""
|
|
import ray
|
|
|
|
ray.init(address="{info["redis_address"]}")
|
|
|
|
def create_pg():
|
|
pg = ray.util.placement_group(
|
|
[{{"CPU": 1}} for _ in range(3)],
|
|
strategy="STRICT_SPREAD")
|
|
ray.get(pg.ready())
|
|
return pg
|
|
|
|
# TODO(sang): Placement groups created by tasks launched by detached actor
|
|
# is not cleaned with the current protocol.
|
|
# @ray.remote(num_cpus=0)
|
|
# def f():
|
|
# create_pg()
|
|
|
|
@ray.remote(num_cpus=0, max_restarts=1)
|
|
class A:
|
|
def create_pg(self):
|
|
create_pg()
|
|
def create_child_pg(self):
|
|
self.a = A.options(name="B").remote()
|
|
ray.get(self.a.create_pg.remote())
|
|
def kill_child_actor(self):
|
|
ray.kill(self.a)
|
|
try:
|
|
ray.get(self.a.create_pg.remote())
|
|
except Exception:
|
|
pass
|
|
|
|
a = A.options(lifetime="detached", name="A").remote()
|
|
ray.get(a.create_pg.remote())
|
|
# TODO(sang): Currently, child tasks are cleaned when a detached actor
|
|
# is dead. We cannot test this scenario until it is fixed.
|
|
# ray.get(a.create_child_pg.remote())
|
|
|
|
ray.shutdown()
|
|
"""
|
|
|
|
run_string_as_driver(driver_code)
|
|
|
|
# Wait until the driver is reported as dead by GCS.
|
|
def is_job_done():
|
|
jobs = ray.jobs()
|
|
for job in jobs:
|
|
if "StopTime" in job:
|
|
return True
|
|
return False
|
|
|
|
def assert_num_cpus(expected_num_cpus):
|
|
if expected_num_cpus == 0:
|
|
return "CPU" not in ray.available_resources()
|
|
return ray.available_resources()["CPU"] == expected_num_cpus
|
|
|
|
wait_for_condition(is_job_done)
|
|
assert assert_num_cpus(num_nodes)
|
|
# Make sure when a child actor spawned by a detached actor
|
|
# is killed, the placement group is removed.
|
|
a = ray.get_actor("A")
|
|
# TODO(sang): child of detached actors
|
|
# seem to be killed when jobs are done. We should fix this before
|
|
# testing this scenario.
|
|
# ray.get(a.kill_child_actor.remote())
|
|
# assert assert_num_cpus(num_nodes)
|
|
|
|
# Make sure placement groups are cleaned when detached actors are killed.
|
|
ray.kill(a, no_restart=False)
|
|
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
|
# The detached actor a should've been restarted.
|
|
# Recreate a placement group.
|
|
ray.get(a.create_pg.remote())
|
|
wait_for_condition(lambda: assert_num_cpus(num_nodes))
|
|
# Kill it again and make sure the placement group
|
|
# that is created is deleted again.
|
|
ray.kill(a, no_restart=False)
|
|
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ray_start_cluster_head", [
|
|
generate_system_config_map(
|
|
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
|
],
|
|
indirect=True)
|
|
def test_create_placement_group_after_gcs_server_restart(
|
|
ray_start_cluster_head):
|
|
cluster = ray_start_cluster_head
|
|
cluster.add_node(num_cpus=2)
|
|
cluster.add_node(num_cpus=2)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Create placement group 1 successfully.
|
|
placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
ray.get(placement_group1.ready(), timeout=10)
|
|
table = ray.util.placement_group_table(placement_group1)
|
|
assert table["state"] == "CREATED"
|
|
|
|
# Restart gcs server.
|
|
cluster.head_node.kill_gcs_server()
|
|
cluster.head_node.start_gcs_server()
|
|
|
|
# Create placement group 2 successfully.
|
|
placement_group2 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
ray.get(placement_group2.ready(), timeout=10)
|
|
table = ray.util.placement_group_table(placement_group2)
|
|
assert table["state"] == "CREATED"
|
|
|
|
# Create placement group 3.
|
|
# Status is `PENDING` because the cluster resource is insufficient.
|
|
placement_group3 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
with pytest.raises(ray.exceptions.GetTimeoutError):
|
|
ray.get(placement_group3.ready(), timeout=2)
|
|
table = ray.util.placement_group_table(placement_group3)
|
|
assert table["state"] == "PENDING"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ray_start_cluster_head", [
|
|
generate_system_config_map(
|
|
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
|
],
|
|
indirect=True)
|
|
def test_create_actor_with_placement_group_after_gcs_server_restart(
|
|
ray_start_cluster_head):
|
|
cluster = ray_start_cluster_head
|
|
cluster.add_node(num_cpus=2)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Create a placement group.
|
|
placement_group = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
|
|
# Create an actor that occupies resources after gcs server restart.
|
|
cluster.head_node.kill_gcs_server()
|
|
cluster.head_node.start_gcs_server()
|
|
actor_2 = Increase.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=1).remote()
|
|
assert ray.get(actor_2.method.remote(1)) == 3
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ray_start_cluster_head", [
|
|
generate_system_config_map(
|
|
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
|
],
|
|
indirect=True)
|
|
def test_create_placement_group_during_gcs_server_restart(
|
|
ray_start_cluster_head):
|
|
cluster = ray_start_cluster_head
|
|
cluster.add_node(num_cpus=200)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Create placement groups during gcs server restart.
|
|
placement_groups = []
|
|
for i in range(0, 100):
|
|
placement_group = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
placement_groups.append(placement_group)
|
|
|
|
cluster.head_node.kill_gcs_server()
|
|
cluster.head_node.start_gcs_server()
|
|
|
|
for i in range(0, 100):
|
|
ray.get(placement_groups[i].ready())
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ray_start_cluster_head", [
|
|
generate_system_config_map(
|
|
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
|
],
|
|
indirect=True)
|
|
def test_placement_group_wait_api(ray_start_cluster_head):
|
|
cluster = ray_start_cluster_head
|
|
cluster.add_node(num_cpus=2)
|
|
cluster.add_node(num_cpus=2)
|
|
cluster.wait_for_nodes()
|
|
|
|
# Create placement group 1 successfully.
|
|
placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
assert placement_group1.wait(10)
|
|
|
|
# Restart gcs server.
|
|
cluster.head_node.kill_gcs_server()
|
|
cluster.head_node.start_gcs_server()
|
|
|
|
# Create placement group 2 successfully.
|
|
placement_group2 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
|
assert placement_group2.wait(10)
|
|
|
|
# Remove placement group 1.
|
|
ray.util.remove_placement_group(placement_group1)
|
|
|
|
# Wait for placement group 1 after it is removed.
|
|
with pytest.raises(Exception):
|
|
placement_group1.wait(10)
|
|
|
|
|
|
def test_schedule_placement_groups_at_the_same_time():
|
|
ray.init(num_cpus=4)
|
|
|
|
pgs = [placement_group([{"CPU": 2}]) for _ in range(6)]
|
|
|
|
wait_pgs = {pg.ready(): pg for pg in pgs}
|
|
|
|
def is_all_placement_group_removed():
|
|
ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5)
|
|
if ready:
|
|
ready_pg = wait_pgs[ready[0]]
|
|
remove_placement_group(ready_pg)
|
|
del wait_pgs[ready[0]]
|
|
|
|
if len(wait_pgs) == 0:
|
|
return True
|
|
return False
|
|
|
|
wait_for_condition(is_all_placement_group_removed)
|
|
|
|
ray.shutdown()
|
|
|
|
|
|
def test_detached_placement_group(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
for _ in range(2):
|
|
cluster.add_node(num_cpus=3)
|
|
cluster.wait_for_nodes()
|
|
info = ray.init(address=cluster.address)
|
|
|
|
# Make sure detached placement group will alive when job dead.
|
|
driver_code = f"""
|
|
import ray
|
|
|
|
ray.init(address="{info["redis_address"]}")
|
|
|
|
pg = ray.util.placement_group(
|
|
[{{"CPU": 1}} for _ in range(2)],
|
|
strategy="STRICT_SPREAD", lifetime="detached")
|
|
ray.get(pg.ready())
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class Actor:
|
|
def ready(self):
|
|
return True
|
|
|
|
for bundle_index in range(2):
|
|
actor = Actor.options(lifetime="detached", placement_group=pg,
|
|
placement_group_bundle_index=bundle_index).remote()
|
|
ray.get(actor.ready.remote())
|
|
|
|
ray.shutdown()
|
|
"""
|
|
|
|
run_string_as_driver(driver_code)
|
|
|
|
# Wait until the driver is reported as dead by GCS.
|
|
def is_job_done():
|
|
jobs = ray.jobs()
|
|
for job in jobs:
|
|
if "StopTime" in job:
|
|
return True
|
|
return False
|
|
|
|
def assert_alive_num_pg(expected_num_pg):
|
|
alive_num_pg = 0
|
|
for _, placement_group_info in ray.util.placement_group_table().items(
|
|
):
|
|
if placement_group_info["state"] == "CREATED":
|
|
alive_num_pg += 1
|
|
return alive_num_pg == expected_num_pg
|
|
|
|
def assert_alive_num_actor(expected_num_actor):
|
|
alive_num_actor = 0
|
|
for actor_info in ray.actors().values():
|
|
if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE:
|
|
alive_num_actor += 1
|
|
return alive_num_actor == expected_num_actor
|
|
|
|
wait_for_condition(is_job_done)
|
|
|
|
assert assert_alive_num_pg(1)
|
|
assert assert_alive_num_actor(2)
|
|
|
|
# Make sure detached placement group will alive when its creator which
|
|
# is detached actor dead.
|
|
# Test actors first.
|
|
@ray.remote(num_cpus=1)
|
|
class NestedActor:
|
|
def ready(self):
|
|
return True
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class Actor:
|
|
def __init__(self):
|
|
self.actors = []
|
|
|
|
def ready(self):
|
|
return True
|
|
|
|
def schedule_nested_actor_with_detached_pg(self):
|
|
# Create placement group which is detached.
|
|
pg = ray.util.placement_group(
|
|
[{
|
|
"CPU": 1
|
|
} for _ in range(2)],
|
|
strategy="STRICT_SPREAD",
|
|
lifetime="detached",
|
|
name="detached_pg")
|
|
ray.get(pg.ready())
|
|
# Schedule nested actor with the placement group.
|
|
for bundle_index in range(2):
|
|
actor = NestedActor.options(
|
|
placement_group=pg,
|
|
placement_group_bundle_index=bundle_index,
|
|
lifetime="detached").remote()
|
|
ray.get(actor.ready.remote())
|
|
self.actors.append(actor)
|
|
|
|
a = Actor.options(lifetime="detached").remote()
|
|
ray.get(a.ready.remote())
|
|
# 1 parent actor and 2 children actor.
|
|
ray.get(a.schedule_nested_actor_with_detached_pg.remote())
|
|
|
|
# Kill an actor and wait until it is killed.
|
|
ray.kill(a)
|
|
with pytest.raises(ray.exceptions.RayActorError):
|
|
ray.get(a.ready.remote())
|
|
|
|
# We should have 2 alive pgs and 4 alive actors.
|
|
assert assert_alive_num_pg(2)
|
|
assert assert_alive_num_actor(4)
|
|
|
|
|
|
def test_named_placement_group(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
for _ in range(2):
|
|
cluster.add_node(num_cpus=3)
|
|
cluster.wait_for_nodes()
|
|
info = ray.init(address=cluster.address)
|
|
global_placement_group_name = "named_placement_group"
|
|
|
|
# Create a detached placement group with name.
|
|
driver_code = f"""
|
|
import ray
|
|
|
|
ray.init(address="{info["redis_address"]}")
|
|
|
|
pg = ray.util.placement_group(
|
|
[{{"CPU": 1}} for _ in range(2)],
|
|
strategy="STRICT_SPREAD",
|
|
name="{global_placement_group_name}",
|
|
lifetime="detached")
|
|
ray.get(pg.ready())
|
|
|
|
ray.shutdown()
|
|
"""
|
|
|
|
run_string_as_driver(driver_code)
|
|
|
|
# Wait until the driver is reported as dead by GCS.
|
|
def is_job_done():
|
|
jobs = ray.jobs()
|
|
for job in jobs:
|
|
if "StopTime" in job:
|
|
return True
|
|
return False
|
|
|
|
wait_for_condition(is_job_done)
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class Actor:
|
|
def ping(self):
|
|
return "pong"
|
|
|
|
# Get the named placement group and schedule a actor.
|
|
placement_group = ray.util.get_placement_group(global_placement_group_name)
|
|
assert placement_group is not None
|
|
assert placement_group.wait(5)
|
|
actor = Actor.options(
|
|
placement_group=placement_group,
|
|
placement_group_bundle_index=0).remote()
|
|
|
|
ray.get(actor.ping.remote())
|
|
|
|
# Create another placement group and make sure its creation will failed.
|
|
same_name_pg = ray.util.placement_group(
|
|
[{
|
|
"CPU": 1
|
|
} for _ in range(2)],
|
|
strategy="STRICT_SPREAD",
|
|
name=global_placement_group_name)
|
|
assert not same_name_pg.wait(10)
|
|
|
|
# Remove a named placement group and make sure the second creation
|
|
# will successful.
|
|
ray.util.remove_placement_group(placement_group)
|
|
same_name_pg = ray.util.placement_group(
|
|
[{
|
|
"CPU": 1
|
|
} for _ in range(2)],
|
|
strategy="STRICT_SPREAD",
|
|
name=global_placement_group_name)
|
|
assert same_name_pg.wait(10)
|
|
|
|
# Get a named placement group with a name that doesn't exist
|
|
# and make sure it will raise ValueError correctly.
|
|
error_count = 0
|
|
try:
|
|
ray.util.get_placement_group("inexistent_pg")
|
|
except ValueError:
|
|
error_count = error_count + 1
|
|
assert error_count == 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main(["-v", __file__]))
|