mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 04:55:04 +08:00
339 lines
11 KiB
Python
339 lines
11 KiB
Python
import pytest
|
|
try:
|
|
import pytest_timeout
|
|
except ImportError:
|
|
pytest_timeout = None
|
|
import time
|
|
|
|
import ray
|
|
import ray.ray_constants
|
|
import ray.test_utils
|
|
|
|
from ray._raylet import GlobalStateAccessor
|
|
|
|
|
|
# TODO(rliaw): The proper way to do this is to have the pytest config setup.
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(10)
|
|
def test_replenish_resources(ray_start_regular):
|
|
cluster_resources = ray.cluster_resources()
|
|
available_resources = ray.available_resources()
|
|
assert cluster_resources == available_resources
|
|
|
|
@ray.remote
|
|
def cpu_task():
|
|
pass
|
|
|
|
ray.get(cpu_task.remote())
|
|
resources_reset = False
|
|
|
|
while not resources_reset:
|
|
available_resources = ray.available_resources()
|
|
resources_reset = (cluster_resources == available_resources)
|
|
assert resources_reset
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(10)
|
|
def test_uses_resources(ray_start_regular):
|
|
cluster_resources = ray.cluster_resources()
|
|
|
|
@ray.remote
|
|
def cpu_task():
|
|
time.sleep(1)
|
|
|
|
cpu_task.remote()
|
|
resource_used = False
|
|
|
|
while not resource_used:
|
|
available_resources = ray.available_resources()
|
|
resource_used = available_resources.get(
|
|
"CPU", 0) == cluster_resources.get("CPU", 0) - 1
|
|
|
|
assert resource_used
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(120)
|
|
def test_add_remove_cluster_resources(ray_start_cluster_head):
|
|
"""Tests that Global State API is consistent with actual cluster."""
|
|
cluster = ray_start_cluster_head
|
|
assert ray.cluster_resources()["CPU"] == 1
|
|
nodes = []
|
|
nodes += [cluster.add_node(num_cpus=1)]
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 2
|
|
|
|
cluster.remove_node(nodes.pop())
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 1
|
|
|
|
for i in range(5):
|
|
nodes += [cluster.add_node(num_cpus=1)]
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 6
|
|
|
|
|
|
def test_global_state_actor_table(ray_start_regular):
|
|
@ray.remote
|
|
class Actor:
|
|
def ready(self):
|
|
pass
|
|
|
|
# actor table should be empty at first
|
|
assert len(ray.actors()) == 0
|
|
|
|
# actor table should contain only one entry
|
|
a = Actor.remote()
|
|
ray.get(a.ready.remote())
|
|
assert len(ray.actors()) == 1
|
|
|
|
# actor table should contain only this entry
|
|
# even when the actor goes out of scope
|
|
del a
|
|
|
|
def get_state():
|
|
return list(ray.actors().values())[0]["State"]
|
|
|
|
dead_state = ray.gcs_utils.ActorTableData.DEAD
|
|
for _ in range(10):
|
|
if get_state() == dead_state:
|
|
break
|
|
else:
|
|
time.sleep(0.5)
|
|
assert get_state() == dead_state
|
|
|
|
|
|
def test_global_state_worker_table(ray_start_regular):
|
|
|
|
# Get worker table from gcs.
|
|
workers_data = ray.state.workers()
|
|
|
|
assert len(workers_data) == 1
|
|
|
|
|
|
def test_global_state_actor_entry(ray_start_regular):
|
|
@ray.remote
|
|
class Actor:
|
|
def ready(self):
|
|
pass
|
|
|
|
# actor table should be empty at first
|
|
assert len(ray.actors()) == 0
|
|
|
|
a = Actor.remote()
|
|
b = Actor.remote()
|
|
ray.get(a.ready.remote())
|
|
ray.get(b.ready.remote())
|
|
assert len(ray.actors()) == 2
|
|
a_actor_id = a._actor_id.hex()
|
|
b_actor_id = b._actor_id.hex()
|
|
assert ray.actors(actor_id=a_actor_id)["ActorID"] == a_actor_id
|
|
assert ray.actors(
|
|
actor_id=a_actor_id)["State"] == ray.gcs_utils.ActorTableData.ALIVE
|
|
assert ray.actors(actor_id=b_actor_id)["ActorID"] == b_actor_id
|
|
assert ray.actors(
|
|
actor_id=b_actor_id)["State"] == ray.gcs_utils.ActorTableData.ALIVE
|
|
|
|
|
|
@pytest.mark.parametrize("max_shapes", [0, 2, -1])
|
|
def test_load_report(shutdown_only, max_shapes):
|
|
resource1 = "A"
|
|
resource2 = "B"
|
|
cluster = ray.init(
|
|
num_cpus=1,
|
|
resources={resource1: 1},
|
|
_system_config={
|
|
"max_resource_shapes_per_load_report": max_shapes,
|
|
})
|
|
global_state_accessor = GlobalStateAccessor(
|
|
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
|
global_state_accessor.connect()
|
|
|
|
@ray.remote
|
|
def sleep():
|
|
time.sleep(1000)
|
|
|
|
sleep.remote()
|
|
for _ in range(3):
|
|
sleep.remote()
|
|
sleep.options(resources={resource1: 1}).remote()
|
|
sleep.options(resources={resource2: 1}).remote()
|
|
|
|
class Checker:
|
|
def __init__(self):
|
|
self.report = None
|
|
|
|
def check_load_report(self):
|
|
message = global_state_accessor.get_all_resource_usage()
|
|
if message is None:
|
|
return False
|
|
|
|
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
|
message)
|
|
self.report = \
|
|
resource_usage.resource_load_by_shape.resource_demands
|
|
if max_shapes == 0:
|
|
return True
|
|
elif max_shapes == 2:
|
|
return len(self.report) >= 2
|
|
else:
|
|
return len(self.report) >= 3
|
|
|
|
# Wait for load information to arrive.
|
|
checker = Checker()
|
|
ray.test_utils.wait_for_condition(checker.check_load_report)
|
|
|
|
# Check that we respect the max shapes limit.
|
|
if max_shapes != -1:
|
|
assert len(checker.report) <= max_shapes
|
|
|
|
print(checker.report)
|
|
|
|
if max_shapes > 0:
|
|
# Check that we always include the 1-CPU resource shape.
|
|
one_cpu_shape = {"CPU": 1}
|
|
one_cpu_found = False
|
|
for demand in checker.report:
|
|
if demand.shape == one_cpu_shape:
|
|
one_cpu_found = True
|
|
assert one_cpu_found
|
|
|
|
# Check that we differentiate between infeasible and ready tasks.
|
|
for demand in checker.report:
|
|
if resource2 in demand.shape:
|
|
assert demand.num_infeasible_requests_queued > 0
|
|
assert demand.num_ready_requests_queued == 0
|
|
else:
|
|
assert demand.num_ready_requests_queued > 0
|
|
assert demand.num_infeasible_requests_queued == 0
|
|
global_state_accessor.disconnect()
|
|
|
|
|
|
def test_placement_group_load_report(ray_start_cluster):
|
|
cluster = ray_start_cluster
|
|
# Add a head node that doesn't have gpu resource.
|
|
cluster.add_node(num_cpus=4)
|
|
ray.init(address=cluster.address)
|
|
global_state_accessor = GlobalStateAccessor(
|
|
cluster.address, ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
|
global_state_accessor.connect()
|
|
|
|
class PgLoadChecker:
|
|
def nothing_is_ready(self):
|
|
resource_usage = self._read_resource_usage()
|
|
if not resource_usage:
|
|
return False
|
|
if resource_usage.HasField("placement_group_load"):
|
|
pg_load = resource_usage.placement_group_load
|
|
return len(pg_load.placement_group_data) == 2
|
|
return False
|
|
|
|
def only_first_one_ready(self):
|
|
resource_usage = self._read_resource_usage()
|
|
if not resource_usage:
|
|
return False
|
|
if resource_usage.HasField("placement_group_load"):
|
|
pg_load = resource_usage.placement_group_load
|
|
return len(pg_load.placement_group_data) == 1
|
|
return False
|
|
|
|
def two_infeasible_pg(self):
|
|
resource_usage = self._read_resource_usage()
|
|
if not resource_usage:
|
|
return False
|
|
if resource_usage.HasField("placement_group_load"):
|
|
pg_load = resource_usage.placement_group_load
|
|
return len(pg_load.placement_group_data) == 2
|
|
return False
|
|
|
|
def _read_resource_usage(self):
|
|
message = global_state_accessor.get_all_resource_usage()
|
|
if message is None:
|
|
return False
|
|
|
|
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
|
message)
|
|
return resource_usage
|
|
|
|
checker = PgLoadChecker()
|
|
|
|
# Create 2 placement groups that are infeasible.
|
|
pg_feasible = ray.util.placement_group([{"A": 1}])
|
|
pg_infeasible = ray.util.placement_group([{"B": 1}])
|
|
_, unready = ray.wait(
|
|
[pg_feasible.ready(), pg_infeasible.ready()], timeout=0)
|
|
assert len(unready) == 2
|
|
ray.test_utils.wait_for_condition(checker.nothing_is_ready)
|
|
|
|
# Add a node that makes pg feasible. Make sure load include this change.
|
|
cluster.add_node(resources={"A": 1})
|
|
ray.get(pg_feasible.ready())
|
|
ray.test_utils.wait_for_condition(checker.only_first_one_ready)
|
|
# Create one more infeasible pg and make sure load is properly updated.
|
|
pg_infeasible_second = ray.util.placement_group([{"C": 1}])
|
|
_, unready = ray.wait([pg_infeasible_second.ready()], timeout=0)
|
|
assert len(unready) == 1
|
|
ray.test_utils.wait_for_condition(checker.two_infeasible_pg)
|
|
global_state_accessor.disconnect()
|
|
|
|
|
|
def test_backlog_report(shutdown_only):
|
|
cluster = ray.init(
|
|
num_cpus=1, _system_config={
|
|
"report_worker_backlog": True,
|
|
})
|
|
global_state_accessor = GlobalStateAccessor(
|
|
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
|
global_state_accessor.connect()
|
|
|
|
@ray.remote(num_cpus=1)
|
|
def foo(x):
|
|
print(".")
|
|
time.sleep(x)
|
|
return None
|
|
|
|
def backlog_size_set():
|
|
message = global_state_accessor.get_all_resource_usage()
|
|
if message is None:
|
|
return False
|
|
|
|
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
|
message)
|
|
aggregate_resource_load = \
|
|
resource_usage.resource_load_by_shape.resource_demands
|
|
if len(aggregate_resource_load) == 1:
|
|
backlog_size = aggregate_resource_load[0].backlog_size
|
|
print(backlog_size)
|
|
# Ideally we'd want to assert backlog_size == 8, but guaranteeing
|
|
# the order the order that submissions will occur is too
|
|
# hard/flaky.
|
|
return backlog_size > 0
|
|
return False
|
|
|
|
# We want this first task to finish
|
|
refs = [foo.remote(0.5)]
|
|
# These tasks should all start _before_ the first one finishes.
|
|
refs.extend([foo.remote(1000) for _ in range(9)])
|
|
# Now there's 1 request running, 1 queued in the raylet, and 8 queued in
|
|
# the worker backlog.
|
|
|
|
ray.get(refs[0])
|
|
# First request finishes, second request is now running, third lease
|
|
# request is sent to the raylet with backlog=7
|
|
|
|
ray.test_utils.wait_for_condition(backlog_size_set, timeout=2)
|
|
global_state_accessor.disconnect()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import pytest
|
|
import sys
|
|
sys.exit(pytest.main(["-v", __file__]))
|