mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:00:36 +08:00
f75dfd60a3
* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
221 lines
6.5 KiB
Python
221 lines
6.5 KiB
Python
import pytest
|
|
try:
|
|
import pytest_timeout
|
|
except ImportError:
|
|
pytest_timeout = None
|
|
import time
|
|
|
|
import ray
|
|
import ray.ray_constants
|
|
import ray.test_utils
|
|
|
|
|
|
# TODO(rliaw): The proper way to do this is to have the pytest config setup.
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(10)
|
|
def test_replenish_resources(ray_start_regular):
|
|
cluster_resources = ray.cluster_resources()
|
|
available_resources = ray.available_resources()
|
|
assert cluster_resources == available_resources
|
|
|
|
@ray.remote
|
|
def cpu_task():
|
|
pass
|
|
|
|
ray.get(cpu_task.remote())
|
|
resources_reset = False
|
|
|
|
while not resources_reset:
|
|
available_resources = ray.available_resources()
|
|
resources_reset = (cluster_resources == available_resources)
|
|
assert resources_reset
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(10)
|
|
def test_uses_resources(ray_start_regular):
|
|
cluster_resources = ray.cluster_resources()
|
|
|
|
@ray.remote
|
|
def cpu_task():
|
|
time.sleep(1)
|
|
|
|
cpu_task.remote()
|
|
resource_used = False
|
|
|
|
while not resource_used:
|
|
available_resources = ray.available_resources()
|
|
resource_used = available_resources.get(
|
|
"CPU", 0) == cluster_resources.get("CPU", 0) - 1
|
|
|
|
assert resource_used
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
pytest_timeout is None,
|
|
reason="Timeout package not installed; skipping test that may hang.")
|
|
@pytest.mark.timeout(120)
|
|
def test_add_remove_cluster_resources(ray_start_cluster_head):
|
|
"""Tests that Global State API is consistent with actual cluster."""
|
|
cluster = ray_start_cluster_head
|
|
assert ray.cluster_resources()["CPU"] == 1
|
|
nodes = []
|
|
nodes += [cluster.add_node(num_cpus=1)]
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 2
|
|
|
|
cluster.remove_node(nodes.pop())
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 1
|
|
|
|
for i in range(5):
|
|
nodes += [cluster.add_node(num_cpus=1)]
|
|
cluster.wait_for_nodes()
|
|
assert ray.cluster_resources()["CPU"] == 6
|
|
|
|
|
|
def test_global_state_actor_table(ray_start_regular):
|
|
@ray.remote
|
|
class Actor:
|
|
def ready(self):
|
|
pass
|
|
|
|
# actor table should be empty at first
|
|
assert len(ray.actors()) == 0
|
|
|
|
# actor table should contain only one entry
|
|
a = Actor.remote()
|
|
ray.get(a.ready.remote())
|
|
assert len(ray.actors()) == 1
|
|
|
|
# actor table should contain only this entry
|
|
# even when the actor goes out of scope
|
|
del a
|
|
|
|
def get_state():
|
|
return list(ray.actors().values())[0]["State"]
|
|
|
|
dead_state = ray.gcs_utils.ActorTableData.DEAD
|
|
for _ in range(10):
|
|
if get_state() == dead_state:
|
|
break
|
|
else:
|
|
time.sleep(0.5)
|
|
assert get_state() == dead_state
|
|
|
|
|
|
def test_global_state_actor_entry(ray_start_regular):
|
|
@ray.remote
|
|
class Actor:
|
|
def ready(self):
|
|
pass
|
|
|
|
# actor table should be empty at first
|
|
assert len(ray.actors()) == 0
|
|
|
|
a = Actor.remote()
|
|
b = Actor.remote()
|
|
ray.get(a.ready.remote())
|
|
ray.get(b.ready.remote())
|
|
assert len(ray.actors()) == 2
|
|
a_actor_id = a._actor_id.hex()
|
|
b_actor_id = b._actor_id.hex()
|
|
assert ray.actors(actor_id=a_actor_id)["ActorID"] == a_actor_id
|
|
assert ray.actors(
|
|
actor_id=a_actor_id)["State"] == ray.gcs_utils.ActorTableData.ALIVE
|
|
assert ray.actors(actor_id=b_actor_id)["ActorID"] == b_actor_id
|
|
assert ray.actors(
|
|
actor_id=b_actor_id)["State"] == ray.gcs_utils.ActorTableData.ALIVE
|
|
|
|
|
|
@pytest.mark.parametrize("max_shapes", [0, 2, -1])
|
|
def test_load_report(shutdown_only, max_shapes):
|
|
resource1 = "A"
|
|
resource2 = "B"
|
|
cluster = ray.init(
|
|
num_cpus=1,
|
|
resources={resource1: 1},
|
|
_system_config={
|
|
"max_resource_shapes_per_load_report": max_shapes,
|
|
})
|
|
redis = ray.services.create_redis_client(
|
|
cluster["redis_address"],
|
|
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
|
client = redis.pubsub(ignore_subscribe_messages=True)
|
|
client.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
|
|
|
@ray.remote
|
|
def sleep():
|
|
time.sleep(1000)
|
|
|
|
sleep.remote()
|
|
for _ in range(3):
|
|
sleep.remote()
|
|
sleep.options(resources={resource1: 1}).remote()
|
|
sleep.options(resources={resource2: 1}).remote()
|
|
|
|
class Checker:
|
|
def __init__(self):
|
|
self.report = None
|
|
|
|
def check_load_report(self):
|
|
try:
|
|
message = client.get_message()
|
|
except redis.exceptions.ConnectionError:
|
|
pass
|
|
if message is None:
|
|
return False
|
|
|
|
pattern = message["pattern"]
|
|
data = message["data"]
|
|
if pattern != ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN:
|
|
return False
|
|
|
|
pub_message = ray.gcs_utils.PubSubMessage.FromString(data)
|
|
heartbeat_data = pub_message.data
|
|
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
|
heartbeat_data)
|
|
self.report = heartbeat.resource_load_by_shape.resource_demands
|
|
if max_shapes == 0:
|
|
return True
|
|
elif max_shapes == 2:
|
|
return len(self.report) >= 2
|
|
else:
|
|
return len(self.report) >= 3
|
|
|
|
# Wait for load information to arrive.
|
|
checker = Checker()
|
|
ray.test_utils.wait_for_condition(checker.check_load_report)
|
|
|
|
# Check that we respect the max shapes limit.
|
|
if max_shapes != -1:
|
|
assert len(checker.report) <= max_shapes
|
|
|
|
if max_shapes > 0:
|
|
# Check that we always include the 1-CPU resource shape.
|
|
one_cpu_shape = {"CPU": 1}
|
|
one_cpu_found = False
|
|
for demand in checker.report:
|
|
if demand.shape == one_cpu_shape:
|
|
one_cpu_found = True
|
|
assert one_cpu_found
|
|
|
|
# Check that we differentiate between infeasible and ready tasks.
|
|
for demand in checker.report:
|
|
if resource2 in demand.shape:
|
|
assert demand.num_infeasible_requests_queued > 0
|
|
assert demand.num_ready_requests_queued == 0
|
|
else:
|
|
assert demand.num_ready_requests_queued > 0
|
|
assert demand.num_infeasible_requests_queued == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import pytest
|
|
import sys
|
|
sys.exit(pytest.main(["-v", __file__]))
|