Add heartbeat test + Fix monitor.py (#5191)

This commit is contained in:
Richard Liaw
2019-07-16 21:59:48 -07:00
committed by GitHub
parent 4fa2a6006c
commit 3e0ad11ae0
5 changed files with 153 additions and 23 deletions
+117
View File
@@ -8,6 +8,7 @@ import time
import ray
import ray.ray_constants as ray_constants
from ray.monitor import Monitor
from ray.tests.cluster_utils import Cluster
from ray.tests.conftest import generate_internal_config_map
@@ -58,6 +59,122 @@ def test_internal_config(ray_start_cluster_head):
assert ray.cluster_resources()["CPU"] == 1
def setup_monitor(redis_address):
monitor = Monitor(redis_address, None)
monitor.subscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_CHANNEL)
monitor.subscribe(ray.gcs_utils.XRAY_JOB_CHANNEL) # TODO: Remove?
monitor.update_raylet_map(_append_port=True)
monitor._maybe_flush_gcs()
return monitor
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=10):
while True:
monitor.process_messages()
resource_usage = monitor.load_metrics.get_resource_usage()
if expected_resource_usage is None:
if all(x for x in resource_usage[1:]):
break
elif all(x == y
for x, y in zip(resource_usage, expected_resource_usage)):
break
else:
timeout -= 1
time.sleep(1)
if timeout <= 0:
raise ValueError("Timeout. {} != {}".format(
resource_usage, expected_resource_usage))
return resource_usage
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 1,
}, {
"num_cpus": 2,
}],
indirect=True)
def test_heartbeats_single(ray_start_cluster_head):
"""Unit test for `Cluster.wait_for_nodes`.
Test proper metrics.
"""
cluster = ray_start_cluster_head
timeout = 5
monitor = setup_monitor(cluster.redis_address)
total_cpus = ray.state.cluster_resources()["CPU"]
verify_load_metrics(monitor, (0.0, {"CPU": 0.0}, {"CPU": total_cpus}))
@ray.remote
def work(timeout):
time.sleep(timeout)
return True
work_handle = work.remote(timeout * 2)
verify_load_metrics(monitor, (1.0 / total_cpus, {
"CPU": 1.0
}, {
"CPU": total_cpus
}))
ray.get(work_handle)
@ray.remote
class Actor(object):
def work(self, timeout):
time.sleep(timeout)
return True
test_actor = Actor.remote()
work_handle = test_actor.work.remote(timeout * 2)
verify_load_metrics(monitor, (1.0 / total_cpus, {
"CPU": 1.0
}, {
"CPU": total_cpus
}))
ray.get(work_handle)
def test_heartbeats_cluster(ray_start_cluster_head):
"""Unit test for `Cluster.wait_for_nodes`.
Test proper metrics.
"""
cluster = ray_start_cluster_head
timeout = 5
num_workers_nodes = 4
num_nodes_total = int(num_workers_nodes + 1)
[cluster.add_node() for i in range(num_workers_nodes)]
cluster.wait_for_nodes()
monitor = setup_monitor(cluster.redis_address)
verify_load_metrics(monitor, (0.0, {"CPU": 0.0}, {"CPU": num_nodes_total}))
@ray.remote
class Actor(object):
def work(self, timeout):
time.sleep(timeout)
return True
test_actors = [Actor.remote() for i in range(num_nodes_total)]
work_handles = [actor.work.remote(timeout * 2) for actor in test_actors]
verify_load_metrics(monitor, (num_nodes_total, {
"CPU": num_nodes_total
}, {
"CPU": num_nodes_total
}))
ray.get(work_handles)
verify_load_metrics(monitor, (0.0, {"CPU": 0.0}, {"CPU": num_nodes_total}))
ray.shutdown()
def test_wait_for_nodes(ray_start_cluster_head):
"""Unit test for `Cluster.wait_for_nodes`.