Add gRPC endpoint to raylet to expose metrics (#6005)

This commit is contained in:
Stephanie Wang
2019-10-26 16:37:39 -07:00
committed by Philipp Moritz
parent 010270b3dc
commit eb41c945a1
11 changed files with 174 additions and 12 deletions
+53
View File
@@ -0,0 +1,53 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import grpc
import psutil
import time
import ray
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
from ray.tests.utils import RayTestTimeoutException
def test_worker_stats(ray_start_regular):
raylet = ray.nodes()[0]
num_cpus = raylet["Resources"]["CPU"]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
ray.nodes()[0]["NodeManagerPort"])
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
# Check that there is one connected driver.
drivers = [worker for worker in reply.workers_stats if worker.is_driver]
assert len(drivers) == 1
assert os.getpid() == drivers[0].pid
timeout_seconds = 20
start_time = time.time()
while True:
if time.time() - start_time > timeout_seconds:
raise RayTestTimeoutException(
"Timed out while waiting for worker processes")
# Wait for the workers to start.
if len(reply.workers_stats) < num_cpus + 1:
time.sleep(1)
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
continue
# Check that the rest of the processes are workers, 1 for each CPU.
assert len(reply.workers_stats) == num_cpus + 1
# Check that all processes are Python.
pids = [worker.pid for worker in reply.workers_stats]
processes = [
p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
if p.info["pid"] in pids
]
for process in processes:
assert "python" in process or "ray" in process
break