[Dashboard] Collecting worker stats in node manager and implement webui display in the backend (#6574)

This commit is contained in:
Yunzhi Zhang
2019-12-22 17:50:23 -08:00
committed by Philipp Moritz
parent c5f141013b
commit bac6f3b61e
13 changed files with 163 additions and 10 deletions
+2
View File
@@ -121,6 +121,7 @@ from ray.worker import (
register_custom_serializer,
remote,
shutdown,
show_in_webui,
wait,
) # noqa: E402
import ray.internal # noqa: E402
@@ -169,6 +170,7 @@ __all__ = [
"register_custom_serializer",
"remote",
"shutdown",
"show_in_webui",
"wait",
]
+3
View File
@@ -805,6 +805,9 @@ cdef class CoreWorker:
def get_actor_id(self):
return ActorID(self.core_worker.get().GetActorId().Binary())
def set_webui_display(self, message):
self.core_worker.get().SetWebuiDisplay(message)
def get_objects(self, object_ids, TaskID current_task_id,
int64_t timeout_ms=-1):
cdef:
+2 -1
View File
@@ -388,7 +388,8 @@ class RayletStats(threading.Thread):
for node in self.nodes:
node_id = node["NodeID"]
stub = self.stubs[node_id]
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
reply = stub.GetNodeStats(
node_manager_pb2.NodeStatsRequest(), timeout=2)
replies[node["NodeManagerAddress"]] = reply
with self._raylet_stats_lock:
for address, reply in replies.items():
+1
View File
@@ -113,6 +113,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CJobID GetCurrentJobId()
CTaskID GetCurrentTaskId()
const CActorID &GetActorId()
void SetWebuiDisplay(const c_string &message)
CTaskID GetCallerId()
const ResourceMappingType &GetResourceIDs() const
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
+54 -2
View File
@@ -21,12 +21,64 @@ def test_worker_stats(ray_start_regular):
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
def try_get_node_stats(num_retry=5, timeout=2):
reply = None
for _ in range(num_retry):
try:
reply = stub.GetNodeStats(
node_manager_pb2.NodeStatsRequest(), timeout=timeout)
break
except grpc.RpcError:
continue
assert reply is not None
return reply
reply = try_get_node_stats()
# Check that there is one connected driver.
drivers = [worker for worker in reply.workers_stats if worker.is_driver]
assert len(drivers) == 1
assert os.getpid() == drivers[0].pid
@ray.remote
def f():
ray.show_in_webui("test")
return os.getpid()
@ray.remote
class Actor(object):
def __init__(self):
pass
def f(self):
ray.show_in_webui("test")
return os.getpid()
# Test show_in_webui for remote functions.
worker_pid = ray.get(f.remote())
reply = try_get_node_stats()
target_worker_present = False
for worker in reply.workers_stats:
if worker.webui_display == "test":
target_worker_present = True
assert worker.pid == worker_pid
else:
assert worker.webui_display == ""
assert target_worker_present
# Test show_in_webui for remote actors.
a = Actor.remote()
worker_pid = ray.get(a.f.remote())
reply = try_get_node_stats()
target_worker_present = False
for worker in reply.workers_stats:
if worker.webui_display == "test":
target_worker_present = True
assert worker.pid == worker_pid
else:
assert worker.webui_display == ""
assert target_worker_present
timeout_seconds = 20
start_time = time.time()
while True:
@@ -37,7 +89,7 @@ def test_worker_stats(ray_start_regular):
# Wait for the workers to start.
if len(reply.workers_stats) < num_cpus + 1:
time.sleep(1)
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
reply = try_get_node_stats()
continue
# Check that the rest of the processes are workers, 1 for each CPU.
+15
View File
@@ -1412,6 +1412,21 @@ def register_custom_serializer(cls,
class_id=class_id)
def show_in_webui(message):
"""Display message in dashboard.
Display message for the current task or actor in the dashboard.
For example, this can be used to display the status of a long-running
computation.
Args:
message (str): Message to be displayed.
"""
worker = global_worker
worker.check_connected()
worker.core_worker.set_webui_display(message.encode())
def get(object_ids, timeout=None):
"""Get a remote object or a list of remote objects from the object store.