mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 20:56:34 +08:00
[Dashboard] Collecting worker stats in node manager and implement webui display in the backend (#6574)
This commit is contained in:
committed by
Philipp Moritz
parent
c5f141013b
commit
bac6f3b61e
@@ -121,6 +121,7 @@ from ray.worker import (
|
||||
register_custom_serializer,
|
||||
remote,
|
||||
shutdown,
|
||||
show_in_webui,
|
||||
wait,
|
||||
) # noqa: E402
|
||||
import ray.internal # noqa: E402
|
||||
@@ -169,6 +170,7 @@ __all__ = [
|
||||
"register_custom_serializer",
|
||||
"remote",
|
||||
"shutdown",
|
||||
"show_in_webui",
|
||||
"wait",
|
||||
]
|
||||
|
||||
|
||||
@@ -805,6 +805,9 @@ cdef class CoreWorker:
|
||||
def get_actor_id(self):
|
||||
return ActorID(self.core_worker.get().GetActorId().Binary())
|
||||
|
||||
def set_webui_display(self, message):
|
||||
self.core_worker.get().SetWebuiDisplay(message)
|
||||
|
||||
def get_objects(self, object_ids, TaskID current_task_id,
|
||||
int64_t timeout_ms=-1):
|
||||
cdef:
|
||||
|
||||
@@ -388,7 +388,8 @@ class RayletStats(threading.Thread):
|
||||
for node in self.nodes:
|
||||
node_id = node["NodeID"]
|
||||
stub = self.stubs[node_id]
|
||||
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
|
||||
reply = stub.GetNodeStats(
|
||||
node_manager_pb2.NodeStatsRequest(), timeout=2)
|
||||
replies[node["NodeManagerAddress"]] = reply
|
||||
with self._raylet_stats_lock:
|
||||
for address, reply in replies.items():
|
||||
|
||||
@@ -113,6 +113,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CJobID GetCurrentJobId()
|
||||
CTaskID GetCurrentTaskId()
|
||||
const CActorID &GetActorId()
|
||||
void SetWebuiDisplay(const c_string &message)
|
||||
CTaskID GetCallerId()
|
||||
const ResourceMappingType &GetResourceIDs() const
|
||||
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
|
||||
|
||||
@@ -21,12 +21,64 @@ def test_worker_stats(ray_start_regular):
|
||||
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
|
||||
|
||||
def try_get_node_stats(num_retry=5, timeout=2):
|
||||
reply = None
|
||||
for _ in range(num_retry):
|
||||
try:
|
||||
reply = stub.GetNodeStats(
|
||||
node_manager_pb2.NodeStatsRequest(), timeout=timeout)
|
||||
break
|
||||
except grpc.RpcError:
|
||||
continue
|
||||
assert reply is not None
|
||||
return reply
|
||||
|
||||
reply = try_get_node_stats()
|
||||
# Check that there is one connected driver.
|
||||
drivers = [worker for worker in reply.workers_stats if worker.is_driver]
|
||||
assert len(drivers) == 1
|
||||
assert os.getpid() == drivers[0].pid
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.show_in_webui("test")
|
||||
return os.getpid()
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def f(self):
|
||||
ray.show_in_webui("test")
|
||||
return os.getpid()
|
||||
|
||||
# Test show_in_webui for remote functions.
|
||||
worker_pid = ray.get(f.remote())
|
||||
reply = try_get_node_stats()
|
||||
target_worker_present = False
|
||||
for worker in reply.workers_stats:
|
||||
if worker.webui_display == "test":
|
||||
target_worker_present = True
|
||||
assert worker.pid == worker_pid
|
||||
else:
|
||||
assert worker.webui_display == ""
|
||||
assert target_worker_present
|
||||
|
||||
# Test show_in_webui for remote actors.
|
||||
a = Actor.remote()
|
||||
worker_pid = ray.get(a.f.remote())
|
||||
reply = try_get_node_stats()
|
||||
target_worker_present = False
|
||||
for worker in reply.workers_stats:
|
||||
if worker.webui_display == "test":
|
||||
target_worker_present = True
|
||||
assert worker.pid == worker_pid
|
||||
else:
|
||||
assert worker.webui_display == ""
|
||||
assert target_worker_present
|
||||
|
||||
timeout_seconds = 20
|
||||
start_time = time.time()
|
||||
while True:
|
||||
@@ -37,7 +89,7 @@ def test_worker_stats(ray_start_regular):
|
||||
# Wait for the workers to start.
|
||||
if len(reply.workers_stats) < num_cpus + 1:
|
||||
time.sleep(1)
|
||||
reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
|
||||
reply = try_get_node_stats()
|
||||
continue
|
||||
|
||||
# Check that the rest of the processes are workers, 1 for each CPU.
|
||||
|
||||
@@ -1412,6 +1412,21 @@ def register_custom_serializer(cls,
|
||||
class_id=class_id)
|
||||
|
||||
|
||||
def show_in_webui(message):
|
||||
"""Display message in dashboard.
|
||||
|
||||
Display message for the current task or actor in the dashboard.
|
||||
For example, this can be used to display the status of a long-running
|
||||
computation.
|
||||
|
||||
Args:
|
||||
message (str): Message to be displayed.
|
||||
"""
|
||||
worker = global_worker
|
||||
worker.check_connected()
|
||||
worker.core_worker.set_webui_display(message.encode())
|
||||
|
||||
|
||||
def get(object_ids, timeout=None):
|
||||
"""Get a remote object or a list of remote objects from the object store.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user