mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 02:46:49 +08:00
[Dashboard] Logical view backend for dashboard (#6590)
This commit is contained in:
committed by
Philipp Moritz
parent
8b16847c02
commit
65acb54553
@@ -38,7 +38,7 @@ const styles = (theme: Theme) =>
|
||||
},
|
||||
extraInfo: {
|
||||
fontFamily: "SFMono-Regular,Consolas,Liberation Mono,Menlo,monospace",
|
||||
whiteSpace: "pre-wrap"
|
||||
whiteSpace: "pre"
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ import ray
|
||||
from ray.core.generated import node_manager_pb2
|
||||
from ray.core.generated import node_manager_pb2_grpc
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.utils
|
||||
|
||||
# Logger for this module. It should be configured at the entry point
|
||||
# into the program using Ray. Ray provides a default configuration at
|
||||
@@ -159,6 +158,9 @@ class Dashboard(object):
|
||||
|
||||
async def raylet_info(req) -> aiohttp.web.Response:
|
||||
D = self.raylet_stats.get_raylet_stats()
|
||||
workers_info = sum((data["workersStats"] for data in D.values()),
|
||||
[])
|
||||
actor_tree = self.node_stats.get_actor_tree(workers_info)
|
||||
for address, data in D.items():
|
||||
available_resources = data["availableResources"]
|
||||
total_resources = data["totalResources"]
|
||||
@@ -171,6 +173,17 @@ class Dashboard(object):
|
||||
extra_info.append("{}: {} / {}".format(
|
||||
resource_name, occupied, total))
|
||||
data["extraInfo"] = ", ".join(extra_info)
|
||||
if os.environ.get("RAY_DASHBOARD_DEBUG"):
|
||||
actor_tree_str = json.dumps(actor_tree, indent=2)
|
||||
actor_tree_lines = actor_tree_str.split("\n")
|
||||
max_line_length = max(map(len, actor_tree_lines))
|
||||
actor_tree_print = []
|
||||
for line in actor_tree_lines:
|
||||
actor_tree_print.append(
|
||||
line + (max_line_length - len(line)) * " ")
|
||||
actor_tree_print = "\n".join(actor_tree_print)
|
||||
data["extraInfo"] += "\n" + actor_tree_print
|
||||
D["actorInfo"] = actor_tree
|
||||
return await json_response(result=D)
|
||||
|
||||
async def logs(req) -> aiohttp.web.Response:
|
||||
@@ -226,6 +239,9 @@ class NodeStats(threading.Thread):
|
||||
redis_address, password=redis_password)
|
||||
|
||||
self._node_stats = {}
|
||||
self._addr_to_owner_addr = {}
|
||||
self._addr_to_actor_id = {}
|
||||
self._addr_to_extra_info_dict = {}
|
||||
self._node_stats_lock = threading.Lock()
|
||||
|
||||
# Mapping from IP address to PID to list of log lines
|
||||
@@ -281,6 +297,35 @@ class NodeStats(threading.Thread):
|
||||
"error_counts": self.calculate_error_counts(),
|
||||
}
|
||||
|
||||
def get_actor_tree(self, workers_info) -> Dict:
|
||||
# construct flattened actor tree
|
||||
flattened_tree = {"root": {"children": {}}}
|
||||
child_to_parent = {}
|
||||
with self._node_stats_lock:
|
||||
for addr, actor_id in self._addr_to_actor_id.items():
|
||||
flattened_tree[actor_id] = self._addr_to_extra_info_dict[addr]
|
||||
flattened_tree[actor_id]["children"] = {}
|
||||
parent_id = self._addr_to_actor_id.get(
|
||||
self._addr_to_owner_addr[addr], "root")
|
||||
child_to_parent[actor_id] = parent_id
|
||||
|
||||
for worker_info in workers_info:
|
||||
if "coreWorkerStats" in worker_info:
|
||||
core_worker_stats = worker_info["coreWorkerStats"]
|
||||
addr = (core_worker_stats["ipAddress"],
|
||||
core_worker_stats["port"])
|
||||
if addr in self._addr_to_actor_id:
|
||||
actor_id = self._addr_to_actor_id[addr]
|
||||
if "currentTaskDesc" in core_worker_stats:
|
||||
core_worker_stats.pop("currentTaskDesc")
|
||||
flattened_tree[actor_id].update(core_worker_stats)
|
||||
|
||||
# construct actor tree
|
||||
actor_tree = flattened_tree
|
||||
for actor_id, parent_id in child_to_parent.items():
|
||||
actor_tree[parent_id]["children"][actor_id] = actor_tree[actor_id]
|
||||
return actor_tree["root"]["children"]
|
||||
|
||||
def get_logs(self, hostname, pid):
|
||||
ip = self._node_stats.get(hostname, {"ip": None})["ip"]
|
||||
logs = self._logs.get(ip, {})
|
||||
@@ -309,6 +354,10 @@ class NodeStats(threading.Thread):
|
||||
p.subscribe(error_channel)
|
||||
logger.info("NodeStats: subscribed to {}".format(error_channel))
|
||||
|
||||
actor_channel = ray.gcs_utils.TablePubsub.Value("ACTOR_PUBSUB")
|
||||
p.subscribe(actor_channel)
|
||||
logger.info("NodeStats: subscribed to {}".format(actor_channel))
|
||||
|
||||
for x in p.listen():
|
||||
try:
|
||||
with self._node_stats_lock:
|
||||
@@ -334,6 +383,21 @@ class NodeStats(threading.Thread):
|
||||
"timestamp": error_data.timestamp,
|
||||
"type": error_data.type
|
||||
})
|
||||
elif channel == str(actor_channel):
|
||||
gcs_entry = ray.gcs_utils.GcsEntry.FromString(data)
|
||||
actor_data = ray.gcs_utils.ActorTableData.FromString(
|
||||
gcs_entry.entries[0])
|
||||
addr = (str(actor_data.address.ip_address),
|
||||
str(actor_data.address.port))
|
||||
owner_addr = (str(actor_data.owner_address.ip_address),
|
||||
str(actor_data.owner_address.port))
|
||||
self._addr_to_owner_addr[addr] = owner_addr
|
||||
self._addr_to_actor_id[addr] = ray.utils.binary_to_hex(
|
||||
actor_data.actor_id)
|
||||
self._addr_to_extra_info_dict[addr] = {
|
||||
"job_id": ray.utils.binary_to_hex(
|
||||
actor_data.job_id)
|
||||
}
|
||||
else:
|
||||
data = json.loads(ray.utils.decode(data))
|
||||
self._node_stats[data["hostname"]] = data
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
from ray.core.generated.gcs_pb2 import (
|
||||
ActorCheckpointIdData,
|
||||
ActorTableData,
|
||||
GcsNodeInfo,
|
||||
JobTableData,
|
||||
ErrorTableData,
|
||||
@@ -21,6 +22,7 @@ from ray.core.generated.gcs_pb2 import (
|
||||
|
||||
__all__ = [
|
||||
"ActorCheckpointIdData",
|
||||
"ActorTableData",
|
||||
"GcsNodeInfo",
|
||||
"JobTableData",
|
||||
"ErrorTableData",
|
||||
|
||||
@@ -5,6 +5,7 @@ from __future__ import print_function
|
||||
import os
|
||||
import grpc
|
||||
import psutil
|
||||
import requests
|
||||
import time
|
||||
|
||||
import ray
|
||||
@@ -60,11 +61,12 @@ def test_worker_stats(shutdown_only):
|
||||
reply = try_get_node_stats()
|
||||
target_worker_present = False
|
||||
for worker in reply.workers_stats:
|
||||
if worker.webui_display == "test":
|
||||
stats = worker.core_worker_stats
|
||||
if stats.webui_display == "test":
|
||||
target_worker_present = True
|
||||
assert worker.pid == worker_pid
|
||||
else:
|
||||
assert worker.webui_display == ""
|
||||
assert stats.webui_display == ""
|
||||
assert target_worker_present
|
||||
|
||||
# Test show_in_webui for remote actors.
|
||||
@@ -73,11 +75,12 @@ def test_worker_stats(shutdown_only):
|
||||
reply = try_get_node_stats()
|
||||
target_worker_present = False
|
||||
for worker in reply.workers_stats:
|
||||
if worker.webui_display == "test":
|
||||
stats = worker.core_worker_stats
|
||||
if stats.webui_display == "test":
|
||||
target_worker_present = True
|
||||
assert worker.pid == worker_pid
|
||||
else:
|
||||
assert worker.webui_display == ""
|
||||
assert stats.webui_display == ""
|
||||
assert target_worker_present
|
||||
|
||||
timeout_seconds = 20
|
||||
@@ -94,7 +97,6 @@ def test_worker_stats(shutdown_only):
|
||||
continue
|
||||
|
||||
# Check that the rest of the processes are workers, 1 for each CPU.
|
||||
print(reply)
|
||||
assert len(reply.workers_stats) == num_cpus + 1
|
||||
views = [view.view_name for view in reply.view_data]
|
||||
assert "redis_latency" in views
|
||||
@@ -112,6 +114,64 @@ def test_worker_stats(shutdown_only):
|
||||
break
|
||||
|
||||
|
||||
def test_raylet_info_endpoint(shutdown_only):
|
||||
addresses = ray.init(include_webui=True, num_cpus=6)
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
class A(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def f(self):
|
||||
return os.getpid()
|
||||
|
||||
@ray.remote(num_cpus=2)
|
||||
class B(object):
|
||||
def __init__(self):
|
||||
self.children = [A.remote(), A.remote()]
|
||||
|
||||
def f(self):
|
||||
return os.getpid(), ray.get(
|
||||
[child.f.remote() for child in self.children])
|
||||
|
||||
# TODO: Currently there is a race condition of Dashboard subscription
|
||||
# and actor initialization. This will be fixed after #6629 is merged.
|
||||
time.sleep(10)
|
||||
|
||||
b = B.remote()
|
||||
pids = ray.get(b.f.remote())
|
||||
assert len(pids) == 2 and len(pids[1]) == 2
|
||||
|
||||
start_time = time.time()
|
||||
while True:
|
||||
time.sleep(1)
|
||||
try:
|
||||
webui_url = addresses["webui_url"]
|
||||
webui_url = webui_url.replace("localhost", "http://127.0.0.1")
|
||||
raylet_info = requests.get(webui_url + "/api/raylet_info").json()
|
||||
actor_info = raylet_info["result"]["actorInfo"]
|
||||
try:
|
||||
assert len(actor_info) == 1
|
||||
print("actor_info", actor_info)
|
||||
_, parent_actor_info = actor_info.popitem()
|
||||
children = parent_actor_info["children"]
|
||||
assert len(children) == 2
|
||||
break
|
||||
except AssertionError:
|
||||
if time.time() > start_time + 30:
|
||||
raise Exception(
|
||||
"Timed out while waiting for actorInfo to show up.")
|
||||
except requests.exceptions.ConnectionError:
|
||||
if time.time() > start_time + 30:
|
||||
raise Exception(
|
||||
"Timed out while waiting for dashboard to start.")
|
||||
|
||||
assert parent_actor_info["usedResources"]["CPU"] == 2
|
||||
for _, child_actor_info in children.items():
|
||||
assert len(child_actor_info["children"]) == 0
|
||||
assert child_actor_info["usedResources"]["CPU"] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
Reference in New Issue
Block a user