diff --git a/dashboard/client/src/api.ts b/dashboard/client/src/api.ts index 7e0b45b69..c222f102c 100644 --- a/dashboard/client/src/api.ts +++ b/dashboard/client/src/api.ts @@ -39,10 +39,8 @@ export type RayConfigResponse = { export const getRayConfig = () => get("/api/ray_config", {}); -export type Worker = { +type ProcessStats = { pid: number; - workerId: string; - createTime: number; memoryInfo: { rss: number; vms: number; @@ -52,6 +50,7 @@ export type Worker = { data: number; dirty: Number; }; + createTime: number; cmdline: string[]; cpuTimes: { user: number; @@ -61,12 +60,17 @@ export type Worker = { iowait: number; }; cpuPercent: number; +} + +export type Worker = { + pid: number; + workerId: string; logCount: number; errorCount: number; language: string; jobId: string; coreWorkerStats: CoreWorkerStats[]; -}; +} & ProcessStats; export type CoreWorkerStats = { ipAddress: string; @@ -220,12 +224,14 @@ export type FullActorInfo = { | ActorState.DependenciesUnready | ActorState.PendingCreation; taskQueueLength?: number; + gpus: GPUStats[]; // Contains info about any GPUs the actor is using timestamp: number; usedObjectStoreMemory?: number; usedResources: { [key: string]: ResourceAllocations }; currentTaskDesc?: string; numPendingTasks?: number; webuiDisplay?: Record; + processStats?: ProcessStats; }; export type ActorTaskInfo = { diff --git a/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx b/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx index d59426418..fba0f3557 100644 --- a/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx +++ b/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx @@ -10,12 +10,8 @@ import { launchKillActor, launchProfiling, } from "../../../api"; -import { sum } from "../../../common/util"; import ActorDetailsPane from "./ActorDetailsPane"; -const memoryDebuggingDocLink = - "https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory"; - const useActorStyles = makeStyles((theme: Theme) => createStyles({ root: { @@ -103,80 +99,6 @@ const Actor: React.FC = ({ actor }) => { } }; - const information = isFullActorInfo(actor) - ? [ - { - label: "Resources", - value: - actor.usedResources && - Object.entries(actor.usedResources).length > 0 && - Object.entries(actor.usedResources) - .sort((a, b) => a[0].localeCompare(b[0])) - .map( - ([key, value]) => - `${sum( - value.resourceSlots.map((slot) => slot.allocation), - )} ${key}`, - ) - .join(", "), - }, - { - label: "Number of pending tasks", - value: actor.taskQueueLength?.toLocaleString() ?? "0", - tooltip: - "The number of tasks that are currently pending to execute on this actor. If this number " + - "remains consistently high, it may indicate that this actor is a bottleneck in your application.", - }, - { - label: "Number of executed tasks", - value: actor.numExecutedTasks?.toLocaleString() ?? "0", - tooltip: - "The number of tasks this actor has executed throughout its lifetimes.", - }, - { - label: "Number of ObjectRefs in scope", - value: actor.numObjectRefsInScope?.toLocaleString() ?? "0", - tooltip: - "The number of ObjectRefs that this actor is keeping in scope via its internal state. " + - "This does not imply that the objects are in active use or colocated on the node with the actor " + - `currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` + - "for more information.", - }, - { - label: "Number of local objects", - value: actor.numLocalObjects?.toLocaleString() ?? "0", - tooltip: - "The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " + - `debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`, - }, - { - label: "Object store memory used (MiB)", - value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0", - tooltip: - "The total amount of memory that this actor is occupying in the Ray object store. " + - "If this number is increasing without bounds, you might have a memory leak. See " + - `the docs at: ${memoryDebuggingDocLink} for more information.`, - }, - ] - : [ - { - label: "Actor ID", - value: actor.actorId, - tooltip: "", - }, - { - label: "Required resources", - value: - actor.requiredResources && - Object.entries(actor.requiredResources).length > 0 && - Object.entries(actor.requiredResources) - .sort((a, b) => a[0].localeCompare(b[0])) - .map(([key, value]) => `${value.toLocaleString()} ${key}`) - .join(", "), - tooltip: "", - }, - ]; - // Construct the custom message from the actor. let actorCustomDisplay: JSX.Element[] = []; if (isFullActorInfo(actor) && actor.webuiDisplay) { @@ -274,9 +196,7 @@ const Actor: React.FC = ({ actor }) => { )} {isFullActorInfo(actor) && ( diff --git a/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx b/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx index 8593023f8..bca2ec1d4 100644 --- a/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx +++ b/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx @@ -1,17 +1,98 @@ -import { Divider, Grid, makeStyles, Theme } from "@material-ui/core"; +import { Divider, Grid, makeStyles, Theme, Typography } from "@material-ui/core"; import React from "react"; -import { ActorState } from "../../../api"; +import { ActorInfo, isFullActorInfo } from "../../../api"; +import { sum } from "../../../common/util"; import LabeledDatum from "../../../common/LabeledDatum"; import ActorStateRepr from "./ActorStateRepr"; +import UsageBar from '../../../common/UsageBar'; + +const memoryDebuggingDocLink = + "https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory"; + +type ActorDatum = { + label: string; + value: any; + tooltip?: string; +} + +const labeledActorData = (actor: ActorInfo) => ( + isFullActorInfo(actor) + ? [ + { + label: "Resources", + value: + actor.usedResources && + Object.entries(actor.usedResources).length > 0 && + Object.entries(actor.usedResources) + .sort((a, b) => a[0].localeCompare(b[0])) + .map( + ([key, value]) => + `${sum( + value.resourceSlots.map((slot) => slot.allocation), + )} ${key}`, + ) + .join(", "), + }, + { + label: "Number of pending tasks", + value: actor.taskQueueLength?.toLocaleString() ?? "0", + tooltip: + "The number of tasks that are currently pending to execute on this actor. If this number " + + "remains consistently high, it may indicate that this actor is a bottleneck in your application.", + }, + { + label: "Number of executed tasks", + value: actor.numExecutedTasks?.toLocaleString() ?? "0", + tooltip: + "The number of tasks this actor has executed throughout its lifetimes.", + }, + { + label: "Number of ObjectRefs in scope", + value: actor.numObjectRefsInScope?.toLocaleString() ?? "0", + tooltip: + "The number of ObjectRefs that this actor is keeping in scope via its internal state. " + + "This does not imply that the objects are in active use or colocated on the node with the actor " + + `currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` + + "for more information.", + }, + { + label: "Number of local objects", + value: actor.numLocalObjects?.toLocaleString() ?? "0", + tooltip: + "The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " + + `debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`, + }, + { + label: "Object store memory used (MiB)", + value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0", + tooltip: + "The total amount of memory that this actor is occupying in the Ray object store. " + + "If this number is increasing without bounds, you might have a memory leak. See " + + `the docs at: ${memoryDebuggingDocLink} for more information.`, + }, + ] + : [ + { + label: "Actor ID", + value: actor.actorId, + tooltip: "", + }, + { + label: "Required resources", + value: + actor.requiredResources && + Object.entries(actor.requiredResources).length > 0 && + Object.entries(actor.requiredResources) + .sort((a, b) => a[0].localeCompare(b[0])) + .map(([key, value]) => `${value.toLocaleString()} ${key}`) + .join(", "), + tooltip: "", + }, + ]); + type ActorDetailsPaneProps = { - actorClass: string; - actorState: ActorState; - actorDetails: { - label: string; - value: any; - tooltip?: string; - }[]; + actor: ActorInfo; }; const useStyles = makeStyles((theme: Theme) => ({ @@ -31,20 +112,55 @@ const useStyles = makeStyles((theme: Theme) => ({ })); const ActorDetailsPane: React.FC = ({ - actorDetails, - actorClass, - actorState, + actor }) => { const classes = useStyles(); + const actorData: ActorDatum[] = labeledActorData(actor); return (
-
{actorClass}
- +
{actor.actorClass}
+
+ {isFullActorInfo(actor) && + + + + CPU Usage + + + + + + + { actor.gpus.length > 0 && + + + GPU Usage + + {actor.gpus.map(gpu => ( + + + {`[${gpu.name}]`} + + + + + + + ))} + + } + } - {actorDetails.map( + {actorData.map( ({ label, value, tooltip }) => value && value.length > 0 && ( diff --git a/dashboard/datacenter.py b/dashboard/datacenter.py index bb1624db4..23a239f29 100644 --- a/dashboard/datacenter.py +++ b/dashboard/datacenter.py @@ -1,6 +1,7 @@ import logging import ray.new_dashboard.consts as dashboard_consts import ray.new_dashboard.memory_utils as memory_utils +from collections import defaultdict from ray.new_dashboard.actor_utils import actor_classname_from_task_spec from ray.new_dashboard.utils import Dict, Signal @@ -61,18 +62,35 @@ class DataOrganizer: @classmethod async def get_node_actors(cls, node_id): node_stats = DataSource.node_stats.get(node_id, {}) - worker_id_to_info = {} + node_physical_stats = DataSource.node_physical_stats.get(node_id, {}) + worker_id_to_raylet_info = {} + pid_to_worker_id = {} + for worker_stats in node_stats.get("workersStats", []): - worker_id_to_info[worker_stats["workerId"]] = worker_stats + worker_id_to_raylet_info[worker_stats["workerId"]] = worker_stats + pid_to_worker_id[worker_stats["pid"]] = worker_stats["workerId"] + worker_id_to_process_info = {} + + for process_stats in node_physical_stats.get("workers"): + if process_stats["pid"] in pid_to_worker_id: + worker_id = pid_to_worker_id[process_stats["pid"]] + worker_id_to_process_info[worker_id] = process_stats + + worker_id_to_gpu_stats = defaultdict(list) + for gpu_stats in node_physical_stats.get("gpus"): + for process in gpu_stats.get("processes", []): + if process["pid"] in pid_to_worker_id: + worker_id = pid_to_worker_id[process["pid"]] + worker_id_to_gpu_stats[worker_id].append(gpu_stats) node_actors = {} for actor_id, actor_table_data in DataSource.actors.items(): - if actor_table_data["address"]["workerId"] in worker_id_to_info: - worker_stats = worker_id_to_info[actor_table_data["address"][ - "workerId"]] - - actor_constructor = worker_stats.get("coreWorkerStats", {})\ - .get("actorTitle", "Unknown actor constructor") + worker_id = actor_table_data["address"]["workerId"] + if worker_id in worker_id_to_raylet_info: + worker_raylet_stats = worker_id_to_raylet_info[worker_id] + core_worker = worker_raylet_stats.get("coreWorkerStats", {}) + actor_constructor = core_worker.get( + "actorTitle", "Unknown actor constructor") actor_table_data["actorConstructor"] = actor_constructor @@ -80,8 +98,12 @@ class DataOrganizer: actor_table_data.get("taskSpec", {})) actor_table_data["actorClass"] = actor_class - actor_table_data.update(worker_stats["coreWorkerStats"]) + actor_table_data.update(core_worker) node_actors[actor_id] = actor_table_data + actor_table_data["gpus"] = worker_id_to_gpu_stats.get( + worker_id, []) + actor_table_data["processStats"] = worker_id_to_process_info.get( + worker_id, {}) return node_actors @classmethod diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index b9ce287bb..49d577bca 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -24,6 +24,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule): def __init__(self, dashboard_head): super().__init__(dashboard_head) self._stubs = {} + self._ray_config = None DataSource.agents.signal.append(self._update_stubs) async def _update_stubs(self, change): diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index 104eca465..b83c922b8 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -60,6 +60,7 @@ if __name__ == "__main__": do_link("experimental", force=args.yes) do_link("util", force=args.yes) do_link("dashboard", force=args.yes) + do_link("new_dashboard", force=args.yes) print("Created links.\n\nIf you run into issues initializing Ray, please " "ensure that your local repo and the installed Ray are in sync " "(pip install -U the latest wheels at "