mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
[Dashboard] CPU/GPU usage details in actor pane (#11269)
This commit is contained in:
@@ -39,10 +39,8 @@ export type RayConfigResponse = {
|
||||
|
||||
export const getRayConfig = () => get<RayConfigResponse>("/api/ray_config", {});
|
||||
|
||||
export type Worker = {
|
||||
type ProcessStats = {
|
||||
pid: number;
|
||||
workerId: string;
|
||||
createTime: number;
|
||||
memoryInfo: {
|
||||
rss: number;
|
||||
vms: number;
|
||||
@@ -52,6 +50,7 @@ export type Worker = {
|
||||
data: number;
|
||||
dirty: Number;
|
||||
};
|
||||
createTime: number;
|
||||
cmdline: string[];
|
||||
cpuTimes: {
|
||||
user: number;
|
||||
@@ -61,12 +60,17 @@ export type Worker = {
|
||||
iowait: number;
|
||||
};
|
||||
cpuPercent: number;
|
||||
}
|
||||
|
||||
export type Worker = {
|
||||
pid: number;
|
||||
workerId: string;
|
||||
logCount: number;
|
||||
errorCount: number;
|
||||
language: string;
|
||||
jobId: string;
|
||||
coreWorkerStats: CoreWorkerStats[];
|
||||
};
|
||||
} & ProcessStats;
|
||||
|
||||
export type CoreWorkerStats = {
|
||||
ipAddress: string;
|
||||
@@ -220,12 +224,14 @@ export type FullActorInfo = {
|
||||
| ActorState.DependenciesUnready
|
||||
| ActorState.PendingCreation;
|
||||
taskQueueLength?: number;
|
||||
gpus: GPUStats[]; // Contains info about any GPUs the actor is using
|
||||
timestamp: number;
|
||||
usedObjectStoreMemory?: number;
|
||||
usedResources: { [key: string]: ResourceAllocations };
|
||||
currentTaskDesc?: string;
|
||||
numPendingTasks?: number;
|
||||
webuiDisplay?: Record<string, string>;
|
||||
processStats?: ProcessStats;
|
||||
};
|
||||
|
||||
export type ActorTaskInfo = {
|
||||
|
||||
@@ -10,12 +10,8 @@ import {
|
||||
launchKillActor,
|
||||
launchProfiling,
|
||||
} from "../../../api";
|
||||
import { sum } from "../../../common/util";
|
||||
import ActorDetailsPane from "./ActorDetailsPane";
|
||||
|
||||
const memoryDebuggingDocLink =
|
||||
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";
|
||||
|
||||
const useActorStyles = makeStyles((theme: Theme) =>
|
||||
createStyles({
|
||||
root: {
|
||||
@@ -103,80 +99,6 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
|
||||
}
|
||||
};
|
||||
|
||||
const information = isFullActorInfo(actor)
|
||||
? [
|
||||
{
|
||||
label: "Resources",
|
||||
value:
|
||||
actor.usedResources &&
|
||||
Object.entries(actor.usedResources).length > 0 &&
|
||||
Object.entries(actor.usedResources)
|
||||
.sort((a, b) => a[0].localeCompare(b[0]))
|
||||
.map(
|
||||
([key, value]) =>
|
||||
`${sum(
|
||||
value.resourceSlots.map((slot) => slot.allocation),
|
||||
)} ${key}`,
|
||||
)
|
||||
.join(", "),
|
||||
},
|
||||
{
|
||||
label: "Number of pending tasks",
|
||||
value: actor.taskQueueLength?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of tasks that are currently pending to execute on this actor. If this number " +
|
||||
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
|
||||
},
|
||||
{
|
||||
label: "Number of executed tasks",
|
||||
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of tasks this actor has executed throughout its lifetimes.",
|
||||
},
|
||||
{
|
||||
label: "Number of ObjectRefs in scope",
|
||||
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
|
||||
"This does not imply that the objects are in active use or colocated on the node with the actor " +
|
||||
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
|
||||
"for more information.",
|
||||
},
|
||||
{
|
||||
label: "Number of local objects",
|
||||
value: actor.numLocalObjects?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
|
||||
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
|
||||
},
|
||||
{
|
||||
label: "Object store memory used (MiB)",
|
||||
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The total amount of memory that this actor is occupying in the Ray object store. " +
|
||||
"If this number is increasing without bounds, you might have a memory leak. See " +
|
||||
`the docs at: ${memoryDebuggingDocLink} for more information.`,
|
||||
},
|
||||
]
|
||||
: [
|
||||
{
|
||||
label: "Actor ID",
|
||||
value: actor.actorId,
|
||||
tooltip: "",
|
||||
},
|
||||
{
|
||||
label: "Required resources",
|
||||
value:
|
||||
actor.requiredResources &&
|
||||
Object.entries(actor.requiredResources).length > 0 &&
|
||||
Object.entries(actor.requiredResources)
|
||||
.sort((a, b) => a[0].localeCompare(b[0]))
|
||||
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
|
||||
.join(", "),
|
||||
tooltip: "",
|
||||
},
|
||||
];
|
||||
|
||||
// Construct the custom message from the actor.
|
||||
let actorCustomDisplay: JSX.Element[] = [];
|
||||
if (isFullActorInfo(actor) && actor.webuiDisplay) {
|
||||
@@ -274,9 +196,7 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
|
||||
)}
|
||||
</Typography>
|
||||
<ActorDetailsPane
|
||||
actorDetails={information}
|
||||
actorClass={actor.actorClass}
|
||||
actorState={actor.state}
|
||||
actor={actor}
|
||||
/>
|
||||
{isFullActorInfo(actor) && (
|
||||
<React.Fragment>
|
||||
|
||||
@@ -1,17 +1,98 @@
|
||||
import { Divider, Grid, makeStyles, Theme } from "@material-ui/core";
|
||||
import { Divider, Grid, makeStyles, Theme, Typography } from "@material-ui/core";
|
||||
import React from "react";
|
||||
import { ActorState } from "../../../api";
|
||||
import { ActorInfo, isFullActorInfo } from "../../../api";
|
||||
import { sum } from "../../../common/util";
|
||||
import LabeledDatum from "../../../common/LabeledDatum";
|
||||
import ActorStateRepr from "./ActorStateRepr";
|
||||
import UsageBar from '../../../common/UsageBar';
|
||||
|
||||
const memoryDebuggingDocLink =
|
||||
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";
|
||||
|
||||
type ActorDatum = {
|
||||
label: string;
|
||||
value: any;
|
||||
tooltip?: string;
|
||||
}
|
||||
|
||||
const labeledActorData = (actor: ActorInfo) => (
|
||||
isFullActorInfo(actor)
|
||||
? [
|
||||
{
|
||||
label: "Resources",
|
||||
value:
|
||||
actor.usedResources &&
|
||||
Object.entries(actor.usedResources).length > 0 &&
|
||||
Object.entries(actor.usedResources)
|
||||
.sort((a, b) => a[0].localeCompare(b[0]))
|
||||
.map(
|
||||
([key, value]) =>
|
||||
`${sum(
|
||||
value.resourceSlots.map((slot) => slot.allocation),
|
||||
)} ${key}`,
|
||||
)
|
||||
.join(", "),
|
||||
},
|
||||
{
|
||||
label: "Number of pending tasks",
|
||||
value: actor.taskQueueLength?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of tasks that are currently pending to execute on this actor. If this number " +
|
||||
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
|
||||
},
|
||||
{
|
||||
label: "Number of executed tasks",
|
||||
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of tasks this actor has executed throughout its lifetimes.",
|
||||
},
|
||||
{
|
||||
label: "Number of ObjectRefs in scope",
|
||||
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
|
||||
"This does not imply that the objects are in active use or colocated on the node with the actor " +
|
||||
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
|
||||
"for more information.",
|
||||
},
|
||||
{
|
||||
label: "Number of local objects",
|
||||
value: actor.numLocalObjects?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
|
||||
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
|
||||
},
|
||||
{
|
||||
label: "Object store memory used (MiB)",
|
||||
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
|
||||
tooltip:
|
||||
"The total amount of memory that this actor is occupying in the Ray object store. " +
|
||||
"If this number is increasing without bounds, you might have a memory leak. See " +
|
||||
`the docs at: ${memoryDebuggingDocLink} for more information.`,
|
||||
},
|
||||
]
|
||||
: [
|
||||
{
|
||||
label: "Actor ID",
|
||||
value: actor.actorId,
|
||||
tooltip: "",
|
||||
},
|
||||
{
|
||||
label: "Required resources",
|
||||
value:
|
||||
actor.requiredResources &&
|
||||
Object.entries(actor.requiredResources).length > 0 &&
|
||||
Object.entries(actor.requiredResources)
|
||||
.sort((a, b) => a[0].localeCompare(b[0]))
|
||||
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
|
||||
.join(", "),
|
||||
tooltip: "",
|
||||
},
|
||||
]);
|
||||
|
||||
|
||||
type ActorDetailsPaneProps = {
|
||||
actorClass: string;
|
||||
actorState: ActorState;
|
||||
actorDetails: {
|
||||
label: string;
|
||||
value: any;
|
||||
tooltip?: string;
|
||||
}[];
|
||||
actor: ActorInfo;
|
||||
};
|
||||
|
||||
const useStyles = makeStyles((theme: Theme) => ({
|
||||
@@ -31,20 +112,55 @@ const useStyles = makeStyles((theme: Theme) => ({
|
||||
}));
|
||||
|
||||
const ActorDetailsPane: React.FC<ActorDetailsPaneProps> = ({
|
||||
actorDetails,
|
||||
actorClass,
|
||||
actorState,
|
||||
actor
|
||||
}) => {
|
||||
const classes = useStyles();
|
||||
const actorData: ActorDatum[] = labeledActorData(actor);
|
||||
return (
|
||||
<React.Fragment>
|
||||
<div className={classes.actorTitleWrapper}>
|
||||
<div>{actorClass}</div>
|
||||
<ActorStateRepr state={actorState} />
|
||||
<div>{actor.actorClass}</div>
|
||||
<ActorStateRepr state={actor.state} />
|
||||
</div>
|
||||
{isFullActorInfo(actor) &&
|
||||
<Grid container className={classes.detailsPane}>
|
||||
<Grid container item xs={6}>
|
||||
<Grid item xs={4}>
|
||||
<Typography>CPU Usage</Typography>
|
||||
</Grid>
|
||||
<Grid item xs={4}>
|
||||
<UsageBar
|
||||
percent={actor.processStats?.cpuPercent ?? 0}
|
||||
text={`${actor.processStats?.cpuPercent ?? 0}%`}
|
||||
/>
|
||||
</Grid>
|
||||
<Grid item xs={4} />
|
||||
</Grid>
|
||||
{ actor.gpus.length > 0 &&
|
||||
<Grid container item xs={6}>
|
||||
<Grid item xs={12}>
|
||||
<Typography>GPU Usage</Typography>
|
||||
</Grid>
|
||||
{actor.gpus.map(gpu => (
|
||||
<React.Fragment key={gpu.uuid}>
|
||||
<Grid item xs={4}>
|
||||
{`[${gpu.name}]`}
|
||||
</Grid>
|
||||
<Grid item xs={4}>
|
||||
<UsageBar
|
||||
percent={gpu.utilizationGpu * 100}
|
||||
text={`${gpu.utilizationGpu * 100}%`}
|
||||
/>
|
||||
</Grid>
|
||||
<Grid item xs={4} />
|
||||
</React.Fragment>
|
||||
))}
|
||||
</Grid>
|
||||
}
|
||||
</Grid>}
|
||||
<Divider className={classes.divider} />
|
||||
<Grid container className={classes.detailsPane}>
|
||||
{actorDetails.map(
|
||||
{actorData.map(
|
||||
({ label, value, tooltip }) =>
|
||||
value &&
|
||||
value.length > 0 && (
|
||||
|
||||
+31
-9
@@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import ray.new_dashboard.consts as dashboard_consts
|
||||
import ray.new_dashboard.memory_utils as memory_utils
|
||||
from collections import defaultdict
|
||||
from ray.new_dashboard.actor_utils import actor_classname_from_task_spec
|
||||
from ray.new_dashboard.utils import Dict, Signal
|
||||
|
||||
@@ -61,18 +62,35 @@ class DataOrganizer:
|
||||
@classmethod
|
||||
async def get_node_actors(cls, node_id):
|
||||
node_stats = DataSource.node_stats.get(node_id, {})
|
||||
worker_id_to_info = {}
|
||||
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
|
||||
worker_id_to_raylet_info = {}
|
||||
pid_to_worker_id = {}
|
||||
|
||||
for worker_stats in node_stats.get("workersStats", []):
|
||||
worker_id_to_info[worker_stats["workerId"]] = worker_stats
|
||||
worker_id_to_raylet_info[worker_stats["workerId"]] = worker_stats
|
||||
pid_to_worker_id[worker_stats["pid"]] = worker_stats["workerId"]
|
||||
worker_id_to_process_info = {}
|
||||
|
||||
for process_stats in node_physical_stats.get("workers"):
|
||||
if process_stats["pid"] in pid_to_worker_id:
|
||||
worker_id = pid_to_worker_id[process_stats["pid"]]
|
||||
worker_id_to_process_info[worker_id] = process_stats
|
||||
|
||||
worker_id_to_gpu_stats = defaultdict(list)
|
||||
for gpu_stats in node_physical_stats.get("gpus"):
|
||||
for process in gpu_stats.get("processes", []):
|
||||
if process["pid"] in pid_to_worker_id:
|
||||
worker_id = pid_to_worker_id[process["pid"]]
|
||||
worker_id_to_gpu_stats[worker_id].append(gpu_stats)
|
||||
|
||||
node_actors = {}
|
||||
for actor_id, actor_table_data in DataSource.actors.items():
|
||||
if actor_table_data["address"]["workerId"] in worker_id_to_info:
|
||||
worker_stats = worker_id_to_info[actor_table_data["address"][
|
||||
"workerId"]]
|
||||
|
||||
actor_constructor = worker_stats.get("coreWorkerStats", {})\
|
||||
.get("actorTitle", "Unknown actor constructor")
|
||||
worker_id = actor_table_data["address"]["workerId"]
|
||||
if worker_id in worker_id_to_raylet_info:
|
||||
worker_raylet_stats = worker_id_to_raylet_info[worker_id]
|
||||
core_worker = worker_raylet_stats.get("coreWorkerStats", {})
|
||||
actor_constructor = core_worker.get(
|
||||
"actorTitle", "Unknown actor constructor")
|
||||
|
||||
actor_table_data["actorConstructor"] = actor_constructor
|
||||
|
||||
@@ -80,8 +98,12 @@ class DataOrganizer:
|
||||
actor_table_data.get("taskSpec", {}))
|
||||
|
||||
actor_table_data["actorClass"] = actor_class
|
||||
actor_table_data.update(worker_stats["coreWorkerStats"])
|
||||
actor_table_data.update(core_worker)
|
||||
node_actors[actor_id] = actor_table_data
|
||||
actor_table_data["gpus"] = worker_id_to_gpu_stats.get(
|
||||
worker_id, [])
|
||||
actor_table_data["processStats"] = worker_id_to_process_info.get(
|
||||
worker_id, {})
|
||||
return node_actors
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -24,6 +24,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule):
|
||||
def __init__(self, dashboard_head):
|
||||
super().__init__(dashboard_head)
|
||||
self._stubs = {}
|
||||
self._ray_config = None
|
||||
DataSource.agents.signal.append(self._update_stubs)
|
||||
|
||||
async def _update_stubs(self, change):
|
||||
|
||||
@@ -60,6 +60,7 @@ if __name__ == "__main__":
|
||||
do_link("experimental", force=args.yes)
|
||||
do_link("util", force=args.yes)
|
||||
do_link("dashboard", force=args.yes)
|
||||
do_link("new_dashboard", force=args.yes)
|
||||
print("Created links.\n\nIf you run into issues initializing Ray, please "
|
||||
"ensure that your local repo and the installed Ray are in sync "
|
||||
"(pip install -U the latest wheels at "
|
||||
|
||||
Reference in New Issue
Block a user