[Dashboard] CPU/GPU usage details in actor pane (#11269)

This commit is contained in:
Max Fitton
2020-10-13 21:23:23 -04:00
committed by GitHub
parent 933cf6675c
commit cd9dcfca0d
6 changed files with 175 additions and 109 deletions
+10 -4
View File
@@ -39,10 +39,8 @@ export type RayConfigResponse = {
export const getRayConfig = () => get<RayConfigResponse>("/api/ray_config", {});
export type Worker = {
type ProcessStats = {
pid: number;
workerId: string;
createTime: number;
memoryInfo: {
rss: number;
vms: number;
@@ -52,6 +50,7 @@ export type Worker = {
data: number;
dirty: Number;
};
createTime: number;
cmdline: string[];
cpuTimes: {
user: number;
@@ -61,12 +60,17 @@ export type Worker = {
iowait: number;
};
cpuPercent: number;
}
export type Worker = {
pid: number;
workerId: string;
logCount: number;
errorCount: number;
language: string;
jobId: string;
coreWorkerStats: CoreWorkerStats[];
};
} & ProcessStats;
export type CoreWorkerStats = {
ipAddress: string;
@@ -220,12 +224,14 @@ export type FullActorInfo = {
| ActorState.DependenciesUnready
| ActorState.PendingCreation;
taskQueueLength?: number;
gpus: GPUStats[]; // Contains info about any GPUs the actor is using
timestamp: number;
usedObjectStoreMemory?: number;
usedResources: { [key: string]: ResourceAllocations };
currentTaskDesc?: string;
numPendingTasks?: number;
webuiDisplay?: Record<string, string>;
processStats?: ProcessStats;
};
export type ActorTaskInfo = {
@@ -10,12 +10,8 @@ import {
launchKillActor,
launchProfiling,
} from "../../../api";
import { sum } from "../../../common/util";
import ActorDetailsPane from "./ActorDetailsPane";
const memoryDebuggingDocLink =
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";
const useActorStyles = makeStyles((theme: Theme) =>
createStyles({
root: {
@@ -103,80 +99,6 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
}
};
const information = isFullActorInfo(actor)
? [
{
label: "Resources",
value:
actor.usedResources &&
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength?.toLocaleString() ?? "0",
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects?.toLocaleString() ?? "0",
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
];
// Construct the custom message from the actor.
let actorCustomDisplay: JSX.Element[] = [];
if (isFullActorInfo(actor) && actor.webuiDisplay) {
@@ -274,9 +196,7 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
)}
</Typography>
<ActorDetailsPane
actorDetails={information}
actorClass={actor.actorClass}
actorState={actor.state}
actor={actor}
/>
{isFullActorInfo(actor) && (
<React.Fragment>
@@ -1,17 +1,98 @@
import { Divider, Grid, makeStyles, Theme } from "@material-ui/core";
import { Divider, Grid, makeStyles, Theme, Typography } from "@material-ui/core";
import React from "react";
import { ActorState } from "../../../api";
import { ActorInfo, isFullActorInfo } from "../../../api";
import { sum } from "../../../common/util";
import LabeledDatum from "../../../common/LabeledDatum";
import ActorStateRepr from "./ActorStateRepr";
import UsageBar from '../../../common/UsageBar';
const memoryDebuggingDocLink =
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";
type ActorDatum = {
label: string;
value: any;
tooltip?: string;
}
const labeledActorData = (actor: ActorInfo) => (
isFullActorInfo(actor)
? [
{
label: "Resources",
value:
actor.usedResources &&
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength?.toLocaleString() ?? "0",
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects?.toLocaleString() ?? "0",
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
]);
type ActorDetailsPaneProps = {
actorClass: string;
actorState: ActorState;
actorDetails: {
label: string;
value: any;
tooltip?: string;
}[];
actor: ActorInfo;
};
const useStyles = makeStyles((theme: Theme) => ({
@@ -31,20 +112,55 @@ const useStyles = makeStyles((theme: Theme) => ({
}));
const ActorDetailsPane: React.FC<ActorDetailsPaneProps> = ({
actorDetails,
actorClass,
actorState,
actor
}) => {
const classes = useStyles();
const actorData: ActorDatum[] = labeledActorData(actor);
return (
<React.Fragment>
<div className={classes.actorTitleWrapper}>
<div>{actorClass}</div>
<ActorStateRepr state={actorState} />
<div>{actor.actorClass}</div>
<ActorStateRepr state={actor.state} />
</div>
{isFullActorInfo(actor) &&
<Grid container className={classes.detailsPane}>
<Grid container item xs={6}>
<Grid item xs={4}>
<Typography>CPU Usage</Typography>
</Grid>
<Grid item xs={4}>
<UsageBar
percent={actor.processStats?.cpuPercent ?? 0}
text={`${actor.processStats?.cpuPercent ?? 0}%`}
/>
</Grid>
<Grid item xs={4} />
</Grid>
{ actor.gpus.length > 0 &&
<Grid container item xs={6}>
<Grid item xs={12}>
<Typography>GPU Usage</Typography>
</Grid>
{actor.gpus.map(gpu => (
<React.Fragment key={gpu.uuid}>
<Grid item xs={4}>
{`[${gpu.name}]`}
</Grid>
<Grid item xs={4}>
<UsageBar
percent={gpu.utilizationGpu * 100}
text={`${gpu.utilizationGpu * 100}%`}
/>
</Grid>
<Grid item xs={4} />
</React.Fragment>
))}
</Grid>
}
</Grid>}
<Divider className={classes.divider} />
<Grid container className={classes.detailsPane}>
{actorDetails.map(
{actorData.map(
({ label, value, tooltip }) =>
value &&
value.length > 0 && (
+31 -9
View File
@@ -1,6 +1,7 @@
import logging
import ray.new_dashboard.consts as dashboard_consts
import ray.new_dashboard.memory_utils as memory_utils
from collections import defaultdict
from ray.new_dashboard.actor_utils import actor_classname_from_task_spec
from ray.new_dashboard.utils import Dict, Signal
@@ -61,18 +62,35 @@ class DataOrganizer:
@classmethod
async def get_node_actors(cls, node_id):
node_stats = DataSource.node_stats.get(node_id, {})
worker_id_to_info = {}
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
worker_id_to_raylet_info = {}
pid_to_worker_id = {}
for worker_stats in node_stats.get("workersStats", []):
worker_id_to_info[worker_stats["workerId"]] = worker_stats
worker_id_to_raylet_info[worker_stats["workerId"]] = worker_stats
pid_to_worker_id[worker_stats["pid"]] = worker_stats["workerId"]
worker_id_to_process_info = {}
for process_stats in node_physical_stats.get("workers"):
if process_stats["pid"] in pid_to_worker_id:
worker_id = pid_to_worker_id[process_stats["pid"]]
worker_id_to_process_info[worker_id] = process_stats
worker_id_to_gpu_stats = defaultdict(list)
for gpu_stats in node_physical_stats.get("gpus"):
for process in gpu_stats.get("processes", []):
if process["pid"] in pid_to_worker_id:
worker_id = pid_to_worker_id[process["pid"]]
worker_id_to_gpu_stats[worker_id].append(gpu_stats)
node_actors = {}
for actor_id, actor_table_data in DataSource.actors.items():
if actor_table_data["address"]["workerId"] in worker_id_to_info:
worker_stats = worker_id_to_info[actor_table_data["address"][
"workerId"]]
actor_constructor = worker_stats.get("coreWorkerStats", {})\
.get("actorTitle", "Unknown actor constructor")
worker_id = actor_table_data["address"]["workerId"]
if worker_id in worker_id_to_raylet_info:
worker_raylet_stats = worker_id_to_raylet_info[worker_id]
core_worker = worker_raylet_stats.get("coreWorkerStats", {})
actor_constructor = core_worker.get(
"actorTitle", "Unknown actor constructor")
actor_table_data["actorConstructor"] = actor_constructor
@@ -80,8 +98,12 @@ class DataOrganizer:
actor_table_data.get("taskSpec", {}))
actor_table_data["actorClass"] = actor_class
actor_table_data.update(worker_stats["coreWorkerStats"])
actor_table_data.update(core_worker)
node_actors[actor_id] = actor_table_data
actor_table_data["gpus"] = worker_id_to_gpu_stats.get(
worker_id, [])
actor_table_data["processStats"] = worker_id_to_process_info.get(
worker_id, {})
return node_actors
@classmethod
@@ -24,6 +24,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule):
def __init__(self, dashboard_head):
super().__init__(dashboard_head)
self._stubs = {}
self._ray_config = None
DataSource.agents.signal.append(self._update_stubs)
async def _update_stubs(self, change):
+1
View File
@@ -60,6 +60,7 @@ if __name__ == "__main__":
do_link("experimental", force=args.yes)
do_link("util", force=args.yes)
do_link("dashboard", force=args.yes)
do_link("new_dashboard", force=args.yes)
print("Created links.\n\nIf you run into issues initializing Ray, please "
"ensure that your local repo and the installed Ray are in sync "
"(pip install -U the latest wheels at "