[Dashboard] Logical View Actor Class Grouping Details (#10453)

* wip

* wip

* wip

* wip

* Need to track the timestamp actors are created for the dashboard. This adds that functionality back in and deletes unused code

* Add the materialui lab packages to get access to the Alert component and fix up some vulnerabilities with npm audit.

* Finish supporting information on a per-actor-class basis in the logical view, add bug fixes around timestamps and infeasible task names, and add a new warning popup that shows if there are infeasible actors around.

* lint and add seconds annotation to actor lifetime values

* real lint

* remove typo

* Somehow missed something last lint

* Add new comments for actor states

* Add underscores to some private functions

* Add tooltips to the actor states on the logical view

* change test metrics to be aligned with new changes.

* lint

* Remove some unnecessary log lines and catch error that happens when we try to decode data from an unexpected source

* Re-add a function I had removed. It is used in the Java codebase.

Co-authored-by: Max Fitton <max@semprehealth.com>
This commit is contained in:
Max Fitton
2020-09-09 10:34:54 -07:00
committed by GitHub
parent 799318d7d7
commit 3e8164ff8a
16 changed files with 2121 additions and 3293 deletions
File diff suppressed because it is too large Load Diff
+4 -2
View File
@@ -5,6 +5,7 @@
"dependencies": {
"@material-ui/core": "4.11.0",
"@material-ui/icons": "^4.9.1",
"@material-ui/lab": "^4.0.0-alpha.56",
"@reduxjs/toolkit": "^1.3.1",
"@types/classnames": "^2.2.10",
"@types/jest": "25.1.4",
@@ -18,9 +19,10 @@
"react-dom": "^16.13.1",
"react-redux": "^7.2.0",
"react-router-dom": "^5.1.2",
"react-scripts": "3.4.1",
"react-scripts": "^3.4.3",
"typeface-roboto": "0.0.75",
"typescript": "3.8.3"
"typescript": "3.8.3",
"use-debounce": "^3.4.3"
},
"devDependencies": {
"eslint-plugin-import": "^2.20.1",
+30 -14
View File
@@ -147,15 +147,19 @@ export type RayletWorkerStats = {
};
export enum ActorState {
Invalid = -1,
DependenciesUnready = 0,
PendingCreation = 1,
Alive = 2,
Restarting = 3,
Dead = 4,
// These two are virtual states that we air because there is
// an existing task to create an actor
Infeasible = -2, // Actor task is waiting on resources (e.g. RAM, CPUs or GPUs) that the cluster does not have
PendingResources = -1, // Actor task is waiting on resources the cluster has but are in-use
// The rest below are "official" GCS actor states
DependenciesUnready = 0, // Actor is pending on an argument to be ready
PendingCreation = 1, // Actor creation is running
Alive = 2, // Actor is alive and handling tasks
Restarting = 3, // Actor died and is being restarted
Dead = 4, // Actor died and is not being restarted
}
export type ActorInfo = FullActorInfo | PartialActorInfo;
export type ActorInfo = FullActorInfo | ActorTaskInfo;
export type FullActorInfo = {
actorId: string;
@@ -185,12 +189,11 @@ export type FullActorInfo = {
webuiDisplay?: Record<string, string>;
};
export type PartialActorInfo = {
export type ActorTaskInfo = {
actorId?: string;
actorTitle?: string;
requiredResources?: { [key: string]: number };
state: ActorState.Invalid;
invalidStateType?: InvalidStateType;
state: ActorState.Infeasible | ActorState.PendingResources;
};
// eslint-disable-next-line
@@ -200,10 +203,23 @@ export function isFullActorInfo(
// Lint disabled because arrow functions don't play well with type guards.
// This function is used to determine what kind of information we have about
// a given actor in a response based on its state.
return actorInfo.state !== ActorState.Invalid;
return (
actorInfo.state !== ActorState.Infeasible &&
actorInfo.state !== ActorState.PendingResources
);
}
export type InvalidStateType = "infeasibleActor" | "pendingActor";
export type ActorGroupSummary = {
stateToCount: { [state in ActorState]: number };
avgLifetime: number;
maxLifetime: number;
numExecutedTasks: number;
};
export type ActorGroup = {
entries: ActorInfo[];
summary: ActorGroupSummary;
};
export type RayletInfoResponse = {
nodes: {
@@ -212,8 +228,8 @@ export type RayletInfoResponse = {
workersStats: Array<RayletWorkerStats>;
};
};
actors: {
[actorId: string]: ActorInfo;
actorGroups: {
[groupKey: string]: ActorGroup;
};
plasmaStats: {
[ip: string]: PlasmaStats;
@@ -1,8 +1,8 @@
import { Grid, makeStyles, Tooltip } from "@material-ui/core";
import React from "react";
import { Box, Grid, makeStyles, Tooltip } from "@material-ui/core";
import React, { ReactChild } from "react";
type LabeledDatumProps = {
label: string;
label: ReactChild;
datum: any;
tooltip?: string;
};
@@ -26,7 +26,7 @@ const LabeledDatum: React.FC<LabeledDatumProps> = ({
const innerHtml = (
<Grid container item xs={6}>
<Grid item xs={6}>
<span className={tooltip && classes.tooltipLabel}>{label}</span>
<Box className={tooltip && classes.tooltipLabel}>{label}</Box>
</Grid>
<Grid item xs={6}>
<span>{datum}</span>
@@ -1,5 +1,3 @@
import { useEffect, useRef } from "react";
export const getWeightedAverage = (
input: {
weight: number;
@@ -26,24 +24,3 @@ export const filterObj = (obj: Object, filterFn: any) =>
export const mapObj = (obj: Object, filterFn: any) =>
Object.fromEntries(Object.entries(obj).map(filterFn));
export const useInterval = (callback: Function, delayMs: number) => {
const savedCallback = useRef<any>();
const intervalId = useRef<any>();
useEffect(() => {
savedCallback.current = callback;
}, [callback]);
useEffect(() => {
const tick = () => savedCallback?.current();
intervalId.current = setInterval(tick, delayMs);
savedCallback.current();
return () => {
if (intervalId.current) {
clearInterval(intervalId.current);
}
};
}, [callback, delayMs]);
return intervalId.current
? () => clearInterval(intervalId.current)
: () => null;
};
@@ -1,11 +1,5 @@
import {
createStyles,
Theme,
Typography,
withStyles,
WithStyles,
} from "@material-ui/core";
import React from "react";
import { createStyles, makeStyles, Theme, Typography } from "@material-ui/core";
import React, { useState } from "react";
import {
ActorInfo,
ActorState,
@@ -21,7 +15,8 @@ import ActorDetailsPane from "./ActorDetailsPane";
const memoryDebuggingDocLink =
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";
const styles = (theme: Theme) =>
const useActorStyles = makeStyles((theme: Theme) =>
createStyles({
root: {
borderColor: theme.palette.divider,
@@ -42,10 +37,10 @@ const styles = (theme: Theme) =>
cursor: "pointer",
},
},
invalidStateTypeInfeasible: {
infeasible: {
color: theme.palette.error.main,
},
invalidStateTypePendingActor: {
pendingResources: {
color: theme.palette.secondary.main,
},
@@ -56,51 +51,44 @@ const styles = (theme: Theme) =>
fontSize: "0.875rem",
display: "inline",
},
});
}),
);
type Props = {
type ActorProps = {
actor: ActorInfo;
};
type State = {
profiling: {
[profilingId: string]: {
startTime: number;
latestResponse: CheckProfilingStatusResponse | null;
};
type profilingInfo = {
[profilingId: string]: {
startTime: number;
latestResponse: CheckProfilingStatusResponse | null;
};
};
class Actor extends React.Component<Props & WithStyles<typeof styles>, State> {
state: State = {
profiling: {},
};
const Actor: React.FC<ActorProps> = ({ actor }) => {
const [profiling, setProfiling] = useState<profilingInfo>({});
const classes = useActorStyles();
handleProfilingClick = (duration: number) => async () => {
const actor = this.props.actor;
const handleProfilingClick = (duration: number) => async () => {
if (actor.state === ActorState.Alive) {
const profilingId = await launchProfiling(
actor.nodeId,
actor.pid,
duration,
);
this.setState((state) => ({
profiling: {
...state.profiling,
[profilingId]: { startTime: Date.now(), latestResponse: null },
},
}));
setProfiling({
...profiling,
[profilingId]: { startTime: Date.now(), latestResponse: null },
});
const checkProfilingStatusLoop = async () => {
const response = await checkProfilingStatus(profilingId);
this.setState((state) => ({
profiling: {
...state.profiling,
[profilingId]: {
...state.profiling[profilingId],
latestResponse: response,
},
setProfiling({
...profiling,
[profilingId]: {
...profiling[profilingId],
latestResponse: response,
},
}));
});
if (response.status === "pending") {
setTimeout(checkProfilingStatusLoop, 1000);
}
@@ -109,204 +97,195 @@ class Actor extends React.Component<Props & WithStyles<typeof styles>, State> {
}
};
killActor = () => {
const actor = this.props.actor;
const killActor = () => {
if (actor.state === ActorState.Alive) {
launchKillActor(actor.actorId, actor.ipAddress, actor.port);
}
};
render() {
const { classes, actor } = this.props;
const { profiling } = this.state;
const invalidStateType = isFullActorInfo(actor)
? undefined
: actor.invalidStateType;
const information = isFullActorInfo(actor)
? [
{
label: "Resources",
value:
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength.toLocaleString(),
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks.toLocaleString(),
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope.toLocaleString(),
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects.toLocaleString(),
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory.toLocaleString(),
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
];
const information = isFullActorInfo(actor)
? [
{
label: "Resources",
value:
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength.toLocaleString(),
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks.toLocaleString(),
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope.toLocaleString(),
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects.toLocaleString(),
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory.toLocaleString(),
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
];
// Construct the custom message from the actor.
let actorCustomDisplay: JSX.Element[] = [];
if (isFullActorInfo(actor) && actor.webuiDisplay) {
actorCustomDisplay = Object.keys(actor.webuiDisplay)
.sort()
.map((key, _, __) => {
// Construct the value from actor.
// Please refer to worker.py::show_in_dashboard for schema.
const valueEncoded = actor.webuiDisplay![key];
const valueParsed = JSON.parse(valueEncoded);
let valueRendered = valueParsed["message"];
if (valueParsed["dtype"] === "html") {
valueRendered = (
<div
className={classes.inlineHTML}
dangerouslySetInnerHTML={{ __html: valueRendered }}
></div>
);
}
// Construct the custom message from the actor.
let actorCustomDisplay: JSX.Element[] = [];
if (isFullActorInfo(actor) && actor.webuiDisplay) {
actorCustomDisplay = Object.keys(actor.webuiDisplay)
.sort()
.map((key, _, __) => {
// Construct the value from actor.
// Please refer to worker.py::show_in_webui for schema.
const valueEncoded = actor.webuiDisplay![key];
const valueParsed = JSON.parse(valueEncoded);
let valueRendered = valueParsed["message"];
if (valueParsed["dtype"] === "html") {
valueRendered = (
<div
className={classes.inlineHTML}
dangerouslySetInnerHTML={{ __html: valueRendered }}
></div>
);
}
if (key === "") {
return (
<Typography className={classes.webuiDisplay}>
&nbsp; &nbsp; {valueRendered}
</Typography>
);
} else {
return (
<Typography className={classes.webuiDisplay}>
&nbsp; &nbsp; {key}: {valueRendered}
</Typography>
);
}
});
}
if (key === "") {
return (
<Typography className={classes.webuiDisplay}>
&nbsp; &nbsp; {valueRendered}
</Typography>
);
} else {
return (
<Typography className={classes.webuiDisplay}>
&nbsp; &nbsp; {key}: {valueRendered}
</Typography>
);
}
});
}
return (
<div className={classes.root}>
<Typography className={classes.title}>
{isFullActorInfo(actor) ? (
<React.Fragment>
Actor {actor.actorId} (Profile for
{[10, 30, 60].map((duration) => (
<React.Fragment>
{" "}
<span
className={classes.action}
onClick={this.handleProfilingClick(duration)}
>
{duration}s
</span>
</React.Fragment>
))}
){" "}
{actor.state === ActorState.Alive && (
<span className={classes.action} onClick={this.killActor}>
Kill Actor
</span>
)}
{Object.entries(profiling).map(
([profilingId, { startTime, latestResponse }]) =>
latestResponse !== null && (
<React.Fragment>
(
{latestResponse.status === "pending" ? (
`Profiling for ${Math.round(
(Date.now() - startTime) / 1000,
)}s...`
) : latestResponse.status === "finished" ? (
<a
className={classes.action}
href={getProfilingResultURL(profilingId)}
rel="noopener noreferrer"
target="_blank"
>
Profiling result
</a>
) : latestResponse.status === "error" ? (
`Profiling error: ${latestResponse.error.trim()}`
) : undefined}
){" "}
</React.Fragment>
),
)}
</React.Fragment>
) : actor.invalidStateType === "infeasibleActor" ? (
<span className={classes.invalidStateTypeInfeasible}>
{actor.actorTitle} cannot be created because the Ray cluster
cannot satisfy its resource requirements.
</span>
) : (
<span className={classes.invalidStateTypePendingActor}>
{actor.actorTitle} is pending until resources are available.
</span>
)}
</Typography>
<ActorDetailsPane
actorDetails={information}
actorTitle={actor.actorTitle ?? ""}
actorState={actor.state}
invalidStateType={invalidStateType}
/>
{isFullActorInfo(actor) && (
return (
<div className={classes.root}>
<Typography className={classes.title}>
{isFullActorInfo(actor) ? (
<React.Fragment>
{actorCustomDisplay.length > 0 && (
<React.Fragment>{actorCustomDisplay}</React.Fragment>
Actor {actor.actorId} (Profile for
{[10, 30, 60].map((duration) => (
<React.Fragment>
{" "}
<span
className={classes.action}
onClick={handleProfilingClick(duration)}
>
{duration}s
</span>
</React.Fragment>
))}
){" "}
{actor.state === ActorState.Alive && (
<span className={classes.action} onClick={killActor}>
Kill Actor
</span>
)}
{Object.entries(profiling).map(
([profilingId, { startTime, latestResponse }]) =>
latestResponse !== null && (
<React.Fragment>
(
{latestResponse.status === "pending" ? (
`Profiling for ${Math.round(
(Date.now() - startTime) / 1000,
)}s...`
) : latestResponse.status === "finished" ? (
<a
className={classes.action}
href={getProfilingResultURL(profilingId)}
rel="noopener noreferrer"
target="_blank"
>
Profiling result
</a>
) : latestResponse.status === "error" ? (
`Profiling error: ${latestResponse.error.trim()}`
) : undefined}
){" "}
</React.Fragment>
),
)}
</React.Fragment>
) : actor.state === ActorState.Infeasible ? (
<span className={classes.infeasible}>
{actor.actorTitle} cannot be created because the Ray cluster cannot
satisfy its resource requirements.
</span>
) : (
<span className={classes.pendingResources}>
{actor.actorTitle} is pending until resources are available.
</span>
)}
</div>
);
}
}
</Typography>
<ActorDetailsPane
actorDetails={information}
actorTitle={actor.actorTitle ?? ""}
actorState={actor.state}
/>
{isFullActorInfo(actor) && (
<React.Fragment>
{actorCustomDisplay.length > 0 && (
<React.Fragment>{actorCustomDisplay}</React.Fragment>
)}
</React.Fragment>
)}
</div>
);
};
export default withStyles(styles)(Actor);
export default Actor;
@@ -1,22 +1,33 @@
import {
Accordion,
AccordionDetails,
AccordionSummary,
Box,
createStyles,
Grid,
makeStyles,
Paper,
styled,
Typography,
} from "@material-ui/core";
import ExpandMoreIcon from "@material-ui/icons/ExpandMore";
import React from "react";
import { ActorInfo } from "../../../api";
import React, { useState } from "react";
import { ActorGroup, ActorState } from "../../../api";
import { Expander, Minimizer } from "../../../common/ExpandControls";
import LabeledDatum from "../../../common/LabeledDatum";
import Actor from "./Actor";
import ActorStateRepr from "./ActorStateRepr";
const asSeconds = (n: number) => `${n}s`;
const CenteredBox = styled(Box)({
textAlign: "center",
});
const useActorClassGroupStyles = makeStyles((theme) =>
createStyles({
container: {
margin: theme.spacing(1),
padding: theme.spacing(1),
marginLeft: theme.spacing(2),
},
title: {
margin: theme.spacing(1),
},
actorEntry: {
width: "100%",
@@ -26,30 +37,90 @@ const useActorClassGroupStyles = makeStyles((theme) =>
type ActorClassGroupProps = {
title: string;
actors: ActorInfo[];
actorGroup: ActorGroup;
};
const ActorClassGroup: React.FC<ActorClassGroupProps> = ({ actors, title }) => {
const ActorClassGroup: React.FC<ActorClassGroupProps> = ({
actorGroup,
title,
}) => {
const classes = useActorClassGroupStyles();
const entries = actors.map((actor, i) => (
const [expanded, setExpanded] = useState(false);
const toggleExpanded = () => setExpanded(!expanded);
const entries = actorGroup.entries.map((actor, i) => (
<Box component="div" className={classes.actorEntry}>
<Actor actor={actor} key={actor.actorId ?? i} />
</Box>
));
const { Alive, PendingResources, Infeasible } = ActorState;
const summary = actorGroup.summary;
return (
<Paper className={classes.container}>
<Accordion defaultExpanded={true}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
>
<Typography variant="h5">{title}</Typography>
</AccordionSummary>
<AccordionDetails>
<Box display="block" className={classes.title}>
<Typography variant="h5">{title}</Typography>
</Box>
<Grid container className={classes.title}>
<LabeledDatum
label={
<ActorStateRepr state={Alive} variant="body1" showTooltip={true} />
}
datum={
Alive in summary.stateToCount ? summary.stateToCount[Alive] : 0
}
/>
<LabeledDatum
label={
<ActorStateRepr
state={Infeasible}
variant="body1"
showTooltip={true}
/>
}
datum={
Infeasible in summary.stateToCount
? summary.stateToCount[Infeasible]
: 0
}
/>
<LabeledDatum
label={
<ActorStateRepr
state={PendingResources}
variant="body1"
showTooltip={true}
/>
}
datum={
PendingResources in summary.stateToCount
? summary.stateToCount[PendingResources]
: 0
}
/>
<LabeledDatum
label={"Mean Lifetime"}
datum={asSeconds(summary.avgLifetime)}
/>
<LabeledDatum
label={"Max Lifetime"}
datum={asSeconds(summary.maxLifetime)}
/>
<LabeledDatum
label={"Executed Tasks"}
datum={summary.numExecutedTasks}
/>
</Grid>
{expanded ? (
<React.Fragment>
<Box>{entries}</Box>
</AccordionDetails>
</Accordion>
<CenteredBox>
<Minimizer onClick={toggleExpanded} />
</CenteredBox>
</React.Fragment>
) : (
<CenteredBox>
<Expander onClick={toggleExpanded} />
</CenteredBox>
)}
</Paper>
);
};
@@ -1,39 +1,56 @@
import React from "react";
import { ActorInfo } from "../../../api";
import { Snackbar } from "@material-ui/core";
import { Alert } from "@material-ui/lab";
import React, { useState } from "react";
import { ActorGroup, ActorState } from "../../../api";
import { stableSort } from "../../../common/tableUtils";
import { sum } from "../../../common/util";
import ActorClassGroup from "./ActorClassGroup";
type ActorClassGroupsProps = {
actors: ActorInfo[];
actorGroups: { [groupKey: string]: ActorGroup };
};
const extractClassName = (actor: ActorInfo) => {
// Given a python class name like Foo(arg1, arg2)
// this function returns "Foo"
const re = /(.+)\(/;
const matches = actor.actorTitle?.match(re);
if (matches) {
return matches[1];
}
};
const ActorClassGroups: React.FC<ActorClassGroupsProps> = ({ actors }) => {
const groups = new Map();
actors.forEach((actor) => {
const className = extractClassName(actor) ?? "Unknown Class";
const existingGroup = groups.get(className);
if (existingGroup) {
existingGroup.push(actor);
} else {
groups.set(className, [actor]);
const ActorClassGroups: React.FC<ActorClassGroupsProps> = ({ actorGroups }) => {
const numInfeasible = (group: ActorGroup) =>
group.summary.stateToCount[ActorState.Infeasible] ?? 0;
const totalInfeasible = sum(Object.values(actorGroups).map(numInfeasible));
const [warningOpen, setWarningOpen] = useState(totalInfeasible > 0);
const groupComparator = (
[title1, group1]: [string, ActorGroup],
[title2, group2]: [string, ActorGroup],
) => {
const infeasible1 = numInfeasible(group1);
const infeasible2 = numInfeasible(group2);
if (infeasible1 !== infeasible2) {
return infeasible1 > infeasible2 ? -1 : 1;
}
});
return title1 > title2 ? 1 : -1;
};
const children = stableSort(
Object.entries(actorGroups),
groupComparator,
).map(([title, actorGroup]) => (
<ActorClassGroup
actorGroup={actorGroup}
title={title}
key={`acg-${title}`}
/>
));
const children = Array.from(groups)
.sort(([title], [title2]) => (title > title2 ? 1 : -1))
.map(([title, actorGroup]) => (
<ActorClassGroup title={title} actors={actorGroup} key={`acg-${title}`} />
));
return <React.Fragment>{children}</React.Fragment>;
return (
<React.Fragment>
<Snackbar open={warningOpen}>
<Alert severity="warning" onClose={() => setWarningOpen(false)}>
There are one or more actors that cannot currently be created due to
insufficient cluster resources. These have been sorted to the top of
the list. If you are using autoscaling functionality, you may ignore
this message.
</Alert>
</Snackbar>
{children}
</React.Fragment>
);
};
export default ActorClassGroups;
@@ -1,81 +1,11 @@
import {
createStyles,
Divider,
Grid,
makeStyles,
Theme,
} from "@material-ui/core";
import { Divider, Grid, makeStyles, Theme } from "@material-ui/core";
import React from "react";
import { ActorState, InvalidStateType } from "../../../api";
import { ActorState } from "../../../api";
import LabeledDatum from "../../../common/LabeledDatum";
type ActorStateReprProps = {
state: ActorState;
ist?: InvalidStateType;
};
const actorStateReprStyles = makeStyles((theme: Theme) =>
createStyles({
infeasible: {
color: theme.palette.error.light,
},
pending: {
color: theme.palette.warning.light,
},
unknown: {
color: theme.palette.warning.light,
},
creating: {
color: theme.palette.success.light,
},
alive: {
color: theme.palette.success.dark,
},
restarting: {
color: theme.palette.warning.light,
},
dead: {
color: "#cccccc",
},
}),
);
const ActorStateRepr: React.FC<ActorStateReprProps> = ({ state, ist }) => {
const classes = actorStateReprStyles();
const {
Alive,
Dead,
PendingCreation,
Restarting,
DependenciesUnready,
Invalid,
} = ActorState;
switch (state) {
case Invalid:
console.log(ist);
if (ist === "infeasibleActor") {
return <div className={classes.infeasible}>Infeasible</div>;
}
if (ist === "pendingActor") {
return <div className={classes.pending}>Pending</div>;
}
return <div className={classes.unknown}>Unknown</div>;
case PendingCreation:
return <div className={classes.creating}>Creating</div>;
case DependenciesUnready:
return <div className={classes.creating}>Dependencies Unready</div>;
case Alive:
return <div className={classes.alive}>Alive</div>;
case Restarting:
return <div className={classes.restarting}>Restarting</div>;
case Dead:
return <div className={classes.dead}>Dead</div>;
}
};
import ActorStateRepr from "./ActorStateRepr";
type ActorDetailsPaneProps = {
actorTitle: string;
invalidStateType?: InvalidStateType;
actorState: ActorState;
actorDetails: {
label: string;
@@ -104,14 +34,13 @@ const ActorDetailsPane: React.FC<ActorDetailsPaneProps> = ({
actorTitle,
actorDetails,
actorState,
invalidStateType,
}) => {
const classes = useStyles();
return (
<React.Fragment>
<div className={classes.actorTitleWrapper}>
<div>{actorTitle}</div>
<ActorStateRepr ist={invalidStateType} state={actorState} />
<ActorStateRepr state={actorState} />
</div>
<Divider className={classes.divider} />
<Grid container className={classes.detailsPane}>
@@ -0,0 +1,148 @@
import {
createStyles,
makeStyles,
Theme,
Tooltip,
Typography,
} from "@material-ui/core";
import React from "react";
import { ActorState } from "../../../api";
type ActorStateReprProps = {
state: ActorState;
showTooltip?: boolean;
variant?: any;
};
const {
Alive,
Dead,
PendingCreation,
Restarting,
DependenciesUnready,
Infeasible,
PendingResources,
} = ActorState;
const useActorStateReprStyles = makeStyles((theme: Theme) =>
createStyles({
infeasible: {
color: theme.palette.error.light,
},
pendingResources: {
color: theme.palette.warning.light,
},
unknown: {
color: theme.palette.warning.light,
},
creating: {
color: theme.palette.success.light,
},
alive: {
color: theme.palette.success.dark,
},
restarting: {
color: theme.palette.warning.light,
},
dead: {
color: "#cccccc",
},
tooltip: {
cursor: "help",
},
}),
);
const infeasibleTooltip =
"The actor cannot be created because of insufficient resources in the cluster. Please examine its resource constraints to make sure they are correct or add additional compute to your cluster.";
const pendingResourcesTooltip =
"The actor is pending resources, such as GPU, Memory, or CPU. It will be created when they become available.";
const aliveTooltip = "The actor is alive and handling remote calls.";
const deadTooltip = "The actor is dead and will not be restarted anymore.";
const restartingTooltip = "The actor died and is restarting.";
const pendingCreationTooltip =
"The actor's resources and other dependencies are ready, and the Ray backend is processing its creation.";
const dependenciesUnreadyTooltip =
"The actor is pending creation because it is waiting for one or more of its initialization arguments to be ready.";
const stateToTooltip = {
[Alive]: aliveTooltip,
[Dead]: deadTooltip,
[Infeasible]: infeasibleTooltip,
[Restarting]: restartingTooltip,
[PendingCreation]: pendingCreationTooltip,
[DependenciesUnready]: dependenciesUnreadyTooltip,
[PendingResources]: pendingResourcesTooltip,
};
const ActorStateRepr: React.FC<ActorStateReprProps> = ({
state,
variant,
showTooltip,
}) => {
const classes = useActorStateReprStyles();
const variantOrDefault = variant ?? "body1";
let body;
switch (state) {
case Infeasible:
body = (
<Typography variant={variantOrDefault} className={classes.infeasible}>
Infeasible
</Typography>
);
break;
case PendingResources:
body = (
<Typography
variant={variantOrDefault}
className={classes.pendingResources}
>
Pending Resources
</Typography>
);
break;
case PendingCreation:
body = (
<Typography variant={variantOrDefault} className={classes.creating}>
Creating
</Typography>
);
break;
case DependenciesUnready:
body = (
<Typography variant={variantOrDefault} className={classes.creating}>
Dependencies Unready
</Typography>
);
break;
case Alive:
body = (
<Typography variant={variantOrDefault} className={classes.alive}>
Alive
</Typography>
);
break;
case Restarting:
body = (
<Typography variant={variantOrDefault} className={classes.restarting}>
Restarting
</Typography>
);
break;
case Dead:
body = (
<Typography variant={variantOrDefault} className={classes.dead}>
Dead
</Typography>
);
break;
}
return showTooltip ? (
<Tooltip className={classes.tooltip} title={stateToTooltip[state]}>
{body}
</Tooltip>
) : (
body
);
};
export default ActorStateRepr;
@@ -1,28 +0,0 @@
import React, { Fragment } from "react";
import { ActorState, RayletInfoResponse } from "../../../api";
import Actor from "./Actor";
type ActorProps = {
actors: RayletInfoResponse["actors"];
};
const Actors = (props: ActorProps) => {
const { actors } = props;
const actorChildren = Object.entries(actors)
.sort(([, actor1], [, actor2]) => {
if (
actor1.state === ActorState.Dead &&
actor2.state === ActorState.Dead
) {
return 0;
} else if (actor2.state === ActorState.Dead) {
return -1;
} else {
return 1;
}
})
.map(([aid, actor]) => <Actor actor={actor} key={aid} />);
return <Fragment>{actorChildren}</Fragment>;
};
export default Actors;
@@ -1,71 +1,58 @@
import {
Box,
createStyles,
FormControl,
FormHelperText,
Input,
InputLabel,
makeStyles,
Theme,
Typography,
} from "@material-ui/core";
import React, { useState } from "react";
import { connect } from "react-redux";
import { ActorInfo, isFullActorInfo, RayletInfoResponse } from "../../../api";
import { filterObj } from "../../../common/util";
import { useSelector } from "react-redux";
import { useDebounce } from "use-debounce";
import { StoreState } from "../../../store";
import ActorClassGroups from "./ActorClassGroups";
const actorMatchesSearch = (actor: ActorInfo, nameFilter: string): boolean => {
// Performs a case insensitive search for the name filter string within the
// actor and all of its nested subactors.
const actorTitles = getNestedActorTitles(actor);
const useLogicalViewStyles = makeStyles((theme: Theme) =>
createStyles({
container: {
marginBottom: theme.spacing(1),
},
}),
);
const actorClassMatchesSearch = (
actorClass: string,
nameFilter: string,
): boolean => {
const loweredNameFilter = nameFilter.toLowerCase();
const match = actorTitles.find(
(actorTitle) => actorTitle.toLowerCase().search(loweredNameFilter) !== -1,
);
return match !== undefined;
return actorClass.toLowerCase().search(loweredNameFilter) !== -1;
};
const getNestedActorTitles = (actor: ActorInfo): string[] => {
const actorTitle = actor.actorTitle;
const titles: string[] = actorTitle ? [actorTitle] : [];
if (!isFullActorInfo(actor)) {
return titles;
}
const children = actor["children"];
if (children === undefined || Object.entries(children).length === 0) {
return titles;
}
const childrenTitles = Object.values(children).flatMap((actor) =>
getNestedActorTitles(actor),
);
return titles.concat(childrenTitles);
};
const rayletInfoSelector = (state: StoreState) => state.dashboard.rayletInfo;
const mapStateToProps = (state: StoreState) => ({
rayletInfo: state.dashboard.rayletInfo,
});
type LogicalViewProps = {
rayletInfo: RayletInfoResponse | null;
} & ReturnType<typeof mapStateToProps>;
const LogicalView: React.FC<LogicalViewProps> = ({ rayletInfo }) => {
const LogicalView: React.FC = () => {
const [nameFilter, setNameFilter] = useState("");
if (rayletInfo === null) {
const [debouncedNameFilter] = useDebounce(nameFilter, 500);
const classes = useLogicalViewStyles();
const rayletInfo = useSelector(rayletInfoSelector);
if (rayletInfo === null || !rayletInfo.actorGroups) {
return <Typography color="textSecondary">Loading...</Typography>;
}
let filteredActors = rayletInfo.actors;
if (nameFilter !== "") {
filteredActors = filterObj(filteredActors, ([_, actor]: [any, ActorInfo]) =>
actorMatchesSearch(actor, nameFilter),
);
}
const actorGroups =
debouncedNameFilter === ""
? Object.entries(rayletInfo.actorGroups)
: Object.entries(rayletInfo.actorGroups).filter(([key, _]) =>
actorClassMatchesSearch(key, debouncedNameFilter),
);
return (
<div>
{Object.entries(rayletInfo.actors).length === 0 ? (
<Box className={classes.container}>
{actorGroups.length === 0 ? (
<Typography color="textSecondary">No actors found.</Typography>
) : (
<div>
<React.Fragment>
<FormControl>
<InputLabel htmlFor="actor-name-filter">Actor Search</InputLabel>
<Input
@@ -78,11 +65,11 @@ const LogicalView: React.FC<LogicalViewProps> = ({ rayletInfo }) => {
Search for an actor by name
</FormHelperText>
</FormControl>
<ActorClassGroups actors={Object.values(filteredActors)} />
</div>
<ActorClassGroups actorGroups={Object.fromEntries(actorGroups)} />
</React.Fragment>
)}
</div>
</Box>
);
};
export default connect(mapStateToProps)(LogicalView);
export default LogicalView;
+7 -3
View File
@@ -92,8 +92,8 @@ class DashboardController(BaseDashboardController):
# (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
ready_tasks = sum((data.get("readyTasks", []) for data in D.values()),
[])
actors = self.node_stats.get_actors(workers_info_by_node,
infeasible_tasks, ready_tasks)
actor_groups = self.node_stats.get_actors(
workers_info_by_node, infeasible_tasks, ready_tasks)
plasma_stats = {}
# HTTP call to metrics port for each node in nodes/
used_views = ("object_store_num_local_objects",
@@ -116,7 +116,11 @@ class DashboardController(BaseDashboardController):
node_plasma_stats[view_name] = view_data
plasma_stats[address] = node_plasma_stats
return {"nodes": D, "actors": actors, "plasmaStats": plasma_stats}
return {
"nodes": D,
"actorGroups": actor_groups,
"plasmaStats": plasma_stats
}
def get_ray_config(self):
try:
+88 -83
View File
@@ -7,7 +7,7 @@ import json
import traceback
import copy
import logging
import datetime
from datetime import datetime
import time
from typing import Dict
import re
@@ -16,6 +16,58 @@ from operator import itemgetter
logger = logging.getLogger(__name__)
PYCLASSNAME_RE = re.compile(r"(.+?)\(")
def _group_actors_by_python_class(actors):
groups = defaultdict(list)
for actor in actors.values():
actor_title = actor.get("actorTitle")
if not actor_title:
groups["Unknown Class"].append(actor)
else:
match = PYCLASSNAME_RE.search(actor_title)
if match:
# Catches case of actorTitle like
# Foo(bar, baz, [1,2,3]) -> Foo
class_name = match.groups()[0]
groups[class_name].append(actor)
else:
# Catches case of e.g. just Foo
# in case of actor task
groups[actor_title].append(actor)
return groups
def _get_actor_group_stats(group):
state_to_count = defaultdict(lambda: 0)
executed_tasks = 0
min_timestamp = None
num_timestamps = 0
sum_timestamps = 0
now = time.time() * 1000 # convert S -> MS
for actor in group:
state_to_count[actor["state"]] += 1
if "timestamp" in actor:
if not min_timestamp or actor["timestamp"] < min_timestamp:
min_timestamp = actor["timestamp"]
num_timestamps += 1
sum_timestamps += now - actor["timestamp"]
if "numExecutedTasks" in actor:
executed_tasks += actor["numExecutedTasks"]
if num_timestamps > 0:
avg_lifetime = int((sum_timestamps / num_timestamps) / 1000)
max_lifetime = int((now - min_timestamp) / 1000)
else:
avg_lifetime = 0
max_lifetime = 0
return {
"stateToCount": state_to_count,
"avgLifetime": avg_lifetime,
"maxLifetime": max_lifetime,
"numExecutedTasks": executed_tasks,
}
class NodeStats(threading.Thread):
def __init__(self, redis_address, redis_password=None):
@@ -59,23 +111,19 @@ class NodeStats(threading.Thread):
def _insert_log_counts(self):
for ip, logs_by_pid in self._logs.items():
hostname = self._ip_to_hostname[ip]
if hostname in self._node_stats:
logs_by_pid = {
pid: len(logs)
for pid, logs in logs_by_pid.items()
}
self._node_stats[hostname]["log_count"] = logs_by_pid
hostname = self._ip_to_hostname.get(ip)
if not hostname or hostname not in self._node_stats:
continue
logs_by_pid = {pid: len(logs) for pid, logs in logs_by_pid.items()}
self._node_stats[hostname]["log_count"] = logs_by_pid
def _insert_error_counts(self):
for ip, errs_by_pid in self._errors.items():
hostname = self._ip_to_hostname[ip]
if hostname in self._node_stats:
errs_by_pid = {
pid: len(errs)
for pid, errs in errs_by_pid.items()
}
self._node_stats[hostname]["error_count"] = errs_by_pid
hostname = self._ip_to_hostname.get(ip)
if not hostname or hostname not in self._node_stats:
continue
errs_by_pid = {pid: len(errs) for pid, errs in errs_by_pid.items()}
self._node_stats[hostname]["error_count"] = errs_by_pid
def _purge_outdated_stats(self):
def current(then, now):
@@ -84,7 +132,7 @@ class NodeStats(threading.Thread):
return True
now = to_unix_time(datetime.datetime.utcnow())
now = to_unix_time(datetime.utcnow())
self._node_stats = {
k: v
for k, v in self._node_stats.items() if current(v["now"], now)
@@ -130,8 +178,13 @@ class NodeStats(threading.Thread):
invalid_state_type):
actor_id = ray.utils.binary_to_hex(
b64decode(task[task_spec_type]["actorId"]))
task["state"] = -1
task["invalidStateType"] = invalid_state_type
if invalid_state_type == "pendingActor":
task["state"] = -1
elif invalid_state_type == "infeasibleActor":
task["state"] = -2
else:
raise ValueError(f"Invalid argument"
"invalid_state_type={invalid_state_type}")
task["actorTitle"] = task["functionDescriptor"][
"pythonFunctionDescriptor"]["className"]
format_reply_id(task)
@@ -145,69 +198,19 @@ class NodeStats(threading.Thread):
for ready_task in ready_tasks:
_update_from_actor_tasks(ready_task, "actorCreationTaskSpec",
"pendingActor")
actor_groups = _group_actors_by_python_class(actors)
stats_by_group = {
name: _get_actor_group_stats(group)
for name, group in actor_groups.items()
}
return actors
# Gets actors in a nested structure showing parent child relationships
def get_actor_tree(self, workers_info_by_node, infeasible_tasks,
ready_tasks):
now = time.time()
# construct flattened actor tree
flattened_tree = {"root": {"children": {}}}
child_to_parent = {}
with self._node_stats_lock:
for addr, actor_id in self._addr_to_actor_id.items():
flattened_tree[actor_id] = copy.deepcopy(self._default_info)
flattened_tree[actor_id].update(
self._addr_to_extra_info_dict[addr])
parent_id = self._addr_to_actor_id.get(
self._addr_to_owner_addr[addr], "root")
child_to_parent[actor_id] = parent_id
for node_id, workers_info in workers_info_by_node.items():
for worker_info in workers_info:
if "coreWorkerStats" in worker_info:
core_worker_stats = worker_info["coreWorkerStats"]
addr = (core_worker_stats["ipAddress"],
str(core_worker_stats["port"]))
if addr in self._addr_to_actor_id:
actor_info = flattened_tree[self._addr_to_actor_id[
addr]]
format_reply_id(core_worker_stats)
actor_info.update(core_worker_stats)
actor_info["averageTaskExecutionSpeed"] = round(
actor_info["numExecutedTasks"] /
(now - actor_info["timestamp"] / 1000), 2)
actor_info["nodeId"] = node_id
actor_info["pid"] = worker_info["pid"]
def _update_flatten_tree(task, task_spec_type, invalid_state_type):
actor_id = ray.utils.binary_to_hex(
b64decode(task[task_spec_type]["actorId"]))
caller_addr = (task["callerAddress"]["ipAddress"],
str(task["callerAddress"]["port"]))
caller_id = self._addr_to_actor_id.get(caller_addr, "root")
child_to_parent[actor_id] = caller_id
task["state"] = -1
task["invalidStateType"] = invalid_state_type
task["actorTitle"] = task["functionDescriptor"][
"pythonFunctionDescriptor"]["className"]
format_reply_id(task)
flattened_tree[actor_id] = task
for infeasible_task in infeasible_tasks:
_update_flatten_tree(infeasible_task, "actorCreationTaskSpec",
"infeasibleActor")
for ready_task in ready_tasks:
_update_flatten_tree(ready_task, "actorCreationTaskSpec",
"pendingActor")
# construct actor tree
actor_tree = flattened_tree
for actor_id, parent_id in child_to_parent.items():
actor_tree[parent_id]["children"][actor_id] = actor_tree[actor_id]
return actor_tree["root"]["children"]
response_data = {}
for name, group in actor_groups.items():
response_data[name] = {
"entries": group,
"summary": stats_by_group[name]
}
return response_data
def get_logs(self, hostname, pid):
ip = self._node_stats.get(hostname, {"ip": None})["ip"]
@@ -307,10 +310,12 @@ class NodeStats(threading.Thread):
self._ip_to_hostname[data["ip"]] = data["hostname"]
self._node_stats[data["hostname"]] = data
else:
try:
data = json.loads(ray.utils.decode(data))
except Exception as e:
data = f"Failed to load data because of {e}"
logger.warning("Unexpected channel data received, "
"channel: {}, data: {}".format(
channel,
json.loads(ray.utils.decode(data))))
f"channel: {channel}, data: {data}")
except Exception:
logger.exception(traceback.format_exc())
+5 -9
View File
@@ -206,13 +206,10 @@ def test_raylet_info_endpoint(shutdown_only):
except Exception as ex:
print("failed response: {}".format(response.text))
raise ex
actors_info = raylet_info["result"]["actors"]
actor_groups = raylet_info["result"]["actorGroups"]
try:
assert len(actors_info) == 3
c_actor_info = [
actor for actor in actors_info.values()
if "ActorC" in actor["actorTitle"]
][0]
assert len(actor_groups.keys()) == 3
c_actor_info = actor_groups["ActorC"]["entries"][0]
assert c_actor_info["numObjectRefsInScope"] == 13
assert c_actor_info["numLocalObjects"] == 10
break
@@ -279,12 +276,11 @@ def test_raylet_infeasible_tasks(shutdown_only):
webui_url = ray_addresses["webui_url"].replace("127.0.0.1",
"http://127.0.0.1")
raylet_info = requests.get(webui_url + "/api/raylet_info").json()
actor_info = raylet_info["result"]["actors"]
actor_info = raylet_info["result"]["actorGroups"]
assert len(actor_info) == 1
_, infeasible_actor_info = actor_info.popitem()
assert infeasible_actor_info["state"] == -1
assert infeasible_actor_info["invalidStateType"] == "infeasibleActor"
assert infeasible_actor_info["entries"][0]["state"] == -2
assert (wait_until_succeeded_without_exception(
test_infeasible_actor,