diff --git a/doc/source/ray-dashboard.rst b/doc/source/ray-dashboard.rst index bf9705ee0..8d0e081b5 100644 --- a/doc/source/ray-dashboard.rst +++ b/doc/source/ray-dashboard.rst @@ -52,7 +52,8 @@ Logical View The logical view shows you: - Created and killed actors. -- Actor statistics such as actor status, number of executed tasks, pending tasks, and memory usage. +- State of actors (e.g. Alive, Dead, Pending Creation). Learn more about actor states at +- Actor statistics such as number of executed tasks, pending tasks, and memory usage. - Actor hierarchy. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/dashboard/Logical-view-basic.png @@ -246,31 +247,33 @@ Logical View (Experimental) **State**: State of an actor. -- 0: Alive -- 1: Restarting -- 2: Dead +- Alive +- Restarting +- Dead +- Infeasible (cannot be created due to not enough available resources (e.g. CPUs, GPUs, memory) in the cluster, even at full capacity) +- Pending Creation +- Dependencies Unready (waiting for one or more of its arguments to be ready) -**Pending**: A number of pending tasks for this actor. +**Number of Pending Tasks**: The number of method calls for this actor that are still awaiting execution. -**Excuted**: A number of executed tasks for this actor. +**Number of Excuted Tasks**: A number of completed method calls for this actor. -**NumObjectRefsInScope**: Number of object refs in scope for this actor. object refs +**Number of ObjectRefs In Scope**: The number of object refs in scope for this actor, which correspond to objects in the Ray object store. object refs in scope will not be evicted unless object stores are full. -**NumLocalObjects**: Number of object refs that are in this actor's local memory. -Only big objects (>100KB) are residing in plasma object stores, and other small +**Number of Local Objects**: Number of object refs that are in this actor's local memory. +Only big objects (>100KB) reside in plasma object stores, and other small objects are staying in local memory. -**UsedLocalObjectMemory**: Used memory used by local objects. +**Used Local Object Memory**: Used memory used by local objects. -**kill actor**: A button to kill an actor in a cluster. It is corresponding to ``ray.kill``. +**kill actor**: A button to kill an actor in a cluster. It has the same effect as calling ``ray.kill`` on an actor handle. -**profile for**: A button to run profiling. We currently support profiling for 10s, -30s and 60s. It requires passwordless ``sudo``. +**profile**: A button to run profiling. We currently support profiling for 10s, +30s and 60s. It requires passwordless ``sudo``. The result of profiling is a py-spy html output displaying how much CPU time the actor spent in various methods. **Infeasible Actor Creation**: Actor creation is infeasible when an actor -requires more resources than a Ray cluster can provide. This is depicted -as a red colored actor. +requires more resources than a Ray cluster can provide, for example an actor that requires a GPU on a cluster that has none. The actor's state is marked "Infeasible" and highlighted in red. **Pending Actor Creation**: Actor creation is pending when there are no available resources for this actor because they are already taken by other diff --git a/python/ray/dashboard/client/src/api.ts b/python/ray/dashboard/client/src/api.ts index 23be7b98d..c708bb918 100644 --- a/python/ray/dashboard/client/src/api.ts +++ b/python/ray/dashboard/client/src/api.ts @@ -146,52 +146,66 @@ export type RayletWorkerStats = { coreWorkerStats: RayletCoreWorkerStats; }; -export type RayletActorInfo = - | { - actorId: string; - actorTitle: string; - averageTaskExecutionSpeed: number; - children: RayletInfoResponse["actors"]; - // currentTaskFuncDesc: string[]; - ipAddress: string; - jobId: string; - nodeId: string; - numExecutedTasks: number; - numLocalObjects: number; - numObjectRefsInScope: number; - pid: number; - port: number; - state: - | ActorState.Creating - | ActorState.Alive - | ActorState.Restarting - | ActorState.Dead; - taskQueueLength: number; - timestamp: number; - usedObjectStoreMemory: number; - usedResources: { [key: string]: ResourceAllocations }; - currentTaskDesc?: string; - numPendingTasks?: number; - webuiDisplay?: Record; - } - | { - actorId: string; - actorTitle: string; - requiredResources: { [key: string]: number }; - state: ActorState.Invalid; - invalidStateType?: InvalidStateType; - }; - -export type InvalidStateType = "infeasibleActor" | "pendingActor"; - export enum ActorState { Invalid = -1, - Creating = 0, - Alive = 1, - Restarting = 2, - Dead = 3, + DependenciesUnready = 0, + PendingCreation = 1, + Alive = 2, + Restarting = 3, + Dead = 4, } +export type RayletActorInfo = FullActorInfo | PartialActorInfo; + +export type FullActorInfo = { + actorId: string; + actorTitle: string; + averageTaskExecutionSpeed: number; + children: RayletInfoResponse["actors"]; + // currentTaskFuncDesc: string[]; + ipAddress: string; + jobId: string; + nodeId: string; + numExecutedTasks: number; + numLocalObjects: number; + numObjectRefsInScope: number; + pid: number; + port: number; + state: + | ActorState.Alive + | ActorState.Restarting + | ActorState.Dead + | ActorState.DependenciesUnready + | ActorState.PendingCreation; + taskQueueLength: number; + timestamp: number; + usedObjectStoreMemory: number; + usedResources: { [key: string]: ResourceAllocations }; + currentTaskDesc?: string; + numPendingTasks?: number; + webuiDisplay?: Record; +}; + +export type PartialActorInfo = { + actorId: string; + actorTitle: string; + requiredResources: { [key: string]: number }; + state: ActorState.Invalid; + invalidStateType?: InvalidStateType; +}; + +// eslint-disable-next-line +export function isFullActorInfo( + rayletInfo: RayletActorInfo, +): rayletInfo is FullActorInfo { + // Lint disabled because arrow functions don't play well with type guards. + // This function is used to determine what kind of information we have about + // a given actor in a response based on its state. + return rayletInfo.state !== ActorState.Invalid; +} + +export type InvalidStateType = "infeasibleActor" | "pendingActor"; + export type RayletInfoResponse = { nodes: { [ip: string]: { diff --git a/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx b/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx index 710505054..c0f8f266d 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx @@ -12,6 +12,7 @@ import { checkProfilingStatus, CheckProfilingStatusResponse, getProfilingResultURL, + isFullActorInfo, launchKillActor, launchProfiling, RayletActorInfo, @@ -84,7 +85,7 @@ class Actor extends React.Component, State> { handleProfilingClick = (duration: number) => async () => { const actor = this.props.actor; - if (actor.state !== ActorState.Invalid) { + if (actor.state === ActorState.Alive) { const profilingId = await launchProfiling( actor.nodeId, actor.pid, @@ -117,10 +118,7 @@ class Actor extends React.Component, State> { killActor = () => { const actor = this.props.actor; - if ( - actor.state === ActorState.Creating || - actor.state === ActorState.Alive - ) { + if (actor.state === ActorState.Alive) { launchKillActor(actor.actorId, actor.ipAddress, actor.port); } }; @@ -128,83 +126,84 @@ class Actor extends React.Component, State> { render() { const { classes, actor } = this.props; const { expanded, profiling } = this.state; - - const information = - actor.state !== ActorState.Invalid - ? [ - { - label: "Resources", - value: - Object.entries(actor.usedResources).length > 0 && - Object.entries(actor.usedResources) - .sort((a, b) => a[0].localeCompare(b[0])) - .map( - ([key, value]) => - `${sum( - value.resourceSlots.map((slot) => slot.allocation), - )} ${key}`, - ) - .join(", "), - }, - { - label: "Number of pending tasks", - value: actor.taskQueueLength.toLocaleString(), - tooltip: - "The number of tasks that are currently pending to execute on this actor. If this number " + - "remains consistently high, it may indicate that this actor is a bottleneck in your application.", - }, - { - label: "Number of executed tasks", - value: actor.numExecutedTasks.toLocaleString(), - tooltip: - "The number of tasks this actor has executed throughout its lifetimes.", - }, - { - label: "Number of ObjectRefs in scope", - value: actor.numObjectRefsInScope.toLocaleString(), - tooltip: - "The number of ObjectRefs that this actor is keeping in scope via its internal state. " + - "This does not imply that the objects are in active use or colocated on the node with the actor " + - `currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` + - "for more information.", - }, - { - label: "Number of local objects", - value: actor.numLocalObjects.toLocaleString(), - tooltip: - "The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " + - `debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`, - }, - { - label: "Object store memory used (MiB)", - value: actor.usedObjectStoreMemory.toLocaleString(), - tooltip: - "The total amount of memory that this actor is occupying in the Ray object store. " + - "If this number is increasing without bounds, you might have a memory leak. See " + - `the docs at: ${memoryDebuggingDocLink} for more information.`, - }, - ] - : [ - { - label: "Actor ID", - value: actor.actorId, - tooltip: "", - }, - { - label: "Required resources", - value: - Object.entries(actor.requiredResources).length > 0 && - Object.entries(actor.requiredResources) - .sort((a, b) => a[0].localeCompare(b[0])) - .map(([key, value]) => `${value.toLocaleString()} ${key}`) - .join(", "), - tooltip: "", - }, - ]; + const invalidStateType = isFullActorInfo(actor) + ? undefined + : actor.invalidStateType; + const information = isFullActorInfo(actor) + ? [ + { + label: "Resources", + value: + Object.entries(actor.usedResources).length > 0 && + Object.entries(actor.usedResources) + .sort((a, b) => a[0].localeCompare(b[0])) + .map( + ([key, value]) => + `${sum( + value.resourceSlots.map((slot) => slot.allocation), + )} ${key}`, + ) + .join(", "), + }, + { + label: "Number of pending tasks", + value: actor.taskQueueLength.toLocaleString(), + tooltip: + "The number of tasks that are currently pending to execute on this actor. If this number " + + "remains consistently high, it may indicate that this actor is a bottleneck in your application.", + }, + { + label: "Number of executed tasks", + value: actor.numExecutedTasks.toLocaleString(), + tooltip: + "The number of tasks this actor has executed throughout its lifetimes.", + }, + { + label: "Number of ObjectRefs in scope", + value: actor.numObjectRefsInScope.toLocaleString(), + tooltip: + "The number of ObjectRefs that this actor is keeping in scope via its internal state. " + + "This does not imply that the objects are in active use or colocated on the node with the actor " + + `currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` + + "for more information.", + }, + { + label: "Number of local objects", + value: actor.numLocalObjects.toLocaleString(), + tooltip: + "The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " + + `debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`, + }, + { + label: "Object store memory used (MiB)", + value: actor.usedObjectStoreMemory.toLocaleString(), + tooltip: + "The total amount of memory that this actor is occupying in the Ray object store. " + + "If this number is increasing without bounds, you might have a memory leak. See " + + `the docs at: ${memoryDebuggingDocLink} for more information.`, + }, + ] + : [ + { + label: "Actor ID", + value: actor.actorId, + tooltip: "", + }, + { + label: "Required resources", + value: + Object.entries(actor.requiredResources).length > 0 && + Object.entries(actor.requiredResources) + .sort((a, b) => a[0].localeCompare(b[0])) + .map(([key, value]) => `${value.toLocaleString()} ${key}`) + .join(", "), + tooltip: "", + }, + ]; // Construct the custom message from the actor. let actorCustomDisplay: JSX.Element[] = []; - if (actor.state !== ActorState.Invalid && actor.webuiDisplay) { + if (isFullActorInfo(actor) && actor.webuiDisplay) { actorCustomDisplay = Object.keys(actor.webuiDisplay) .sort() .map((key, _, __) => { @@ -241,7 +240,7 @@ class Actor extends React.Component, State> { return (
- {actor.state !== ActorState.Invalid ? ( + {isFullActorInfo(actor) ? ( Actor {actor.actorId}{" "} {Object.entries(actor.children).length > 0 && ( @@ -269,7 +268,7 @@ class Actor extends React.Component, State> { ))} ){" "} - {actor.state === 0 && ( + {actor.state === ActorState.Alive && ( Kill Actor @@ -303,7 +302,7 @@ class Actor extends React.Component, State> { ) : actor.invalidStateType === "infeasibleActor" ? ( {actor.actorTitle} cannot be created because the Ray cluster - cannot satisfy its resource requirements.) + cannot satisfy its resource requirements. ) : ( @@ -315,8 +314,9 @@ class Actor extends React.Component, State> { actorDetails={information} actorTitle={actor.actorTitle} actorState={actor.state} + invalidStateType={invalidStateType} /> - {actor.state !== ActorState.Invalid && ( + {isFullActorInfo(actor) && ( {actorCustomDisplay.length > 0 && ( {actorCustomDisplay} diff --git a/python/ray/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx b/python/ray/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx index 1ed47d5ef..0b9c72aec 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx @@ -77,18 +77,28 @@ const actorStateReprStyles = makeStyles((theme: Theme) => const ActorStateRepr: React.FC = ({ state, ist }) => { const classes = actorStateReprStyles(); - const { Alive, Dead, Creating, Restarting, Invalid } = ActorState; + const { + Alive, + Dead, + PendingCreation, + Restarting, + DependenciesUnready, + Invalid, + } = ActorState; switch (state) { case Invalid: + console.log(ist); if (ist === "infeasibleActor") { return
Infeasible
; } if (ist === "pendingActor") { - return
Pending Resources
; + return
Pending
; } return
Unknown
; - case Creating: + case PendingCreation: return
Creating
; + case DependenciesUnready: + return
Dependencies Unready
; case Alive: return
Alive
; case Restarting: diff --git a/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx b/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx index 914d7e214..56cef7f06 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx @@ -7,7 +7,11 @@ import { } from "@material-ui/core"; import React, { useState } from "react"; import { connect } from "react-redux"; -import { ActorState, RayletActorInfo, RayletInfoResponse } from "../../../api"; +import { + isFullActorInfo, + RayletActorInfo, + RayletInfoResponse, +} from "../../../api"; import { filterObj } from "../../../common/util"; import { StoreState } from "../../../store"; import Actors from "./Actors"; @@ -29,8 +33,7 @@ const actorMatchesSearch = ( const getNestedActorTitles = (actor: RayletActorInfo): string[] => { const actorTitle = actor.actorTitle; const titles: string[] = actorTitle ? [actorTitle] : []; - // state of -1 indicates an actor data record that does not have children. - if (actor.state === ActorState.Invalid) { + if (!isFullActorInfo(actor)) { return titles; } const children = actor["children"];