diff --git a/python/ray/dashboard/client/src/api.ts b/python/ray/dashboard/client/src/api.ts index bf71a6f94..23be7b98d 100644 --- a/python/ray/dashboard/client/src/api.ts +++ b/python/ray/dashboard/client/src/api.ts @@ -117,18 +117,10 @@ export type NodeInfoResponse = { }; load_avg: [[number, number, number], [number, number, number]]; net: [number, number]; // Sent and received network traffic in bytes / second + log_count?: { [pid: string]: number }; + error_count?: { [pid: string]: number }; workers: Array; }>; - log_counts: { - [ip: string]: { - [pid: string]: number; - }; - }; - error_counts: { - [ip: string]: { - [pid: string]: number; - }; - }; }; export const getNodeInfo = () => get("/api/node_info", {}); diff --git a/python/ray/dashboard/client/src/common/SortableTableHead.tsx b/python/ray/dashboard/client/src/common/SortableTableHead.tsx index aa4149b08..751e5bd3e 100644 --- a/python/ray/dashboard/client/src/common/SortableTableHead.tsx +++ b/python/ray/dashboard/client/src/common/SortableTableHead.tsx @@ -27,49 +27,65 @@ const useSortableTableHeadStyles = makeStyles((theme: Theme) => ); export type HeaderInfo = { - id: keyof T; + sortable: boolean; + id: T; label: string; numeric: boolean; }; type SortableTableHeadProps = { - onRequestSort: (event: React.MouseEvent, property: keyof T) => void; + onRequestSort: (event: React.MouseEvent, id: T) => void; order: Order; - orderBy: string | null; + orderBy: T | null; headerInfo: HeaderInfo[]; + firstColumnEmpty: boolean; }; const SortableTableHead = (props: SortableTableHeadProps) => { - const { order, orderBy, onRequestSort, headerInfo } = props; + const { order, orderBy, onRequestSort, headerInfo, firstColumnEmpty } = props; const classes = useSortableTableHeadStyles(); - const createSortHandler = (property: keyof T) => ( - event: React.MouseEvent, - ) => { - onRequestSort(event, property); + const createSortHandler = (id: T) => (event: React.MouseEvent) => { + onRequestSort(event, id); }; return ( - {headerInfo.map((headerInfo) => ( - - - {headerInfo.label} - {orderBy === headerInfo.id ? ( - - {order === "desc" ? "sorted descending" : "sorted ascending"} - - ) : null} - - - ))} + {firstColumnEmpty && } + {headerInfo.map((headerInfo) => { + if (headerInfo.sortable) { + return ( + + + {headerInfo.label} + {orderBy === headerInfo.id ? ( + + {order === "desc" + ? "sorted descending" + : "sorted ascending"} + + ) : null} + + + ); + } else { + return ( + + {headerInfo.label} + + ); + } + })} ); diff --git a/python/ray/dashboard/client/src/common/tableUtils.ts b/python/ray/dashboard/client/src/common/tableUtils.ts index a3432dc10..7161c4c22 100644 --- a/python/ray/dashboard/client/src/common/tableUtils.ts +++ b/python/ray/dashboard/client/src/common/tableUtils.ts @@ -8,7 +8,25 @@ export const descendingComparator = (a: T, b: T, orderBy: keyof T) => { return 0; }; +const descendingComparatorFnAccessor = ( + a: T, + b: T, + orderByFn: Accessor, +) => { + const aVal = orderByFn(a); + const bVal = orderByFn(b); + if (bVal < aVal) { + return -1; + } + if (bVal > aVal) { + return 1; + } + return 0; +}; + export type Order = "asc" | "desc"; +export type Comparator = (a: T, b: T) => number; +export type Accessor = (a: T) => number | string; export const getComparator = ( order: Order, @@ -22,10 +40,16 @@ export const getComparator = ( : (a, b) => -descendingComparator(a, b, orderBy); }; -export const stableSort = ( - array: T[], - comparator: (a: T, b: T) => number, -) => { +export const getFnComparator = (order: Order, orderByFn: Accessor) => ( + a: T, + b: T, +): number => { + return order === "desc" + ? descendingComparatorFnAccessor(a, b, orderByFn) + : -descendingComparatorFnAccessor(a, b, orderByFn); +}; + +export const stableSort = (array: T[], comparator: Comparator) => { const stabilizedThis = array.map((el, index) => [el, index] as [T, number]); stabilizedThis.sort((a, b) => { const order = comparator(a[0], b[0]); diff --git a/python/ray/dashboard/client/src/common/util.ts b/python/ray/dashboard/client/src/common/util.ts index aa75e9d1c..ce26c11cf 100644 --- a/python/ray/dashboard/client/src/common/util.ts +++ b/python/ray/dashboard/client/src/common/util.ts @@ -18,3 +18,9 @@ export const getWeightedAverage = ( }; export const sum = (vals: number[]) => vals.reduce((acc, val) => acc + val, 0); + +export const filterObj = (obj: Object, filterFn: any) => + Object.fromEntries(Object.entries(obj).filter(filterFn)); + +export const mapObj = (obj: Object, filterFn: any) => + Object.fromEntries(Object.entries(obj).map(filterFn)); diff --git a/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx b/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx index 0e5e0a44c..914d7e214 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/logical-view/LogicalView.tsx @@ -8,6 +8,7 @@ import { import React, { useState } from "react"; import { connect } from "react-redux"; import { ActorState, RayletActorInfo, RayletInfoResponse } from "../../../api"; +import { filterObj } from "../../../common/util"; import { StoreState } from "../../../store"; import Actors from "./Actors"; @@ -46,9 +47,6 @@ const mapStateToProps = (state: StoreState) => ({ rayletInfo: state.dashboard.rayletInfo, }); -const filterObj = (obj: Object, filterFn: any) => - Object.fromEntries(Object.entries(obj).filter(filterFn)); - type LogicalViewProps = { rayletInfo: RayletInfoResponse | null; } & ReturnType; diff --git a/python/ray/dashboard/client/src/pages/dashboard/memory/Memory.tsx b/python/ray/dashboard/client/src/pages/dashboard/memory/Memory.tsx index 5593ca9b4..b533afff0 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/memory/Memory.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/memory/Memory.tsx @@ -22,6 +22,7 @@ import SortableTableHead, { } from "../../../common/SortableTableHead"; import { getComparator, Order, stableSort } from "../../../common/tableUtils"; import { StoreState } from "../../../store"; +import { dashboardActions } from "../state"; import MemoryRowGroup from "./MemoryRowGroup"; import { MemoryTableRow } from "./MemoryTableRow"; @@ -50,7 +51,7 @@ const makeGroupedEntries = ( const makeUngroupedEntries = ( memoryTableGroups: MemoryTableGroups, order: Order, - orderBy: keyof MemoryTableEntry | null, + orderBy: memoryColumnId | null, ) => { const allEntries = Object.values(memoryTableGroups).reduce( (allEntries: Array, memoryTableGroup) => { @@ -71,14 +72,33 @@ const makeUngroupedEntries = ( )); }; -const memoryHeaderInfo: HeaderInfo[] = [ - { id: "node_ip_address", label: "IP Address", numeric: true }, - { id: "pid", label: "pid", numeric: true }, - { id: "type", label: "Type", numeric: false }, - { id: "object_ref", label: "Object Ref", numeric: false }, - { id: "object_size", label: "Object Size (B)", numeric: true }, - { id: "reference_type", label: "Reference Type", numeric: false }, - { id: "call_site", label: "Call Site", numeric: false }, +type memoryColumnId = + | "node_ip_address" + | "pid" + | "type" + | "object_ref" + | "object_size" + | "reference_type" + | "call_site"; + +const memoryHeaderInfo: HeaderInfo[] = [ + { id: "node_ip_address", label: "IP Address", numeric: true, sortable: true }, + { id: "pid", label: "pid", numeric: true, sortable: true }, + { id: "type", label: "Type", numeric: false, sortable: true }, + { id: "object_ref", label: "Object Ref", numeric: false, sortable: true }, + { + id: "object_size", + label: "Object Size (B)", + numeric: true, + sortable: true, + }, + { + id: "reference_type", + label: "Reference Type", + numeric: false, + sortable: true, + }, + { id: "call_site", label: "Call Site", numeric: false, sortable: true }, ]; const useMemoryInfoStyles = makeStyles((theme: Theme) => @@ -103,9 +123,11 @@ const MemoryInfo: React.FC<{}> = () => { const { memoryTable, shouldObtainMemoryTable } = useSelector( memoryInfoSelector, ); - const { setShouldObtainMemoryTable } = useDispatch(); + const dispatch = useDispatch(); const toggleMemoryCollection = async () => { - setShouldObtainMemoryTable(!shouldObtainMemoryTable); + dispatch( + dashboardActions.setShouldObtainMemoryTable(!shouldObtainMemoryTable), + ); if (shouldObtainMemoryTable) { await stopMemoryTableCollection(); } @@ -120,9 +142,7 @@ const MemoryInfo: React.FC<{}> = () => { const [isGrouped, setIsGrouped] = useState(true); const [order, setOrder] = React.useState("asc"); const toggleOrder = () => setOrder(order === "asc" ? "desc" : "asc"); - const [orderBy, setOrderBy] = React.useState( - null, - ); + const [orderBy, setOrderBy] = React.useState(null); return ( {memoryTable !== null ? ( @@ -143,9 +163,9 @@ const MemoryInfo: React.FC<{}> = () => { /> { + onRequestSort={(_, property) => { if (property === orderBy) { toggleOrder(); } else { @@ -154,6 +174,7 @@ const MemoryInfo: React.FC<{}> = () => { } }} headerInfo={memoryHeaderInfo} + firstColumnEmpty={false} /> {isGrouped diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx index e987929ec..01766cc78 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx @@ -1,40 +1,137 @@ import { + Checkbox, createStyles, + FormControlLabel, makeStyles, Table, TableBody, - TableCell, - TableHead, - TableRow, Theme, Typography, } from "@material-ui/core"; import React, { useState } from "react"; import { useSelector } from "react-redux"; import { RayletInfoResponse } from "../../../api"; +import SortableTableHead, { + HeaderInfo, +} from "../../../common/SortableTableHead"; +import { getFnComparator, Order, stableSort } from "../../../common/tableUtils"; import { sum } from "../../../common/util"; import { StoreState } from "../../../store"; import Errors from "./dialogs/errors/Errors"; import Logs from "./dialogs/logs/Logs"; +import cpuFeature from "./features/CPU"; +import diskFeature from "./features/Disk"; +import makeErrorsFeature from "./features/Errors"; +import gpuFeature from "./features/GPU"; +import gramFeature from "./features/GRAM"; +import hostFeature from "./features/Host"; +import makeLogsFeature from "./features/Logs"; +import ramFeature from "./features/RAM"; +import receivedFeature from "./features/Received"; +import sentFeature from "./features/Sent"; +import { + Node, + nodeInfoColumnId, + NodeInfoFeature, + WorkerFeatureData, +} from "./features/types"; +import uptimeFeature from "./features/Uptime"; +import workersFeature from "./features/Workers"; import NodeRowGroup from "./NodeRowGroup"; +import { NodeWorkerRow } from "./NodeWorkerRow"; import TotalRow from "./TotalRow"; -const clusterWorkerPids = ( - rayletInfo: RayletInfoResponse, -): Map> => { - // Groups PIDs registered with the raylet by node IP address - // This is used to filter out processes belonging to other ray clusters. - const nodeMap = new Map(); - const workerPids = new Set(); - for (const [nodeIp, { workersStats }] of Object.entries(rayletInfo.nodes)) { - for (const worker of workersStats) { - if (!worker.isDriver) { - workerPids.add(worker.pid.toString()); - } +const sortWorkers = ( + workerFeatureData: WorkerFeatureData[], + sortWorkerComparator: any, +) => { + // Sorts idle workers to end, applies the worker comparator function to sort + // then returns a new list of worker feature data. + const idleSortedClusterWorkers = workerFeatureData.sort((wfd1, wfd2) => { + const w1 = wfd1.worker; + const w2 = wfd2.worker; + if (w2.cmdline[0] === "ray::IDLE") { + return -1; } - nodeMap.set(nodeIp, workerPids); - } - return nodeMap; + if (w1.cmdline[0] === "ray::IDLE") { + return 1; + } + return w1.pid < w2.pid ? -1 : 1; + }); + return sortWorkerComparator + ? stableSort(idleSortedClusterWorkers, sortWorkerComparator) + : idleSortedClusterWorkers; +}; + +const makeGroupedTableContents = ( + nodes: Node[], + sortWorkerComparator: any, + sortGroupComparator: any, + rayletInfo: RayletInfoResponse | null, + nodeInfoFeatures: NodeInfoFeature[], +) => { + const sortedGroups = stableSort(nodes, sortGroupComparator); + return sortedGroups.map((node) => { + const workerFeatureData: WorkerFeatureData[] = node.workers.map( + (worker) => { + const rayletWorker = + rayletInfo?.nodes?.[node.ip]?.workersStats?.find( + (workerStats) => workerStats.pid === worker.pid, + ) || null; + return { + node: node, + worker, + rayletWorker, + }; + }, + ); + + const sortedClusterWorkers = sortWorkers( + workerFeatureData, + sortWorkerComparator, + ); + return ( + + ); + }); +}; + +const makeUngroupedTableContents = ( + nodes: Node[], + sortWorkerComparator: any, + rayletInfo: RayletInfoResponse | null, + nodeInfoFeatures: NodeInfoFeature[], +) => { + const workerInfoFeatures = nodeInfoFeatures.map( + (feature) => feature.WorkerFeatureRenderFn, + ); + const allWorkerFeatures: WorkerFeatureData[] = nodes.flatMap((node) => { + return node.workers.map((worker) => { + const rayletWorker = + rayletInfo?.nodes?.[node.ip]?.workersStats?.find( + (workerStats) => workerStats.pid === worker.pid, + ) || null; + return { + node: node, + worker, + rayletWorker, + }; + }); + }); + const sortedWorkers = sortWorkers(allWorkerFeatures, sortWorkerComparator); + return sortedWorkers.map((workerFeatureDatum, i) => ( + + )); }; const useNodeInfoStyles = makeStyles((theme: Theme) => @@ -57,138 +154,115 @@ const nodeInfoSelector = (state: StoreState) => ({ rayletInfo: state.dashboard.rayletInfo, }); -type dialogState = { +type DialogState = { hostname: string; pid: number | null; } | null; +const nodeInfoHeaders: HeaderInfo[] = [ + { id: "host", label: "Host", numeric: true, sortable: true }, + { id: "workers", label: "PID", numeric: true, sortable: false }, + { id: "uptime", label: "Uptime (s)", numeric: true, sortable: true }, + { id: "cpu", label: "CPU", numeric: false, sortable: true }, + { id: "ram", label: "RAM", numeric: true, sortable: true }, + { id: "gpu", label: "GPU", numeric: true, sortable: true }, + { id: "gram", label: "GRAM", numeric: true, sortable: true }, + { id: "disk", label: "Disk", numeric: true, sortable: true }, + { id: "sent", label: "Sent", numeric: true, sortable: true }, + { id: "received", label: "Received", numeric: false, sortable: true }, + { id: "logs", label: "Logs", numeric: false, sortable: true }, + { id: "errors", label: "Errors", numeric: false, sortable: true }, +]; + const NodeInfo: React.FC<{}> = () => { - const [logDialog, setLogDialog] = useState(null); - const [errorDialog, setErrorDialog] = useState(null); + const [logDialog, setLogDialog] = useState(null); + const [errorDialog, setErrorDialog] = useState(null); + const [isGrouped, setIsGrouped] = useState(true); + const [order, setOrder] = React.useState("asc"); + const toggleOrder = () => setOrder(order === "asc" ? "desc" : "asc"); + const [orderBy, setOrderBy] = React.useState(null); const classes = useNodeInfoStyles(); const { nodeInfo, rayletInfo } = useSelector(nodeInfoSelector); if (nodeInfo === null || rayletInfo === null) { return Loading...; } - - const logCounts: { - [ip: string]: { - perWorker: { - [pid: string]: number; - }; - total: number; - }; - } = {}; - - const errorCounts: { - [ip: string]: { - perWorker: { - [pid: string]: number; - }; - total: number; - }; - } = {}; - - // We fetch data about which process IDs are registered with - // the cluster's raylet for each node. We use this to filter - // the worker data contained in the node info data because - // the node info can contain data from more than one cluster - // if more than one cluster is running on a machine. - const clusterWorkerPidsByIp = clusterWorkerPids(rayletInfo); const clusterTotalWorkers = sum( - Array.from(clusterWorkerPidsByIp.values()).map( - (workerSet) => workerSet.size, - ), + nodeInfo.clients.map((c) => c.workers.length), ); - // Initialize inner structure of the count objects - for (const client of nodeInfo.clients) { - const clusterWorkerPids = clusterWorkerPidsByIp.get(client.ip); - if (!clusterWorkerPids) { - continue; - } - const filteredLogEntries = Object.entries( - nodeInfo.log_counts[client.ip] || {}, - ).filter(([pid, _]) => clusterWorkerPids.has(pid)); - const totalLogEntries = sum(filteredLogEntries.map(([_, count]) => count)); - logCounts[client.ip] = { - perWorker: Object.fromEntries(filteredLogEntries), - total: totalLogEntries, - }; - - const filteredErrEntries = Object.entries( - nodeInfo.error_counts[client.ip] || {}, - ).filter(([pid, _]) => clusterWorkerPids.has(pid)); - const totalErrEntries = sum(filteredErrEntries.map(([_, count]) => count)); - errorCounts[client.ip] = { - perWorker: Object.fromEntries(filteredErrEntries), - total: totalErrEntries, - }; - } - + const nodeInfoFeatures: NodeInfoFeature[] = [ + hostFeature, + workersFeature, + uptimeFeature, + cpuFeature, + ramFeature, + gpuFeature, + gramFeature, + diskFeature, + sentFeature, + receivedFeature, + makeLogsFeature((hostname, pid) => setLogDialog({ hostname, pid })), + makeErrorsFeature((hostname, pid) => setErrorDialog({ hostname, pid })), + ]; + const sortNodeAccessor = nodeInfoFeatures.find( + (feature) => feature.id === orderBy, + )?.nodeAccessor; + const sortNodeComparator = + sortNodeAccessor && getFnComparator(order, sortNodeAccessor); + const sortWorkerAccessor = nodeInfoFeatures.find( + (feature) => feature.id === orderBy, + )?.workerAccessor; + const sortWorkerComparator = + sortWorkerAccessor && getFnComparator(order, sortWorkerAccessor); + const tableContents = isGrouped + ? makeGroupedTableContents( + nodeInfo.clients, + sortWorkerComparator, + sortNodeComparator, + rayletInfo, + nodeInfoFeatures, + ) + : makeUngroupedTableContents( + nodeInfo.clients, + sortWorkerComparator, + rayletInfo, + nodeInfoFeatures, + ); return ( + setIsGrouped(!isGrouped)} + color="primary" + /> + } + label="Group by host" + />
- - - - Host - Workers - Uptime - CPU - RAM - GPU - GRAM - Disk - Sent - Received - Logs - Errors - - + { + if (property === orderBy) { + toggleOrder(); + } else { + setOrderBy(property); + setOrder("asc"); + } + }} + headerInfo={nodeInfoHeaders} + order={order} + orderBy={orderBy} + firstColumnEmpty={true} + /> - {nodeInfo.clients.map((client) => { - const clusterWorkerPids = - clusterWorkerPidsByIp.get(client.ip) || new Set(); - return ( - - clusterWorkerPids.has(worker.pid.toString()), - ) - .sort((w1, w2) => { - if (w2.cmdline[0] === "ray::IDLE") { - return -1; - } - if (w1.cmdline[0] === "ray::IDLE") { - return 1; - } - return w1.pid < w2.pid ? -1 : 1; - })} - node={client} - raylet={ - client.ip in rayletInfo.nodes - ? rayletInfo.nodes[client.ip] - : null - } - logCounts={logCounts[client.ip]} - errorCounts={errorCounts[client.ip]} - setLogDialog={(hostname, pid) => - setLogDialog({ hostname, pid }) - } - setErrorDialog={(hostname, pid) => - setErrorDialog({ hostname, pid }) - } - initialExpanded={nodeInfo.clients.length <= 1} - /> - ); - })} + {tableContents} feature.ClusterFeatureRenderFn, + )} />
diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx index 58303c933..57b556cad 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx @@ -9,23 +9,10 @@ import AddIcon from "@material-ui/icons/Add"; import RemoveIcon from "@material-ui/icons/Remove"; import classNames from "classnames"; import React, { useState } from "react"; -import { - NodeInfoResponse, - NodeInfoResponseWorker, - RayletInfoResponse, -} from "../../../api"; -import { NodeCPU, WorkerCPU } from "./features/CPU"; -import { NodeDisk, WorkerDisk } from "./features/Disk"; -import { makeNodeErrors, makeWorkerErrors } from "./features/Errors"; -import { NodeGPU, WorkerGPU } from "./features/GPU"; -import { NodeGRAM, WorkerGRAM } from "./features/GRAM"; -import { NodeHost, WorkerHost } from "./features/Host"; -import { makeNodeLogs, makeWorkerLogs } from "./features/Logs"; -import { NodeRAM, WorkerRAM } from "./features/RAM"; -import { NodeReceived, WorkerReceived } from "./features/Received"; -import { NodeSent, WorkerSent } from "./features/Sent"; -import { NodeUptime, WorkerUptime } from "./features/Uptime"; -import { NodeWorkers, WorkerWorkers } from "./features/Workers"; +import { NodeInfoResponse } from "../../../api"; +import { StyledTableCell } from "../../../common/TableCell"; +import { NodeInfoFeature, WorkerFeatureData } from "./features/types"; +import { NodeWorkerRow } from "./NodeWorkerRow"; const useNodeRowGroupStyles = makeStyles((theme: Theme) => createStyles({ @@ -55,59 +42,31 @@ type ArrayType = T extends Array ? U : never; type Node = ArrayType; type NodeRowGroupProps = { + features: NodeInfoFeature[]; node: Node; - clusterWorkers: Array; - raylet: RayletInfoResponse["nodes"][keyof RayletInfoResponse["nodes"]] | null; - logCounts: { - perWorker: { [pid: string]: number }; - total: number; - }; - errorCounts: { - perWorker: { [pid: string]: number }; - total: number; - }; - setLogDialog: (hostname: string, pid: number | null) => void; - setErrorDialog: (hostname: string, pid: number | null) => void; + rayletInfo?: string; + workerFeatureData: WorkerFeatureData[]; initialExpanded: boolean; }; const NodeRowGroup: React.FC = ({ + features, node, - raylet, - clusterWorkers, - logCounts, - errorCounts, - setLogDialog, - setErrorDialog, initialExpanded, + rayletInfo, + workerFeatureData, }) => { const [expanded, setExpanded] = useState(initialExpanded); const toggleExpand = () => setExpanded(!expanded); const classes = useNodeRowGroupStyles(); - const features = [ - { NodeFeature: NodeHost, WorkerFeature: WorkerHost }, - { - NodeFeature: NodeWorkers(clusterWorkers.length), - WorkerFeature: WorkerWorkers, - }, - { NodeFeature: NodeUptime, WorkerFeature: WorkerUptime }, - { NodeFeature: NodeCPU, WorkerFeature: WorkerCPU }, - { NodeFeature: NodeRAM, WorkerFeature: WorkerRAM }, - { NodeFeature: NodeGPU, WorkerFeature: WorkerGPU }, - { NodeFeature: NodeGRAM, WorkerFeature: WorkerGRAM }, - { NodeFeature: NodeDisk, WorkerFeature: WorkerDisk }, - { NodeFeature: NodeSent, WorkerFeature: WorkerSent }, - { NodeFeature: NodeReceived, WorkerFeature: WorkerReceived }, - { - NodeFeature: makeNodeLogs(logCounts, setLogDialog), - WorkerFeature: makeWorkerLogs(logCounts, setLogDialog), - }, - { - NodeFeature: makeNodeErrors(errorCounts, setErrorDialog), - WorkerFeature: makeWorkerErrors(errorCounts, setErrorDialog), - }, - ]; - + const renderedNodeFeatures = features.map((nodeInfoFeature, i) => { + const FeatureComponent = nodeInfoFeature.NodeFeatureRenderFn; + return ( + + + + ); + }); return ( @@ -121,44 +80,30 @@ const NodeRowGroup: React.FC = ({ )} - {features.map(({ NodeFeature }, index) => ( - - - - ))} + {renderedNodeFeatures} {expanded && ( - {raylet !== null && raylet.extraInfo !== undefined && ( + {rayletInfo !== undefined && ( - {raylet.extraInfo} + {rayletInfo} )} - {clusterWorkers.map((worker, index: number) => { - const rayletWorker = - raylet?.workersStats.find( - (rayletWorker) => worker.pid === rayletWorker.pid, - ) || null; - + {workerFeatureData.map((featureData, index: number) => { return ( - - - {features.map(({ WorkerFeature }, index) => ( - - - - ))} - + feature.WorkerFeatureRenderFn, + )} + data={featureData} + /> ); })} diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeWorkerRow.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeWorkerRow.tsx new file mode 100644 index 000000000..631362f4a --- /dev/null +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeWorkerRow.tsx @@ -0,0 +1,32 @@ +import { TableRow } from "@material-ui/core"; +import React from "react"; +import { StyledTableCell } from "../../../common/TableCell"; +import { WorkerFeatureData, WorkerFeatureRenderFn } from "./features/types"; + +type NodeWorkerRowProps = { + key: string | number; + features: WorkerFeatureRenderFn[]; + data: WorkerFeatureData; +}; + +export const NodeWorkerRow: React.FC = ({ + features, + data, + key, +}) => { + const { node, worker, rayletWorker } = data; + return ( + + + {features.map((WorkerFeature, index) => ( + + + + ))} + + ); +}; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx index 36a95617a..0929a4933 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx @@ -8,18 +8,8 @@ import { import LayersIcon from "@material-ui/icons/Layers"; import React from "react"; import { NodeInfoResponse } from "../../../api"; -import { ClusterCPU } from "./features/CPU"; -import { ClusterDisk } from "./features/Disk"; -import { makeClusterErrors } from "./features/Errors"; -import { ClusterGPU } from "./features/GPU"; -import { ClusterGRAM } from "./features/GRAM"; -import { ClusterHost } from "./features/Host"; -import { makeClusterLogs } from "./features/Logs"; -import { ClusterRAM } from "./features/RAM"; -import { ClusterReceived } from "./features/Received"; -import { ClusterSent } from "./features/Sent"; -import { ClusterUptime } from "./features/Uptime"; -import { ClusterWorkers } from "./features/Workers"; +import { StyledTableCell } from "../../../common/TableCell"; +import { ClusterFeatureRenderFn } from "./features/types"; const useTotalRowStyles = makeStyles((theme: Theme) => createStyles({ @@ -44,52 +34,25 @@ const useTotalRowStyles = makeStyles((theme: Theme) => type TotalRowProps = { nodes: NodeInfoResponse["clients"]; clusterTotalWorkers: number; - logCounts: { - [ip: string]: { - perWorker: { [pid: string]: number }; - total: number; - }; - }; - errorCounts: { - [ip: string]: { - perWorker: { [pid: string]: number }; - total: number; - }; - }; + features: (ClusterFeatureRenderFn | undefined)[]; }; -const TotalRow: React.FC = ({ - nodes, - clusterTotalWorkers, - logCounts, - errorCounts, -}) => { +const TotalRow: React.FC = ({ nodes, features }) => { const classes = useTotalRowStyles(); - const features = [ - { ClusterFeature: ClusterHost }, - { ClusterFeature: ClusterWorkers(clusterTotalWorkers) }, - { ClusterFeature: ClusterUptime }, - { ClusterFeature: ClusterCPU }, - { ClusterFeature: ClusterRAM }, - { ClusterFeature: ClusterGPU }, - { ClusterFeature: ClusterGRAM }, - { ClusterFeature: ClusterDisk }, - { ClusterFeature: ClusterSent }, - { ClusterFeature: ClusterReceived }, - { ClusterFeature: makeClusterLogs(logCounts) }, - { ClusterFeature: makeClusterErrors(errorCounts) }, - ]; - return ( - {features.map(({ ClusterFeature }, index) => ( - - - - ))} + {features.map((ClusterFeature, index) => + ClusterFeature ? ( + + + + ) : ( + + ), + )} ); }; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx index 94bbad44d..b76a1ca7c 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx @@ -1,13 +1,17 @@ import React from "react"; +import { Accessor } from "../../../../common/tableUtils"; import UsageBar from "../../../../common/UsageBar"; import { getWeightedAverage } from "../../../../common/util"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterCPU: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterCPU: ClusterFeatureRenderFn = ({ nodes }) => { const cpuWeightedAverage = getWeightedAverage( nodes.map((node) => ({ weight: node.cpus[0], value: node.cpu })), ); @@ -21,13 +25,16 @@ export const ClusterCPU: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeCPU: NodeFeatureComponent = ({ node }) => ( +export const NodeCPU: NodeFeatureRenderFn = ({ node }) => (
); +export const nodeCPUAccessor: Accessor = ({ node }) => { + return node.cpu; +}; -export const WorkerCPU: WorkerFeatureComponent = ({ worker }) => ( +export const WorkerCPU: WorkerFeatureRenderFn = ({ worker }) => (
( />
); + +export const workerCPUAccessor: Accessor = ({ worker }) => { + return worker.cpu_percent; +}; + +const cpuFeature: NodeInfoFeature = { + id: "cpu", + ClusterFeatureRenderFn: ClusterCPU, + NodeFeatureRenderFn: NodeCPU, + WorkerFeatureRenderFn: WorkerCPU, + nodeAccessor: nodeCPUAccessor, + workerAccessor: workerCPUAccessor, +}; + +export default cpuFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Disk.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Disk.tsx index dfd8cb721..f3823df04 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Disk.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Disk.tsx @@ -1,14 +1,17 @@ import { Typography } from "@material-ui/core"; import React from "react"; import { formatUsage } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import UsageBar from "../../../../common/UsageBar"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterDisk: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterDisk: ClusterFeatureRenderFn = ({ nodes }) => { let used = 0; let total = 0; for (const node of nodes) { @@ -23,15 +26,28 @@ export const ClusterDisk: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeDisk: NodeFeatureComponent = ({ node }) => ( +export const NodeDisk: NodeFeatureRenderFn = ({ node }) => ( ); -export const WorkerDisk: WorkerFeatureComponent = () => ( +export const nodeDiskAccessor: Accessor = ({ node }) => + node.disk["/"].used; + +export const WorkerDisk: WorkerFeatureRenderFn = () => ( N/A ); + +const diskFeature: NodeInfoFeature = { + id: "disk", + ClusterFeatureRenderFn: ClusterDisk, + NodeFeatureRenderFn: NodeDisk, + WorkerFeatureRenderFn: WorkerDisk, + nodeAccessor: nodeDiskAccessor, +}; + +export default diskFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Errors.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Errors.tsx index cd99115ea..e711e2563 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Errors.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Errors.tsx @@ -1,68 +1,80 @@ import { Typography } from "@material-ui/core"; import React from "react"; import SpanButton from "../../../../common/SpanButton"; +import { Accessor } from "../../../../common/tableUtils"; +import { sum } from "../../../../common/util"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + Node, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; -export const makeClusterErrors = (errorCounts: { - [ip: string]: { - perWorker: { - [pid: string]: number; - }; - total: number; - }; -}): ClusterFeatureComponent => ({ nodes }) => { - let totalErrorCount = 0; - for (const node of nodes) { - if (node.ip in errorCounts) { - totalErrorCount += errorCounts[node.ip].total; - } - } - return totalErrorCount === 0 ? ( +const nodeErrCount = (node: Node) => + node.error_count ? sum(Object.values(node.error_count)) : 0; + +const ClusterErrors: ClusterFeatureRenderFn = ({ nodes }) => { + const totalErrCount = sum(nodes.map(nodeErrCount)); + return totalErrCount === 0 ? ( No errors ) : ( - {totalErrorCount.toLocaleString()}{" "} - {totalErrorCount === 1 ? "error" : "errors"} + {totalErrCount.toLocaleString()}{" "} + {totalErrCount === 1 ? "error" : "errors"} ); }; -export const makeNodeErrors = ( - errorCounts: { - perWorker: { [pid: string]: number }; - total: number; - }, +const makeNodeErrors = ( setErrorDialog: (hostname: string, pid: number | null) => void, -): NodeFeatureComponent => ({ node }) => - errorCounts.total === 0 ? ( +): NodeFeatureRenderFn => ({ node }) => { + const nodeErrorCount = nodeErrCount(node); + return nodeErrorCount === 0 ? ( No errors ) : ( setErrorDialog(node.hostname, null)}> - View all errors ({errorCounts.total.toLocaleString()}) + View all errors ({nodeErrorCount.toLocaleString()}) ); +}; -export const makeWorkerErrors = ( - errorCounts: { - perWorker: { [pid: string]: number }; - total: number; - }, +const nodeErrorsAccessor: Accessor = ({ node }) => + nodeErrCount(node); + +const makeWorkerErrors = ( setErrorDialog: (hostname: string, pid: number | null) => void, -): WorkerFeatureComponent => ({ node, worker }) => - errorCounts.perWorker[worker.pid] ? ( +): WorkerFeatureRenderFn => ({ node, worker }) => { + const workerErrorCount = node.error_count?.[worker.pid] || 0; + return workerErrorCount !== 0 ? ( setErrorDialog(node.hostname, worker.pid)}> - View errors ({errorCounts.perWorker[worker.pid].toLocaleString()}) + View errors ({workerErrorCount.toLocaleString()}) ) : ( No errors ); +}; + +const workerErrorsAccessor: Accessor = ({ node, worker }) => + node.error_count?.[worker.pid] || 0; + +const makeErrorsFeature = ( + setErrorDialog: (hostname: string, pid: number | null) => void, +): NodeInfoFeature => ({ + id: "errors", + ClusterFeatureRenderFn: ClusterErrors, + WorkerFeatureRenderFn: makeWorkerErrors(setErrorDialog), + NodeFeatureRenderFn: makeNodeErrors(setErrorDialog), + nodeAccessor: nodeErrorsAccessor, + workerAccessor: workerErrorsAccessor, +}); + +export default makeErrorsFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx index 8c83f9359..6a6615c45 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx @@ -1,23 +1,28 @@ import { Box, Tooltip, Typography } from "@material-ui/core"; import React from "react"; -import { GPUStats, ResourceSlot } from "../../../../api"; +import { GPUStats, RayletWorkerStats, ResourceSlot } from "../../../../api"; import { RightPaddedTypography } from "../../../../common/CustomTypography"; +import { Accessor } from "../../../../common/tableUtils"; + import UsageBar from "../../../../common/UsageBar"; import { getWeightedAverage, sum } from "../../../../common/util"; import { - ClusterFeatureComponent, + ClusterFeatureRenderFn, Node, - NodeFeatureComponent, - WorkerFeatureComponent, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; const GPU_COL_WIDTH = 120; -const clusterUtilization = (nodes: Array): number => { +const clusterGPUUtilization = (nodes: Array): number => { const utils = nodes .map((node) => ({ weight: node.gpus.length, - value: nodeAverageUtilization(node), + value: nodeGPUUtilization(node), })) .filter((util) => !isNaN(util.value)); if (utils.length === 0) { @@ -26,7 +31,7 @@ const clusterUtilization = (nodes: Array): number => { return getWeightedAverage(utils); }; -const nodeAverageUtilization = (node: Node): number => { +const nodeGPUUtilization = (node: Node): number => { if (!node.gpus || node.gpus.length === 0) { return NaN; } @@ -35,8 +40,11 @@ const nodeAverageUtilization = (node: Node): number => { return avgUtilization; }; -export const ClusterGPU: ClusterFeatureComponent = ({ nodes }) => { - const clusterAverageUtilization = clusterUtilization(nodes); +const nodeGPUAccessor: Accessor = ({ node }) => + nodeGPUUtilization(node); + +const ClusterGPU: ClusterFeatureRenderFn = ({ nodes }) => { + const clusterAverageUtilization = clusterGPUUtilization(nodes); return (
{isNaN(clusterAverageUtilization) ? ( @@ -53,7 +61,7 @@ export const ClusterGPU: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeGPU: NodeFeatureComponent = ({ node }) => { +const NodeGPU: NodeFeatureRenderFn = ({ node }) => { const hasGPU = node.gpus !== undefined && node.gpus.length !== 0; return (
@@ -111,7 +119,7 @@ const WorkerGPUEntry: React.FC = ({ resourceSlot }) => { ); }; -export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => { +const WorkerGPU: WorkerFeatureRenderFn = ({ rayletWorker }) => { const workerRes = rayletWorker?.coreWorkerStats.usedResources; const workerUsedGPUResources = workerRes?.["GPU"]; let message; @@ -138,3 +146,31 @@ export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => { } return
{message}
; }; + +const workerGPUUtilization = (rayletWorker: RayletWorkerStats | null) => { + const workerRes = rayletWorker?.coreWorkerStats.usedResources; + const workerUsedGPUResources = workerRes?.["GPU"]; + return ( + workerUsedGPUResources && + sum( + workerUsedGPUResources.resourceSlots.map( + (resourceSlot) => resourceSlot.allocation, + ), + ) + ); +}; + +const workerGPUAccessor: Accessor = ({ rayletWorker }) => { + return workerGPUUtilization(rayletWorker) ?? 0; +}; + +const gpuFeature: NodeInfoFeature = { + id: "gpu", + ClusterFeatureRenderFn: ClusterGPU, + NodeFeatureRenderFn: NodeGPU, + WorkerFeatureRenderFn: WorkerGPU, + nodeAccessor: nodeGPUAccessor, + workerAccessor: workerGPUAccessor, +}; + +export default gpuFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx index 15e57457b..a900e4596 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx @@ -3,13 +3,17 @@ import React from "react"; import { GPUStats } from "../../../../api"; import { RightPaddedTypography } from "../../../../common/CustomTypography"; import { MiBRatioNoPercent } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import UsageBar from "../../../../common/UsageBar"; import { getWeightedAverage, sum } from "../../../../common/util"; import { - ClusterFeatureComponent, + ClusterFeatureRenderFn, Node, - NodeFeatureComponent, - WorkerFeatureComponent, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; const GRAM_COL_WIDTH = 120; @@ -25,6 +29,11 @@ const nodeGRAMUtilization = (node: Node) => { return avgUtilization * 100; }; +const nodeGRAMAccessor: Accessor = ({ node }) => { + const nodeGRAMUtil = nodeGRAMUtilization(node); + return isNaN(nodeGRAMUtil) ? -1 : nodeGRAMUtil; +}; + const clusterGRAMUtilization = (nodes: Array) => { const utils = nodes .map((node) => ({ @@ -38,7 +47,7 @@ const clusterGRAMUtilization = (nodes: Array) => { return getWeightedAverage(utils); }; -export const ClusterGRAM: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterGRAM: ClusterFeatureRenderFn = ({ nodes }) => { const clusterAverageUtilization = clusterGRAMUtilization(nodes); return (
@@ -56,7 +65,7 @@ export const ClusterGRAM: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeGRAM: NodeFeatureComponent = ({ node }) => { +export const NodeGRAM: NodeFeatureRenderFn = ({ node }) => { const nodeGRAMEntries = node.gpus.map((gpu, i) => { const props = { gpuName: gpu.name, @@ -104,7 +113,7 @@ const GRAMEntry: React.FC = ({ ); }; -export const WorkerGRAM: WorkerFeatureComponent = ({ worker, node }) => { +export const WorkerGRAM: WorkerFeatureRenderFn = ({ worker, node }) => { const workerGRAMEntries = node.gpus .map((gpu, i) => { const process = gpu.processes.find( @@ -131,3 +140,33 @@ export const WorkerGRAM: WorkerFeatureComponent = ({ worker, node }) => {
{workerGRAMEntries}
); }; + +const workerGRAMUtilization = (worker: any, node: Node) => { + const workerProcessPerGPU = node.gpus + .map((gpu) => gpu.processes) + .map((processes) => + processes.find((process) => process.pid === worker.pid), + ); + const workerUtilPerGPU = workerProcessPerGPU.map( + (proc) => proc?.gpu_memory_usage || 0, + ); + return sum(workerUtilPerGPU); +}; + +const workerGRAMAccessor: Accessor = ({ worker, node }) => { + if (node.gpus.length === 0) { + return -1; + } + return workerGRAMUtilization(worker, node); +}; + +const gramFeature: NodeInfoFeature = { + id: "gram", + ClusterFeatureRenderFn: ClusterGRAM, + NodeFeatureRenderFn: NodeGRAM, + WorkerFeatureRenderFn: WorkerGRAM, + nodeAccessor: nodeGRAMAccessor, + workerAccessor: workerGRAMAccessor, +}; + +export default gramFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Host.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Host.tsx index c1eaf6cce..d35454832 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Host.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Host.tsx @@ -1,29 +1,45 @@ import React from "react"; +import { Accessor } from "../../../../common/tableUtils"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterHost: ClusterFeatureComponent = ({ nodes }) => ( +export const ClusterHost: ClusterFeatureRenderFn = ({ nodes }) => ( Totals ({nodes.length.toLocaleString()}{" "} {nodes.length === 1 ? "host" : "hosts"}) ); -export const NodeHost: NodeFeatureComponent = ({ node }) => ( +export const NodeHost: NodeFeatureRenderFn = ({ node }) => ( {node.hostname} ({node.ip}) ); +export const nodeHostAccessor: Accessor = ({ node }) => + node.hostname; + // Ray worker process titles have one of the following forms: `ray::IDLE`, // `ray::function()`, `ray::Class`, or `ray::Class.method()`. We extract the // first portion here for display in the "Host" column. Note that this will // always be `ray` under the current setup, but it may vary in the future. -export const WorkerHost: WorkerFeatureComponent = ({ worker }) => ( +export const WorkerHost: WorkerFeatureRenderFn = ({ worker }) => ( {worker.cmdline[0].split("::", 2)[0]} (PID: {worker.pid}) ); + +const hostFeature: NodeInfoFeature = { + id: "host", + ClusterFeatureRenderFn: ClusterHost, + NodeFeatureRenderFn: NodeHost, + WorkerFeatureRenderFn: WorkerHost, + nodeAccessor: nodeHostAccessor, +}; + +export default hostFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Logs.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Logs.tsx index b6a099d18..aae7b7c9c 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Logs.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Logs.tsx @@ -1,26 +1,23 @@ import { Typography } from "@material-ui/core"; import React from "react"; import SpanButton from "../../../../common/SpanButton"; +import { Accessor } from "../../../../common/tableUtils"; +import { sum } from "../../../../common/util"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + Node, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; -export const makeClusterLogs = (logCounts: { - [ip: string]: { - perWorker: { - [pid: string]: number; - }; - total: number; - }; -}): ClusterFeatureComponent => ({ nodes }) => { - let totalLogCount = 0; - for (const node of nodes) { - if (node.ip in logCounts) { - totalLogCount += logCounts[node.ip].total; - } - } +const nodeLogCount = (node: Node) => + node.log_count ? sum(Object.values(node.log_count)) : 0; + +const ClusterLogs: ClusterFeatureRenderFn = ({ nodes }) => { + const totalLogCount = sum(nodes.map(nodeLogCount)); return totalLogCount === 0 ? ( No logs @@ -32,38 +29,55 @@ export const makeClusterLogs = (logCounts: { ); }; -export const makeNodeLogs = ( - logCounts: { - perWorker: { [pid: string]: number }; - total: number; - }, +const makeNodeLogs = ( setLogDialog: (hostname: string, pid: number | null) => void, -): NodeFeatureComponent => ({ node }) => - logCounts.total === 0 ? ( +): NodeFeatureRenderFn => ({ node }) => { + const logCount = nodeLogCount(node); + return logCount === 0 ? ( No logs ) : ( setLogDialog(node.hostname, null)}> - View all logs ({logCounts.total.toLocaleString()}{" "} - {logCounts.total === 1 ? "line" : "lines"}) + View all logs ({logCount.toLocaleString()}{" "} + {logCount === 1 ? "line" : "lines"}) ); +}; -export const makeWorkerLogs = ( - logCounts: { - perWorker: { [pid: string]: number }; - total: number; - }, +const nodeLogsAccessor: Accessor = ({ node }) => + node.log_count ? sum(Object.values(node.log_count)) : 0; + +const makeWorkerLogs = ( setLogDialog: (hostname: string, pid: number | null) => void, -): WorkerFeatureComponent => ({ node, worker }) => - logCounts.perWorker[worker.pid] ? ( +): WorkerFeatureRenderFn => ({ node, worker }) => { + const workerLogCount = node.log_count?.[worker.pid] || 0; + return workerLogCount !== 0 ? ( setLogDialog(node.hostname, worker.pid)}> - View log ({logCounts.perWorker[worker.pid].toLocaleString()}{" "} - {logCounts.perWorker[worker.pid] === 1 ? "line" : "lines"}) + View log ({workerLogCount.toLocaleString()}{" "} + {workerLogCount === 1 ? "line" : "lines"}) ) : ( No logs ); +}; + +const workerLogsAccessor: Accessor = ({ worker, node }) => { + const workerLogCount = node.log_count?.[worker.pid] || 0; + return workerLogCount; +}; + +const makeLogsFeature = ( + setLogDialog: (hostname: string, pid: number | null) => void, +): NodeInfoFeature => ({ + id: "logs", + ClusterFeatureRenderFn: ClusterLogs, + WorkerFeatureRenderFn: makeWorkerLogs(setLogDialog), + NodeFeatureRenderFn: makeNodeLogs(setLogDialog), + workerAccessor: workerLogsAccessor, + nodeAccessor: nodeLogsAccessor, +}); + +export default makeLogsFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/RAM.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/RAM.tsx index 434d0b540..eaf67d111 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/RAM.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/RAM.tsx @@ -1,13 +1,17 @@ import React from "react"; import { formatByteAmount, formatUsage } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import UsageBar from "../../../../common/UsageBar"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterRAM: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterRAM: ClusterFeatureRenderFn = ({ nodes }) => { let used = 0; let total = 0; for (const node of nodes) { @@ -22,16 +26,33 @@ export const ClusterRAM: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeRAM: NodeFeatureComponent = ({ node }) => ( +export const NodeRAM: NodeFeatureRenderFn = ({ node }) => ( ); -export const WorkerRAM: WorkerFeatureComponent = ({ node, worker }) => ( +export const nodeRAMAccessor: Accessor = ({ node }) => + 100 * (node.mem[0] - node.mem[1]); + +export const WorkerRAM: WorkerFeatureRenderFn = ({ node, worker }) => ( ); + +export const workerRAMAccessor: Accessor = ({ worker }) => + worker.memory_info.rss; + +const ramFeature: NodeInfoFeature = { + id: "ram", + ClusterFeatureRenderFn: ClusterRAM, + NodeFeatureRenderFn: NodeRAM, + WorkerFeatureRenderFn: WorkerRAM, + nodeAccessor: nodeRAMAccessor, + workerAccessor: workerRAMAccessor, +}; + +export default ramFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Received.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Received.tsx index c5cdd3955..9f53eeafe 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Received.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Received.tsx @@ -1,13 +1,16 @@ import { Typography } from "@material-ui/core"; import React from "react"; import { formatByteAmount } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterReceived: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterReceived: ClusterFeatureRenderFn = ({ nodes }) => { let totalReceived = 0; for (const node of nodes) { totalReceived += node.net[1]; @@ -19,12 +22,25 @@ export const ClusterReceived: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeReceived: NodeFeatureComponent = ({ node }) => ( +export const NodeReceived: NodeFeatureRenderFn = ({ node }) => ( {formatByteAmount(node.net[1], "mebibyte")}/s ); -export const WorkerReceived: WorkerFeatureComponent = () => ( +export const nodeReceivedAccessor: Accessor = ({ node }) => + node.net[1]; + +export const WorkerReceived: WorkerFeatureRenderFn = () => ( N/A ); + +const receivedFeature: NodeInfoFeature = { + id: "received", + ClusterFeatureRenderFn: ClusterReceived, + NodeFeatureRenderFn: NodeReceived, + WorkerFeatureRenderFn: WorkerReceived, + nodeAccessor: nodeReceivedAccessor, +}; + +export default receivedFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Sent.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Sent.tsx index e8c19d7c4..7b9e3408b 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Sent.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Sent.tsx @@ -1,13 +1,16 @@ import { Typography } from "@material-ui/core"; import React from "react"; import { formatByteAmount } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterSent: ClusterFeatureComponent = ({ nodes }) => { +export const ClusterSent: ClusterFeatureRenderFn = ({ nodes }) => { let totalSent = 0; for (const node of nodes) { totalSent += node.net[0]; @@ -17,12 +20,25 @@ export const ClusterSent: ClusterFeatureComponent = ({ nodes }) => { ); }; -export const NodeSent: NodeFeatureComponent = ({ node }) => ( +export const NodeSent: NodeFeatureRenderFn = ({ node }) => ( {formatByteAmount(node.net[0], "mebibyte")}/s ); -export const WorkerSent: WorkerFeatureComponent = () => ( +export const nodeSentAccessor: Accessor = ({ node }) => + node.net[0]; + +export const WorkerSent: WorkerFeatureRenderFn = () => ( N/A ); + +const sentFeature: NodeInfoFeature = { + id: "sent", + ClusterFeatureRenderFn: ClusterSent, + NodeFeatureRenderFn: NodeSent, + WorkerFeatureRenderFn: WorkerSent, + nodeAccessor: nodeSentAccessor, +}; + +export default sentFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Uptime.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Uptime.tsx index af936c687..5e6a11a6c 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Uptime.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Uptime.tsx @@ -1,26 +1,46 @@ import { Typography } from "@material-ui/core"; import React from "react"; import { formatDuration } from "../../../../common/formatUtils"; +import { Accessor } from "../../../../common/tableUtils"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureData, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureData, + WorkerFeatureRenderFn, } from "./types"; const getUptime = (bootTime: number) => Date.now() / 1000 - bootTime; -export const ClusterUptime: ClusterFeatureComponent = ({ nodes }) => ( +export const ClusterUptime: ClusterFeatureRenderFn = ({ nodes }) => ( N/A ); -export const NodeUptime: NodeFeatureComponent = ({ node }) => ( +export const NodeUptime: NodeFeatureRenderFn = ({ node }) => ( {formatDuration(getUptime(node.boot_time))} ); -export const WorkerUptime: WorkerFeatureComponent = ({ worker }) => ( +export const nodeUptimeAccessor: Accessor = ({ node }) => + getUptime(node.boot_time); + +export const WorkerUptime: WorkerFeatureRenderFn = ({ worker }) => ( {formatDuration(getUptime(worker.create_time))} ); + +const workerUptimeAccessor: Accessor = ({ worker }) => + getUptime(worker.create_time); + +const uptimeFeature: NodeInfoFeature = { + id: "uptime", + NodeFeatureRenderFn: NodeUptime, + WorkerFeatureRenderFn: WorkerUptime, + nodeAccessor: nodeUptimeAccessor, + workerAccessor: workerUptimeAccessor, +}; + +export default uptimeFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Workers.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Workers.tsx index 5b56dc46a..1fa1ab628 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Workers.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/Workers.tsx @@ -1,16 +1,17 @@ import React from "react"; import { - ClusterFeatureComponent, - NodeFeatureComponent, - WorkerFeatureComponent, + ClusterFeatureRenderFn, + NodeFeatureRenderFn, + NodeInfoFeature, + WorkerFeatureRenderFn, } from "./types"; -export const ClusterWorkers = ( - totalWorkers: number, -): ClusterFeatureComponent => ({ nodes }) => { +export const ClusterWorkers: ClusterFeatureRenderFn = ({ nodes }) => { let totalCpus = 0; + let totalWorkers = 0; for (const node of nodes) { totalCpus += node.cpus[0]; + totalWorkers += node.workers.length; } return ( @@ -21,10 +22,9 @@ export const ClusterWorkers = ( ); }; -export const NodeWorkers = (totalWorkers: number): NodeFeatureComponent => ({ - node, -}) => { +export const NodeWorkers: NodeFeatureRenderFn = ({ node }) => { const cpus = node.cpus[0]; + const totalWorkers = node.workers.length; return ( {totalWorkers.toLocaleString()}{" "} @@ -37,6 +37,15 @@ export const NodeWorkers = (totalWorkers: number): NodeFeatureComponent => ({ // Ray worker process titles have one of the following forms: `ray::IDLE`, // `ray::function()`, `ray::Class`, or `ray::Class.method()`. We extract the // second portion here for display in the "Workers" column. -export const WorkerWorkers: WorkerFeatureComponent = ({ worker }) => ( +export const WorkerWorkers: WorkerFeatureRenderFn = ({ worker }) => ( {worker.cmdline[0].split("::", 2)[1]} ); + +const workersFeature: NodeInfoFeature = { + id: "workers", + ClusterFeatureRenderFn: ClusterWorkers, + NodeFeatureRenderFn: NodeWorkers, + WorkerFeatureRenderFn: WorkerWorkers, +}; + +export default workersFeature; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx index bcbf7ff09..32c87e66d 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx @@ -1,24 +1,46 @@ import React from "react"; import { NodeInfoResponse, RayletWorkerStats } from "../../../../api"; +import { Accessor } from "../../../../common/tableUtils"; type ArrayType = T extends Array ? U : never; export type Node = ArrayType; export type Worker = ArrayType; type ClusterFeatureData = { nodes: Node[] }; -type NodeFeatureData = { node: Node }; -type WorkerFeatureData = { +export type NodeFeatureData = { node: Node }; +export type WorkerFeatureData = { node: Node; worker: Worker; rayletWorker: RayletWorkerStats | null; }; -export type ClusterFeatureComponent = ( +export type ClusterFeatureRenderFn = ( data: ClusterFeatureData, ) => React.ReactElement; -export type NodeFeatureComponent = ( - data: NodeFeatureData, -) => React.ReactElement; -export type WorkerFeatureComponent = ( +export type NodeFeatureRenderFn = (data: NodeFeatureData) => React.ReactElement; +export type WorkerFeatureRenderFn = ( data: WorkerFeatureData, ) => React.ReactElement; + +export type NodeInfoFeature = { + id: nodeInfoColumnId; + WorkerFeatureRenderFn: WorkerFeatureRenderFn; + NodeFeatureRenderFn: NodeFeatureRenderFn; + ClusterFeatureRenderFn?: ClusterFeatureRenderFn; + workerAccessor?: Accessor; + nodeAccessor?: Accessor; +}; + +export type nodeInfoColumnId = + | "host" + | "workers" + | "uptime" + | "cpu" + | "ram" + | "gpu" + | "gram" + | "disk" + | "sent" + | "received" + | "logs" + | "errors"; diff --git a/python/ray/dashboard/client/src/pages/dashboard/state.ts b/python/ray/dashboard/client/src/pages/dashboard/state.ts index fd8d3ed17..92410ea38 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/state.ts +++ b/python/ray/dashboard/client/src/pages/dashboard/state.ts @@ -7,6 +7,7 @@ import { TuneAvailabilityResponse, TuneJobResponse, } from "../../api"; +import { filterObj } from "../../common/util"; const name = "dashboard"; @@ -53,8 +54,11 @@ const slice = createSlice({ rayletInfo: RayletInfoResponse; }>, ) => { - state.nodeInfo = action.payload.nodeInfo; state.rayletInfo = action.payload.rayletInfo; + state.nodeInfo = filterNonClusterWorkerInfo( + action.payload.rayletInfo, + action.payload.nodeInfo, + ); state.lastUpdatedAt = Date.now(); }, setTuneInfo: (state, action: PayloadAction) => { @@ -83,5 +87,55 @@ const slice = createSlice({ }, }); +const clusterWorkerPids = ( + rayletInfo: RayletInfoResponse, +): Map> => { + // Groups PIDs registered with the raylet by node IP address + // This is used to filter out processes belonging to other ray clusters. + const nodeMap = new Map(); + const workerPids = new Set(); + for (const [nodeIp, { workersStats }] of Object.entries(rayletInfo.nodes)) { + for (const worker of workersStats) { + if (!worker.isDriver) { + workerPids.add(worker.pid); + } + } + nodeMap.set(nodeIp, workerPids); + } + return nodeMap; +}; + +const filterNonClusterWorkerInfo = ( + rayletInfo: RayletInfoResponse, + nodeInfo: NodeInfoResponse, +) => { + // The back-end that generates the NodeInfoResponse does not remove worker + // information of workers that belong to other clusters, so we do it here. + const workerPidsByIP = clusterWorkerPids(rayletInfo); + const filteredClients = nodeInfo.clients.map((client) => { + const workerPids = workerPidsByIP.get(client.ip); + const workers = client.workers.filter((worker) => + workerPids?.has(worker.pid), + ); + const logs = client.log_count + ? filterObj(client.log_count, ([pid, _]: [string, any]) => + workerPids?.has(parseInt(pid)), + ) + : {}; + const errors = client.error_count + ? filterObj(client.error_count, ([pid, _]: [string, any]) => + workerPids?.has(parseInt(pid)), + ) + : {}; + client.workers = workers; + client.log_count = logs; + client.error_count = errors; + return client; + }); + return { + clients: filteredClients, + }; +}; + export const dashboardActions = slice.actions; export const dashboardReducer = slice.reducer; diff --git a/python/ray/dashboard/node_stats.py b/python/ray/dashboard/node_stats.py index 3edc80707..f8c401292 100644 --- a/python/ray/dashboard/node_stats.py +++ b/python/ray/dashboard/node_stats.py @@ -23,6 +23,7 @@ class NodeStats(threading.Thread): redis_address, password=redis_password) self._node_stats = {} + self._ip_to_hostname = {} self._addr_to_owner_addr = {} self._addr_to_actor_id = {} self._addr_to_extra_info_dict = {} @@ -55,23 +56,17 @@ class NodeStats(threading.Thread): super().__init__() - def _calculate_log_counts(self): - return { - ip: { - pid: len(logs_for_pid) - for pid, logs_for_pid in logs_for_ip.items() - } - for ip, logs_for_ip in self._logs.items() - } + def _insert_log_counts(self): + for ip, logs_by_pid in self._logs.items(): + hostname = self._ip_to_hostname[ip] + logs_by_pid = {pid: len(logs) for pid, logs in logs_by_pid.items()} + self._node_stats[hostname]["log_count"] = logs_by_pid - def _calculate_error_counts(self): - return { - ip: { - pid: len(errors_for_pid) - for pid, errors_for_pid in errors_for_ip.items() - } - for ip, errors_for_ip in self._errors.items() - } + def _insert_error_counts(self): + for ip, errs_by_pid in self._errors.items(): + hostname = self._ip_to_hostname[ip] + errs_by_pid = {pid: len(errs) for pid, errs in errs_by_pid.items()} + self._node_stats[hostname]["error_count"] = errs_by_pid def _purge_outdated_stats(self): def current(then, now): @@ -89,14 +84,12 @@ class NodeStats(threading.Thread): def get_node_stats(self): with self._node_stats_lock: self._purge_outdated_stats() + self._insert_error_counts() + self._insert_log_counts() node_stats = sorted( (v for v in self._node_stats.values()), key=itemgetter("boot_time")) - return { - "clients": node_stats, - "log_counts": self._calculate_log_counts(), - "error_counts": self._calculate_error_counts(), - } + return {"clients": node_stats} def get_actor_tree(self, workers_info_by_node, infeasible_tasks, ready_tasks): @@ -252,6 +245,7 @@ class NodeStats(threading.Thread): } elif channel == ray.gcs_utils.RAY_REPORTER_PUBSUB_PATTERN: data = json.loads(ray.utils.decode(data)) + self._ip_to_hostname[data["ip"]] = data["hostname"] self._node_stats[data["hostname"]] = data else: logger.warning("Unexpected channel data received, "