diff --git a/python/ray/dashboard/client/package.json b/python/ray/dashboard/client/package.json index abf21d040..fb2af9f1c 100644 --- a/python/ray/dashboard/client/package.json +++ b/python/ray/dashboard/client/package.json @@ -33,6 +33,7 @@ "test": "react-scripts test", "eject": "react-scripts eject", "lint": "npm run eslint && npm run prettier", + "lint-fix": "npm run prettier -- --write && npm run eslint -- --fix", "prettier": "./node_modules/.bin/prettier -c src/", "eslint": "./node_modules/.bin/eslint \"src/**\"" }, diff --git a/python/ray/dashboard/client/src/api.ts b/python/ray/dashboard/client/src/api.ts index 73553669f..d2e78160f 100644 --- a/python/ray/dashboard/client/src/api.ts +++ b/python/ray/dashboard/client/src/api.ts @@ -74,6 +74,29 @@ export type NodeInfoResponseWorker = { }; }; +export type GPUProcessStats = { + // Sub stat of GPU stats, this type represents the GPU + // utilization of a single process of a single GPU. + username: string; + command: string; + gpu_memory_usage: number; + pid: number; +}; + +export type GPUStats = { + // This represents stats fetched from a node about a single GPU + uuid: string; + name: string; + temperature_gpu: number; + fan_speed: number; + utilization_gpu: number; + power_draw: number; + enforced_power_limit: number; + memory_used: number; + memory_total: number; + processes: Array; +}; + export type NodeInfoResponse = { clients: Array<{ now: number; @@ -82,6 +105,7 @@ export type NodeInfoResponse = { boot_time: number; // System boot time expressed in seconds since epoch cpu: number; // System-wide CPU utilization expressed as a percentage cpus: [number, number]; // Number of logical CPUs and physical CPUs + gpus: Array; // GPU stats fetched from node, 1 entry per GPU mem: [number, number, number]; // Total, available, and used percentage of memory disk: { [path: string]: { @@ -109,6 +133,18 @@ export type NodeInfoResponse = { export const getNodeInfo = () => get("/api/node_info", {}); +export type RayletCoreWorkerStats = { + usedResources: { + [key: string]: number; + }; +}; + +export type RayletWorkerStats = { + pid: number; + isDriver?: boolean; + coreWorkerStats: RayletCoreWorkerStats; +}; + export type RayletActorInfo = | { actorId: string; @@ -145,10 +181,7 @@ export type RayletInfoResponse = { nodes: { [ip: string]: { extraInfo?: string; - workersStats: { - pid: number; - isDriver?: boolean; - }[]; + workersStats: Array; }; }; actors: { diff --git a/python/ray/dashboard/client/src/common/formatUtils.ts b/python/ray/dashboard/client/src/common/formatUtils.ts index 9518637b5..6d08a59d5 100644 --- a/python/ray/dashboard/client/src/common/formatUtils.ts +++ b/python/ray/dashboard/client/src/common/formatUtils.ts @@ -17,6 +17,10 @@ export const formatUsage = ( return `${usedFormatted} / ${totalFormatted} (${percent.toFixed(0)}%)`; }; +// Formats, e.g. 400 and 6000 as "400 MiB / 6000 MiB (6.7%)" +export const MiBRatio = (used: number, total: number) => + `${used} MiB / ${total} MiB (${(100 * (used / total)).toFixed(1)}%)`; + export const formatDuration = (durationInSeconds: number) => { const durationSeconds = Math.floor(durationInSeconds) % 60; const durationMinutes = Math.floor(durationInSeconds / 60) % 60; diff --git a/python/ray/dashboard/client/src/common/util.ts b/python/ray/dashboard/client/src/common/util.ts new file mode 100644 index 000000000..aa75e9d1c --- /dev/null +++ b/python/ray/dashboard/client/src/common/util.ts @@ -0,0 +1,20 @@ +export const getWeightedAverage = ( + input: { + weight: number; + value: number; + }[], +) => { + if (input.length === 0) { + return 0; + } + + let totalWeightTimesValue = 0; + let totalWeight = 0; + for (const { weight, value } of input) { + totalWeightTimesValue += weight * value; + totalWeight += weight; + } + return totalWeightTimesValue / totalWeight; +}; + +export const sum = (vals: number[]) => vals.reduce((acc, val) => acc + val, 0); diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx index 7c8d88ecd..30f182b94 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeInfo.tsx @@ -13,6 +13,7 @@ import { import React from "react"; import { connect } from "react-redux"; import { RayletInfoResponse } from "../../../api"; +import { sum } from "../../../common/util"; import { StoreState } from "../../../store"; import Errors from "./dialogs/errors/Errors"; import Logs from "./dialogs/logs/Logs"; @@ -117,9 +118,11 @@ class NodeInfo extends React.Component< // the node info can contain data from more than one cluster // if more than one cluster is running on a machine. const clusterWorkerPidsByIp = clusterWorkerPids(rayletInfo); - const clusterTotalWorkers = Array.from( - clusterWorkerPidsByIp.values(), - ).reduce((acc, workerSet) => acc + workerSet.size, 0); + const clusterTotalWorkers = sum( + Array.from(clusterWorkerPidsByIp.values()).map( + (workerSet) => workerSet.size, + ), + ); // Initialize inner structure of the count objects for (const client of nodeInfo.clients) { const clusterWorkerPids = clusterWorkerPidsByIp.get(client.ip); @@ -129,9 +132,8 @@ class NodeInfo extends React.Component< const filteredLogEntries = Object.entries( nodeInfo.log_counts[client.ip] || {}, ).filter(([pid, _]) => clusterWorkerPids.has(pid)); - const totalLogEntries = filteredLogEntries.reduce( - (acc, [_, count]) => acc + count, - 0, + const totalLogEntries = sum( + filteredLogEntries.map(([_, count]) => count), ); logCounts[client.ip] = { perWorker: Object.fromEntries(filteredLogEntries), @@ -141,9 +143,8 @@ class NodeInfo extends React.Component< const filteredErrEntries = Object.entries( nodeInfo.error_counts[client.ip] || {}, ).filter(([pid, _]) => clusterWorkerPids.has(pid)); - const totalErrEntries = filteredErrEntries.reduce( - (acc, [_, count]) => acc + count, - 0, + const totalErrEntries = sum( + filteredErrEntries.map(([_, count]) => count), ); errorCounts[client.ip] = { perWorker: Object.fromEntries(filteredErrEntries), @@ -162,6 +163,8 @@ class NodeInfo extends React.Component< Uptime CPU RAM + GPU + GRAM Disk Sent Received diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx index 8f2179c93..08bec78d1 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/NodeRowGroup.tsx @@ -18,6 +18,8 @@ import { import { NodeCPU, WorkerCPU } from "./features/CPU"; import { NodeDisk, WorkerDisk } from "./features/Disk"; import { makeNodeErrors, makeWorkerErrors } from "./features/Errors"; +import { NodeGPU, WorkerGPU } from "./features/GPU"; +import { NodeGRAM, WorkerGRAM } from "./features/GRAM"; import { NodeHost, WorkerHost } from "./features/Host"; import { makeNodeLogs, makeWorkerLogs } from "./features/Logs"; import { NodeRAM, WorkerRAM } from "./features/RAM"; @@ -108,6 +110,8 @@ class NodeRowGroup extends React.Component< { NodeFeature: NodeUptime, WorkerFeature: WorkerUptime }, { NodeFeature: NodeCPU, WorkerFeature: WorkerCPU }, { NodeFeature: NodeRAM, WorkerFeature: WorkerRAM }, + { NodeFeature: NodeGPU, WorkerFeature: WorkerGPU }, + { NodeFeature: NodeGRAM, WorkerFeature: WorkerGRAM }, { NodeFeature: NodeDisk, WorkerFeature: WorkerDisk }, { NodeFeature: NodeSent, WorkerFeature: WorkerSent }, { NodeFeature: NodeReceived, WorkerFeature: WorkerReceived }, @@ -153,16 +157,27 @@ class NodeRowGroup extends React.Component< )} - {clusterWorkers.map((worker, index: number) => ( - - - {features.map(({ WorkerFeature }, index) => ( - - - - ))} - - ))} + {clusterWorkers.map((worker, index: number) => { + const rayletWorker = + raylet?.workersStats.find( + (rayletWorker) => worker.pid === rayletWorker.pid, + ) || null; + + return ( + + + {features.map(({ WorkerFeature }, index) => ( + + + + ))} + + ); + })} )} diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx index d5b7a9b72..1c4297c56 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/TotalRow.tsx @@ -12,6 +12,8 @@ import { NodeInfoResponse } from "../../../api"; import { ClusterCPU } from "./features/CPU"; import { ClusterDisk } from "./features/Disk"; import { makeClusterErrors } from "./features/Errors"; +import { ClusterGPU } from "./features/GPU"; +import { ClusterGRAM } from "./features/GRAM"; import { ClusterHost } from "./features/Host"; import { makeClusterLogs } from "./features/Logs"; import { ClusterRAM } from "./features/RAM"; @@ -72,6 +74,8 @@ class TotalRow extends React.Component> { { ClusterFeature: ClusterUptime }, { ClusterFeature: ClusterCPU }, { ClusterFeature: ClusterRAM }, + { ClusterFeature: ClusterGPU }, + { ClusterFeature: ClusterGRAM }, { ClusterFeature: ClusterDisk }, { ClusterFeature: ClusterSent }, { ClusterFeature: ClusterReceived }, diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx index 1ce9b285a..94bbad44d 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/CPU.tsx @@ -1,30 +1,12 @@ import React from "react"; import UsageBar from "../../../../common/UsageBar"; +import { getWeightedAverage } from "../../../../common/util"; import { ClusterFeatureComponent, NodeFeatureComponent, WorkerFeatureComponent, } from "./types"; -const getWeightedAverage = ( - input: { - weight: number; - value: number; - }[], -) => { - if (input.length === 0) { - return 0; - } - - let totalWeightTimesValue = 0; - let totalWeight = 0; - for (const { weight, value } of input) { - totalWeightTimesValue += weight * value; - totalWeight += weight; - } - return totalWeightTimesValue / totalWeight; -}; - export const ClusterCPU: ClusterFeatureComponent = ({ nodes }) => { const cpuWeightedAverage = getWeightedAverage( nodes.map((node) => ({ weight: node.cpus[0], value: node.cpu })), diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx new file mode 100644 index 000000000..2b8fd862c --- /dev/null +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx @@ -0,0 +1,75 @@ +import { Typography } from "@material-ui/core"; +import React from "react"; +import UsageBar from "../../../../common/UsageBar"; +import { getWeightedAverage, sum } from "../../../../common/util"; +import { + ClusterFeatureComponent, + Node, + NodeFeatureComponent, + WorkerFeatureComponent, +} from "./types"; + +const clusterUtilization = (nodes: Array): number => { + const utils = nodes + .map((node) => ({ weight: node.gpus.length, value: nodeUtilization(node) })) + .filter((util) => !isNaN(util.value)); + if (utils.length === 0) { + return NaN; + } + return getWeightedAverage(utils); +}; + +const nodeUtilization = (node: Node): number => { + if (!node.gpus || node.gpus.length === 0) { + return NaN; + } + const utilizationSum = sum(node.gpus.map((gpu) => gpu.utilization_gpu)); + const avgUtilization = utilizationSum / node.gpus.length; + return avgUtilization; +}; + +export const ClusterGPU: ClusterFeatureComponent = ({ nodes }) => { + const clusterAverageUtilization = clusterUtilization(nodes); + return ( +
+ {isNaN(clusterAverageUtilization) ? ( + + N/A + + ) : ( + + )} +
+ ); +}; + +export const NodeGPU: NodeFeatureComponent = ({ node }) => { + const nodeUtil = nodeUtilization(node); + return ( +
+ {isNaN(nodeUtil) ? ( + + N/A + + ) : ( + + )} +
+ ); +}; + +export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => { + const workerRes = rayletWorker?.coreWorkerStats.usedResources; + const workerUsedGPUResources = workerRes?.["GPU"] || NaN; + const message = isNaN(workerUsedGPUResources) ? ( + + N/A + + ) : ( + `${workerUsedGPUResources} GPUs in use` + ); + return
{message}
; +}; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx new file mode 100644 index 000000000..19d510aae --- /dev/null +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GRAM.tsx @@ -0,0 +1,96 @@ +import { Typography } from "@material-ui/core"; +import React from "react"; +import { GPUStats } from "../../../../api"; +import { MiBRatio } from "../../../../common/formatUtils"; +import UsageBar from "../../../../common/UsageBar"; +import { getWeightedAverage, sum } from "../../../../common/util"; +import { + ClusterFeatureComponent, + Node, + NodeFeatureComponent, + WorkerFeatureComponent, +} from "./types"; + +const nodeGRAMUtilization = (node: Node) => { + const utilization = (gpu: GPUStats) => gpu.memory_used / gpu.memory_total; + if (node.gpus.length === 0) { + return NaN; + } + const utilizationSum = sum(node.gpus.map((gpu) => utilization(gpu))); + const avgUtilization = utilizationSum / node.gpus.length; + // Convert to a percent before returning + return avgUtilization * 100; +}; + +const clusterGRAMUtilization = (nodes: Array) => { + const utils = nodes + .map((node) => ({ + weight: node.gpus.length, + value: nodeGRAMUtilization(node), + })) + .filter((util) => !isNaN(util.value)); + if (utils.length === 0) { + return NaN; + } + return getWeightedAverage(utils); +}; + +export const ClusterGRAM: ClusterFeatureComponent = ({ nodes }) => { + const clusterAverageUtilization = clusterGRAMUtilization(nodes); + return ( +
+ {isNaN(clusterAverageUtilization) ? ( + + N/A + + ) : ( + + )} +
+ ); +}; + +export const NodeGRAM: NodeFeatureComponent = ({ node }) => { + const gramUtil = nodeGRAMUtilization(node); + return ( +
+ {isNaN(gramUtil) ? ( + + N/A + + ) : ( + + )} +
+ ); +}; + +export const WorkerGRAM: WorkerFeatureComponent = ({ worker, node }) => { + const workerProcessPerGPU = node.gpus + .map((gpu) => gpu.processes) + .map((processes) => + processes.find((process) => process.pid === worker.pid), + ); + const workerUtilPerGPU = workerProcessPerGPU.map( + (proc) => proc?.gpu_memory_usage || 0, + ); + const totalNodeGRAM = sum(node.gpus.map((gpu) => gpu.memory_total)); + const usedGRAM = sum(workerUtilPerGPU); + return ( +
+ {node.gpus.length === 0 ? ( + + N/A + + ) : ( + + )} +
+ ); +}; diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx index c0c692370..bcbf7ff09 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/types.tsx @@ -1,13 +1,17 @@ import React from "react"; -import { NodeInfoResponse } from "../../../../api"; +import { NodeInfoResponse, RayletWorkerStats } from "../../../../api"; type ArrayType = T extends Array ? U : never; -type Node = ArrayType; -type Worker = ArrayType; +export type Node = ArrayType; +export type Worker = ArrayType; type ClusterFeatureData = { nodes: Node[] }; type NodeFeatureData = { node: Node }; -type WorkerFeatureData = { node: Node; worker: Worker }; +type WorkerFeatureData = { + node: Node; + worker: Worker; + rayletWorker: RayletWorkerStats | null; +}; export type ClusterFeatureComponent = ( data: ClusterFeatureData, diff --git a/python/ray/reporter.py b/python/ray/reporter.py index 4fc396318..437bae723 100644 --- a/python/ray/reporter.py +++ b/python/ray/reporter.py @@ -10,7 +10,6 @@ import platform import subprocess import sys from concurrent import futures - import ray import psutil import ray.ray_constants as ray_constants @@ -24,6 +23,13 @@ from ray.core.generated import reporter_pb2_grpc # entry/init points. logger = logging.getLogger(__name__) +try: + import gpustat.core as gpustat +except ImportError: + gpustat = None + logger.warning( + "Install gpustat with 'pip install gpustat' to enable GPU monitoring.") + class ReporterServer(reporter_pb2_grpc.ReporterServiceServicer): def __init__(self): @@ -107,6 +113,27 @@ class Reporter: def get_cpu_percent(): return psutil.cpu_percent() + @staticmethod + def get_gpu_usage(): + if gpustat is None: + return [] + gpu_utilizations = [] + gpus = [] + try: + gpus = gpustat.new_query().gpus + except Exception as e: + logger.debug( + "gpustat failed to retrieve GPU information: {}".format(e)) + for gpu in gpus: + # Note the keys in this dict have periods which throws + # off javascript so we change .s to _s + gpu_data = { + "_".join(key.split(".")): val + for key, val in gpu.entry.items() + } + gpu_utilizations.append(gpu_data) + return gpu_utilizations + @staticmethod def get_boot_time(): return psutil.boot_time() @@ -179,6 +206,7 @@ class Reporter: "boot_time": self.get_boot_time(), "load_avg": self.get_load_avg(), "disk": self.get_disk_usage(), + "gpus": self.get_gpu_usage(), "net": netstats, } diff --git a/python/setup.py b/python/setup.py index 582316623..8707f85a4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -82,7 +82,7 @@ if "RAY_USE_NEW_GCS" in os.environ and os.environ["RAY_USE_NEW_GCS"] == "on": extras = { "debug": [], - "dashboard": ["requests"], + "dashboard": ["requests", "gpustat"], "serve": ["uvicorn", "flask", "blist"], "tune": ["tabulate", "tensorboardX", "pandas"] }