diff --git a/python/ray/dashboard/client/src/common/requestUtils.ts b/python/ray/dashboard/client/src/common/requestUtils.ts new file mode 100644 index 000000000..918703dfb --- /dev/null +++ b/python/ray/dashboard/client/src/common/requestUtils.ts @@ -0,0 +1,44 @@ +const base = + process.env.NODE_ENV === "development" + ? "http://localhost:8265" + : window.location.origin; + +// TODO(mitchellstern): Add JSON schema validation for the responses. +export const get = async (path: string, params: { [key: string]: any }) => { + const url = new URL(path, base); + for (const [key, value] of Object.entries(params)) { + url.searchParams.set(key, value); + } + + const response = await fetch(url.toString()); + const json = await response.json(); + + const { result, error } = json; + + if (error !== null) { + throw Error(error); + } + + return result as T; +}; + +export const post = async (path: string, params: { [key: string]: any }) => { + const requestOptions = { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(params), + }; + + const url = new URL(path, base); + + const response = await fetch(url.toString(), requestOptions); + const json = await response.json(); + + const { result, error } = json; + + if (error !== null) { + throw Error(error); + } + + return result as T; +}; diff --git a/python/ray/dashboard/client/src/newApi.ts b/python/ray/dashboard/client/src/newApi.ts new file mode 100644 index 000000000..a579d4611 --- /dev/null +++ b/python/ray/dashboard/client/src/newApi.ts @@ -0,0 +1,146 @@ +import { get } from "./common/requestUtils"; + +type HostnamesResponse = APIResponse; +type NodeSummaryResponse = APIResponse; +type NodeDetailsResponse = APIResponse; + +export type GPUProcessStats = { + // Sub stat of GPU stats, this type represents the GPU + // utilization of a single process of a single GPU. + username: string; + command: string; + gpu_memory_usage: number; + pid: number; +}; + +export type GPUStats = { + // This represents stats fetched from a node about a single GPU + uuid: string; + name: string; + temperature_gpu: number; + fan_speed: number; + utilization_gpu: number; + power_draw: number; + enforced_power_limit: number; + memory_used: number; + memory_total: number; + processes: GPUProcessStats[]; +}; + +export const getNodeSummaries = () => + get("/api/v2/hosts", { view: "summary" }); + +export const getHostnames = () => + get("/api/v2/hosts", { view: "hostnamelist" }); + +export const getNodeDetails = (hostname: string) => + get(`/api/v2/hosts/${hostname}`, {}); + +type NodeSummaryResponseData = { + summaries: NodeSummary[]; +}; + +type NodeDetailsResponseData = { + details: NodeDetails; +}; + +type RayletAddressInformation = { + rayletId: string; + ipAddress: string; + port: number; + workerId: string; +}; +type ActorState = "ALIVE" | string; // todo flesh out once ant provides other values + +type NodeSummary = BaseNodeInfo; + +type NodeDetails = { + workers: Worker[]; +} & BaseNodeInfo; + +type BaseNodeInfo = { + now: number; + hostname: string; + ip: string; + cpu: number; + cpus: number[]; + gpus: GPUStats[]; // GPU stats fetched from node, 1 entry per GPU + mem: number[]; + bootTime: number; + loadAvg: number[][]; // todo figure out what this is + disk: { + [dir: string]: { + total: number; + user: number; + free: number; + percent: number; + }; + }; + net: number[]; + logCounts: number; + errorCounts: number; + actors: { [actorId: string]: Actor }; + raylet: { + numWorkers: number; + pid: number; + }; +}; + +type Actor = { + actorId: string; + parentId: string; + actorCreationDummyObjectId: string; + jobId: string; + address: RayletAddressInformation; + ownerAddress: RayletAddressInformation; + timestamp: number; + workerId: string; + pid: number; + functionDescriptor: string; + state: ActorState; + maxRestarts: number; + remainingRestarts: number; + isDetached: boolean; +}; + +type Worker = { + pid: number; + createTime: number; + memoryInfo: { + rss: number; + vms: number; + shared: number; + text: number; + lib: number; + data: number; + dirty: Number; + }; + cmdLine: string[]; + cpuTimes: { + user: number; + system: number; + childrenUser: number; + childrenSystem: number; + iowait: number; + }; + coreWorkerStats: CoreWorkerStats[]; +}; + +type CoreWorkerStats = { + ipAddress: string; + port: number; + usedResources: { [resource: string]: number }; + numExecutedTasks: number; + workerId: string; + // We need the below but Ant's API does not yet support it. +}; + +type HostnamesResponseData = { + hostnames: string[]; +}; + +type APIResponse = { + result: boolean; + msg: string; + data: T; +}; diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py index c9a22bd6e..c77b71187 100644 --- a/python/ray/dashboard/dashboard.py +++ b/python/ray/dashboard/dashboard.py @@ -75,6 +75,8 @@ class DashboardController(BaseDashboardController): if Analysis is not None: self.tune_stats = TuneCollector(2.0) self.memory_table = MemoryTable([]) + self.v2_api_handler = Dashboardv2APIHandler(self.node_stats, + self.raylet_stats) def _construct_raylet_info(self): D = self.raylet_stats.get_raylet_stats() @@ -240,6 +242,52 @@ class DashboardController(BaseDashboardController): self.tune_stats.start() +class Dashboardv2APIHandler: + def __init__(self, node_stats, raylet_stats): + self.raylet_stats = raylet_stats + self.node_stats = node_stats + + @staticmethod + def api_response(data): + return aiohttp.web.json_response({ + "result": True, + "msg": "Success", + "data": data, + }) + + @staticmethod + def api_error(msg, status): + return aiohttp.web.json_response( + { + "result": False, + "msg": msg + }, status=status) + + def hostnames(self, req): + node_stats = self.node_stats.get_node_stats() + return self.api_response({ + "hostnames": [ + client["hostname"] for client in node_stats["clients"] + ] + }) + + def node_summaries(self, req): + node_stats = self.node_stats.get_node_stats() + return self.api_response({"summaries": list(node_stats["clients"])}) + + def node_details(self, req): + hostname = req.match_info.get("hostname") + if hostname is None: + return self.api_error(400, "Missing hostname") + node_stats = self.node_stats.get_node_stats() + for node in node_stats["clients"]: + if node["hostname"] == hostname: + node_obj = {"details": node} + return self.api_response(node_obj) + return self.api_error( + 400, "Host not found for hostname {}".format(hostname)) + + class DashboardRouteHandler(BaseDashboardRouteHandler): def __init__(self, dashboard_controller: DashboardController, is_dev=False): @@ -529,7 +577,19 @@ class Dashboard: logs="/api/logs", errors="/api/errors", memory_table="/api/memory_table", - stop_memory_table="/api/stop_memory_table") + stop_memory_table="/api/stop_memory_table", + ) + # Add v2 routes + self.app.router.add_get( + "/api/v2/hostnames", + self.dashboard_controller.v2_api_handler.hostnames) + self.app.router.add_get( + "/api/v2/nodes/{hostname}", + self.dashboard_controller.v2_api_handler.node_details) + self.app.router.add_get( + "/api/v2/nodes", + self.dashboard_controller.v2_api_handler.node_summaries) + self.app.router.add_get("/{_}", route_handler.get_forbidden) self.app.router.add_post("/api/set_tune_experiment", route_handler.set_tune_experiment)