mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 23:54:34 +08:00
Dashboard next-version API support in backend (#9345)
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
const base =
|
||||
process.env.NODE_ENV === "development"
|
||||
? "http://localhost:8265"
|
||||
: window.location.origin;
|
||||
|
||||
// TODO(mitchellstern): Add JSON schema validation for the responses.
|
||||
export const get = async <T>(path: string, params: { [key: string]: any }) => {
|
||||
const url = new URL(path, base);
|
||||
for (const [key, value] of Object.entries(params)) {
|
||||
url.searchParams.set(key, value);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString());
|
||||
const json = await response.json();
|
||||
|
||||
const { result, error } = json;
|
||||
|
||||
if (error !== null) {
|
||||
throw Error(error);
|
||||
}
|
||||
|
||||
return result as T;
|
||||
};
|
||||
|
||||
export const post = async <T>(path: string, params: { [key: string]: any }) => {
|
||||
const requestOptions = {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(params),
|
||||
};
|
||||
|
||||
const url = new URL(path, base);
|
||||
|
||||
const response = await fetch(url.toString(), requestOptions);
|
||||
const json = await response.json();
|
||||
|
||||
const { result, error } = json;
|
||||
|
||||
if (error !== null) {
|
||||
throw Error(error);
|
||||
}
|
||||
|
||||
return result as T;
|
||||
};
|
||||
@@ -0,0 +1,146 @@
|
||||
import { get } from "./common/requestUtils";
|
||||
|
||||
type HostnamesResponse = APIResponse<HostnamesResponseData>;
|
||||
type NodeSummaryResponse = APIResponse<NodeSummaryResponseData>;
|
||||
type NodeDetailsResponse = APIResponse<NodeDetailsResponseData>;
|
||||
|
||||
export type GPUProcessStats = {
|
||||
// Sub stat of GPU stats, this type represents the GPU
|
||||
// utilization of a single process of a single GPU.
|
||||
username: string;
|
||||
command: string;
|
||||
gpu_memory_usage: number;
|
||||
pid: number;
|
||||
};
|
||||
|
||||
export type GPUStats = {
|
||||
// This represents stats fetched from a node about a single GPU
|
||||
uuid: string;
|
||||
name: string;
|
||||
temperature_gpu: number;
|
||||
fan_speed: number;
|
||||
utilization_gpu: number;
|
||||
power_draw: number;
|
||||
enforced_power_limit: number;
|
||||
memory_used: number;
|
||||
memory_total: number;
|
||||
processes: GPUProcessStats[];
|
||||
};
|
||||
|
||||
export const getNodeSummaries = () =>
|
||||
get<NodeSummaryResponse>("/api/v2/hosts", { view: "summary" });
|
||||
|
||||
export const getHostnames = () =>
|
||||
get<HostnamesResponse>("/api/v2/hosts", { view: "hostnamelist" });
|
||||
|
||||
export const getNodeDetails = (hostname: string) =>
|
||||
get<NodeDetailsResponse>(`/api/v2/hosts/${hostname}`, {});
|
||||
|
||||
type NodeSummaryResponseData = {
|
||||
summaries: NodeSummary[];
|
||||
};
|
||||
|
||||
type NodeDetailsResponseData = {
|
||||
details: NodeDetails;
|
||||
};
|
||||
|
||||
type RayletAddressInformation = {
|
||||
rayletId: string;
|
||||
ipAddress: string;
|
||||
port: number;
|
||||
workerId: string;
|
||||
};
|
||||
type ActorState = "ALIVE" | string; // todo flesh out once ant provides other values
|
||||
|
||||
type NodeSummary = BaseNodeInfo;
|
||||
|
||||
type NodeDetails = {
|
||||
workers: Worker[];
|
||||
} & BaseNodeInfo;
|
||||
|
||||
type BaseNodeInfo = {
|
||||
now: number;
|
||||
hostname: string;
|
||||
ip: string;
|
||||
cpu: number;
|
||||
cpus: number[];
|
||||
gpus: GPUStats[]; // GPU stats fetched from node, 1 entry per GPU
|
||||
mem: number[];
|
||||
bootTime: number;
|
||||
loadAvg: number[][]; // todo figure out what this is
|
||||
disk: {
|
||||
[dir: string]: {
|
||||
total: number;
|
||||
user: number;
|
||||
free: number;
|
||||
percent: number;
|
||||
};
|
||||
};
|
||||
net: number[];
|
||||
logCounts: number;
|
||||
errorCounts: number;
|
||||
actors: { [actorId: string]: Actor };
|
||||
raylet: {
|
||||
numWorkers: number;
|
||||
pid: number;
|
||||
};
|
||||
};
|
||||
|
||||
type Actor = {
|
||||
actorId: string;
|
||||
parentId: string;
|
||||
actorCreationDummyObjectId: string;
|
||||
jobId: string;
|
||||
address: RayletAddressInformation;
|
||||
ownerAddress: RayletAddressInformation;
|
||||
timestamp: number;
|
||||
workerId: string;
|
||||
pid: number;
|
||||
functionDescriptor: string;
|
||||
state: ActorState;
|
||||
maxRestarts: number;
|
||||
remainingRestarts: number;
|
||||
isDetached: boolean;
|
||||
};
|
||||
|
||||
type Worker = {
|
||||
pid: number;
|
||||
createTime: number;
|
||||
memoryInfo: {
|
||||
rss: number;
|
||||
vms: number;
|
||||
shared: number;
|
||||
text: number;
|
||||
lib: number;
|
||||
data: number;
|
||||
dirty: Number;
|
||||
};
|
||||
cmdLine: string[];
|
||||
cpuTimes: {
|
||||
user: number;
|
||||
system: number;
|
||||
childrenUser: number;
|
||||
childrenSystem: number;
|
||||
iowait: number;
|
||||
};
|
||||
coreWorkerStats: CoreWorkerStats[];
|
||||
};
|
||||
|
||||
type CoreWorkerStats = {
|
||||
ipAddress: string;
|
||||
port: number;
|
||||
usedResources: { [resource: string]: number };
|
||||
numExecutedTasks: number;
|
||||
workerId: string;
|
||||
// We need the below but Ant's API does not yet support it.
|
||||
};
|
||||
|
||||
type HostnamesResponseData = {
|
||||
hostnames: string[];
|
||||
};
|
||||
|
||||
type APIResponse<T> = {
|
||||
result: boolean;
|
||||
msg: string;
|
||||
data: T;
|
||||
};
|
||||
@@ -75,6 +75,8 @@ class DashboardController(BaseDashboardController):
|
||||
if Analysis is not None:
|
||||
self.tune_stats = TuneCollector(2.0)
|
||||
self.memory_table = MemoryTable([])
|
||||
self.v2_api_handler = Dashboardv2APIHandler(self.node_stats,
|
||||
self.raylet_stats)
|
||||
|
||||
def _construct_raylet_info(self):
|
||||
D = self.raylet_stats.get_raylet_stats()
|
||||
@@ -240,6 +242,52 @@ class DashboardController(BaseDashboardController):
|
||||
self.tune_stats.start()
|
||||
|
||||
|
||||
class Dashboardv2APIHandler:
|
||||
def __init__(self, node_stats, raylet_stats):
|
||||
self.raylet_stats = raylet_stats
|
||||
self.node_stats = node_stats
|
||||
|
||||
@staticmethod
|
||||
def api_response(data):
|
||||
return aiohttp.web.json_response({
|
||||
"result": True,
|
||||
"msg": "Success",
|
||||
"data": data,
|
||||
})
|
||||
|
||||
@staticmethod
|
||||
def api_error(msg, status):
|
||||
return aiohttp.web.json_response(
|
||||
{
|
||||
"result": False,
|
||||
"msg": msg
|
||||
}, status=status)
|
||||
|
||||
def hostnames(self, req):
|
||||
node_stats = self.node_stats.get_node_stats()
|
||||
return self.api_response({
|
||||
"hostnames": [
|
||||
client["hostname"] for client in node_stats["clients"]
|
||||
]
|
||||
})
|
||||
|
||||
def node_summaries(self, req):
|
||||
node_stats = self.node_stats.get_node_stats()
|
||||
return self.api_response({"summaries": list(node_stats["clients"])})
|
||||
|
||||
def node_details(self, req):
|
||||
hostname = req.match_info.get("hostname")
|
||||
if hostname is None:
|
||||
return self.api_error(400, "Missing hostname")
|
||||
node_stats = self.node_stats.get_node_stats()
|
||||
for node in node_stats["clients"]:
|
||||
if node["hostname"] == hostname:
|
||||
node_obj = {"details": node}
|
||||
return self.api_response(node_obj)
|
||||
return self.api_error(
|
||||
400, "Host not found for hostname {}".format(hostname))
|
||||
|
||||
|
||||
class DashboardRouteHandler(BaseDashboardRouteHandler):
|
||||
def __init__(self, dashboard_controller: DashboardController,
|
||||
is_dev=False):
|
||||
@@ -529,7 +577,19 @@ class Dashboard:
|
||||
logs="/api/logs",
|
||||
errors="/api/errors",
|
||||
memory_table="/api/memory_table",
|
||||
stop_memory_table="/api/stop_memory_table")
|
||||
stop_memory_table="/api/stop_memory_table",
|
||||
)
|
||||
# Add v2 routes
|
||||
self.app.router.add_get(
|
||||
"/api/v2/hostnames",
|
||||
self.dashboard_controller.v2_api_handler.hostnames)
|
||||
self.app.router.add_get(
|
||||
"/api/v2/nodes/{hostname}",
|
||||
self.dashboard_controller.v2_api_handler.node_details)
|
||||
self.app.router.add_get(
|
||||
"/api/v2/nodes",
|
||||
self.dashboard_controller.v2_api_handler.node_summaries)
|
||||
|
||||
self.app.router.add_get("/{_}", route_handler.get_forbidden)
|
||||
self.app.router.add_post("/api/set_tune_experiment",
|
||||
route_handler.set_tune_experiment)
|
||||
|
||||
Reference in New Issue
Block a user