Add a web dashboard for monitoring node resource usage (#4066)

This commit is contained in:
Daniel Edgecumbe
2019-02-21 08:10:04 +00:00
committed by Robert Nishihara
parent 3ac8fd7ee8
commit 2e30f7ba38
13 changed files with 1239 additions and 44 deletions
+348
View File
@@ -0,0 +1,348 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
import aiohttp.web
except ModuleNotFoundError:
print("The reporter requires aiohttp to run.")
import sys
sys.exit(1)
import argparse
import datetime
import json
import logging
import os
import threading
import traceback
import yaml
from pathlib import Path
from collections import Counter
from operator import itemgetter
from typing import Dict
import ray.ray_constants as ray_constants
import ray.utils
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray provides a default configuration at
# entry/init points.
logger = logging.getLogger(__name__)
def to_unix_time(dt):
return (dt - datetime.datetime(1970, 1, 1)).total_seconds()
class Dashboard(object):
"""A dashboard process for monitoring Ray nodes.
This dashboard is made up of a REST API which collates data published by
Reporter processes on nodes into a json structure, and a webserver
which polls said API for display purposes.
Attributes:
redis_client: A client used to communicate with the Redis server.
"""
def __init__(self,
redis_address,
http_port,
token,
temp_dir,
redis_password=None):
"""Initialize the dashboard object."""
self.ip = ray.services.get_node_ip_address()
self.port = http_port
self.token = token
self.temp_dir = temp_dir
self.node_stats = NodeStats(redis_address, redis_password)
self.app = aiohttp.web.Application(middlewares=[self.auth_middleware])
self.setup_routes()
@aiohttp.web.middleware
async def auth_middleware(self, req, handler):
def valid_token(req):
# If the cookie token is correct, accept that.
try:
if req.cookies["token"] == self.token:
return True
except KeyError:
pass
# If the query token is correct, accept that.
try:
if req.query["token"] == self.token:
return True
except KeyError:
pass
# Reject.
logger.warning("Dashboard: rejected an invalid token")
return False
# Check that the token is present, either in query or as cookie.
if not valid_token(req):
return aiohttp.web.Response(status=401, text="401 Unauthorized")
resp = await handler(req)
resp.cookies["token"] = self.token
return resp
def setup_routes(self):
def forbidden() -> aiohttp.web.Response:
return aiohttp.web.Response(status=403, text="403 Forbidden")
def get_forbidden(_) -> aiohttp.web.Response:
return forbidden()
async def get_index(req) -> aiohttp.web.Response:
return aiohttp.web.FileResponse(
os.path.join(
os.path.dirname(os.path.abspath(__file__)), "index.html"))
async def get_resource(req) -> aiohttp.web.Response:
try:
path = req.match_info["x"]
except KeyError:
return forbidden()
if path not in ["main.css", "main.js"]:
return forbidden()
return aiohttp.web.FileResponse(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"res/{}".format(path)))
async def json_response(result=None, error=None,
ts=None) -> aiohttp.web.Response:
if ts is None:
ts = datetime.datetime.utcnow()
return aiohttp.web.json_response({
"result": result,
"timestamp": to_unix_time(ts),
"error": error,
})
async def ray_config(_) -> aiohttp.web.Response:
try:
with open(
Path("~/ray_bootstrap_config.yaml").expanduser()) as f:
cfg = yaml.load(f)
except Exception:
return await json_response(error="No config")
D = {
"min_workers": cfg["min_workers"],
"max_workers": cfg["max_workers"],
"initial_workers": cfg["initial_workers"],
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
}
try:
D["head_type"] = cfg["head_node"]["InstanceType"]
except KeyError:
D["head_type"] = "unknown"
try:
D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
except KeyError:
D["worker_type"] = "unknown"
return await json_response(result=D)
async def node_info(req) -> aiohttp.web.Response:
now = datetime.datetime.utcnow()
D = self.node_stats.get_node_stats()
return await json_response(result=D, ts=now)
self.app.router.add_get("/", get_index)
self.app.router.add_get("/index.html", get_index)
self.app.router.add_get("/index.htm", get_index)
self.app.router.add_get("/res/{x}", get_resource)
self.app.router.add_get("/api/node_info", node_info)
self.app.router.add_get("/api/super_client_table", node_info)
self.app.router.add_get("/api/ray_config", ray_config)
self.app.router.add_get("/{_}", get_forbidden)
def log_dashboard_url(self):
url = "http://{}:{}?token={}".format(self.ip, self.port, self.token)
with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
f.write(url)
logger.info("Dashboard running on {}".format(url))
def run(self):
self.log_dashboard_url()
self.node_stats.start()
aiohttp.web.run_app(self.app, host=self.ip, port=self.port)
class NodeStats(threading.Thread):
def __init__(self, redis_address, redis_password=None):
self.redis_key = "{}.*".format(ray.gcs_utils.REPORTER_CHANNEL)
self.redis_client = ray.services.create_redis_client(
redis_address, password=redis_password)
self._node_stats = {}
self._node_stats_lock = threading.Lock()
super().__init__()
def calculate_totals(self) -> Dict:
total_boot_time = 0
total_cpus = 0
total_workers = 0
total_load = [0.0, 0.0, 0.0]
total_storage_avail = 0
total_storage_total = 0
total_ram_avail = 0
total_ram_total = 0
total_sent = 0
total_recv = 0
for v in self._node_stats.values():
total_boot_time += v["boot_time"]
total_cpus += v["cpus"][0]
total_workers += len(v["workers"])
total_load[0] += v["load_avg"][0][0]
total_load[1] += v["load_avg"][0][1]
total_load[2] += v["load_avg"][0][2]
total_storage_avail += v["disk"]["/"]["free"]
total_storage_total += v["disk"]["/"]["total"]
total_ram_avail += v["mem"][1]
total_ram_total += v["mem"][0]
total_sent += v["net"][0]
total_recv += v["net"][1]
return {
"boot_time": total_boot_time,
"n_workers": total_workers,
"n_cores": total_cpus,
"m_avail": total_ram_avail,
"m_total": total_ram_total,
"d_avail": total_storage_avail,
"d_total": total_storage_total,
"load": total_load,
"n_sent": total_sent,
"n_recv": total_recv,
}
def calculate_tasks(self) -> Counter:
return Counter(
(x["name"]
for y in (v["workers"] for v in self._node_stats.values())
for x in y))
def purge_outdated_stats(self):
def current(then, now):
if (now - then) > 5:
return False
return True
now = to_unix_time(datetime.datetime.utcnow())
self._node_stats = {
k: v
for k, v in self._node_stats.items() if current(v["now"], now)
}
def get_node_stats(self) -> Dict:
with self._node_stats_lock:
self.purge_outdated_stats()
node_stats = sorted(
(v for v in self._node_stats.values()),
key=itemgetter("boot_time"))
return {
"totals": self.calculate_totals(),
"tasks": self.calculate_tasks(),
"clients": node_stats,
}
def run(self):
p = self.redis_client.pubsub(ignore_subscribe_messages=True)
p.psubscribe(self.redis_key)
logger.info("NodeStats: subscribed to {}".format(self.redis_key))
for x in p.listen():
try:
D = json.loads(x["data"])
with self._node_stats_lock:
self._node_stats[D["hostname"]] = D
except Exception:
logger.exception(traceback.format_exc())
continue
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=("Parse Redis server for the "
"dashboard to connect to."))
parser.add_argument(
"--http-port",
required=True,
type=int,
help="The port to use for the HTTP server.")
parser.add_argument(
"--token",
required=True,
type=str,
help="The token to use for the HTTP server.")
parser.add_argument(
"--redis-address",
required=True,
type=str,
help="The address to use for Redis.")
parser.add_argument(
"--redis-password",
required=False,
type=str,
default=None,
help="the password to use for Redis")
parser.add_argument(
"--logging-level",
required=False,
type=str,
default=ray_constants.LOGGER_LEVEL,
choices=ray_constants.LOGGER_LEVEL_CHOICES,
help=ray_constants.LOGGER_LEVEL_HELP)
parser.add_argument(
"--logging-format",
required=False,
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP)
parser.add_argument(
"--temp-dir",
required=False,
type=str,
default=None,
help="Specify the path of the temporary directory use by Ray process.")
args = parser.parse_args()
ray.utils.setup_logger(args.logging_level, args.logging_format)
dashboard = Dashboard(
args.redis_address,
args.http_port,
args.token,
args.temp_dir,
redis_password=args.redis_password,
)
try:
dashboard.run()
except Exception as e:
# Something went wrong, so push an error to all drivers.
redis_client = ray.services.create_redis_client(
args.redis_address, password=args.redis_password)
traceback_str = ray.utils.format_error_message(traceback.format_exc())
message = ("The dashboard on node {} failed with the following "
"error:\n{}".format(os.uname()[1], traceback_str))
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
raise e
+99
View File
@@ -0,0 +1,99 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>ray dashboard</title>
<meta name="description" content="ray dashboard"</meta>
<link rel="stylesheet" href="res/main.css">
<meta name="referrer" content="same-origin">
<!--
<script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.min.js"
integrity="sha384-rldcjlIPDkF0mEihgyEOIFhd2NW5YL717okjKC5YF2LrqoiBeMk4tpcgbRrlDHj5"
crossorigin="anonymous"></script>
-->
<script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.js"
integrity="sha384-94H2I+MU5hfBDUinQG+/Y9JbHALTPlQmHO26R3Jv60MT6WWkOD5hlYNyT9ciiLsR"
crossorigin="anonymous"></script>
</head>
<body>
<div id="dashboard">
<table v-if="clients && !error" class="ray_node_grid">
<thead>
<tr>
<th class="hostname">Hostname</th>
<th class="uptime">Uptime</th>
<th class="workers">Workers</th>
<th class="mem">RAM</th>
<th class="storage">Disk</th>
<th class="load">Load (1m, 5m, 15m)</th>
<th class="netsent">Sent (M/s)</th>
<th class="netrecv">Recv (M/s)</th>
</tr>
</thead>
<tbody is="node"
v-for="v in clients"
:key="v.hostname"
:now="now"
:hostname="v.hostname"
:boot_time="v.boot_time"
:n_workers="v.workers.length"
:n_cores="v.cpus[0]"
:m_avail="v.mem[1]"
:m_total="v.mem[0]"
:d_avail="v.disk['/'].free"
:d_total="v.disk['/'].total"
:load="v.load_avg[0]"
:n_sent="v.net[0]"
:n_recv="v.net[1]"
:workers="v.workers"
></tbody>
<tbody is="node"
class="totals"
v-if="totals"
:now="now"
:hostname="Object.keys(clients).length"
:boot_time="totals.boot_time"
:n_workers="totals.n_workers"
:n_cores="totals.n_cores"
:m_avail="totals.m_avail"
:m_total="totals.m_total"
:d_avail="totals.d_avail"
:d_total="totals.d_total"
:load="totals.load"
:n_sent="totals.n_sent"
:n_recv="totals.n_recv"
:workers="[]"
></tbody>
</table>
<template v-if="error">
<h2>{{error}}</h2>
</template>
<h2 v-if="last_update" :class="outdated_cls">Last updated {{age}} ago</h2>
<div class="cols">
<div class="tasks">
<template v-if="tasks && !error">
<h2>tasks</h2>
<ul>
<li v-for="v, k, _ in tasks">{{k}}: {{v}}</li>
</ul>
</template>
</div>
<div class="ray_config">
<template v-if="ray_config">
<h2>ray config</h2>
<pre>{{ray_config}}</pre>
</template>
</div>
</div>
</div>
</body>
<script src="res/main.js"></script>
+58
View File
@@ -0,0 +1,58 @@
* { font-family: monospace; margin: 0; padding: 0; }
h1, h2 { text-align: center; margin: 1rem 0; }
h1 { font-size: 3rem; margin: 0.5rem auto; text-align: center; }
h2 { font-size: 2rem; margin: 1rem auto; text-align: center; }
div#dashboard {
width: 116rem; margin: 1rem auto;
}
table, tbody, thead {
width: 100%;
margin: 0;
padding: 0;
}
tr {
cursor: pointer;
}
tr.workerlist td, tr.workerlist th {
font-size: 0.8rem;
}
tr:hover {
filter: brightness(0.85);
}
td, th {
padding: 0.3rem;
font-size: 1.4rem;
font-family: monospace;
text-align: center;
background-color: white;
}
th {
background-color: #eeeeee;
border: 1px solid black;
}
tbody.totals {
font-weight: bold;
font-size: 1.5rem;
}
ul { list-style-position: inside; }
.critical { background-color: magenta; }
.bad { background-color: red; }
.high { background-color: orange; }
.average { background-color: limegreen; }
.low { background-color: aquamarine; }
div.cols {
width: 100%;
margin: 1rem 0;
display: grid;
grid-template-columns: 1fr 1fr;
}
div.cols div {
padding: 1rem 0;
}
.outdated { background-color: red; }
+265
View File
@@ -0,0 +1,265 @@
let dashboard = new Vue({
el: "#dashboard",
data: {
now: (new Date()).getTime() / 1000,
shown: {},
error: "loading...",
last_update: undefined,
clients: undefined,
totals: undefined,
tasks: undefined,
ray_config: undefined,
},
methods: {
updateNodeInfo: function() {
var self = this;
fetch("/api/node_info").then(function (resp) {
return resp.json();
}).then(function(data) {
self.error = data.error;
if (data.error) {
self.clients = undefined;
self.tasks = undefined;
self.totals = undefined;
return;
}
self.last_update = data.timestamp;
self.clients = data.result.clients;
self.tasks = data.result.tasks;
self.totals = data.result.totals;
}).catch(function() {
self.error = "request error"
self.clients = undefined;
self.tasks = undefined;
self.totals = undefined;
}).finally(function() {
setTimeout(self.updateNodeInfo, 500);
});
},
updateRayConfig: function() {
var self = this;
fetch("/api/ray_config").then(function (resp) {
return resp.json();
}).then(function(data) {
if (data.error) {
self.ray_config = undefined;
return;
}
self.ray_config = data.result;
}).catch(function() {
self.error = "request error"
self.ray_config = undefined;
}).finally(function() {
setTimeout(self.updateRayConfig, 10000);
});
},
updateAll: function() {
this.updateNodeInfo();
this.updateRayConfig();
},
tickClock: function() {
this.now = (new Date()).getTime() / 1000;
}
},
computed: {
outdated_cls: function(ts) {
if ((this.now - this.last_update) > 5) {
return "outdated";
}
return "";
},
age: function(ts) {
return (this.now - this.last_update | 0) + "s";
},
},
filters: {
si: function(x) {
let prefixes = ["B", "K", "M", "G", "T"]
let i = 0;
while (x > 1024) {
x /= 1024;
i += 1;
}
return `${x.toFixed(1)}${prefixes[i]}`;
},
},
});
Vue.component("worker-usage", {
props: ['cores', 'workers'],
computed: {
frac: function() {
return this.workers / this.cores;
},
cls: function() {
if (this.frac > 3) { return "critical"; }
if (this.frac > 2) { return "bad"; }
if (this.frac > 1.5) { return "high"; }
if (this.frac > 1) { return "average"; }
return "low";
},
},
template: `
<td class="workers" :class="cls">
{{workers}}/{{cores}} {{(frac*100).toFixed(0)}}%
</td>
`,
});
Vue.component("node", {
props: [
"now",
"hostname",
"boot_time",
"n_workers",
"n_cores",
"m_avail",
"m_total",
"d_avail",
"d_total",
"load",
"n_sent",
"n_recv",
"workers",
],
data: function() {
return {
hidden: true,
};
},
computed: {
age: function() {
if (this.boot_time) {
let n = this.now;
if (this.boot_time > 2840140800) {
// Hack. It's a sum of multiple nodes.
n *= this.hostname;
}
let rs = n - this.boot_time | 0;
let s = rs % 60;
let m = ((rs / 60) % 60) | 0;
let h = (rs / 3600) | 0;
if (h) {
return `${h}h ${m}m ${s}s`;
}
if (m) {
return `${m}m ${s}s`;
}
return `${s}s`;
}
return "?"
},
},
methods: {
toggleHide: function() {
this.hidden = !this.hidden;
}
},
filters: {
mib(x) {
return `${(x/(1024**2)).toFixed(3)}M`;
},
hostnamefilter(x) {
if (isNaN(x)) {
return x;
}
return `Totals: ${x} nodes`;
},
},
template: `
<tbody v-on:click="toggleHide()">
<tr class="ray_node">
<td class="hostname">{{hostname | hostnamefilter}}</td>
<td class="uptime">{{age}}</td>
<worker-usage
:workers="n_workers"
:cores="n_cores"
></worker-usage>
<usagebar
:avail="m_avail" :total="m_total"
stat="mem"
></usagebar>
<usagebar
:avail="d_avail" :total="d_total"
stat="storage"
></usagebar>
<loadbar
:cores="n_cores"
:onem="load[0]"
:fivem="load[1]"
:fifteenm="load[2]"
>
</loadbar>
<td class="netsent">{{n_sent | mib}}/s</td>
<td class="netrecv">{{n_recv | mib}}/s</td>
</tr>
<template v-if="!hidden && workers">
<tr class="workerlist">
<th>time</th>
<th>name</th>
<th>pid</th>
<th>uss</th>
</tr>
<tr class="workerlist" v-for="x in workers">
<td>user: {{x.cpu_times.user}}s</td>
<td>{{x.name}}</td>
<td>{{x.pid}}</td>
<td>{{(x.memory_full_info.uss/1048576).toFixed(0)}}MiB</td>
</tr>
</template>
</tbody>
`,
});
Vue.component("usagebar", {
props: ['stat', 'avail', 'total'], // e.g. free -m avail
computed: {
used: function() { return this.total - this.avail; },
frac: function() { return (this.total - this.avail)/this.total; },
cls: function() {
if (this.frac > 0.95) { return "critical"; }
if (this.frac > 0.9) { return "bad"; }
if (this.frac > 0.8) { return "high"; }
if (this.frac > 0.5) { return "average"; }
return "low";
},
tcls: function() {
return `${this.stat} ${this.cls}`;
}
},
filters: {
gib(x) {
return `${(x/(1024**3)).toFixed(1)}G`;
},
pct(x) {
return `${(x*100).toFixed(0)}%`;
},
},
template: `
<td class="usagebar" :class="tcls">
{{used | gib}}/{{total | gib}} {{ frac | pct }}
</td>
`,
});
Vue.component("loadbar", {
props: ['cores', 'onem', 'fivem', 'fifteenm'],
computed: {
frac: function() { return this.onem/this.cores; },
cls: function() {
if (this.frac > 3) { return "critical"; }
if (this.frac > 2.5) { return "bad"; }
if (this.frac > 2) { return "high"; }
if (this.frac > 1.5) { return "average"; }
return "low";
},
},
template: `
<td class="load loadbar" :class="cls">
{{onem.toFixed(2)}}, {{fivem.toFixed(2)}}, {{fifteenm.toFixed(2)}}
</td>
`,
});
setInterval(dashboard.tickClock, 1000);
dashboard.updateAll();