mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 17:49:47 +08:00
Metrics Export Service (#7809)
This commit is contained in:
+2
-1
@@ -1191,7 +1191,8 @@ filegroup(
|
||||
"python/ray/core/generated/__init__.py",
|
||||
"python/ray/core/generated/ray/__init__.py",
|
||||
"python/ray/core/generated/ray/protocol/__init__.py",
|
||||
"python/ray/dashboard/dashboard.py",
|
||||
"python/ray/dashboard/*.py",
|
||||
"python/ray/dashboard/metrics_exporter/*.py",
|
||||
"python/ray/experimental/*.py",
|
||||
"python/ray/util/*.py",
|
||||
"python/ray/internal/*.py",
|
||||
|
||||
@@ -62,7 +62,11 @@ const slice = createSlice({
|
||||
tuneAvailability: TuneAvailabilityResponse;
|
||||
}>,
|
||||
) => {
|
||||
state.tuneAvailability = action.payload.tuneAvailability["available"];
|
||||
const tuneAvailability =
|
||||
action.payload.tuneAvailability === null
|
||||
? false
|
||||
: action.payload.tuneAvailability["available"];
|
||||
state.tuneAvailability = tuneAvailability;
|
||||
state.lastUpdatedAt = Date.now();
|
||||
},
|
||||
setError: (state, action: PayloadAction<string | null>) => {
|
||||
|
||||
+475
-264
@@ -27,13 +27,18 @@ from typing import Dict
|
||||
import grpc
|
||||
from google.protobuf.json_format import MessageToDict
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
from ray.core.generated import node_manager_pb2
|
||||
from ray.core.generated import node_manager_pb2_grpc
|
||||
from ray.core.generated import reporter_pb2
|
||||
from ray.core.generated import reporter_pb2_grpc
|
||||
from ray.core.generated import core_worker_pb2
|
||||
from ray.core.generated import core_worker_pb2_grpc
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.dashboard.interface import BaseDashboardController
|
||||
from ray.dashboard.interface import BaseDashboardRouteHandler
|
||||
from ray.dashboard.metrics_exporter.client import Exporter
|
||||
from ray.dashboard.metrics_exporter.client import MetricsExportClient
|
||||
|
||||
try:
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
@@ -96,15 +101,381 @@ def b64_decode(reply):
|
||||
return b64decode(reply).decode("utf-8")
|
||||
|
||||
|
||||
class Dashboard(object):
|
||||
async def json_response(is_dev, result=None, error=None,
|
||||
ts=None) -> aiohttp.web.Response:
|
||||
if ts is None:
|
||||
ts = datetime.datetime.utcnow()
|
||||
|
||||
headers = None
|
||||
if is_dev:
|
||||
headers = {"Access-Control-Allow-Origin": "*"}
|
||||
|
||||
return aiohttp.web.json_response(
|
||||
{
|
||||
"result": result,
|
||||
"timestamp": to_unix_time(ts),
|
||||
"error": error,
|
||||
},
|
||||
headers=headers)
|
||||
|
||||
|
||||
class DashboardController(BaseDashboardController):
|
||||
def __init__(self, redis_address, redis_password):
|
||||
self.node_stats = NodeStats(redis_address, redis_password)
|
||||
self.raylet_stats = RayletStats(
|
||||
redis_address, redis_password=redis_password)
|
||||
if Analysis is not None:
|
||||
self.tune_stats = TuneCollector(DEFAULT_RESULTS_DIR, 2.0)
|
||||
|
||||
def _construct_raylet_info(self):
|
||||
D = self.raylet_stats.get_raylet_stats()
|
||||
workers_info_by_node = {
|
||||
data["nodeId"]: data.get("workersStats")
|
||||
for data in D.values()
|
||||
}
|
||||
infeasible_tasks = sum(
|
||||
(data.get("infeasibleTasks", []) for data in D.values()), [])
|
||||
# ready_tasks are used to render tasks that are not schedulable
|
||||
# due to resource limitations.
|
||||
# (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
|
||||
ready_tasks = sum((data.get("readyTasks", []) for data in D.values()),
|
||||
[])
|
||||
actor_tree = self.node_stats.get_actor_tree(
|
||||
workers_info_by_node, infeasible_tasks, ready_tasks)
|
||||
for address, data in D.items():
|
||||
# process view data
|
||||
measures_dicts = {}
|
||||
for view_data in data["viewData"]:
|
||||
view_name = view_data["viewName"]
|
||||
if view_name in ("local_available_resource",
|
||||
"local_total_resource",
|
||||
"object_manager_stats"):
|
||||
measures_dicts[view_name] = measures_to_dict(
|
||||
view_data["measures"])
|
||||
# process resources info
|
||||
extra_info_strings = []
|
||||
prefix = "ResourceName:"
|
||||
for resource_name, total_resource in measures_dicts[
|
||||
"local_total_resource"].items():
|
||||
available_resource = measures_dicts[
|
||||
"local_available_resource"].get(resource_name, .0)
|
||||
resource_name = resource_name[len(prefix):]
|
||||
extra_info_strings.append("{}: {} / {}".format(
|
||||
resource_name,
|
||||
format_resource(resource_name,
|
||||
total_resource - available_resource),
|
||||
format_resource(resource_name, total_resource)))
|
||||
data["extraInfo"] = ", ".join(extra_info_strings) + "\n"
|
||||
if os.environ.get("RAY_DASHBOARD_DEBUG"):
|
||||
# process object store info
|
||||
extra_info_strings = []
|
||||
prefix = "ValueType:"
|
||||
for stats_name in [
|
||||
"used_object_store_memory", "num_local_objects"
|
||||
]:
|
||||
stats_value = measures_dicts["object_manager_stats"].get(
|
||||
prefix + stats_name, .0)
|
||||
extra_info_strings.append("{}: {}".format(
|
||||
stats_name, stats_value))
|
||||
data["extraInfo"] += ", ".join(extra_info_strings)
|
||||
# process actor info
|
||||
actor_tree_str = json.dumps(
|
||||
actor_tree, indent=2, sort_keys=True)
|
||||
lines = actor_tree_str.split("\n")
|
||||
max_line_length = max(map(len, lines))
|
||||
to_print = []
|
||||
for line in lines:
|
||||
to_print.append(line + (max_line_length - len(line)) * " ")
|
||||
data["extraInfo"] += "\n" + "\n".join(to_print)
|
||||
return {"nodes": D, "actors": actor_tree}
|
||||
|
||||
def get_ray_config(self):
|
||||
try:
|
||||
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
|
||||
with open(config_path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
except Exception:
|
||||
error = "No config"
|
||||
return error, None
|
||||
|
||||
D = {
|
||||
"min_workers": cfg["min_workers"],
|
||||
"max_workers": cfg["max_workers"],
|
||||
"initial_workers": cfg["initial_workers"],
|
||||
"autoscaling_mode": cfg["autoscaling_mode"],
|
||||
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
|
||||
}
|
||||
|
||||
try:
|
||||
D["head_type"] = cfg["head_node"]["InstanceType"]
|
||||
except KeyError:
|
||||
D["head_type"] = "unknown"
|
||||
|
||||
try:
|
||||
D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
|
||||
except KeyError:
|
||||
D["worker_type"] = "unknown"
|
||||
|
||||
return None, D
|
||||
|
||||
def get_node_info(self):
|
||||
return self.node_stats.get_node_stats()
|
||||
|
||||
def get_raylet_info(self):
|
||||
return self._construct_raylet_info()
|
||||
|
||||
def tune_info(self):
|
||||
if Analysis is not None:
|
||||
D = self.tune_stats.get_stats()
|
||||
else:
|
||||
D = {}
|
||||
return D
|
||||
|
||||
def tune_availability(self):
|
||||
if Analysis is not None:
|
||||
D = self.tune_stats.get_availability()
|
||||
else:
|
||||
D = {"available": False}
|
||||
return D
|
||||
|
||||
def launch_profiling(self, node_id, pid, duration):
|
||||
profiling_id = self.raylet_stats.launch_profiling(
|
||||
node_id=node_id, pid=pid, duration=duration)
|
||||
return profiling_id
|
||||
|
||||
def check_profiling_status(self, profiling_id):
|
||||
return self.raylet_stats.check_profiling_status(profiling_id)
|
||||
|
||||
def get_profiling_info(self, profiling_id):
|
||||
return self.raylet_stats.get_profiling_info(profiling_id)
|
||||
|
||||
def kill_actor(self, actor_id, ip_address, port):
|
||||
return self.raylet_stats.kill_actor(actor_id, ip_address, port)
|
||||
|
||||
def get_logs(self, hostname, pid):
|
||||
return self.node_stats.get_logs(hostname, pid)
|
||||
|
||||
def get_errors(self, hostname, pid):
|
||||
return self.node_stats.get_errors(hostname, pid)
|
||||
|
||||
def start_collecting_metrics(self):
|
||||
self.node_stats.start()
|
||||
self.raylet_stats.start()
|
||||
if Analysis is not None:
|
||||
self.tune_stats.start()
|
||||
|
||||
|
||||
class DashboardRouteHandler(BaseDashboardRouteHandler):
|
||||
def __init__(self, dashboard_controller: DashboardController,
|
||||
is_dev=False):
|
||||
self.dashboard_controller = dashboard_controller
|
||||
self.is_dev = is_dev
|
||||
|
||||
def forbidden(self) -> aiohttp.web.Response:
|
||||
return aiohttp.web.Response(status=403, text="403 Forbidden")
|
||||
|
||||
async def get_forbidden(self, _) -> aiohttp.web.Response:
|
||||
return self.forbidden()
|
||||
|
||||
async def get_index(self, req) -> aiohttp.web.Response:
|
||||
return aiohttp.web.FileResponse(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"client/build/index.html"))
|
||||
|
||||
async def get_favicon(self, req) -> aiohttp.web.Response:
|
||||
return aiohttp.web.FileResponse(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"client/build/favicon.ico"))
|
||||
|
||||
async def ray_config(self, req) -> aiohttp.web.Response:
|
||||
error, result = self.dashboard_controller.get_ray_config()
|
||||
if error:
|
||||
return await json_response(self.is_dev, error=error)
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def node_info(self, req) -> aiohttp.web.Response:
|
||||
now = datetime.datetime.utcnow()
|
||||
D = self.dashboard_controller.get_node_info()
|
||||
return await json_response(self.is_dev, result=D, ts=now)
|
||||
|
||||
async def raylet_info(self, req) -> aiohttp.web.Response:
|
||||
result = self.dashboard_controller.get_raylet_info()
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def tune_info(self, req) -> aiohttp.web.Response:
|
||||
result = self.dashboard_controller.tune_info()
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def tune_availability(self, req) -> aiohttp.web.Response:
|
||||
result = self.dashboard_controller.tune_availability()
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def launch_profiling(self, req) -> aiohttp.web.Response:
|
||||
node_id = req.query.get("node_id")
|
||||
pid = int(req.query.get("pid"))
|
||||
duration = int(req.query.get("duration"))
|
||||
profiling_id = self.dashboard_controller.launch_profiling(
|
||||
node_id, pid, duration)
|
||||
return await json_response(self.is_dev, result=str(profiling_id))
|
||||
|
||||
async def check_profiling_status(self, req) -> aiohttp.web.Response:
|
||||
profiling_id = req.query.get("profiling_id")
|
||||
status = self.dashboard_controller.check_profiling_status(profiling_id)
|
||||
return await json_response(self.is_dev, result=status)
|
||||
|
||||
async def get_profiling_info(self, req) -> aiohttp.web.Response:
|
||||
profiling_id = req.query.get("profiling_id")
|
||||
profiling_info = self.dashboard_controller.get_profiling_info(
|
||||
profiling_id)
|
||||
return aiohttp.web.json_response(self.is_dev, profiling_info)
|
||||
|
||||
async def kill_actor(self, req) -> aiohttp.web.Response:
|
||||
actor_id = req.query.get("actor_id")
|
||||
ip_address = req.query.get("ip_address")
|
||||
port = req.query.get("port")
|
||||
return await json_response(
|
||||
self.is_dev,
|
||||
self.dashboard_controller.kill_actor(actor_id, ip_address, port))
|
||||
|
||||
async def logs(self, req) -> aiohttp.web.Response:
|
||||
hostname = req.query.get("hostname")
|
||||
pid = req.query.get("pid")
|
||||
result = self.dashboard_controller.get_logs(hostname, pid)
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def errors(self, req) -> aiohttp.web.Response:
|
||||
hostname = req.query.get("hostname")
|
||||
pid = req.query.get("pid")
|
||||
result = self.dashboard_controller.get_errors(hostname, pid)
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
|
||||
class MetricsExportHandler:
|
||||
def __init__(self,
|
||||
dashboard_controller: DashboardController,
|
||||
metrics_export_client: MetricsExportClient,
|
||||
dashboard_id,
|
||||
is_dev=False):
|
||||
assert metrics_export_client is not None
|
||||
self.metrics_export_client = metrics_export_client
|
||||
self.dashboard_controller = dashboard_controller
|
||||
self.is_dev = is_dev
|
||||
|
||||
async def enable_export_metrics(self, req) -> aiohttp.web.Response:
|
||||
if self.metrics_export_client.enabled:
|
||||
return await json_response(
|
||||
self.is_dev, result={"url": None}, error="Already enabled")
|
||||
|
||||
succeed, error = self.metrics_export_client.start_exporting_metrics()
|
||||
error_msg = "Failed to enable it. Error: {}".format(error)
|
||||
if not succeed:
|
||||
return await json_response(
|
||||
self.is_dev, result={"url": None}, error=error_msg)
|
||||
|
||||
url = self.metrics_export_client.dashboard_url
|
||||
return await json_response(self.is_dev, result={"url": url})
|
||||
|
||||
async def get_dashboard_address(self, req) -> aiohttp.web.Response:
|
||||
if not self.metrics_export_client.enabled:
|
||||
return await json_response(
|
||||
self.is_dev,
|
||||
result={"url": None},
|
||||
error="Metrics exporting is not enabled.")
|
||||
|
||||
url = self.metrics_export_client.dashboard_url
|
||||
return await json_response(self.is_dev, result={"url": url})
|
||||
|
||||
async def redirect_to_dashboard(self, req) -> aiohttp.web.Response:
|
||||
if not self.metrics_export_client.enabled:
|
||||
return await json_response(
|
||||
self.is_dev,
|
||||
result={"url": None},
|
||||
error="You should enable metrics export to use this endpoint.")
|
||||
|
||||
raise aiohttp.web.HTTPFound(self.metrics_export_client.dashboard_url)
|
||||
|
||||
|
||||
def setup_metrics_export_routes(app: aiohttp.web.Application,
|
||||
handler: MetricsExportHandler):
|
||||
"""Routes that require dynamically changing class attributes."""
|
||||
app.router.add_get("/api/metrics/enable", handler.enable_export_metrics)
|
||||
app.router.add_get("/api/metrics/url", handler.get_dashboard_address)
|
||||
app.router.add_get("/metrics/redirect", handler.redirect_to_dashboard)
|
||||
|
||||
|
||||
def setup_static_dir(app):
|
||||
build_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "client/build")
|
||||
if not os.path.isdir(build_dir):
|
||||
raise OSError(
|
||||
errno.ENOENT, "Dashboard build directory not found. If installing "
|
||||
"from source, please follow the additional steps "
|
||||
"required to build the dashboard"
|
||||
"(cd python/ray/dashboard/client "
|
||||
"&& npm ci "
|
||||
"&& npm run build)", build_dir)
|
||||
|
||||
static_dir = os.path.join(build_dir, "static")
|
||||
app.router.add_static("/static", static_dir)
|
||||
return build_dir
|
||||
|
||||
|
||||
def setup_speedscope_dir(app, build_dir):
|
||||
speedscope_dir = os.path.join(build_dir, "speedscope-1.5.3")
|
||||
app.router.add_static("/speedscope", speedscope_dir)
|
||||
|
||||
|
||||
def setup_dashboard_route(app: aiohttp.web.Application,
|
||||
handler: BaseDashboardRouteHandler,
|
||||
index=None,
|
||||
favicon=None,
|
||||
ray_config=None,
|
||||
node_info=None,
|
||||
raylet_info=None,
|
||||
tune_info=None,
|
||||
tune_availability=None,
|
||||
launch_profiling=None,
|
||||
check_profiling_status=None,
|
||||
get_profiling_info=None,
|
||||
kill_actor=None,
|
||||
logs=None,
|
||||
errors=None):
|
||||
def add_get_route(route, handler_func):
|
||||
if route is not None:
|
||||
app.router.add_get(route, handler_func)
|
||||
|
||||
add_get_route(index, handler.get_index)
|
||||
add_get_route(favicon, handler.get_favicon)
|
||||
add_get_route(ray_config, handler.ray_config)
|
||||
add_get_route(node_info, handler.node_info)
|
||||
add_get_route(raylet_info, handler.raylet_info)
|
||||
add_get_route(tune_info, handler.tune_info)
|
||||
add_get_route(tune_availability, handler.tune_availability)
|
||||
add_get_route(launch_profiling, handler.launch_profiling)
|
||||
add_get_route(check_profiling_status, handler.check_profiling_status)
|
||||
add_get_route(get_profiling_info, handler.get_profiling_info)
|
||||
add_get_route(kill_actor, handler.kill_actor)
|
||||
add_get_route(logs, handler.logs)
|
||||
add_get_route(errors, handler.errors)
|
||||
|
||||
|
||||
class Dashboard:
|
||||
"""A dashboard process for monitoring Ray nodes.
|
||||
|
||||
This dashboard is made up of a REST API which collates data published by
|
||||
Reporter processes on nodes into a json structure, and a webserver
|
||||
which polls said API for display purposes.
|
||||
|
||||
Attributes:
|
||||
redis_client: A client used to communicate with the Redis server.
|
||||
Args:
|
||||
host(str): Host address of dashboard aiohttp server.
|
||||
port(str): Port number of dashboard aiohttp server.
|
||||
redis_address(str): GCS address of a Ray cluster
|
||||
temp_dir (str): The temporary directory used for log files and
|
||||
information for this Ray session.
|
||||
redis_passord(str): Redis password to access GCS
|
||||
metrics_export_address(str): The address users host their dashboard.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -112,18 +483,16 @@ class Dashboard(object):
|
||||
port,
|
||||
redis_address,
|
||||
temp_dir,
|
||||
redis_password=None):
|
||||
"""Initialize the dashboard object."""
|
||||
redis_password=None,
|
||||
metrics_export_address=None):
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.redis_client = ray.services.create_redis_client(
|
||||
redis_address, password=redis_password)
|
||||
self.temp_dir = temp_dir
|
||||
|
||||
self.node_stats = NodeStats(redis_address, redis_password)
|
||||
self.raylet_stats = RayletStats(redis_address, redis_password)
|
||||
if Analysis is not None:
|
||||
self.tune_stats = TuneCollector(DEFAULT_RESULTS_DIR, 2.0)
|
||||
self.dashboard_id = str(uuid.uuid4())
|
||||
self.dashboard_controller = DashboardController(
|
||||
redis_address, redis_password)
|
||||
|
||||
# Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
|
||||
# security checks in the dashboard server to ease development while
|
||||
@@ -132,240 +501,72 @@ class Dashboard(object):
|
||||
self.is_dev = os.environ.get("RAY_DASHBOARD_DEV") == "1"
|
||||
|
||||
self.app = aiohttp.web.Application()
|
||||
self.setup_routes()
|
||||
route_handler = DashboardRouteHandler(
|
||||
self.dashboard_controller, is_dev=self.is_dev)
|
||||
|
||||
def setup_routes(self):
|
||||
def forbidden() -> aiohttp.web.Response:
|
||||
return aiohttp.web.Response(status=403, text="403 Forbidden")
|
||||
# Setup Metrics exporting service if necessary.
|
||||
self.metrics_export_address = metrics_export_address
|
||||
if self.metrics_export_address:
|
||||
self._setup_metrics_export()
|
||||
|
||||
def get_forbidden(_) -> aiohttp.web.Response:
|
||||
return forbidden()
|
||||
# Setup Dashboard Routes
|
||||
build_dir = setup_static_dir(self.app)
|
||||
setup_speedscope_dir(self.app, build_dir)
|
||||
setup_dashboard_route(
|
||||
self.app,
|
||||
route_handler,
|
||||
index="/",
|
||||
favicon="/favicon.ico",
|
||||
ray_config="/api/ray_config",
|
||||
node_info="/api/node_info",
|
||||
raylet_info="/api/raylet_info",
|
||||
tune_info="/api/tune_info",
|
||||
tune_availability="/api/tune_availability",
|
||||
launch_profiling="/api/launch_profiling",
|
||||
check_profiling_status="/api/check_profiling_status",
|
||||
get_profiling_info="/api/get_profiling_info",
|
||||
kill_actor="/api/kill_actor",
|
||||
logs="/api/logs",
|
||||
errors="/api/errors")
|
||||
self.app.router.add_get("/{_}", route_handler.get_forbidden)
|
||||
|
||||
async def get_index(req) -> aiohttp.web.Response:
|
||||
return aiohttp.web.FileResponse(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"client/build/index.html"))
|
||||
def _setup_metrics_export(self):
|
||||
exporter = Exporter(self.dashboard_id, self.metrics_export_address,
|
||||
self.dashboard_controller)
|
||||
self.metrics_export_client = MetricsExportClient(
|
||||
self.metrics_export_address, self.dashboard_controller,
|
||||
self.dashboard_id, exporter)
|
||||
|
||||
async def get_favicon(req) -> aiohttp.web.Response:
|
||||
return aiohttp.web.FileResponse(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"client/build/favicon.ico"))
|
||||
# Setup endpoints
|
||||
metrics_export_handler = MetricsExportHandler(
|
||||
self.dashboard_controller,
|
||||
self.metrics_export_client,
|
||||
self.dashboard_id,
|
||||
is_dev=self.is_dev)
|
||||
setup_metrics_export_routes(self.app, metrics_export_handler)
|
||||
|
||||
async def json_response(result=None, error=None,
|
||||
ts=None) -> aiohttp.web.Response:
|
||||
if ts is None:
|
||||
ts = datetime.datetime.utcnow()
|
||||
|
||||
headers = None
|
||||
if self.is_dev:
|
||||
headers = {"Access-Control-Allow-Origin": "*"}
|
||||
|
||||
return aiohttp.web.json_response(
|
||||
{
|
||||
"result": result,
|
||||
"timestamp": to_unix_time(ts),
|
||||
"error": error,
|
||||
},
|
||||
headers=headers)
|
||||
|
||||
async def ray_config(_) -> aiohttp.web.Response:
|
||||
try:
|
||||
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
|
||||
with open(config_path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
except Exception:
|
||||
return await json_response(error="No config")
|
||||
|
||||
D = {
|
||||
"min_workers": cfg["min_workers"],
|
||||
"max_workers": cfg["max_workers"],
|
||||
"initial_workers": cfg["initial_workers"],
|
||||
"autoscaling_mode": cfg["autoscaling_mode"],
|
||||
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
|
||||
}
|
||||
|
||||
try:
|
||||
D["head_type"] = cfg["head_node"]["InstanceType"]
|
||||
except KeyError:
|
||||
D["head_type"] = "unknown"
|
||||
|
||||
try:
|
||||
D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
|
||||
except KeyError:
|
||||
D["worker_type"] = "unknown"
|
||||
|
||||
return await json_response(result=D)
|
||||
|
||||
async def node_info(req) -> aiohttp.web.Response:
|
||||
now = datetime.datetime.utcnow()
|
||||
D = self.node_stats.get_node_stats()
|
||||
return await json_response(result=D, ts=now)
|
||||
|
||||
async def raylet_info(req) -> aiohttp.web.Response:
|
||||
D = self.raylet_stats.get_raylet_stats()
|
||||
workers_info_by_node = {
|
||||
data["nodeId"]: data.get("workersStats")
|
||||
for data in D.values()
|
||||
}
|
||||
infeasible_tasks = sum(
|
||||
(data.get("infeasibleTasks", []) for data in D.values()), [])
|
||||
# ready_tasks are used to render tasks that are not schedulable
|
||||
# due to resource limitations.
|
||||
# (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
|
||||
ready_tasks = sum(
|
||||
(data.get("readyTasks", []) for data in D.values()), [])
|
||||
actor_tree = self.node_stats.get_actor_tree(
|
||||
workers_info_by_node, infeasible_tasks, ready_tasks)
|
||||
for address, data in D.items():
|
||||
# process view data
|
||||
measures_dicts = {}
|
||||
for view_data in data["viewData"]:
|
||||
view_name = view_data["viewName"]
|
||||
if view_name in ("local_available_resource",
|
||||
"local_total_resource",
|
||||
"object_manager_stats"):
|
||||
measures_dicts[view_name] = measures_to_dict(
|
||||
view_data["measures"])
|
||||
# process resources info
|
||||
extra_info_strings = []
|
||||
prefix = "ResourceName:"
|
||||
for resource_name, total_resource in measures_dicts[
|
||||
"local_total_resource"].items():
|
||||
available_resource = measures_dicts[
|
||||
"local_available_resource"].get(resource_name, .0)
|
||||
resource_name = resource_name[len(prefix):]
|
||||
extra_info_strings.append("{}: {} / {}".format(
|
||||
resource_name,
|
||||
format_resource(resource_name,
|
||||
total_resource - available_resource),
|
||||
format_resource(resource_name, total_resource)))
|
||||
data["extraInfo"] = ", ".join(extra_info_strings) + "\n"
|
||||
if os.environ.get("RAY_DASHBOARD_DEBUG"):
|
||||
# process object store info
|
||||
extra_info_strings = []
|
||||
prefix = "ValueType:"
|
||||
for stats_name in [
|
||||
"used_object_store_memory", "num_local_objects"
|
||||
]:
|
||||
stats_value = measures_dicts[
|
||||
"object_manager_stats"].get(
|
||||
prefix + stats_name, .0)
|
||||
extra_info_strings.append("{}: {}".format(
|
||||
stats_name, stats_value))
|
||||
data["extraInfo"] += ", ".join(extra_info_strings)
|
||||
# process actor info
|
||||
actor_tree_str = json.dumps(
|
||||
actor_tree, indent=2, sort_keys=True)
|
||||
lines = actor_tree_str.split("\n")
|
||||
max_line_length = max(map(len, lines))
|
||||
to_print = []
|
||||
for line in lines:
|
||||
to_print.append(line +
|
||||
(max_line_length - len(line)) * " ")
|
||||
data["extraInfo"] += "\n" + "\n".join(to_print)
|
||||
result = {"nodes": D, "actors": actor_tree}
|
||||
return await json_response(result=result)
|
||||
|
||||
async def tune_info(req) -> aiohttp.web.Response:
|
||||
if Analysis is not None:
|
||||
D = self.tune_stats.get_stats()
|
||||
else:
|
||||
D = {}
|
||||
return await json_response(result=D)
|
||||
|
||||
async def tune_availability(req) -> aiohttp.web.Response:
|
||||
if Analysis is not None:
|
||||
D = self.tune_stats.get_availability()
|
||||
else:
|
||||
D = {"available": False}
|
||||
return await json_response(result=D)
|
||||
|
||||
async def launch_profiling(req) -> aiohttp.web.Response:
|
||||
node_id = req.query.get("node_id")
|
||||
pid = int(req.query.get("pid"))
|
||||
duration = int(req.query.get("duration"))
|
||||
profiling_id = self.raylet_stats.launch_profiling(
|
||||
node_id=node_id, pid=pid, duration=duration)
|
||||
return await json_response(str(profiling_id))
|
||||
|
||||
async def check_profiling_status(req) -> aiohttp.web.Response:
|
||||
profiling_id = req.query.get("profiling_id")
|
||||
return await json_response(
|
||||
self.raylet_stats.check_profiling_status(profiling_id))
|
||||
|
||||
async def get_profiling_info(req) -> aiohttp.web.Response:
|
||||
profiling_id = req.query.get("profiling_id")
|
||||
return aiohttp.web.json_response(
|
||||
self.raylet_stats.get_profiling_info(profiling_id))
|
||||
|
||||
async def kill_actor(req) -> aiohttp.web.Response:
|
||||
actor_id = req.query.get("actor_id")
|
||||
ip_address = req.query.get("ip_address")
|
||||
port = req.query.get("port")
|
||||
return await json_response(
|
||||
self.raylet_stats.kill_actor(actor_id, ip_address, port))
|
||||
|
||||
async def logs(req) -> aiohttp.web.Response:
|
||||
hostname = req.query.get("hostname")
|
||||
pid = req.query.get("pid")
|
||||
result = self.node_stats.get_logs(hostname, pid)
|
||||
return await json_response(result=result)
|
||||
|
||||
async def errors(req) -> aiohttp.web.Response:
|
||||
hostname = req.query.get("hostname")
|
||||
pid = req.query.get("pid")
|
||||
result = self.node_stats.get_errors(hostname, pid)
|
||||
return await json_response(result=result)
|
||||
|
||||
self.app.router.add_get("/", get_index)
|
||||
self.app.router.add_get("/favicon.ico", get_favicon)
|
||||
|
||||
build_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "client/build")
|
||||
if not os.path.isdir(build_dir):
|
||||
raise OSError(
|
||||
errno.ENOENT,
|
||||
"Dashboard build directory not found. If installing "
|
||||
"from source, please follow the additional steps required to "
|
||||
"build the dashboard "
|
||||
"(cd python/ray/dashboard/client && npm ci && npm run build)",
|
||||
build_dir)
|
||||
|
||||
static_dir = os.path.join(build_dir, "static")
|
||||
self.app.router.add_static("/static", static_dir)
|
||||
|
||||
speedscope_dir = os.path.join(build_dir, "speedscope-1.5.3")
|
||||
self.app.router.add_static("/speedscope", speedscope_dir)
|
||||
|
||||
self.app.router.add_get("/api/ray_config", ray_config)
|
||||
self.app.router.add_get("/api/node_info", node_info)
|
||||
self.app.router.add_get("/api/raylet_info", raylet_info)
|
||||
self.app.router.add_get("/api/tune_info", tune_info)
|
||||
self.app.router.add_get("/api/tune_availability", tune_availability)
|
||||
self.app.router.add_get("/api/launch_profiling", launch_profiling)
|
||||
self.app.router.add_get("/api/check_profiling_status",
|
||||
check_profiling_status)
|
||||
self.app.router.add_get("/api/get_profiling_info", get_profiling_info)
|
||||
self.app.router.add_get("/api/kill_actor", kill_actor)
|
||||
self.app.router.add_get("/api/logs", logs)
|
||||
self.app.router.add_get("/api/errors", errors)
|
||||
|
||||
self.app.router.add_get("/{_}", get_forbidden)
|
||||
def _start_exporting_metrics(self):
|
||||
result, error = self.metrics_export_client.start_exporting_metrics()
|
||||
if not result and error:
|
||||
url = ray.services.get_webui_url_from_redis(self.redis_client)
|
||||
error += (" Please reenable the metrics export by going to "
|
||||
"the url: {}/api/metrics/enable".format(url))
|
||||
ray.utils.push_error_to_driver_through_redis(
|
||||
self.redis_client, "metrics export failed", error)
|
||||
|
||||
def log_dashboard_url(self):
|
||||
url = ray.services.get_webui_url_from_redis(self.redis_client)
|
||||
if url is None:
|
||||
raise ValueError("WebUI URL is not present in GCS.")
|
||||
with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
|
||||
f.write(url)
|
||||
logger.info("Dashboard running on {}".format(url))
|
||||
|
||||
def run(self):
|
||||
self.log_dashboard_url()
|
||||
self.node_stats.start()
|
||||
self.raylet_stats.start()
|
||||
if Analysis is not None:
|
||||
self.tune_stats.start()
|
||||
self.dashboard_controller.start_collecting_metrics()
|
||||
if self.metrics_export_address:
|
||||
self._start_exporting_metrics()
|
||||
aiohttp.web.run_app(self.app, host=self.host, port=self.port)
|
||||
|
||||
|
||||
@@ -409,7 +610,7 @@ class NodeStats(threading.Thread):
|
||||
|
||||
super().__init__()
|
||||
|
||||
def calculate_log_counts(self):
|
||||
def _calculate_log_counts(self):
|
||||
return {
|
||||
ip: {
|
||||
pid: len(logs_for_pid)
|
||||
@@ -418,7 +619,7 @@ class NodeStats(threading.Thread):
|
||||
for ip, logs_for_ip in self._logs.items()
|
||||
}
|
||||
|
||||
def calculate_error_counts(self):
|
||||
def _calculate_error_counts(self):
|
||||
return {
|
||||
ip: {
|
||||
pid: len(errors_for_pid)
|
||||
@@ -427,7 +628,7 @@ class NodeStats(threading.Thread):
|
||||
for ip, errors_for_ip in self._errors.items()
|
||||
}
|
||||
|
||||
def purge_outdated_stats(self):
|
||||
def _purge_outdated_stats(self):
|
||||
def current(then, now):
|
||||
if (now - then) > 5:
|
||||
return False
|
||||
@@ -442,14 +643,14 @@ class NodeStats(threading.Thread):
|
||||
|
||||
def get_node_stats(self) -> Dict:
|
||||
with self._node_stats_lock:
|
||||
self.purge_outdated_stats()
|
||||
self._purge_outdated_stats()
|
||||
node_stats = sorted(
|
||||
(v for v in self._node_stats.values()),
|
||||
key=itemgetter("boot_time"))
|
||||
return {
|
||||
"clients": node_stats,
|
||||
"log_counts": self.calculate_log_counts(),
|
||||
"error_counts": self.calculate_error_counts(),
|
||||
"log_counts": self._calculate_log_counts(),
|
||||
"error_counts": self._calculate_error_counts(),
|
||||
}
|
||||
|
||||
def get_actor_tree(self, workers_info_by_node, infeasible_tasks,
|
||||
@@ -606,6 +807,7 @@ class NodeStats(threading.Thread):
|
||||
else:
|
||||
data = json.loads(ray.utils.decode(data))
|
||||
self._node_stats[data["hostname"]] = data
|
||||
|
||||
except Exception:
|
||||
logger.exception(traceback.format_exc())
|
||||
continue
|
||||
@@ -624,11 +826,11 @@ class RayletStats(threading.Thread):
|
||||
self._raylet_stats = {}
|
||||
self._profiling_stats = {}
|
||||
|
||||
self.update_nodes()
|
||||
self._update_nodes()
|
||||
|
||||
super().__init__()
|
||||
|
||||
def update_nodes(self):
|
||||
def _update_nodes(self):
|
||||
with self.nodes_lock:
|
||||
self.nodes = ray.nodes()
|
||||
node_ids = [node["NodeID"] for node in self.nodes]
|
||||
@@ -688,15 +890,15 @@ class RayletStats(threading.Thread):
|
||||
def check_profiling_status(self, profiling_id):
|
||||
with self._raylet_stats_lock:
|
||||
is_present = profiling_id in self._profiling_stats
|
||||
if is_present:
|
||||
reply = self._profiling_stats[profiling_id]
|
||||
if reply.stderr:
|
||||
return {"status": "error", "error": reply.stderr}
|
||||
else:
|
||||
return {"status": "finished"}
|
||||
else:
|
||||
if not is_present:
|
||||
return {"status": "pending"}
|
||||
|
||||
reply = self._profiling_stats[profiling_id]
|
||||
if reply.stderr:
|
||||
return {"status": "error", "error": reply.stderr}
|
||||
else:
|
||||
return {"status": "finished"}
|
||||
|
||||
def get_profiling_info(self, profiling_id):
|
||||
with self._raylet_stats_lock:
|
||||
profiling_stats = self._profiling_stats.get(profiling_id)
|
||||
@@ -721,22 +923,27 @@ class RayletStats(threading.Thread):
|
||||
while True:
|
||||
time.sleep(1.0)
|
||||
replies = {}
|
||||
for node in self.nodes:
|
||||
node_id = node["NodeID"]
|
||||
stub = self.stubs[node_id]
|
||||
reply = stub.GetNodeStats(
|
||||
node_manager_pb2.GetNodeStatsRequest(), timeout=2)
|
||||
reply_dict = MessageToDict(reply)
|
||||
reply_dict["nodeId"] = node_id
|
||||
replies[node["NodeManagerAddress"]] = reply_dict
|
||||
with self._raylet_stats_lock:
|
||||
for address, reply_dict in replies.items():
|
||||
self._raylet_stats[address] = reply_dict
|
||||
counter += 1
|
||||
# From time to time, check if new nodes have joined the cluster
|
||||
# and update self.nodes
|
||||
if counter % 10:
|
||||
self.update_nodes()
|
||||
|
||||
try:
|
||||
for node in self.nodes:
|
||||
node_id = node["NodeID"]
|
||||
stub = self.stubs[node_id]
|
||||
reply = stub.GetNodeStats(
|
||||
node_manager_pb2.GetNodeStatsRequest(), timeout=2)
|
||||
reply_dict = MessageToDict(reply)
|
||||
reply_dict["nodeId"] = node_id
|
||||
replies[node["NodeManagerAddress"]] = reply_dict
|
||||
with self._raylet_stats_lock:
|
||||
for address, reply_dict in replies.items():
|
||||
self._raylet_stats[address] = reply_dict
|
||||
except Exception:
|
||||
logger.exception(traceback.format_exc())
|
||||
finally:
|
||||
counter += 1
|
||||
# From time to time, check if new nodes have joined the cluster
|
||||
# and update self.nodes
|
||||
if counter % 10:
|
||||
self._update_nodes()
|
||||
|
||||
|
||||
class TuneCollector(threading.Thread):
|
||||
@@ -749,7 +956,6 @@ class TuneCollector(threading.Thread):
|
||||
"""
|
||||
|
||||
def __init__(self, logdir, reload_interval):
|
||||
super().__init__()
|
||||
self._logdir = logdir
|
||||
self._trial_records = {}
|
||||
self._data_lock = threading.Lock()
|
||||
@@ -757,6 +963,9 @@ class TuneCollector(threading.Thread):
|
||||
self._available = False
|
||||
self._tensor_board_started = False
|
||||
|
||||
os.makedirs(self._logdir, exist_ok=True)
|
||||
super().__init__()
|
||||
|
||||
def get_stats(self):
|
||||
with self._data_lock:
|
||||
return {"trial_records": copy.deepcopy(self._trial_records)}
|
||||
@@ -920,6 +1129,8 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.utils.setup_logger(args.logging_level, args.logging_format)
|
||||
|
||||
metrics_export_address = os.environ.get("METRICS_EXPORT_ADDRESS")
|
||||
|
||||
try:
|
||||
dashboard = Dashboard(
|
||||
args.host,
|
||||
@@ -927,7 +1138,7 @@ if __name__ == "__main__":
|
||||
args.redis_address,
|
||||
args.temp_dir,
|
||||
redis_password=args.redis_password,
|
||||
)
|
||||
metrics_export_address=metrics_export_address)
|
||||
dashboard.run()
|
||||
except Exception as e:
|
||||
# Something went wrong, so push an error to all drivers.
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
import aiohttp
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseDashboardController(ABC):
|
||||
"""Set of APIs to interact with a Dashboard class and routes.
|
||||
|
||||
Make sure you run start_collecting_metrics function before using
|
||||
get_[stats]_info methods.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_ray_config(self):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def get_node_info(self):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def get_raylet_info(self):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def tune_info(self):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def tune_availability(self):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def launch_profiling(self, node_id, pid, duration):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def check_profiling_status(self, profiling_id):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def get_profiling_info(self, profiling_id):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def kill_actor(self, actor_id, ip_address, port):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def get_logs(self, hostname, pid):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def get_errors(self, hostname, pid):
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
def start_collecting_metrics(self):
|
||||
"""Start threads/processes/actors to collect metrics
|
||||
|
||||
NOTE: This interface should be called only once before using
|
||||
other api calls.
|
||||
"""
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
|
||||
class BaseDashboardRouteHandler(ABC):
|
||||
"""Collection of routes that should be implemented for dashboard."""
|
||||
|
||||
@abstractmethod
|
||||
def get_forbidden(self, _) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def get_index(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def ray_config(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def node_info(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def raylet_info(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def tune_info(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def tune_availability(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def launch_profiling(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def check_profiling_status(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def get_profiling_info(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def kill_actor(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def logs(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
|
||||
@abstractmethod
|
||||
async def errors(self, req) -> aiohttp.web.Response:
|
||||
raise NotImplementedError("Please implement this method.")
|
||||
@@ -0,0 +1,31 @@
|
||||
try:
|
||||
import requests # `requests` is not part of stdlib.
|
||||
except ImportError:
|
||||
requests = None
|
||||
print("Couldn't import `requests` library. "
|
||||
"Be sure to install it on the client side.")
|
||||
|
||||
from ray.dashboard.metrics_exporter.schema import AuthRequest, AuthResponse
|
||||
from ray.dashboard.metrics_exporter.schema import IngestRequest, IngestResponse
|
||||
|
||||
|
||||
def authentication_request(url, cluster_id):
|
||||
auth_requeset = AuthRequest(cluster_id=cluster_id)
|
||||
response = requests.post(url, data=auth_requeset.json())
|
||||
response.raise_for_status()
|
||||
return AuthResponse.parse_obj(response.json())
|
||||
|
||||
|
||||
def ingest_request(url, cluster_id, access_token, ray_config, node_info,
|
||||
raylet_info, tune_info, tune_availability):
|
||||
ingest_request = IngestRequest(
|
||||
cluster_id=cluster_id,
|
||||
access_token=access_token,
|
||||
ray_config=ray_config,
|
||||
node_info=node_info,
|
||||
raylet_info=raylet_info,
|
||||
tune_info=tune_info,
|
||||
tune_availability=tune_availability)
|
||||
response = requests.post(url, data=ingest_request.json())
|
||||
response.raise_for_status()
|
||||
return IngestResponse.parse_obj(response.json())
|
||||
@@ -0,0 +1,140 @@
|
||||
import logging
|
||||
import threading
|
||||
import traceback
|
||||
import time
|
||||
|
||||
from ray.dashboard.metrics_exporter import api
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetricsExportClient:
|
||||
"""Group of functionalities used by Dashboard to do external communication.
|
||||
|
||||
start_export_metrics should not be called more than once as it can create
|
||||
multiple threads that export the same metrics.
|
||||
|
||||
Args:
|
||||
address(str): Address to export metrics
|
||||
dashboard_controller(BaseDashboardController): Dashboard controller to
|
||||
run dashboard business logic.
|
||||
dashboard_id(str): Unique dashboard ID.
|
||||
exporter(Exporter): Thread to export metrics.
|
||||
"""
|
||||
|
||||
def __init__(self, address, dashboard_controller, dashboard_id, exporter):
|
||||
self.dashboard_id = dashboard_id
|
||||
self.auth_url = "{}/auth".format(address)
|
||||
self.dashboard_controller = dashboard_controller
|
||||
self.exporter = exporter
|
||||
|
||||
# Data obtained from requests.
|
||||
self._dashboard_url = None
|
||||
self.auth_info = None
|
||||
|
||||
# Client states
|
||||
self.is_authenticated = False
|
||||
self.is_exporting_started = False
|
||||
|
||||
def _authenticate(self):
|
||||
"""
|
||||
Return:
|
||||
Whether or not the authentication succeed.
|
||||
"""
|
||||
self.auth_info = api.authentication_request(self.auth_url,
|
||||
self.dashboard_id)
|
||||
self._dashboard_url = self.auth_info.dashboard_url
|
||||
self.is_authenticated = True
|
||||
|
||||
@property
|
||||
def enabled(self):
|
||||
return self.is_authenticated
|
||||
|
||||
@property
|
||||
def dashboard_url(self):
|
||||
# This function should be used only after authentication succeed.
|
||||
assert self._dashboard_url is not None, (
|
||||
"dashboard url should be obtained by "
|
||||
"`start_exporting_metrics` method first.")
|
||||
return self._dashboard_url
|
||||
|
||||
def start_exporting_metrics(self):
|
||||
"""Create a thread to export metrics.
|
||||
|
||||
Once this function succeeds, it should not be called again.
|
||||
|
||||
Return:
|
||||
Whether or not it suceedes to run exporter.
|
||||
"""
|
||||
assert not self.is_exporting_started
|
||||
if not self.is_authenticated:
|
||||
try:
|
||||
self._authenticate()
|
||||
except Exception as e:
|
||||
error = ("Authentication failed with an error: {}\n"
|
||||
"Traceback: {}".format(e, traceback.format_exc()))
|
||||
logger.error(error)
|
||||
return False, error
|
||||
|
||||
self.exporter.access_token = self.auth_info.access_token
|
||||
self.exporter.start()
|
||||
self.is_exporting_started = True
|
||||
return True, None
|
||||
|
||||
|
||||
class Exporter(threading.Thread):
|
||||
"""Python thread that exports metrics periodically.
|
||||
|
||||
Args:
|
||||
dashboard_id(str): Unique Dashboard ID.
|
||||
address(str): Address to export metrics.
|
||||
dashboard_controller(BaseDashboardController): dashboard
|
||||
controller for dashboard business logic.
|
||||
update_frequency(float): Frequency to export metrics.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dashboard_id,
|
||||
address,
|
||||
dashboard_controller,
|
||||
update_frequency=1.0):
|
||||
assert update_frequency >= 1.0
|
||||
|
||||
self.dashboard_id = dashboard_id
|
||||
self.dashboard_controller = dashboard_controller
|
||||
self.export_address = "{}/ingest".format(address)
|
||||
self.update_frequency = update_frequency
|
||||
self._access_token = None
|
||||
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def access_token(self):
|
||||
return self._access_token
|
||||
|
||||
@access_token.setter
|
||||
def access_token(self, access_token):
|
||||
self._access_token = access_token
|
||||
|
||||
def export(self, ray_config, node_info, raylet_info, tune_info,
|
||||
tune_availability):
|
||||
api.ingest_request(self.export_address, self.dashboard_id,
|
||||
self.access_token, ray_config, node_info,
|
||||
raylet_info, tune_info, tune_availability)
|
||||
# TODO(sang): Add piggybacking response handler.
|
||||
|
||||
def run(self):
|
||||
assert self.access_token is not None, (
|
||||
"Set access token before running an exporter thread.")
|
||||
while True:
|
||||
try:
|
||||
time.sleep(self.update_frequency)
|
||||
self.export(self.dashboard_controller.get_ray_config(),
|
||||
self.dashboard_controller.get_node_info(),
|
||||
self.dashboard_controller.get_raylet_info(),
|
||||
self.dashboard_controller.tune_info(),
|
||||
self.dashboard_controller.tune_availability())
|
||||
except Exception as e:
|
||||
logger.error("Exception occured while exporting metrics: {}.\n"
|
||||
"Traceback: {}".format(e, traceback.format_exc()))
|
||||
continue
|
||||
@@ -0,0 +1,70 @@
|
||||
import json
|
||||
|
||||
|
||||
class ValidationError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BaseModel:
|
||||
"""Base class to define schema.
|
||||
|
||||
This will raise ValidationError if
|
||||
- Number of given kwargs are bigger than needed.
|
||||
- Number of given kwargs are smaller than needed.
|
||||
|
||||
This doesn't
|
||||
- Validate types.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._dict = kwargs
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def __str__(self):
|
||||
name = "{}\n".format(self.__class__.__name__)
|
||||
return name + str(self._dict)
|
||||
|
||||
def json(self):
|
||||
return json.dumps(self._dict)
|
||||
|
||||
@classmethod
|
||||
def parse_obj(cls, obj):
|
||||
assert type(obj) == dict, ("It can only parse dict type object.")
|
||||
required_args = cls.__slots__
|
||||
given_args = obj.keys()
|
||||
|
||||
# Check if given_args have args that is not required.
|
||||
for arg in given_args:
|
||||
if arg not in required_args:
|
||||
raise ValidationError(
|
||||
"Given argument has a key {}, which is not required "
|
||||
"by this schema: {}".format(arg, required_args))
|
||||
|
||||
# Check if given args have all required args.
|
||||
if len(required_args) != len(given_args):
|
||||
raise ValidationError("Given args: {} doesn't have all the "
|
||||
"necessary args for this schema: {}".format(
|
||||
given_args, required_args))
|
||||
|
||||
return cls(**obj)
|
||||
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
__slots__ = [
|
||||
"cluster_id", "access_token", "ray_config", "node_info", "raylet_info",
|
||||
"tune_info", "tune_availability"
|
||||
]
|
||||
|
||||
|
||||
# TODO(sang): Add piggybacked response.
|
||||
class IngestResponse(BaseModel):
|
||||
pass
|
||||
|
||||
|
||||
class AuthRequest(BaseModel):
|
||||
__slots__ = ["cluster_id"]
|
||||
|
||||
|
||||
class AuthResponse(BaseModel):
|
||||
__slots__ = ["dashboard_url", "access_token"]
|
||||
@@ -85,10 +85,10 @@ class RayServeHandle:
|
||||
# If both the slo's are None then then we use a high default
|
||||
# value so other queries can be prioritize and put in front of these
|
||||
# queries.
|
||||
assert not all(absolute_slo_ms,
|
||||
relative_slo_ms), ("Can't specify both "
|
||||
"relative and absolute "
|
||||
"slo's together!")
|
||||
assert not all([absolute_slo_ms, relative_slo_ms
|
||||
]), ("Can't specify both "
|
||||
"relative and absolute "
|
||||
"slo's together!")
|
||||
|
||||
# Don't override existing method
|
||||
if method_name is None and self.method_name is not None:
|
||||
|
||||
@@ -113,15 +113,16 @@ class RayServeMixin:
|
||||
"which is specified in the request. "
|
||||
"The avaiable methods are {}".format(
|
||||
method_name, dir(self)))
|
||||
|
||||
if method_name != "__call__":
|
||||
return getattr(self, method_name)
|
||||
else:
|
||||
# For simple callables, we should just return the object so
|
||||
# signature recoding will continue to funciton.
|
||||
return self
|
||||
return getattr(self, method_name)
|
||||
|
||||
def _ray_serve_count_num_positional(self, f):
|
||||
# NOTE:
|
||||
# In the case of simple functions, not actors, the f will be
|
||||
# a TaskRunner.__call__. What we really want here is the wrapped
|
||||
# functionso inspect.signature will figure out the underlying f.
|
||||
if hasattr(self, "__wrapped__"):
|
||||
f = self.__wrapped__
|
||||
|
||||
signature = inspect.signature(f)
|
||||
counter = 0
|
||||
for param in signature.parameters.values():
|
||||
|
||||
@@ -1112,13 +1112,9 @@ def start_dashboard(require_webui,
|
||||
dashboard_filepath = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "dashboard/dashboard.py")
|
||||
command = [
|
||||
sys.executable,
|
||||
"-u",
|
||||
dashboard_filepath,
|
||||
"--host={}".format(host),
|
||||
"--port={}".format(port),
|
||||
"--redis-address={}".format(redis_address),
|
||||
"--temp-dir={}".format(temp_dir),
|
||||
sys.executable, "-u", dashboard_filepath, "--host={}".format(host),
|
||||
"--port={}".format(port), "--redis-address={}".format(redis_address),
|
||||
"--temp-dir={}".format(temp_dir)
|
||||
]
|
||||
if redis_password:
|
||||
command += ["--redis-password", redis_password]
|
||||
@@ -1150,6 +1146,7 @@ def start_dashboard(require_webui,
|
||||
logger.info("View the Ray dashboard at {}{}{}{}{}".format(
|
||||
colorama.Style.BRIGHT, colorama.Fore.GREEN, dashboard_url,
|
||||
colorama.Fore.RESET, colorama.Style.NORMAL))
|
||||
|
||||
return dashboard_url, process_info
|
||||
else:
|
||||
return None, None
|
||||
|
||||
@@ -259,6 +259,14 @@ py_test(
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_metrics_export",
|
||||
size = "small",
|
||||
srcs = ["test_metrics_export.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_microbenchmarks",
|
||||
size = "small",
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
from ray.dashboard.metrics_exporter.client import MetricsExportClient
|
||||
from ray.dashboard.metrics_exporter.client import Exporter
|
||||
from ray.dashboard.metrics_exporter.schema import (AuthResponse, BaseModel,
|
||||
ValidationError)
|
||||
|
||||
MOCK_DASHBOARD_ID = "1234"
|
||||
MOCK_DASHBOARD_ADDRESS = "127.0.0.1:9081"
|
||||
MOCK_ACCESS_TOKEN = "1234"
|
||||
|
||||
|
||||
def _setup_client_and_exporter(controller):
|
||||
exporter = Exporter(MOCK_DASHBOARD_ID, MOCK_DASHBOARD_ADDRESS, controller)
|
||||
client = MetricsExportClient(MOCK_DASHBOARD_ADDRESS, controller,
|
||||
MOCK_DASHBOARD_ID, exporter)
|
||||
return exporter, client
|
||||
|
||||
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
def test_verify_exporter_cannot_run_without_access_token(mock_controller):
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
# Should raise an assertion error because there's no access token set.
|
||||
with pytest.raises(AssertionError):
|
||||
exporter.run()
|
||||
|
||||
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
@patch(
|
||||
"ray.dashboard.metrics_exporter.api.authentication_request",
|
||||
side_effect=requests.exceptions.HTTPError)
|
||||
def test_client_invalid_request_status_returned(auth_request, mock_controller):
|
||||
"""
|
||||
If authentication request fails with an invalid status code,
|
||||
`start_exporting_metrics` should fail.
|
||||
"""
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
|
||||
# authenticate should throw an exception because API request fails.
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
client._authenticate()
|
||||
|
||||
# This should fail because authentication throws an exception.
|
||||
result, error = client.start_exporting_metrics()
|
||||
assert result is False
|
||||
|
||||
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
@patch("ray.dashboard.metrics_exporter.api.authentication_request")
|
||||
def test_authentication(auth_request, mock_controller):
|
||||
auth_request.return_value = AuthResponse(
|
||||
dashboard_url=MOCK_DASHBOARD_ADDRESS, access_token=MOCK_ACCESS_TOKEN)
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
|
||||
assert client.enabled is False
|
||||
client._authenticate()
|
||||
assert client.dashboard_url == MOCK_DASHBOARD_ADDRESS
|
||||
assert client.enabled is True
|
||||
|
||||
|
||||
@patch.object(Exporter, "start")
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
@patch("ray.dashboard.metrics_exporter.api.authentication_request")
|
||||
def test_start_exporting_metrics_without_authentication(
|
||||
auth_request, mock_controller, start):
|
||||
"""
|
||||
`start_exporting_metrics` should trigger authentication if users
|
||||
are not authenticated.
|
||||
"""
|
||||
auth_request.return_value = AuthResponse(
|
||||
dashboard_url=MOCK_DASHBOARD_ADDRESS, access_token=MOCK_ACCESS_TOKEN)
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
|
||||
# start_exporting_metrics should succeed.
|
||||
result, error = client.start_exporting_metrics()
|
||||
assert result is True
|
||||
assert error is None
|
||||
assert client.enabled is True
|
||||
|
||||
|
||||
@patch.object(Exporter, "start")
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
@patch("ray.dashboard.metrics_exporter.api.authentication_request")
|
||||
def test_start_exporting_metrics_with_authentication(auth_request,
|
||||
mock_controller, start):
|
||||
"""
|
||||
If users are already authenticated, `start_exporting_metrics`
|
||||
should not authenticate users.
|
||||
"""
|
||||
auth_request.return_value = AuthResponse(
|
||||
dashboard_url=MOCK_DASHBOARD_ADDRESS, access_token=MOCK_ACCESS_TOKEN)
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
# Already authenticated.
|
||||
client._authenticate()
|
||||
assert client.enabled is True
|
||||
|
||||
result, error = client.start_exporting_metrics()
|
||||
# Auth request should be called only once because
|
||||
# it was already authenticated.
|
||||
auth_request.call_count == 1
|
||||
assert result is True
|
||||
assert error is None
|
||||
|
||||
|
||||
@patch.object(Exporter, "start")
|
||||
@patch("ray.dashboard.dashboard.DashboardController")
|
||||
@patch("ray.dashboard.metrics_exporter.api.authentication_request")
|
||||
def test_start_exporting_metrics_succeed(auth_request, mock_controller, start):
|
||||
auth_request.return_value = AuthResponse(
|
||||
dashboard_url=MOCK_DASHBOARD_ADDRESS, access_token=MOCK_ACCESS_TOKEN)
|
||||
exporter, client = _setup_client_and_exporter(mock_controller)
|
||||
|
||||
result, error = client.start_exporting_metrics()
|
||||
assert result is True
|
||||
assert error is None
|
||||
assert client.is_exporting_started is True
|
||||
start.call_count == 1
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
client.start_exporting_metrics()
|
||||
|
||||
|
||||
"""
|
||||
BaseModel Test
|
||||
"""
|
||||
|
||||
|
||||
def test_base_model():
|
||||
class A(BaseModel):
|
||||
__slots__ = ["a", "b"]
|
||||
|
||||
# Test the correct case.
|
||||
obj = {"a": "1", "b": "1"}
|
||||
a = A.parse_obj(obj)
|
||||
assert a.a == "1"
|
||||
assert a.b == "1"
|
||||
assert a._dict == obj
|
||||
string = "{name}\n{dict}".format(name=A.__name__, dict=str(obj))
|
||||
assert str(a) == string
|
||||
|
||||
# Test wrong types. It is not checked in the current implementation.
|
||||
obj = {"a": 1, "b": 2}
|
||||
a = A.parse_obj(obj)
|
||||
assert a.a == 1
|
||||
assert a.b == 2
|
||||
|
||||
# Test wrong types. parse_obj can only parse dictionary.
|
||||
obj = None
|
||||
with pytest.raises(AssertionError):
|
||||
a = A.parse_obj(obj)
|
||||
|
||||
# Test when fields are not sufficient.
|
||||
obj = {"a": "1"}
|
||||
with pytest.raises(ValidationError):
|
||||
a = A.parse_obj(obj)
|
||||
|
||||
# Test when fields are more than expected.
|
||||
obj = {"a": "1", "b": "1", "c": "1"}
|
||||
with pytest.raises(ValidationError):
|
||||
a = A.parse_obj(obj)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import os
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
+1
-2
@@ -23,7 +23,6 @@ ray_files = [
|
||||
"ray/core/src/ray/raylet/raylet_monitor",
|
||||
"ray/core/src/ray/gcs/gcs_server",
|
||||
"ray/core/src/ray/raylet/raylet",
|
||||
"ray/dashboard/dashboard.py",
|
||||
"ray/streaming/_streaming.so",
|
||||
]
|
||||
|
||||
@@ -75,7 +74,7 @@ if "RAY_USE_NEW_GCS" in os.environ and os.environ["RAY_USE_NEW_GCS"] == "on":
|
||||
|
||||
extras = {
|
||||
"debug": [],
|
||||
"dashboard": [],
|
||||
"dashboard": ["requests"],
|
||||
"serve": ["uvicorn", "pygments", "werkzeug", "flask", "pandas", "blist"],
|
||||
"tune": ["tabulate", "tensorboardX", "pandas"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user