Files
ray/python/ray/dashboard/dashboard.py
T

994 lines
38 KiB
Python

try:
import aiohttp.web
except ImportError:
print("The dashboard requires aiohttp to run.")
import sys
sys.exit(1)
import argparse
import copy
import datetime
import errno
import json
import logging
import os
import platform
import threading
import time
import traceback
import yaml
import uuid
import grpc
from google.protobuf.json_format import MessageToDict
import ray
import ray.ray_constants as ray_constants
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
from ray.core.generated import reporter_pb2
from ray.core.generated import reporter_pb2_grpc
from ray.core.generated import core_worker_pb2
from ray.core.generated import core_worker_pb2_grpc
from ray.dashboard.interface import BaseDashboardController
from ray.dashboard.interface import BaseDashboardRouteHandler
from ray.dashboard.memory import construct_memory_table, MemoryTable, \
GroupByType, SortingType
from ray.dashboard.metrics_exporter.client import Exporter
from ray.dashboard.metrics_exporter.client import MetricsExportClient
from ray.dashboard.node_stats import NodeStats
from ray.dashboard.util import to_unix_time, measures_to_dict, format_resource
from ray.metrics_agent import PrometheusServiceDiscoveryWriter
try:
from ray.tune import Analysis
from tensorboard import program
except ImportError:
Analysis = None
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray provides a default configuration at
# entry/init points.
logger = logging.getLogger(__name__)
async def json_response(is_dev, result=None, error=None,
ts=None) -> aiohttp.web.Response:
if ts is None:
ts = datetime.datetime.utcnow()
headers = None
if is_dev:
headers = {"Access-Control-Allow-Origin": "*"}
return aiohttp.web.json_response(
{
"result": result,
"timestamp": to_unix_time(ts),
"error": error,
},
headers=headers)
class DashboardController(BaseDashboardController):
def __init__(self, redis_address, redis_password):
self.node_stats = NodeStats(redis_address, redis_password)
self.raylet_stats = RayletStats(
redis_address, redis_password=redis_password)
if Analysis is not None:
self.tune_stats = TuneCollector(2.0)
self.memory_table = MemoryTable([])
def _construct_raylet_info(self):
D = self.raylet_stats.get_raylet_stats()
workers_info_by_node = {
data["nodeId"]: data.get("workersStats")
for data in D.values()
}
infeasible_tasks = sum(
(data.get("infeasibleTasks", []) for data in D.values()), [])
# ready_tasks are used to render tasks that are not schedulable
# due to resource limitations.
# (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
ready_tasks = sum((data.get("readyTasks", []) for data in D.values()),
[])
actors = self.node_stats.get_actors(workers_info_by_node,
infeasible_tasks, ready_tasks)
for address, data in D.items():
# process view data
measures_dicts = {}
for view_data in data["viewData"]:
view_name = view_data["viewName"]
if view_name in ("local_available_resource",
"local_total_resource",
"object_manager_stats"):
measures_dicts[view_name] = measures_to_dict(
view_data["measures"])
# process resources info
extra_info_strings = []
prefix = "ResourceName:"
for resource_name, total_resource in measures_dicts[
"local_total_resource"].items():
available_resource = measures_dicts[
"local_available_resource"].get(resource_name, .0)
resource_name = resource_name[len(prefix):]
extra_info_strings.append("{}: {} / {}".format(
resource_name,
format_resource(resource_name,
total_resource - available_resource),
format_resource(resource_name, total_resource)))
data["extraInfo"] = ", ".join(extra_info_strings) + "\n"
if os.environ.get("RAY_DASHBOARD_DEBUG"):
# process object store info
extra_info_strings = []
prefix = "ValueType:"
for stats_name in [
"used_object_store_memory", "num_local_objects"
]:
stats_value = measures_dicts["object_manager_stats"].get(
prefix + stats_name, .0)
extra_info_strings.append("{}: {}".format(
stats_name, stats_value))
data["extraInfo"] += ", ".join(extra_info_strings)
# process actor info
actors_str = json.dumps(actors, indent=2, sort_keys=True)
lines = actors_str.split("\n")
max_line_length = max(map(len, lines))
to_print = []
for line in lines:
to_print.append(line + (max_line_length - len(line)) * " ")
data["extraInfo"] += "\n" + "\n".join(to_print)
return {"nodes": D, "actors": actors}
def get_ray_config(self):
try:
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
with open(config_path) as f:
cfg = yaml.safe_load(f)
except Exception:
error = "No config"
return error, None
D = {
"min_workers": cfg["min_workers"],
"max_workers": cfg["max_workers"],
"initial_workers": cfg["initial_workers"],
"autoscaling_mode": cfg["autoscaling_mode"],
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
}
try:
D["head_type"] = cfg["head_node"]["InstanceType"]
except KeyError:
D["head_type"] = "unknown"
try:
D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
except KeyError:
D["worker_type"] = "unknown"
return None, D
def get_node_info(self):
return self.node_stats.get_node_stats()
def get_raylet_info(self):
return self._construct_raylet_info()
def get_memory_table_info(self,
group_by=GroupByType.NODE_ADDRESS,
sort_by=SortingType.OBJECT_SIZE) -> MemoryTable:
# Collecting memory info adds big overhead to the cluster.
# This must be collected only when it is necessary.
self.raylet_stats.include_memory_info = True
D = self.raylet_stats.get_raylet_stats()
workers_info_by_node = {
data["nodeId"]: data.get("workersStats")
for data in D.values()
}
self.memory_table = construct_memory_table(
workers_info_by_node, group_by=group_by, sort_by=sort_by)
return self.memory_table
def stop_collecting_memory_table_info(self):
self.raylet_stats.include_memory_info = False
def tune_info(self):
if Analysis is not None:
D = self.tune_stats.get_stats()
else:
D = {}
return D
def tune_availability(self):
if Analysis is not None:
D = self.tune_stats.get_availability()
else:
D = {"available": False, "trials_available": False}
return D
def set_tune_experiment(self, experiment):
if Analysis is not None:
return self.tune_stats.set_experiment(experiment)
return "Tune Not Enabled", None
def enable_tune_tensorboard(self):
if Analysis is not None:
self.tune_stats.enable_tensorboard()
def launch_profiling(self, node_id, pid, duration):
profiling_id = self.raylet_stats.launch_profiling(
node_id=node_id, pid=pid, duration=duration)
return profiling_id
def check_profiling_status(self, profiling_id):
return self.raylet_stats.check_profiling_status(profiling_id)
def get_profiling_info(self, profiling_id):
return self.raylet_stats.get_profiling_info(profiling_id)
def kill_actor(self, actor_id, ip_address, port):
return self.raylet_stats.kill_actor(actor_id, ip_address, port)
def get_logs(self, hostname, pid):
return self.node_stats.get_logs(hostname, pid)
def get_errors(self, hostname, pid):
return self.node_stats.get_errors(hostname, pid)
def start_collecting_metrics(self):
self.node_stats.start()
self.raylet_stats.start()
if Analysis is not None:
self.tune_stats.start()
class DashboardRouteHandler(BaseDashboardRouteHandler):
def __init__(self, dashboard_controller: DashboardController,
is_dev=False):
self.dashboard_controller = dashboard_controller
self.is_dev = is_dev
def forbidden(self) -> aiohttp.web.Response:
return aiohttp.web.Response(status=403, text="403 Forbidden")
async def get_forbidden(self, _) -> aiohttp.web.Response:
return self.forbidden()
async def get_index(self, req) -> aiohttp.web.Response:
return aiohttp.web.FileResponse(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"client/build/index.html"))
async def get_favicon(self, req) -> aiohttp.web.Response:
return aiohttp.web.FileResponse(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"client/build/favicon.ico"))
async def ray_config(self, req) -> aiohttp.web.Response:
error, result = self.dashboard_controller.get_ray_config()
if error:
return await json_response(self.is_dev, error=error)
return await json_response(self.is_dev, result=result)
async def node_info(self, req) -> aiohttp.web.Response:
now = datetime.datetime.utcnow()
D = self.dashboard_controller.get_node_info()
return await json_response(self.is_dev, result=D, ts=now)
async def raylet_info(self, req) -> aiohttp.web.Response:
result = self.dashboard_controller.get_raylet_info()
return await json_response(self.is_dev, result=result)
async def memory_table_info(self, req) -> aiohttp.web.Response:
group_by = req.query.get("group_by")
sort_by = req.query.get("sort_by")
kwargs = {}
try:
if group_by:
kwargs["group_by"] = GroupByType(group_by)
if sort_by:
kwargs["sort_by"] = SortingType(sort_by)
except ValueError as e:
return aiohttp.web.HTTPBadRequest(reason=str(e))
memory_table = self.dashboard_controller.get_memory_table_info(
**kwargs)
return await json_response(self.is_dev, result=memory_table.__dict__())
async def stop_collecting_memory_table_info(self,
req) -> aiohttp.web.Response:
self.dashboard_controller.stop_collecting_memory_table_info()
return await json_response(self.is_dev, result={})
async def tune_info(self, req) -> aiohttp.web.Response:
result = self.dashboard_controller.tune_info()
return await json_response(self.is_dev, result=result)
async def tune_availability(self, req) -> aiohttp.web.Response:
result = self.dashboard_controller.tune_availability()
return await json_response(self.is_dev, result=result)
async def set_tune_experiment(self, req) -> aiohttp.web.Response:
data = await req.json()
error, result = self.dashboard_controller.set_tune_experiment(
data["experiment"])
if error:
return await json_response(self.is_dev, error=error)
return await json_response(self.is_dev, result=result)
async def enable_tune_tensorboard(self, req) -> aiohttp.web.Response:
self.dashboard_controller.enable_tune_tensorboard()
return await json_response(self.is_dev, result={})
async def launch_profiling(self, req) -> aiohttp.web.Response:
node_id = req.query.get("node_id")
pid = int(req.query.get("pid"))
duration = int(req.query.get("duration"))
profiling_id = self.dashboard_controller.launch_profiling(
node_id, pid, duration)
return await json_response(self.is_dev, result=str(profiling_id))
async def check_profiling_status(self, req) -> aiohttp.web.Response:
profiling_id = req.query.get("profiling_id")
status = self.dashboard_controller.check_profiling_status(profiling_id)
return await json_response(self.is_dev, result=status)
async def get_profiling_info(self, req) -> aiohttp.web.Response:
profiling_id = req.query.get("profiling_id")
profiling_info = self.dashboard_controller.get_profiling_info(
profiling_id)
return aiohttp.web.json_response(profiling_info)
async def kill_actor(self, req) -> aiohttp.web.Response:
actor_id = req.query.get("actor_id")
ip_address = req.query.get("ip_address")
port = req.query.get("port")
return await json_response(
self.is_dev,
self.dashboard_controller.kill_actor(actor_id, ip_address, port))
async def logs(self, req) -> aiohttp.web.Response:
hostname = req.query.get("hostname")
pid = req.query.get("pid")
result = self.dashboard_controller.get_logs(hostname, pid)
return await json_response(self.is_dev, result=result)
async def errors(self, req) -> aiohttp.web.Response:
hostname = req.query.get("hostname")
pid = req.query.get("pid")
result = self.dashboard_controller.get_errors(hostname, pid)
return await json_response(self.is_dev, result=result)
class MetricsExportHandler:
def __init__(self,
dashboard_controller: DashboardController,
metrics_export_client: MetricsExportClient,
dashboard_id,
is_dev=False):
assert metrics_export_client is not None
self.metrics_export_client = metrics_export_client
self.dashboard_controller = dashboard_controller
self.is_dev = is_dev
async def enable_export_metrics(self, req) -> aiohttp.web.Response:
if self.metrics_export_client.enabled:
return await json_response(
self.is_dev, result={"url": None}, error="Already enabled")
succeed, error = self.metrics_export_client.start_exporting_metrics()
error_msg = "Failed to enable it. Error: {}".format(error)
if not succeed:
return await json_response(
self.is_dev, result={"url": None}, error=error_msg)
url = self.metrics_export_client.dashboard_url
return await json_response(self.is_dev, result={"url": url})
async def get_dashboard_address(self, req) -> aiohttp.web.Response:
if not self.metrics_export_client.enabled:
return await json_response(
self.is_dev,
result={"url": None},
error="Metrics exporting is not enabled.")
url = self.metrics_export_client.dashboard_url
return await json_response(self.is_dev, result={"url": url})
async def redirect_to_dashboard(self, req) -> aiohttp.web.Response:
if not self.metrics_export_client.enabled:
return await json_response(
self.is_dev,
result={"url": None},
error="You should enable metrics export to use this endpoint.")
raise aiohttp.web.HTTPFound(self.metrics_export_client.dashboard_url)
def setup_metrics_export_routes(app: aiohttp.web.Application,
handler: MetricsExportHandler):
"""Routes that require dynamically changing class attributes."""
app.router.add_get("/api/metrics/enable", handler.enable_export_metrics)
app.router.add_get("/api/metrics/url", handler.get_dashboard_address)
app.router.add_get("/metrics/redirect", handler.redirect_to_dashboard)
def setup_static_dir(app):
build_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "client/build")
if not os.path.isdir(build_dir):
raise OSError(
errno.ENOENT, "Dashboard build directory not found. If installing "
"from source, please follow the additional steps "
"required to build the dashboard"
"(cd python/ray/dashboard/client "
"&& npm ci "
"&& npm run build)", build_dir)
static_dir = os.path.join(build_dir, "static")
app.router.add_static("/static", static_dir)
return build_dir
def setup_speedscope_dir(app, build_dir):
speedscope_dir = os.path.join(build_dir, "speedscope-1.5.3")
app.router.add_static("/speedscope", speedscope_dir)
def setup_dashboard_route(app: aiohttp.web.Application,
handler: BaseDashboardRouteHandler,
index=None,
favicon=None,
ray_config=None,
node_info=None,
raylet_info=None,
tune_info=None,
tune_availability=None,
launch_profiling=None,
check_profiling_status=None,
get_profiling_info=None,
kill_actor=None,
logs=None,
errors=None,
memory_table=None,
stop_memory_table=None):
def add_get_route(route, handler_func):
if route is not None:
app.router.add_get(route, handler_func)
add_get_route(index, handler.get_index)
add_get_route(favicon, handler.get_favicon)
add_get_route(ray_config, handler.ray_config)
add_get_route(node_info, handler.node_info)
add_get_route(raylet_info, handler.raylet_info)
add_get_route(tune_info, handler.tune_info)
add_get_route(tune_availability, handler.tune_availability)
add_get_route(launch_profiling, handler.launch_profiling)
add_get_route(check_profiling_status, handler.check_profiling_status)
add_get_route(get_profiling_info, handler.get_profiling_info)
add_get_route(kill_actor, handler.kill_actor)
add_get_route(logs, handler.logs)
add_get_route(errors, handler.errors)
add_get_route(memory_table, handler.memory_table_info)
add_get_route(stop_memory_table, handler.stop_collecting_memory_table_info)
class Dashboard:
"""A dashboard process for monitoring Ray nodes.
This dashboard is made up of a REST API which collates data published by
Reporter processes on nodes into a json structure, and a webserver
which polls said API for display purposes.
Args:
host(str): Host address of dashboard aiohttp server.
port(str): Port number of dashboard aiohttp server.
redis_address(str): GCS address of a Ray cluster
temp_dir (str): The temporary directory used for log files and
information for this Ray session.
redis_passord(str): Redis password to access GCS
metrics_export_address(str): The address users host their dashboard.
"""
def __init__(self,
host,
port,
redis_address,
temp_dir,
redis_password=None,
metrics_export_address=None):
self.host = host
self.port = port
self.redis_client = ray.services.create_redis_client(
redis_address, password=redis_password)
self.temp_dir = temp_dir
self.dashboard_id = str(uuid.uuid4())
self.dashboard_controller = DashboardController(
redis_address, redis_password)
self.service_discovery = PrometheusServiceDiscoveryWriter(
redis_address, redis_password, temp_dir)
# Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
# security checks in the dashboard server to ease development while
# using the React dev server. Specifically, when this option is set, we
# allow cross-origin requests to be made.
self.is_dev = os.environ.get("RAY_DASHBOARD_DEV") == "1"
self.app = aiohttp.web.Application()
route_handler = DashboardRouteHandler(
self.dashboard_controller, is_dev=self.is_dev)
# Setup Metrics exporting service if necessary.
self.metrics_export_address = metrics_export_address
if self.metrics_export_address:
self._setup_metrics_export()
# Setup Dashboard Routes
build_dir = setup_static_dir(self.app)
setup_speedscope_dir(self.app, build_dir)
setup_dashboard_route(
self.app,
route_handler,
index="/",
favicon="/favicon.ico",
ray_config="/api/ray_config",
node_info="/api/node_info",
raylet_info="/api/raylet_info",
tune_info="/api/tune_info",
tune_availability="/api/tune_availability",
launch_profiling="/api/launch_profiling",
check_profiling_status="/api/check_profiling_status",
get_profiling_info="/api/get_profiling_info",
kill_actor="/api/kill_actor",
logs="/api/logs",
errors="/api/errors",
memory_table="/api/memory_table",
stop_memory_table="/api/stop_memory_table")
self.app.router.add_get("/{_}", route_handler.get_forbidden)
self.app.router.add_post("/api/set_tune_experiment",
route_handler.set_tune_experiment)
self.app.router.add_post("/api/enable_tune_tensorboard",
route_handler.enable_tune_tensorboard)
def _setup_metrics_export(self):
exporter = Exporter(self.dashboard_id, self.metrics_export_address,
self.dashboard_controller)
self.metrics_export_client = MetricsExportClient(
self.metrics_export_address, self.dashboard_controller,
self.dashboard_id, exporter)
# Setup endpoints
metrics_export_handler = MetricsExportHandler(
self.dashboard_controller,
self.metrics_export_client,
self.dashboard_id,
is_dev=self.is_dev)
setup_metrics_export_routes(self.app, metrics_export_handler)
def _start_exporting_metrics(self):
result, error = self.metrics_export_client.start_exporting_metrics()
if not result and error:
url = ray.services.get_webui_url_from_redis(self.redis_client)
error += (" Please reenable the metrics export by going to "
"the url: {}/api/metrics/enable".format(url))
ray.utils.push_error_to_driver_through_redis(
self.redis_client, "metrics export failed", error)
def log_dashboard_url(self):
url = ray.services.get_webui_url_from_redis(self.redis_client)
if url is None:
raise ValueError("WebUI URL is not present in GCS.")
with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
f.write(url)
logger.info("Dashboard running on {}".format(url))
def run(self):
self.log_dashboard_url()
self.dashboard_controller.start_collecting_metrics()
self.service_discovery.start()
if self.metrics_export_address:
self._start_exporting_metrics()
aiohttp.web.run_app(self.app, host=self.host, port=self.port)
class RayletStats(threading.Thread):
def __init__(self, redis_address, redis_password=None):
self.nodes_lock = threading.Lock()
self.nodes = []
self.stubs = {}
self.reporter_stubs = {}
self.redis_client = ray.services.create_redis_client(
redis_address, password=redis_password)
self._raylet_stats_lock = threading.Lock()
self._raylet_stats = {}
self._profiling_stats = {}
self._update_nodes()
self.include_memory_info = False
super().__init__()
def _update_nodes(self):
with self.nodes_lock:
self.nodes = ray.nodes()
node_ids = [node["NodeID"] for node in self.nodes]
# First remove node connections of disconnected nodes.
for node_id in self.stubs.keys():
if node_id not in node_ids:
stub = self.stubs.pop(node_id)
stub.close()
reporter_stub = self.reporter_stubs.pop(node_id)
reporter_stub.close()
# Now add node connections of new nodes.
for node in self.nodes:
node_id = node["NodeID"]
if node_id not in self.stubs:
node_ip = node["NodeManagerAddress"]
channel = grpc.insecure_channel("{}:{}".format(
node_ip, node["NodeManagerPort"]))
stub = node_manager_pb2_grpc.NodeManagerServiceStub(
channel)
self.stubs[node_id] = stub
# Block wait until the reporter for the node starts.
while True:
reporter_port = self.redis_client.get(
"REPORTER_PORT:{}".format(node_ip))
if reporter_port:
break
reporter_channel = grpc.insecure_channel("{}:{}".format(
node_ip, int(reporter_port)))
reporter_stub = reporter_pb2_grpc.ReporterServiceStub(
reporter_channel)
self.reporter_stubs[node_id] = reporter_stub
assert len(self.stubs) == len(
self.reporter_stubs), (self.stubs.keys(),
self.reporter_stubs.keys())
def get_raylet_stats(self):
with self._raylet_stats_lock:
return copy.deepcopy(self._raylet_stats)
def launch_profiling(self, node_id, pid, duration):
profiling_id = str(uuid.uuid4())
def _callback(reply_future):
reply = reply_future.result()
with self._raylet_stats_lock:
self._profiling_stats[profiling_id] = reply
reporter_stub = self.reporter_stubs[node_id]
reply_future = reporter_stub.GetProfilingStats.future(
reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
reply_future.add_done_callback(_callback)
return profiling_id
def check_profiling_status(self, profiling_id):
with self._raylet_stats_lock:
is_present = profiling_id in self._profiling_stats
if not is_present:
return {"status": "pending"}
reply = self._profiling_stats[profiling_id]
if reply.std_err:
return {"status": "error", "error": reply.std_err}
else:
return {"status": "finished"}
def get_profiling_info(self, profiling_id):
with self._raylet_stats_lock:
profiling_stats = self._profiling_stats.get(profiling_id)
assert profiling_stats, "profiling not finished"
return json.loads(profiling_stats.profiling_stats)
def kill_actor(self, actor_id, ip_address, port):
channel = grpc.insecure_channel("{}:{}".format(ip_address, int(port)))
stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel)
def _callback(reply_future):
_ = reply_future.result()
reply_future = stub.KillActor.future(
core_worker_pb2.KillActorRequest(
intended_actor_id=ray.utils.hex_to_binary(actor_id)))
reply_future.add_done_callback(_callback)
return {}
def run(self):
counter = 0
while True:
time.sleep(1.0)
replies = {}
try:
for node in self.nodes:
node_id = node["NodeID"]
stub = self.stubs[node_id]
reply = stub.GetNodeStats(
node_manager_pb2.GetNodeStatsRequest(
include_memory_info=self.include_memory_info),
timeout=2)
reply_dict = MessageToDict(reply)
reply_dict["nodeId"] = node_id
replies[node["NodeManagerAddress"]] = reply_dict
with self._raylet_stats_lock:
for address, reply_dict in replies.items():
self._raylet_stats[address] = reply_dict
except Exception:
logger.exception(traceback.format_exc())
finally:
counter += 1
# From time to time, check if new nodes have joined the cluster
# and update self.nodes
if counter % 10:
self._update_nodes()
class TuneCollector(threading.Thread):
"""Initialize collector worker thread.
Args
logdir (str): Directory path to save the status information of
jobs and trials.
reload_interval (float): Interval(in s) of space between loading
data from logs
"""
def __init__(self, reload_interval):
self._logdir = None
self._trial_records = {}
self._data_lock = threading.Lock()
self._reload_interval = reload_interval
self._trials_available = False
self._tensor_board_dir = ""
self._enable_tensor_board = False
self._errors = {}
super().__init__()
def get_stats(self):
with self._data_lock:
tensor_board_info = {
"tensorboard_current": self._logdir == self._tensor_board_dir,
"tensorboard_enabled": self._tensor_board_dir != ""
}
return {
"trial_records": copy.deepcopy(self._trial_records),
"errors": copy.deepcopy(self._errors),
"tensorboard": tensor_board_info
}
def set_experiment(self, experiment):
with self._data_lock:
if os.path.isdir(os.path.expanduser(experiment)):
self._logdir = os.path.expanduser(experiment)
return None, {"experiment": self._logdir}
else:
return "Not a Valid Directory", None
def enable_tensorboard(self):
with self._data_lock:
if not self._tensor_board_dir:
tb = program.TensorBoard()
tb.configure(argv=[None, "--logdir", str(self._logdir)])
tb.launch()
self._tensor_board_dir = self._logdir
def get_availability(self):
with self._data_lock:
return {
"available": True,
"trials_available": self._trials_available
}
def run(self):
while True:
with self._data_lock:
self.collect()
time.sleep(self._reload_interval)
def collect_errors(self, df):
sub_dirs = os.listdir(self._logdir)
trial_names = filter(
lambda d: os.path.isdir(os.path.join(self._logdir, d)), sub_dirs)
for trial in trial_names:
error_path = os.path.join(self._logdir, trial, "error.txt")
if os.path.isfile(error_path):
self._trials_available = True
with open(error_path) as f:
text = f.read()
self._errors[str(trial)] = {
"text": text,
"job_id": os.path.basename(self._logdir),
"trial_id": "No Trial ID"
}
other_data = df[df["logdir"].str.contains(trial)]
if len(other_data) > 0:
trial_id = other_data["trial_id"].values[0]
self._errors[str(trial)]["trial_id"] = str(trial_id)
if str(trial_id) in self._trial_records.keys():
self._trial_records[str(trial_id)]["error"] = text
self._trial_records[str(trial_id)][
"status"] = "ERROR"
def collect(self):
"""
Collects and cleans data on the running Tune experiment from the
Tune logs so that users can see this information in the front-end
client
"""
self._trial_records = {}
self._errors = {}
if not self._logdir:
return
# search through all the sub_directories in log directory
analysis = Analysis(str(self._logdir))
df = analysis.dataframe()
if len(df) == 0 or "trial_id" not in df.columns:
return
self._trials_available = True
# make sure that data will convert to JSON without error
df["trial_id_key"] = df["trial_id"].astype(str)
df = df.fillna(0)
trial_ids = df["trial_id"]
for i, value in df["trial_id"].iteritems():
if type(value) != str and type(value) != int:
trial_ids[i] = int(value)
df["trial_id"] = trial_ids
# convert df to python dict
df = df.set_index("trial_id_key")
trial_data = df.to_dict(orient="index")
# clean data and update class attribute
if len(trial_data) > 0:
trial_data = self.clean_trials(trial_data)
self._trial_records.update(trial_data)
self.collect_errors(df)
def clean_trials(self, trial_details):
first_trial = trial_details[list(trial_details.keys())[0]]
config_keys = []
float_keys = []
metric_keys = []
# list of static attributes for trial
default_names = [
"logdir", "time_this_iter_s", "done", "episodes_total",
"training_iteration", "timestamp", "timesteps_total",
"experiment_id", "date", "timestamp", "time_total_s", "pid",
"hostname", "node_ip", "time_since_restore",
"timesteps_since_restore", "iterations_since_restore",
"experiment_tag", "trial_id"
]
# filter attributes into floats, metrics, and config variables
for key, value in first_trial.items():
if isinstance(value, float):
float_keys.append(key)
if str(key).startswith("config/"):
config_keys.append(key)
elif key not in default_names:
metric_keys.append(key)
# clean data into a form that front-end client can handle
for trial, details in trial_details.items():
ts = os.path.getctime(details["logdir"])
formatted_time = datetime.datetime.fromtimestamp(ts).strftime(
"%Y-%m-%d %H:%M:%S")
details["start_time"] = formatted_time
details["params"] = {}
details["metrics"] = {}
# round all floats
for key in float_keys:
details[key] = round(details[key], 12)
# group together config attributes
for key in config_keys:
new_name = key[7:]
details["params"][new_name] = details[key]
details.pop(key)
# group together metric attributes
for key in metric_keys:
details["metrics"][key] = details[key]
details.pop(key)
if details["done"]:
details["status"] = "TERMINATED"
else:
details["status"] = "RUNNING"
details.pop("done")
details["job_id"] = os.path.basename(self._logdir)
details["error"] = "No Error"
return trial_details
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=("Parse Redis server for the "
"dashboard to connect to."))
parser.add_argument(
"--host",
required=True,
type=str,
help="The host to use for the HTTP server.")
parser.add_argument(
"--port",
required=True,
type=int,
help="The port to use for the HTTP server.")
parser.add_argument(
"--redis-address",
required=True,
type=str,
help="The address to use for Redis.")
parser.add_argument(
"--redis-password",
required=False,
type=str,
default=None,
help="the password to use for Redis")
parser.add_argument(
"--logging-level",
required=False,
type=str,
default=ray_constants.LOGGER_LEVEL,
choices=ray_constants.LOGGER_LEVEL_CHOICES,
help=ray_constants.LOGGER_LEVEL_HELP)
parser.add_argument(
"--logging-format",
required=False,
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP)
parser.add_argument(
"--temp-dir",
required=False,
type=str,
default=None,
help="Specify the path of the temporary directory use by Ray process.")
args = parser.parse_args()
ray.utils.setup_logger(args.logging_level, args.logging_format)
# TODO(sang): Add a URL validation.
metrics_export_address = os.environ.get("METRICS_EXPORT_ADDRESS")
try:
dashboard = Dashboard(
args.host,
args.port,
args.redis_address,
args.temp_dir,
redis_password=args.redis_password,
metrics_export_address=metrics_export_address)
dashboard.run()
except Exception as e:
# Something went wrong, so push an error to all drivers.
redis_client = ray.services.create_redis_client(
args.redis_address, password=args.redis_password)
traceback_str = ray.utils.format_error_message(traceback.format_exc())
message = ("The dashboard on node {} failed with the following "
"error:\n{}".format(platform.node(), traceback_str))
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
if isinstance(e, OSError) and e.errno == errno.ENOENT:
logger.warning(message)
else:
raise e