From eb790bf3a3241b492171c321fc59234c66fae8d5 Mon Sep 17 00:00:00 2001 From: aannadi <31830056+aannadi@users.noreply.github.com> Date: Mon, 27 Apr 2020 20:17:52 -0700 Subject: [PATCH] [Dashboard] Set logdir in Tune Dashboard and TensorBoard Opt-in (#8074) --- python/ray/dashboard/client/src/api.ts | 38 +++++ .../client/src/pages/dashboard/Dashboard.tsx | 4 +- .../client/src/pages/dashboard/state.ts | 14 +- .../client/src/pages/dashboard/tune/Tune.tsx | 126 +++++++++++++-- .../src/pages/dashboard/tune/TuneErrors.tsx | 14 +- .../src/pages/dashboard/tune/TuneTable.tsx | 17 +- .../pages/dashboard/tune/TuneTensorBoard.tsx | 84 +++++++++- python/ray/dashboard/dashboard.py | 145 +++++++++++------- 8 files changed, 336 insertions(+), 106 deletions(-) diff --git a/python/ray/dashboard/client/src/api.ts b/python/ray/dashboard/client/src/api.ts index 32dae8a03..1419bb2b4 100644 --- a/python/ray/dashboard/client/src/api.ts +++ b/python/ray/dashboard/client/src/api.ts @@ -22,6 +22,27 @@ const get = async (path: string, params: { [key: string]: any }) => { return result as T; }; +const post = async (path: string, params: { [key: string]: any }) => { + const requestOptions = { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(params), + }; + + const url = new URL(path, base); + + const response = await fetch(url.toString(), requestOptions); + const json = await response.json(); + + const { result, error } = json; + + if (error !== null) { + throw Error(error); + } + + return result as T; +}; + export type RayConfigResponse = { min_workers: number; max_workers: number; @@ -234,13 +255,30 @@ export type TuneError = { export type TuneJobResponse = { trial_records: { [key: string]: TuneTrial }; errors: { [key: string]: TuneError }; + tensorboard: { + tensorboard_current: boolean; + tensorboard_enabled: boolean; + }; }; export const getTuneInfo = () => get("/api/tune_info", {}); export type TuneAvailabilityResponse = { available: boolean; + trials_available: boolean; }; export const getTuneAvailability = () => get("/api/tune_availability", {}); + +export type TuneSetExperimentReponse = { + experiment: string; +}; + +export const setTuneExperiment = (experiment: string) => + post("/api/set_tune_experiment", { + experiment: experiment, + }); + +export const enableTuneTensorBoard = () => + post<{}>("/api/enable_tune_tensorboard", {}); diff --git a/python/ray/dashboard/client/src/pages/dashboard/Dashboard.tsx b/python/ray/dashboard/client/src/pages/dashboard/Dashboard.tsx index f74ea253a..de9e4da5d 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -56,7 +56,7 @@ class Dashboard extends React.Component< getTuneAvailability(), ]); this.props.setNodeAndRayletInfo({ nodeInfo, rayletInfo }); - this.props.setTuneAvailability({ tuneAvailability }); + this.props.setTuneAvailability(tuneAvailability); this.props.setError(null); } catch (error) { this.props.setError(error.toString()); @@ -87,7 +87,7 @@ class Dashboard extends React.Component< ]; // if Tune information is not available, remove Tune tab from the dashboard - if (!tuneAvailability) { + if (tuneAvailability === null || !tuneAvailability.available) { tabs.splice(3); } diff --git a/python/ray/dashboard/client/src/pages/dashboard/state.ts b/python/ray/dashboard/client/src/pages/dashboard/state.ts index 2dce597a2..017d7de1b 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/state.ts +++ b/python/ray/dashboard/client/src/pages/dashboard/state.ts @@ -15,7 +15,7 @@ type State = { nodeInfo: NodeInfoResponse | null; rayletInfo: RayletInfoResponse | null; tuneInfo: TuneJobResponse | null; - tuneAvailability: boolean; + tuneAvailability: TuneAvailabilityResponse | null; lastUpdatedAt: number | null; error: string | null; }; @@ -26,7 +26,7 @@ const initialState: State = { nodeInfo: null, rayletInfo: null, tuneInfo: null, - tuneAvailability: false, + tuneAvailability: null, lastUpdatedAt: null, error: null, }; @@ -58,15 +58,9 @@ const slice = createSlice({ }, setTuneAvailability: ( state, - action: PayloadAction<{ - tuneAvailability: TuneAvailabilityResponse; - }>, + action: PayloadAction, ) => { - const tuneAvailability = - action.payload.tuneAvailability === null - ? false - : action.payload.tuneAvailability["available"]; - state.tuneAvailability = tuneAvailability; + state.tuneAvailability = action.payload; state.lastUpdatedAt = Date.now(); }, setError: (state, action: PayloadAction) => { diff --git a/python/ray/dashboard/client/src/pages/dashboard/tune/Tune.tsx b/python/ray/dashboard/client/src/pages/dashboard/tune/Tune.tsx index a2d20b079..54125722d 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/tune/Tune.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/tune/Tune.tsx @@ -1,7 +1,10 @@ import { + Button, + CircularProgress, createStyles, Tab, Tabs, + TextField, Theme, Typography, WithStyles, @@ -10,7 +13,7 @@ import { import WarningRoundedIcon from "@material-ui/icons/WarningRounded"; import React from "react"; import { connect } from "react-redux"; -import { getTuneInfo } from "../../../api"; +import { getTuneInfo, setTuneExperiment } from "../../../api"; import { StoreState } from "../../../store"; import { dashboardActions } from "../state"; import TuneErrors from "./TuneErrors"; @@ -27,23 +30,48 @@ const styles = (theme: Theme) => borderBottomStyle: "solid", borderBottomWidth: 1, }, + heading: { + fontsize: "0.9em", + marginTop: theme.spacing(2), + }, warning: { - fontSize: "0.8125rem", + fontSize: "1em", }, warningIcon: { fontSize: "1.25em", verticalAlign: "text-bottom", }, + formControl: { + margin: theme.spacing(1), + minWidth: 120, + }, + submit: { + marginLeft: theme.spacing(2), + fontSize: "0.8125em", + }, + prompt: { + fontSize: "1em", + marginTop: theme.spacing(1), + }, + input: { + width: "85%", + }, + progress: { + marginLeft: theme.spacing(2), + }, }); const mapStateToProps = (state: StoreState) => ({ tuneInfo: state.dashboard.tuneInfo, + tuneAvailability: state.dashboard.tuneAvailability, }); const mapDispatchToProps = dashboardActions; type State = { tabIndex: number; + experiment: string; + loading: boolean; }; class Tune extends React.Component< @@ -56,12 +84,19 @@ class Tune extends React.Component< state: State = { tabIndex: 0, + experiment: "", + loading: false, }; refreshTuneInfo = async () => { try { - const tuneInfo = await getTuneInfo(); - this.props.setTuneInfo(tuneInfo); + if ( + this.props.tuneAvailability && + this.props.tuneAvailability.available + ) { + const tuneInfo = await getTuneInfo(); + this.props.setTuneInfo(tuneInfo); + } } catch (error) { this.props.setError(error.toString()); } finally { @@ -69,10 +104,6 @@ class Tune extends React.Component< } }; - async componentDidMount() { - await this.refreshTuneInfo(); - } - async componentWillUnmount() { window.clearTimeout(this.timeout); } @@ -83,8 +114,78 @@ class Tune extends React.Component< }); }; + handleExperimentChange = (event: React.ChangeEvent<{ value: any }>) => { + this.setState({ + experiment: event.target.value, + }); + }; + + handleExperimentSubmit = async () => { + this.setState({ loading: true }); + try { + await setTuneExperiment(this.state.experiment); + window.clearTimeout(this.timeout); + await this.refreshTuneInfo(); + this.setState({ loading: false }); + } catch (error) { + this.props.setError(error.toString()); + this.setState({ loading: false }); + } + }; + + experimentChoice = (prompt: boolean) => { + const { classes } = this.props; + + const { loading } = this.state; + return ( +
+ + Note: This tab + is experimental. + + + {prompt && ( + + You can use this tab to monitor Tune jobs, their statuses, + hyperparameters, and more. For more information, read the + documentation{" "} + + here + + . + + )} +
+ + Enter Tune Log Directory Here: + + + + {loading && ( + + )} +
+
+ ); + }; + render() { - const { classes, tuneInfo } = this.props; + const { classes, tuneInfo, tuneAvailability } = this.props; + + if (tuneAvailability && !tuneAvailability.trials_available) { + return this.experimentChoice(true); + } const { tabIndex } = this.state; @@ -93,17 +194,14 @@ class Tune extends React.Component< { label: "TensorBoard", component: TuneTensorBoard }, ]; - if (tuneInfo !== null && Object.keys(tuneInfo["errors"]).length > 0) { + if (tuneInfo !== null && Object.keys(tuneInfo.errors).length > 0) { tabs.push({ label: "Errors", component: TuneErrors }); } const SelectedComponent = tabs[tabIndex].component; return (
- - Note: This tab - is experimental. - + {this.experimentChoice(false)} - {tuneInfo["errors"] !== null && - Object.keys(tuneInfo["errors"]).map((key, index) => ( + {tuneInfo.errors !== null && + Object.keys(tuneInfo.errors).map((key, index) => ( - {tuneInfo["errors"][key]["job_id"]} + {tuneInfo.errors[key].job_id} - {tuneInfo["errors"][key]["trial_id"]} + {tuneInfo.errors[key].trial_id} {key} @@ -115,9 +115,7 @@ class TuneErrors extends React.Component< {open && ( )} diff --git a/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTable.tsx b/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTable.tsx index d98dbfd86..7a0479bba 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTable.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTable.tsx @@ -183,14 +183,11 @@ class TuneTable extends React.Component< const { tuneInfo } = this.props; const { sortedColumn, ascending, metricParamColumn } = this.state; - if ( - tuneInfo === null || - Object.keys(tuneInfo["trial_records"]).length === 0 - ) { + if (tuneInfo === null || Object.keys(tuneInfo.trial_records).length === 0) { return null; } - const trialDetails = Object.values(tuneInfo["trial_records"]); + const trialDetails = Object.values(tuneInfo.trial_records); if (!sortedColumn) { return trialDetails; @@ -313,8 +310,8 @@ class TuneTable extends React.Component< return null; } - const firstTrial = Object.keys(tuneInfo["trial_records"])[0]; - const paramsDict = tuneInfo["trial_records"][firstTrial]["params"]; + const firstTrial = Object.keys(tuneInfo.trial_records)[0]; + const paramsDict = tuneInfo.trial_records[firstTrial].params; const paramNames = Object.keys(paramsDict).filter((k) => k !== "args"); let viewableParams = paramNames; @@ -328,9 +325,7 @@ class TuneTable extends React.Component< viewableParams = paramColumns; } - const metricNames = Object.keys( - tuneInfo["trial_records"][firstTrial]["metrics"], - ); + const metricNames = Object.keys(tuneInfo.trial_records[firstTrial].metrics); let viewableMetrics = metricNames; const metricOptions = metricNames.length > 3; @@ -429,7 +424,7 @@ class TuneTable extends React.Component< {open && ( diff --git a/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTensorBoard.tsx b/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTensorBoard.tsx index 98b8300dd..8135afdd1 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTensorBoard.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/tune/TuneTensorBoard.tsx @@ -1,12 +1,15 @@ import { + Button, + CircularProgress, createStyles, Theme, Typography, - WithStyles, withStyles, + WithStyles, } from "@material-ui/core"; import React from "react"; import { connect } from "react-redux"; +import { enableTuneTensorBoard } from "../../../api"; import { StoreState } from "../../../store"; import { dashboardActions } from "../state"; @@ -27,28 +30,64 @@ const styles = (theme: Theme) => warning: { fontSize: "0.8125rem", }, + progress: { + marginLeft: "10px", + marginTop: "2px", + }, }); const mapStateToProps = (state: StoreState) => ({ error: state.dashboard.error, + tuneInfo: state.dashboard.tuneInfo, }); +type State = { + tensorBoardEnabled: boolean; + loading: boolean; +}; + const mapDispatchToProps = dashboardActions; class TuneTensorBoard extends React.Component< WithStyles & ReturnType & - typeof mapDispatchToProps + typeof mapDispatchToProps, + State > { - render() { - const { classes, error } = this.props; + state: State = { + tensorBoardEnabled: false, + loading: false, + }; + + enableTensorBoard() { + enableTuneTensorBoard(); + this.setState({ + tensorBoardEnabled: true, + }); + } + + handleSubmit = () => { + this.setState({ loading: true }); + enableTuneTensorBoard().then(() => { + this.setState({ loading: false }); + }); + }; + + tensorBoard = () => { + const { classes, error, tuneInfo } = this.props; return ( -
+
{error === "TypeError: Failed to fetch" && ( - Warning: Tensorboard is currently not available. View Tensorboard by - running "tensorboard --logdir" if not displaying below. + Warning: Tensorboard server closed. View Tensorboard by running + "tensorboard --logdir" if not displaying below. + + )} + {tuneInfo && !tuneInfo.tensorboard.tensorboard_current && ( + + The below Tensorboard reflects a previously entered log directory. + Restart the Ray Dashboard to change the Tensorboard logdir. )}
); + }; + + render() { + const { classes, tuneInfo } = this.props; + + const { loading } = this.state; + + if (tuneInfo === null) { + return; + } + const enabled = tuneInfo.tensorboard.tensorboard_enabled; + return ( +
+ {!enabled && ( +
+ + {loading && ( + + )} +
+ )} + + {enabled && this.tensorBoard()} +
+ ); } } diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py index 83efcd44f..1a226ecfd 100644 --- a/python/ray/dashboard/dashboard.py +++ b/python/ray/dashboard/dashboard.py @@ -41,8 +41,8 @@ from ray.dashboard.metrics_exporter.client import Exporter from ray.dashboard.metrics_exporter.client import MetricsExportClient try: - from ray.tune.result import DEFAULT_RESULTS_DIR from ray.tune import Analysis + from tensorboard import program except ImportError: Analysis = None @@ -124,7 +124,7 @@ class DashboardController(BaseDashboardController): self.raylet_stats = RayletStats( redis_address, redis_password=redis_password) if Analysis is not None: - self.tune_stats = TuneCollector(DEFAULT_RESULTS_DIR, 2.0) + self.tune_stats = TuneCollector(2.0) def _construct_raylet_info(self): D = self.raylet_stats.get_raylet_stats() @@ -234,9 +234,18 @@ class DashboardController(BaseDashboardController): if Analysis is not None: D = self.tune_stats.get_availability() else: - D = {"available": False} + D = {"available": False, "trials_available": False} return D + def set_tune_experiment(self, experiment): + if Analysis is not None: + return self.tune_stats.set_experiment(experiment) + return "Tune Not Enabled", None + + def enable_tune_tensorboard(self): + if Analysis is not None: + self.tune_stats.enable_tensorboard() + def launch_profiling(self, node_id, pid, duration): profiling_id = self.raylet_stats.launch_profiling( node_id=node_id, pid=pid, duration=duration) @@ -311,6 +320,18 @@ class DashboardRouteHandler(BaseDashboardRouteHandler): result = self.dashboard_controller.tune_availability() return await json_response(self.is_dev, result=result) + async def set_tune_experiment(self, req) -> aiohttp.web.Response: + data = await req.json() + error, result = self.dashboard_controller.set_tune_experiment( + data["experiment"]) + if error: + return await json_response(self.is_dev, error=error) + return await json_response(self.is_dev, result=result) + + async def enable_tune_tensorboard(self, req) -> aiohttp.web.Response: + self.dashboard_controller.enable_tune_tensorboard() + return await json_response(self.is_dev, result={}) + async def launch_profiling(self, req) -> aiohttp.web.Response: node_id = req.query.get("node_id") pid = int(req.query.get("pid")) @@ -528,6 +549,10 @@ class Dashboard: logs="/api/logs", errors="/api/errors") self.app.router.add_get("/{_}", route_handler.get_forbidden) + self.app.router.add_post("/api/set_tune_experiment", + route_handler.set_tune_experiment) + self.app.router.add_post("/api/enable_tune_tensorboard", + route_handler.enable_tune_tensorboard) def _setup_metrics_export(self): exporter = Exporter(self.dashboard_id, self.metrics_export_address, @@ -954,28 +979,52 @@ class TuneCollector(threading.Thread): data from logs """ - def __init__(self, logdir, reload_interval): - self._logdir = logdir + def __init__(self, reload_interval): + self._logdir = None self._trial_records = {} self._data_lock = threading.Lock() self._reload_interval = reload_interval - self._available = False - self._tensor_board_started = False + self._trials_available = False + self._tensor_board_dir = "" + self._enable_tensor_board = False self._errors = {} - os.makedirs(self._logdir, exist_ok=True) super().__init__() def get_stats(self): with self._data_lock: + tensor_board_info = { + "tensorboard_current": self._logdir == self._tensor_board_dir, + "tensorboard_enabled": self._tensor_board_dir != "" + } return { "trial_records": copy.deepcopy(self._trial_records), - "errors": copy.deepcopy(self._errors) + "errors": copy.deepcopy(self._errors), + "tensorboard": tensor_board_info } + def set_experiment(self, experiment): + with self._data_lock: + if os.path.isdir(os.path.expanduser(experiment)): + self._logdir = os.path.expanduser(experiment) + return None, {"experiment": self._logdir} + else: + return "Not a Valid Directory", None + + def enable_tensorboard(self): + with self._data_lock: + if not self._tensor_board_dir: + tb = program.TensorBoard() + tb.configure(argv=[None, "--logdir", str(self._logdir)]) + tb.launch() + self._tensor_board_dir = self._logdir + def get_availability(self): with self._data_lock: - return {"available": self._available} + return { + "available": True, + "trials_available": self._trials_available + } def run(self): while True: @@ -983,21 +1032,19 @@ class TuneCollector(threading.Thread): self.collect() time.sleep(self._reload_interval) - def collect_errors(self, job_name, df): - sub_dirs = os.listdir(os.path.join(self._logdir, job_name)) + def collect_errors(self, df): + sub_dirs = os.listdir(self._logdir) trial_names = filter( - lambda d: os.path.isdir(os.path.join(self._logdir, job_name, d)), - sub_dirs) + lambda d: os.path.isdir(os.path.join(self._logdir, d)), sub_dirs) for trial in trial_names: - error_path = os.path.join(self._logdir, job_name, trial, - "error.txt") + error_path = os.path.join(self._logdir, trial, "error.txt") if os.path.isfile(error_path): - self._available = True + self._trials_available = True with open(error_path) as f: text = f.read() self._errors[str(trial)] = { "text": text, - "job_id": job_name, + "job_id": os.path.basename(self._logdir), "trial_id": "No Trial ID" } other_data = df[df["logdir"].str.contains(trial)] @@ -1015,53 +1062,43 @@ class TuneCollector(threading.Thread): Tune logs so that users can see this information in the front-end client """ - - sub_dirs = os.listdir(self._logdir) - job_names = filter( - lambda d: os.path.isdir(os.path.join(self._logdir, d)), sub_dirs) - self._trial_records = {} + self._errors = {} + if not self._logdir: + return # search through all the sub_directories in log directory - for job_name in job_names: - analysis = Analysis(str(os.path.join(self._logdir, job_name))) - df = analysis.dataframe() + analysis = Analysis(str(self._logdir)) + df = analysis.dataframe() - if len(df) == 0 or "trial_id" not in df.columns: - continue + if len(df) == 0 or "trial_id" not in df.columns: + return - # # start TensorBoard server if not started yet - # if not self._tensor_board_started: - # tb = program.TensorBoard() - # tb.configure(argv=[None, "--logdir", self._logdir]) - # tb.launch() - # self._tensor_board_started = True + self._trials_available = True - self._available = True + # make sure that data will convert to JSON without error + df["trial_id_key"] = df["trial_id"].astype(str) + df = df.fillna(0) - # make sure that data will convert to JSON without error - df["trial_id_key"] = df["trial_id"].astype(str) - df = df.fillna(0) + trial_ids = df["trial_id"] + for i, value in df["trial_id"].iteritems(): + if type(value) != str and type(value) != int: + trial_ids[i] = int(value) - trial_ids = df["trial_id"] - for i, value in df["trial_id"].iteritems(): - if type(value) != str and type(value) != int: - trial_ids[i] = int(value) + df["trial_id"] = trial_ids - df["trial_id"] = trial_ids + # convert df to python dict + df = df.set_index("trial_id_key") + trial_data = df.to_dict(orient="index") - # convert df to python dict - df = df.set_index("trial_id_key") - trial_data = df.to_dict(orient="index") + # clean data and update class attribute + if len(trial_data) > 0: + trial_data = self.clean_trials(trial_data) + self._trial_records.update(trial_data) - # clean data and update class attribute - if len(trial_data) > 0: - trial_data = self.clean_trials(trial_data, job_name) - self._trial_records.update(trial_data) + self.collect_errors(df) - self.collect_errors(job_name, df) - - def clean_trials(self, trial_details, job_name): + def clean_trials(self, trial_details): first_trial = trial_details[list(trial_details.keys())[0]] config_keys = [] float_keys = [] @@ -1116,7 +1153,7 @@ class TuneCollector(threading.Thread): details["status"] = "RUNNING" details.pop("done") - details["job_id"] = job_name + details["job_id"] = os.path.basename(self._logdir) details["error"] = "No Error" return trial_details