mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 05:17:38 +08:00
[Dashboard] Set logdir in Tune Dashboard and TensorBoard Opt-in (#8074)
This commit is contained in:
@@ -22,6 +22,27 @@ const get = async <T>(path: string, params: { [key: string]: any }) => {
|
||||
return result as T;
|
||||
};
|
||||
|
||||
const post = async <T>(path: string, params: { [key: string]: any }) => {
|
||||
const requestOptions = {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(params),
|
||||
};
|
||||
|
||||
const url = new URL(path, base);
|
||||
|
||||
const response = await fetch(url.toString(), requestOptions);
|
||||
const json = await response.json();
|
||||
|
||||
const { result, error } = json;
|
||||
|
||||
if (error !== null) {
|
||||
throw Error(error);
|
||||
}
|
||||
|
||||
return result as T;
|
||||
};
|
||||
|
||||
export type RayConfigResponse = {
|
||||
min_workers: number;
|
||||
max_workers: number;
|
||||
@@ -234,13 +255,30 @@ export type TuneError = {
|
||||
export type TuneJobResponse = {
|
||||
trial_records: { [key: string]: TuneTrial };
|
||||
errors: { [key: string]: TuneError };
|
||||
tensorboard: {
|
||||
tensorboard_current: boolean;
|
||||
tensorboard_enabled: boolean;
|
||||
};
|
||||
};
|
||||
|
||||
export const getTuneInfo = () => get<TuneJobResponse>("/api/tune_info", {});
|
||||
|
||||
export type TuneAvailabilityResponse = {
|
||||
available: boolean;
|
||||
trials_available: boolean;
|
||||
};
|
||||
|
||||
export const getTuneAvailability = () =>
|
||||
get<TuneAvailabilityResponse>("/api/tune_availability", {});
|
||||
|
||||
export type TuneSetExperimentReponse = {
|
||||
experiment: string;
|
||||
};
|
||||
|
||||
export const setTuneExperiment = (experiment: string) =>
|
||||
post<TuneSetExperimentReponse>("/api/set_tune_experiment", {
|
||||
experiment: experiment,
|
||||
});
|
||||
|
||||
export const enableTuneTensorBoard = () =>
|
||||
post<{}>("/api/enable_tune_tensorboard", {});
|
||||
|
||||
@@ -56,7 +56,7 @@ class Dashboard extends React.Component<
|
||||
getTuneAvailability(),
|
||||
]);
|
||||
this.props.setNodeAndRayletInfo({ nodeInfo, rayletInfo });
|
||||
this.props.setTuneAvailability({ tuneAvailability });
|
||||
this.props.setTuneAvailability(tuneAvailability);
|
||||
this.props.setError(null);
|
||||
} catch (error) {
|
||||
this.props.setError(error.toString());
|
||||
@@ -87,7 +87,7 @@ class Dashboard extends React.Component<
|
||||
];
|
||||
|
||||
// if Tune information is not available, remove Tune tab from the dashboard
|
||||
if (!tuneAvailability) {
|
||||
if (tuneAvailability === null || !tuneAvailability.available) {
|
||||
tabs.splice(3);
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ type State = {
|
||||
nodeInfo: NodeInfoResponse | null;
|
||||
rayletInfo: RayletInfoResponse | null;
|
||||
tuneInfo: TuneJobResponse | null;
|
||||
tuneAvailability: boolean;
|
||||
tuneAvailability: TuneAvailabilityResponse | null;
|
||||
lastUpdatedAt: number | null;
|
||||
error: string | null;
|
||||
};
|
||||
@@ -26,7 +26,7 @@ const initialState: State = {
|
||||
nodeInfo: null,
|
||||
rayletInfo: null,
|
||||
tuneInfo: null,
|
||||
tuneAvailability: false,
|
||||
tuneAvailability: null,
|
||||
lastUpdatedAt: null,
|
||||
error: null,
|
||||
};
|
||||
@@ -58,15 +58,9 @@ const slice = createSlice({
|
||||
},
|
||||
setTuneAvailability: (
|
||||
state,
|
||||
action: PayloadAction<{
|
||||
tuneAvailability: TuneAvailabilityResponse;
|
||||
}>,
|
||||
action: PayloadAction<TuneAvailabilityResponse>,
|
||||
) => {
|
||||
const tuneAvailability =
|
||||
action.payload.tuneAvailability === null
|
||||
? false
|
||||
: action.payload.tuneAvailability["available"];
|
||||
state.tuneAvailability = tuneAvailability;
|
||||
state.tuneAvailability = action.payload;
|
||||
state.lastUpdatedAt = Date.now();
|
||||
},
|
||||
setError: (state, action: PayloadAction<string | null>) => {
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import {
|
||||
Button,
|
||||
CircularProgress,
|
||||
createStyles,
|
||||
Tab,
|
||||
Tabs,
|
||||
TextField,
|
||||
Theme,
|
||||
Typography,
|
||||
WithStyles,
|
||||
@@ -10,7 +13,7 @@ import {
|
||||
import WarningRoundedIcon from "@material-ui/icons/WarningRounded";
|
||||
import React from "react";
|
||||
import { connect } from "react-redux";
|
||||
import { getTuneInfo } from "../../../api";
|
||||
import { getTuneInfo, setTuneExperiment } from "../../../api";
|
||||
import { StoreState } from "../../../store";
|
||||
import { dashboardActions } from "../state";
|
||||
import TuneErrors from "./TuneErrors";
|
||||
@@ -27,23 +30,48 @@ const styles = (theme: Theme) =>
|
||||
borderBottomStyle: "solid",
|
||||
borderBottomWidth: 1,
|
||||
},
|
||||
heading: {
|
||||
fontsize: "0.9em",
|
||||
marginTop: theme.spacing(2),
|
||||
},
|
||||
warning: {
|
||||
fontSize: "0.8125rem",
|
||||
fontSize: "1em",
|
||||
},
|
||||
warningIcon: {
|
||||
fontSize: "1.25em",
|
||||
verticalAlign: "text-bottom",
|
||||
},
|
||||
formControl: {
|
||||
margin: theme.spacing(1),
|
||||
minWidth: 120,
|
||||
},
|
||||
submit: {
|
||||
marginLeft: theme.spacing(2),
|
||||
fontSize: "0.8125em",
|
||||
},
|
||||
prompt: {
|
||||
fontSize: "1em",
|
||||
marginTop: theme.spacing(1),
|
||||
},
|
||||
input: {
|
||||
width: "85%",
|
||||
},
|
||||
progress: {
|
||||
marginLeft: theme.spacing(2),
|
||||
},
|
||||
});
|
||||
|
||||
const mapStateToProps = (state: StoreState) => ({
|
||||
tuneInfo: state.dashboard.tuneInfo,
|
||||
tuneAvailability: state.dashboard.tuneAvailability,
|
||||
});
|
||||
|
||||
const mapDispatchToProps = dashboardActions;
|
||||
|
||||
type State = {
|
||||
tabIndex: number;
|
||||
experiment: string;
|
||||
loading: boolean;
|
||||
};
|
||||
|
||||
class Tune extends React.Component<
|
||||
@@ -56,12 +84,19 @@ class Tune extends React.Component<
|
||||
|
||||
state: State = {
|
||||
tabIndex: 0,
|
||||
experiment: "",
|
||||
loading: false,
|
||||
};
|
||||
|
||||
refreshTuneInfo = async () => {
|
||||
try {
|
||||
const tuneInfo = await getTuneInfo();
|
||||
this.props.setTuneInfo(tuneInfo);
|
||||
if (
|
||||
this.props.tuneAvailability &&
|
||||
this.props.tuneAvailability.available
|
||||
) {
|
||||
const tuneInfo = await getTuneInfo();
|
||||
this.props.setTuneInfo(tuneInfo);
|
||||
}
|
||||
} catch (error) {
|
||||
this.props.setError(error.toString());
|
||||
} finally {
|
||||
@@ -69,10 +104,6 @@ class Tune extends React.Component<
|
||||
}
|
||||
};
|
||||
|
||||
async componentDidMount() {
|
||||
await this.refreshTuneInfo();
|
||||
}
|
||||
|
||||
async componentWillUnmount() {
|
||||
window.clearTimeout(this.timeout);
|
||||
}
|
||||
@@ -83,8 +114,78 @@ class Tune extends React.Component<
|
||||
});
|
||||
};
|
||||
|
||||
handleExperimentChange = (event: React.ChangeEvent<{ value: any }>) => {
|
||||
this.setState({
|
||||
experiment: event.target.value,
|
||||
});
|
||||
};
|
||||
|
||||
handleExperimentSubmit = async () => {
|
||||
this.setState({ loading: true });
|
||||
try {
|
||||
await setTuneExperiment(this.state.experiment);
|
||||
window.clearTimeout(this.timeout);
|
||||
await this.refreshTuneInfo();
|
||||
this.setState({ loading: false });
|
||||
} catch (error) {
|
||||
this.props.setError(error.toString());
|
||||
this.setState({ loading: false });
|
||||
}
|
||||
};
|
||||
|
||||
experimentChoice = (prompt: boolean) => {
|
||||
const { classes } = this.props;
|
||||
|
||||
const { loading } = this.state;
|
||||
return (
|
||||
<div>
|
||||
<Typography className={classes.warning} color="textSecondary">
|
||||
<WarningRoundedIcon className={classes.warningIcon} /> Note: This tab
|
||||
is experimental.
|
||||
</Typography>
|
||||
|
||||
{prompt && (
|
||||
<Typography className={classes.heading} color="textPrimary">
|
||||
You can use this tab to monitor Tune jobs, their statuses,
|
||||
hyperparameters, and more. For more information, read the
|
||||
documentation{" "}
|
||||
<a href="https://docs.ray.io/en/latest/ray-dashboard.html#tune">
|
||||
here
|
||||
</a>
|
||||
.
|
||||
</Typography>
|
||||
)}
|
||||
<div>
|
||||
<Typography className={classes.prompt} color="textSecondary">
|
||||
Enter Tune Log Directory Here:
|
||||
</Typography>
|
||||
<TextField
|
||||
className={classes.input}
|
||||
id="standard-basic"
|
||||
value={this.state.experiment}
|
||||
onChange={this.handleExperimentChange}
|
||||
/>
|
||||
<Button
|
||||
className={classes.submit}
|
||||
variant="outlined"
|
||||
onClick={this.handleExperimentSubmit}
|
||||
>
|
||||
Submit
|
||||
</Button>
|
||||
{loading && (
|
||||
<CircularProgress className={classes.progress} size={25} />
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
render() {
|
||||
const { classes, tuneInfo } = this.props;
|
||||
const { classes, tuneInfo, tuneAvailability } = this.props;
|
||||
|
||||
if (tuneAvailability && !tuneAvailability.trials_available) {
|
||||
return this.experimentChoice(true);
|
||||
}
|
||||
|
||||
const { tabIndex } = this.state;
|
||||
|
||||
@@ -93,17 +194,14 @@ class Tune extends React.Component<
|
||||
{ label: "TensorBoard", component: TuneTensorBoard },
|
||||
];
|
||||
|
||||
if (tuneInfo !== null && Object.keys(tuneInfo["errors"]).length > 0) {
|
||||
if (tuneInfo !== null && Object.keys(tuneInfo.errors).length > 0) {
|
||||
tabs.push({ label: "Errors", component: TuneErrors });
|
||||
}
|
||||
|
||||
const SelectedComponent = tabs[tabIndex].component;
|
||||
return (
|
||||
<div className={classes.root}>
|
||||
<Typography className={classes.warning} color="textSecondary">
|
||||
<WarningRoundedIcon className={classes.warningIcon} /> Note: This tab
|
||||
is experimental.
|
||||
</Typography>
|
||||
{this.experimentChoice(false)}
|
||||
<Tabs
|
||||
className={classes.tabs}
|
||||
indicatorColor="primary"
|
||||
|
||||
@@ -70,7 +70,7 @@ class TuneErrors extends React.Component<
|
||||
const { classes, tuneInfo } = this.props;
|
||||
const { currentError, open } = this.state;
|
||||
|
||||
if (tuneInfo === null || Object.keys(tuneInfo["errors"]).length === 0) {
|
||||
if (tuneInfo === null || Object.keys(tuneInfo.errors).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -86,14 +86,14 @@ class TuneErrors extends React.Component<
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{tuneInfo["errors"] !== null &&
|
||||
Object.keys(tuneInfo["errors"]).map((key, index) => (
|
||||
{tuneInfo.errors !== null &&
|
||||
Object.keys(tuneInfo.errors).map((key, index) => (
|
||||
<TableRow key={index}>
|
||||
<TableCell className={classes.cell}>
|
||||
{tuneInfo["errors"][key]["job_id"]}
|
||||
{tuneInfo.errors[key].job_id}
|
||||
</TableCell>
|
||||
<TableCell className={classes.cell}>
|
||||
{tuneInfo["errors"][key]["trial_id"]}
|
||||
{tuneInfo.errors[key].trial_id}
|
||||
</TableCell>
|
||||
<TableCell className={classes.cell}>{key}</TableCell>
|
||||
<TableCell className={classes.cell}>
|
||||
@@ -115,9 +115,7 @@ class TuneErrors extends React.Component<
|
||||
<DialogWithTitle handleClose={this.handleClose} title="Error Log">
|
||||
{open && (
|
||||
<NumberedLines
|
||||
lines={tuneInfo["errors"][currentError]["text"]
|
||||
.trim()
|
||||
.split("\n")}
|
||||
lines={tuneInfo.errors[currentError].text.trim().split("\n")}
|
||||
/>
|
||||
)}
|
||||
</DialogWithTitle>
|
||||
|
||||
@@ -183,14 +183,11 @@ class TuneTable extends React.Component<
|
||||
const { tuneInfo } = this.props;
|
||||
const { sortedColumn, ascending, metricParamColumn } = this.state;
|
||||
|
||||
if (
|
||||
tuneInfo === null ||
|
||||
Object.keys(tuneInfo["trial_records"]).length === 0
|
||||
) {
|
||||
if (tuneInfo === null || Object.keys(tuneInfo.trial_records).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const trialDetails = Object.values(tuneInfo["trial_records"]);
|
||||
const trialDetails = Object.values(tuneInfo.trial_records);
|
||||
|
||||
if (!sortedColumn) {
|
||||
return trialDetails;
|
||||
@@ -313,8 +310,8 @@ class TuneTable extends React.Component<
|
||||
return null;
|
||||
}
|
||||
|
||||
const firstTrial = Object.keys(tuneInfo["trial_records"])[0];
|
||||
const paramsDict = tuneInfo["trial_records"][firstTrial]["params"];
|
||||
const firstTrial = Object.keys(tuneInfo.trial_records)[0];
|
||||
const paramsDict = tuneInfo.trial_records[firstTrial].params;
|
||||
const paramNames = Object.keys(paramsDict).filter((k) => k !== "args");
|
||||
|
||||
let viewableParams = paramNames;
|
||||
@@ -328,9 +325,7 @@ class TuneTable extends React.Component<
|
||||
viewableParams = paramColumns;
|
||||
}
|
||||
|
||||
const metricNames = Object.keys(
|
||||
tuneInfo["trial_records"][firstTrial]["metrics"],
|
||||
);
|
||||
const metricNames = Object.keys(tuneInfo.trial_records[firstTrial].metrics);
|
||||
|
||||
let viewableMetrics = metricNames;
|
||||
const metricOptions = metricNames.length > 3;
|
||||
@@ -429,7 +424,7 @@ class TuneTable extends React.Component<
|
||||
<DialogWithTitle handleClose={this.handleClose} title="Error Log">
|
||||
{open && (
|
||||
<NumberedLines
|
||||
lines={tuneInfo["trial_records"][errorTrial]["error"]
|
||||
lines={tuneInfo.trial_records[errorTrial].error
|
||||
.trim()
|
||||
.split("\n")}
|
||||
/>
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
import {
|
||||
Button,
|
||||
CircularProgress,
|
||||
createStyles,
|
||||
Theme,
|
||||
Typography,
|
||||
WithStyles,
|
||||
withStyles,
|
||||
WithStyles,
|
||||
} from "@material-ui/core";
|
||||
import React from "react";
|
||||
import { connect } from "react-redux";
|
||||
import { enableTuneTensorBoard } from "../../../api";
|
||||
import { StoreState } from "../../../store";
|
||||
import { dashboardActions } from "../state";
|
||||
|
||||
@@ -27,28 +30,64 @@ const styles = (theme: Theme) =>
|
||||
warning: {
|
||||
fontSize: "0.8125rem",
|
||||
},
|
||||
progress: {
|
||||
marginLeft: "10px",
|
||||
marginTop: "2px",
|
||||
},
|
||||
});
|
||||
|
||||
const mapStateToProps = (state: StoreState) => ({
|
||||
error: state.dashboard.error,
|
||||
tuneInfo: state.dashboard.tuneInfo,
|
||||
});
|
||||
|
||||
type State = {
|
||||
tensorBoardEnabled: boolean;
|
||||
loading: boolean;
|
||||
};
|
||||
|
||||
const mapDispatchToProps = dashboardActions;
|
||||
|
||||
class TuneTensorBoard extends React.Component<
|
||||
WithStyles<typeof styles> &
|
||||
ReturnType<typeof mapStateToProps> &
|
||||
typeof mapDispatchToProps
|
||||
typeof mapDispatchToProps,
|
||||
State
|
||||
> {
|
||||
render() {
|
||||
const { classes, error } = this.props;
|
||||
state: State = {
|
||||
tensorBoardEnabled: false,
|
||||
loading: false,
|
||||
};
|
||||
|
||||
enableTensorBoard() {
|
||||
enableTuneTensorBoard();
|
||||
this.setState({
|
||||
tensorBoardEnabled: true,
|
||||
});
|
||||
}
|
||||
|
||||
handleSubmit = () => {
|
||||
this.setState({ loading: true });
|
||||
enableTuneTensorBoard().then(() => {
|
||||
this.setState({ loading: false });
|
||||
});
|
||||
};
|
||||
|
||||
tensorBoard = () => {
|
||||
const { classes, error, tuneInfo } = this.props;
|
||||
|
||||
return (
|
||||
<div className={classes.root}>
|
||||
<div>
|
||||
{error === "TypeError: Failed to fetch" && (
|
||||
<Typography className={classes.warning} color="textSecondary">
|
||||
Warning: Tensorboard is currently not available. View Tensorboard by
|
||||
running "tensorboard --logdir" if not displaying below.
|
||||
Warning: Tensorboard server closed. View Tensorboard by running
|
||||
"tensorboard --logdir" if not displaying below.
|
||||
</Typography>
|
||||
)}
|
||||
{tuneInfo && !tuneInfo.tensorboard.tensorboard_current && (
|
||||
<Typography className={classes.warning} color="textSecondary">
|
||||
The below Tensorboard reflects a previously entered log directory.
|
||||
Restart the Ray Dashboard to change the Tensorboard logdir.
|
||||
</Typography>
|
||||
)}
|
||||
<iframe
|
||||
@@ -58,6 +97,37 @@ class TuneTensorBoard extends React.Component<
|
||||
></iframe>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
render() {
|
||||
const { classes, tuneInfo } = this.props;
|
||||
|
||||
const { loading } = this.state;
|
||||
|
||||
if (tuneInfo === null) {
|
||||
return;
|
||||
}
|
||||
const enabled = tuneInfo.tensorboard.tensorboard_enabled;
|
||||
return (
|
||||
<div className={classes.root}>
|
||||
{!enabled && (
|
||||
<div>
|
||||
<Button
|
||||
variant="outlined"
|
||||
onClick={this.handleSubmit}
|
||||
className={classes.warning}
|
||||
>
|
||||
Enable TensorBoard
|
||||
</Button>
|
||||
{loading && (
|
||||
<CircularProgress className={classes.progress} size={25} />
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{enabled && this.tensorBoard()}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,8 @@ from ray.dashboard.metrics_exporter.client import Exporter
|
||||
from ray.dashboard.metrics_exporter.client import MetricsExportClient
|
||||
|
||||
try:
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
from ray.tune import Analysis
|
||||
from tensorboard import program
|
||||
except ImportError:
|
||||
Analysis = None
|
||||
|
||||
@@ -124,7 +124,7 @@ class DashboardController(BaseDashboardController):
|
||||
self.raylet_stats = RayletStats(
|
||||
redis_address, redis_password=redis_password)
|
||||
if Analysis is not None:
|
||||
self.tune_stats = TuneCollector(DEFAULT_RESULTS_DIR, 2.0)
|
||||
self.tune_stats = TuneCollector(2.0)
|
||||
|
||||
def _construct_raylet_info(self):
|
||||
D = self.raylet_stats.get_raylet_stats()
|
||||
@@ -234,9 +234,18 @@ class DashboardController(BaseDashboardController):
|
||||
if Analysis is not None:
|
||||
D = self.tune_stats.get_availability()
|
||||
else:
|
||||
D = {"available": False}
|
||||
D = {"available": False, "trials_available": False}
|
||||
return D
|
||||
|
||||
def set_tune_experiment(self, experiment):
|
||||
if Analysis is not None:
|
||||
return self.tune_stats.set_experiment(experiment)
|
||||
return "Tune Not Enabled", None
|
||||
|
||||
def enable_tune_tensorboard(self):
|
||||
if Analysis is not None:
|
||||
self.tune_stats.enable_tensorboard()
|
||||
|
||||
def launch_profiling(self, node_id, pid, duration):
|
||||
profiling_id = self.raylet_stats.launch_profiling(
|
||||
node_id=node_id, pid=pid, duration=duration)
|
||||
@@ -311,6 +320,18 @@ class DashboardRouteHandler(BaseDashboardRouteHandler):
|
||||
result = self.dashboard_controller.tune_availability()
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def set_tune_experiment(self, req) -> aiohttp.web.Response:
|
||||
data = await req.json()
|
||||
error, result = self.dashboard_controller.set_tune_experiment(
|
||||
data["experiment"])
|
||||
if error:
|
||||
return await json_response(self.is_dev, error=error)
|
||||
return await json_response(self.is_dev, result=result)
|
||||
|
||||
async def enable_tune_tensorboard(self, req) -> aiohttp.web.Response:
|
||||
self.dashboard_controller.enable_tune_tensorboard()
|
||||
return await json_response(self.is_dev, result={})
|
||||
|
||||
async def launch_profiling(self, req) -> aiohttp.web.Response:
|
||||
node_id = req.query.get("node_id")
|
||||
pid = int(req.query.get("pid"))
|
||||
@@ -528,6 +549,10 @@ class Dashboard:
|
||||
logs="/api/logs",
|
||||
errors="/api/errors")
|
||||
self.app.router.add_get("/{_}", route_handler.get_forbidden)
|
||||
self.app.router.add_post("/api/set_tune_experiment",
|
||||
route_handler.set_tune_experiment)
|
||||
self.app.router.add_post("/api/enable_tune_tensorboard",
|
||||
route_handler.enable_tune_tensorboard)
|
||||
|
||||
def _setup_metrics_export(self):
|
||||
exporter = Exporter(self.dashboard_id, self.metrics_export_address,
|
||||
@@ -954,28 +979,52 @@ class TuneCollector(threading.Thread):
|
||||
data from logs
|
||||
"""
|
||||
|
||||
def __init__(self, logdir, reload_interval):
|
||||
self._logdir = logdir
|
||||
def __init__(self, reload_interval):
|
||||
self._logdir = None
|
||||
self._trial_records = {}
|
||||
self._data_lock = threading.Lock()
|
||||
self._reload_interval = reload_interval
|
||||
self._available = False
|
||||
self._tensor_board_started = False
|
||||
self._trials_available = False
|
||||
self._tensor_board_dir = ""
|
||||
self._enable_tensor_board = False
|
||||
self._errors = {}
|
||||
|
||||
os.makedirs(self._logdir, exist_ok=True)
|
||||
super().__init__()
|
||||
|
||||
def get_stats(self):
|
||||
with self._data_lock:
|
||||
tensor_board_info = {
|
||||
"tensorboard_current": self._logdir == self._tensor_board_dir,
|
||||
"tensorboard_enabled": self._tensor_board_dir != ""
|
||||
}
|
||||
return {
|
||||
"trial_records": copy.deepcopy(self._trial_records),
|
||||
"errors": copy.deepcopy(self._errors)
|
||||
"errors": copy.deepcopy(self._errors),
|
||||
"tensorboard": tensor_board_info
|
||||
}
|
||||
|
||||
def set_experiment(self, experiment):
|
||||
with self._data_lock:
|
||||
if os.path.isdir(os.path.expanduser(experiment)):
|
||||
self._logdir = os.path.expanduser(experiment)
|
||||
return None, {"experiment": self._logdir}
|
||||
else:
|
||||
return "Not a Valid Directory", None
|
||||
|
||||
def enable_tensorboard(self):
|
||||
with self._data_lock:
|
||||
if not self._tensor_board_dir:
|
||||
tb = program.TensorBoard()
|
||||
tb.configure(argv=[None, "--logdir", str(self._logdir)])
|
||||
tb.launch()
|
||||
self._tensor_board_dir = self._logdir
|
||||
|
||||
def get_availability(self):
|
||||
with self._data_lock:
|
||||
return {"available": self._available}
|
||||
return {
|
||||
"available": True,
|
||||
"trials_available": self._trials_available
|
||||
}
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
@@ -983,21 +1032,19 @@ class TuneCollector(threading.Thread):
|
||||
self.collect()
|
||||
time.sleep(self._reload_interval)
|
||||
|
||||
def collect_errors(self, job_name, df):
|
||||
sub_dirs = os.listdir(os.path.join(self._logdir, job_name))
|
||||
def collect_errors(self, df):
|
||||
sub_dirs = os.listdir(self._logdir)
|
||||
trial_names = filter(
|
||||
lambda d: os.path.isdir(os.path.join(self._logdir, job_name, d)),
|
||||
sub_dirs)
|
||||
lambda d: os.path.isdir(os.path.join(self._logdir, d)), sub_dirs)
|
||||
for trial in trial_names:
|
||||
error_path = os.path.join(self._logdir, job_name, trial,
|
||||
"error.txt")
|
||||
error_path = os.path.join(self._logdir, trial, "error.txt")
|
||||
if os.path.isfile(error_path):
|
||||
self._available = True
|
||||
self._trials_available = True
|
||||
with open(error_path) as f:
|
||||
text = f.read()
|
||||
self._errors[str(trial)] = {
|
||||
"text": text,
|
||||
"job_id": job_name,
|
||||
"job_id": os.path.basename(self._logdir),
|
||||
"trial_id": "No Trial ID"
|
||||
}
|
||||
other_data = df[df["logdir"].str.contains(trial)]
|
||||
@@ -1015,53 +1062,43 @@ class TuneCollector(threading.Thread):
|
||||
Tune logs so that users can see this information in the front-end
|
||||
client
|
||||
"""
|
||||
|
||||
sub_dirs = os.listdir(self._logdir)
|
||||
job_names = filter(
|
||||
lambda d: os.path.isdir(os.path.join(self._logdir, d)), sub_dirs)
|
||||
|
||||
self._trial_records = {}
|
||||
self._errors = {}
|
||||
if not self._logdir:
|
||||
return
|
||||
|
||||
# search through all the sub_directories in log directory
|
||||
for job_name in job_names:
|
||||
analysis = Analysis(str(os.path.join(self._logdir, job_name)))
|
||||
df = analysis.dataframe()
|
||||
analysis = Analysis(str(self._logdir))
|
||||
df = analysis.dataframe()
|
||||
|
||||
if len(df) == 0 or "trial_id" not in df.columns:
|
||||
continue
|
||||
if len(df) == 0 or "trial_id" not in df.columns:
|
||||
return
|
||||
|
||||
# # start TensorBoard server if not started yet
|
||||
# if not self._tensor_board_started:
|
||||
# tb = program.TensorBoard()
|
||||
# tb.configure(argv=[None, "--logdir", self._logdir])
|
||||
# tb.launch()
|
||||
# self._tensor_board_started = True
|
||||
self._trials_available = True
|
||||
|
||||
self._available = True
|
||||
# make sure that data will convert to JSON without error
|
||||
df["trial_id_key"] = df["trial_id"].astype(str)
|
||||
df = df.fillna(0)
|
||||
|
||||
# make sure that data will convert to JSON without error
|
||||
df["trial_id_key"] = df["trial_id"].astype(str)
|
||||
df = df.fillna(0)
|
||||
trial_ids = df["trial_id"]
|
||||
for i, value in df["trial_id"].iteritems():
|
||||
if type(value) != str and type(value) != int:
|
||||
trial_ids[i] = int(value)
|
||||
|
||||
trial_ids = df["trial_id"]
|
||||
for i, value in df["trial_id"].iteritems():
|
||||
if type(value) != str and type(value) != int:
|
||||
trial_ids[i] = int(value)
|
||||
df["trial_id"] = trial_ids
|
||||
|
||||
df["trial_id"] = trial_ids
|
||||
# convert df to python dict
|
||||
df = df.set_index("trial_id_key")
|
||||
trial_data = df.to_dict(orient="index")
|
||||
|
||||
# convert df to python dict
|
||||
df = df.set_index("trial_id_key")
|
||||
trial_data = df.to_dict(orient="index")
|
||||
# clean data and update class attribute
|
||||
if len(trial_data) > 0:
|
||||
trial_data = self.clean_trials(trial_data)
|
||||
self._trial_records.update(trial_data)
|
||||
|
||||
# clean data and update class attribute
|
||||
if len(trial_data) > 0:
|
||||
trial_data = self.clean_trials(trial_data, job_name)
|
||||
self._trial_records.update(trial_data)
|
||||
self.collect_errors(df)
|
||||
|
||||
self.collect_errors(job_name, df)
|
||||
|
||||
def clean_trials(self, trial_details, job_name):
|
||||
def clean_trials(self, trial_details):
|
||||
first_trial = trial_details[list(trial_details.keys())[0]]
|
||||
config_keys = []
|
||||
float_keys = []
|
||||
@@ -1116,7 +1153,7 @@ class TuneCollector(threading.Thread):
|
||||
details["status"] = "RUNNING"
|
||||
details.pop("done")
|
||||
|
||||
details["job_id"] = job_name
|
||||
details["job_id"] = os.path.basename(self._logdir)
|
||||
details["error"] = "No Error"
|
||||
|
||||
return trial_details
|
||||
|
||||
Reference in New Issue
Block a user