mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 11:21:15 +08:00
[tune] Clean up result logging: move out of /tmp, add timestamp (#1297)
This commit is contained in:
@@ -18,7 +18,7 @@ import uuid
|
||||
import tensorflow as tf
|
||||
from ray.tune.logger import UnifiedLogger
|
||||
from ray.tune.registry import ENV_CREATOR
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
|
||||
from ray.tune.trainable import Trainable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -72,7 +72,6 @@ class Agent(Trainable):
|
||||
|
||||
_allow_unknown_configs = False
|
||||
_allow_unknown_subkeys = []
|
||||
_default_logdir = "/tmp/ray"
|
||||
|
||||
def __init__(
|
||||
self, config={}, env=None, registry=None, logger_creator=None):
|
||||
@@ -111,10 +110,10 @@ class Agent(Trainable):
|
||||
logdir_suffix = "{}_{}_{}".format(
|
||||
env, self._agent_name,
|
||||
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
if not os.path.exists(self._default_logdir):
|
||||
os.makedirs(self._default_logdir)
|
||||
if not os.path.exists(DEFAULT_RESULTS_DIR):
|
||||
os.makedirs(DEFAULT_RESULTS_DIR)
|
||||
self.logdir = tempfile.mkdtemp(
|
||||
prefix=logdir_suffix, dir=self._default_logdir)
|
||||
prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
|
||||
self._result_logger = UnifiedLogger(self.config, self.logdir, None)
|
||||
|
||||
self._iteration = 0
|
||||
@@ -155,8 +154,11 @@ class Agent(Trainable):
|
||||
self._time_total += time_this_iter
|
||||
self._timesteps_total += result.timesteps_this_iter
|
||||
|
||||
now = datetime.today()
|
||||
result = result._replace(
|
||||
experiment_id=self._experiment_id,
|
||||
date=now.strftime("%Y-%m-%d_%H-%M-%S"),
|
||||
timestamp=int(time.mktime(now.timetuple())),
|
||||
training_iteration=self._iteration,
|
||||
timesteps_total=self._timesteps_total,
|
||||
time_this_iter_s=time_this_iter,
|
||||
|
||||
@@ -57,7 +57,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
# Note: keep this in sync with tune/config_parser.py
|
||||
experiments = {
|
||||
args.experiment_name: { # i.e. log to /tmp/ray/default
|
||||
args.experiment_name: { # i.e. log to ~/ray_results/default
|
||||
"run": args.run,
|
||||
"checkpoint_freq": args.checkpoint_freq,
|
||||
"local_dir": args.local_dir,
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pandas as pd\n",
|
||||
"from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
|
||||
"import plotly\n",
|
||||
@@ -46,7 +47,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RESULTS_DIR = \"/tmp/ray/\"\n",
|
||||
"RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
|
||||
"df = load_results_to_df(RESULTS_DIR)\n",
|
||||
"[key for key in df]"
|
||||
]
|
||||
|
||||
@@ -7,6 +7,7 @@ import argparse
|
||||
import json
|
||||
|
||||
from ray.tune import TuneError
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
|
||||
@@ -63,8 +64,9 @@ def make_parser(**kwargs):
|
||||
"--repeat", default=1, type=int,
|
||||
help="Number of times to repeat each trial.")
|
||||
parser.add_argument(
|
||||
"--local-dir", default="/tmp/ray", type=str,
|
||||
help="Local dir to save training results to. Defaults to '/tmp/ray'.")
|
||||
"--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
|
||||
help="Local dir to save training results to. Defaults to '{}'.".format(
|
||||
DEFAULT_RESULTS_DIR))
|
||||
parser.add_argument(
|
||||
"--upload-dir", default="", type=str,
|
||||
help="Optional URI to upload training results to.")
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import json
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
@@ -20,6 +21,9 @@ Most of the fields are optional, the only required one is timesteps_total.
|
||||
In RLlib, the supplied algorithms fill in TrainingResult for you.
|
||||
"""
|
||||
|
||||
# Where ray.tune writes result files by default
|
||||
DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")
|
||||
|
||||
|
||||
TrainingResult = namedtuple("TrainingResult", [
|
||||
# (Required) Accumulated timesteps for this entire experiment.
|
||||
@@ -40,9 +44,12 @@ TrainingResult = namedtuple("TrainingResult", [
|
||||
# (Optional) The number of episodes total.
|
||||
"episodes_total",
|
||||
|
||||
# (Optional) The current training accuracy if applicable>
|
||||
# (Optional) The current training accuracy if applicable.
|
||||
"mean_accuracy",
|
||||
|
||||
# (Optional) The current validation accuracy if applicable.
|
||||
"mean_validation_accuracy",
|
||||
|
||||
# (Optional) The current training loss if applicable.
|
||||
"mean_loss",
|
||||
|
||||
@@ -69,6 +76,12 @@ TrainingResult = namedtuple("TrainingResult", [
|
||||
# (Auto-filled) The pid of the training process.
|
||||
"pid",
|
||||
|
||||
# (Auto-filled) A formatted date of when the result was processed.
|
||||
"date",
|
||||
|
||||
# (Auto-filled) A UNIX timestamp of when the result was processed.
|
||||
"timestamp",
|
||||
|
||||
# (Auto-filled) The hostname of the machine hosting the training process.
|
||||
"hostname",
|
||||
])
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import traceback
|
||||
import ray
|
||||
@@ -10,7 +11,7 @@ import os
|
||||
from collections import namedtuple
|
||||
from ray.tune import TuneError
|
||||
from ray.tune.logger import NoopLogger, UnifiedLogger
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
|
||||
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
|
||||
|
||||
|
||||
@@ -62,7 +63,7 @@ class Trial(object):
|
||||
ERROR = "ERROR"
|
||||
|
||||
def __init__(
|
||||
self, trainable_name, config={}, local_dir='/tmp/ray',
|
||||
self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
|
||||
experiment_tag=None, resources=Resources(cpu=1, gpu=0),
|
||||
stopping_criterion={}, checkpoint_freq=0,
|
||||
restore_path=None, upload_dir=None):
|
||||
@@ -295,16 +296,22 @@ class Trial(object):
|
||||
if not os.path.exists(self.local_dir):
|
||||
os.makedirs(self.local_dir)
|
||||
self.logdir = tempfile.mkdtemp(
|
||||
prefix=str(self), dir=self.local_dir)
|
||||
prefix=str(self), dir=self.local_dir,
|
||||
suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
|
||||
self.result_logger = UnifiedLogger(
|
||||
self.config, self.logdir, self.upload_dir)
|
||||
remote_logdir = self.logdir
|
||||
|
||||
def logger_creator(config):
|
||||
# Set the working dir in the remote process, for user file writes
|
||||
os.chdir(remote_logdir)
|
||||
return NoopLogger(config, remote_logdir)
|
||||
|
||||
# Logging for trials is handled centrally by TrialRunner, so
|
||||
# configure the remote runner to use a noop-logger.
|
||||
self.runner = cls.remote(
|
||||
config=self.config,
|
||||
registry=get_registry(),
|
||||
logger_creator=lambda config: NoopLogger(config, remote_logdir))
|
||||
config=self.config, registry=get_registry(),
|
||||
logger_creator=logger_creator)
|
||||
|
||||
def __str__(self):
|
||||
if "env" in self.config:
|
||||
|
||||
Reference in New Issue
Block a user