[tune] Clean up result logging: move out of /tmp, add timestamp (#1297)

2026-06-28 11:21:15 +08:00 · 2017-12-15 14:19:08 -08:00
parent 12fdb3f53a
commit fbf1806b8a
11 changed files with 64 additions and 26 deletions
@@ -18,7 +18,7 @@ import uuid
 import tensorflow as tf
 from ray.tune.logger import UnifiedLogger
 from ray.tune.registry import ENV_CREATOR
-from ray.tune.result import TrainingResult
+from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
 from ray.tune.trainable import Trainable

 logger = logging.getLogger(__name__)
@@ -72,7 +72,6 @@ class Agent(Trainable):

    _allow_unknown_configs = False
    _allow_unknown_subkeys = []
-    _default_logdir = "/tmp/ray"

    def __init__(
            self, config={}, env=None, registry=None, logger_creator=None):
@@ -111,10 +110,10 @@ class Agent(Trainable):
            logdir_suffix = "{}_{}_{}".format(
                env, self._agent_name,
                datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
-            if not os.path.exists(self._default_logdir):
-                os.makedirs(self._default_logdir)
+            if not os.path.exists(DEFAULT_RESULTS_DIR):
+                os.makedirs(DEFAULT_RESULTS_DIR)
            self.logdir = tempfile.mkdtemp(
-                prefix=logdir_suffix, dir=self._default_logdir)
+                prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
            self._result_logger = UnifiedLogger(self.config, self.logdir, None)

        self._iteration = 0
@@ -155,8 +154,11 @@ class Agent(Trainable):
        self._time_total += time_this_iter
        self._timesteps_total += result.timesteps_this_iter

+        now = datetime.today()
        result = result._replace(
            experiment_id=self._experiment_id,
+            date=now.strftime("%Y-%m-%d_%H-%M-%S"),
+            timestamp=int(time.mktime(now.timetuple())),
            training_iteration=self._iteration,
            timesteps_total=self._timesteps_total,
            time_this_iter_s=time_this_iter,
@@ -57,7 +57,7 @@ if __name__ == "__main__":
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
-            args.experiment_name: {  # i.e. log to /tmp/ray/default
+            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "local_dir": args.local_dir,
@@ -24,6 +24,7 @@
   },
   "outputs": [],
   "source": [
+    "import os\n",
    "import pandas as pd\n",
    "from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
    "import plotly\n",
@@ -46,7 +47,7 @@
   },
   "outputs": [],
   "source": [
-    "RESULTS_DIR = \"/tmp/ray/\"\n",
+    "RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
    "df = load_results_to_df(RESULTS_DIR)\n",
    "[key for key in df]"
   ]
@@ -7,6 +7,7 @@ import argparse
 import json

 from ray.tune import TuneError
+from ray.tune.result import DEFAULT_RESULTS_DIR
 from ray.tune.trial import Resources


@@ -63,8 +64,9 @@ def make_parser(**kwargs):
        "--repeat", default=1, type=int,
        help="Number of times to repeat each trial.")
    parser.add_argument(
-        "--local-dir", default="/tmp/ray", type=str,
-        help="Local dir to save training results to. Defaults to '/tmp/ray'.")
+        "--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
+        help="Local dir to save training results to. Defaults to '{}'.".format(
+            DEFAULT_RESULTS_DIR))
    parser.add_argument(
        "--upload-dir", default="", type=str,
        help="Optional URI to upload training results to.")
@@ -4,6 +4,7 @@ from __future__ import print_function

 from collections import namedtuple
 import json
+import os

 try:
    import yaml
@@ -20,6 +21,9 @@ Most of the fields are optional, the only required one is timesteps_total.
 In RLlib, the supplied algorithms fill in TrainingResult for you.
 """

+# Where ray.tune writes result files by default
+DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")
+

 TrainingResult = namedtuple("TrainingResult", [
    # (Required) Accumulated timesteps for this entire experiment.
@@ -40,9 +44,12 @@ TrainingResult = namedtuple("TrainingResult", [
    # (Optional) The number of episodes total.
    "episodes_total",

-    # (Optional) The current training accuracy if applicable>
+    # (Optional) The current training accuracy if applicable.
    "mean_accuracy",

+    # (Optional) The current validation accuracy if applicable.
+    "mean_validation_accuracy",
+
    # (Optional) The current training loss if applicable.
    "mean_loss",

@@ -69,6 +76,12 @@ TrainingResult = namedtuple("TrainingResult", [
    # (Auto-filled) The pid of the training process.
    "pid",

+    # (Auto-filled) A formatted date of when the result was processed.
+    "date",
+
+    # (Auto-filled) A UNIX timestamp of when the result was processed.
+    "timestamp",
+
    # (Auto-filled) The hostname of the machine hosting the training process.
    "hostname",
 ])
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from datetime import datetime
 import tempfile
 import traceback
 import ray
@@ -10,7 +11,7 @@ import os
 from collections import namedtuple
 from ray.tune import TuneError
 from ray.tune.logger import NoopLogger, UnifiedLogger
-from ray.tune.result import TrainingResult
+from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
 from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS


@@ -62,7 +63,7 @@ class Trial(object):
    ERROR = "ERROR"

    def __init__(
-            self, trainable_name, config={}, local_dir='/tmp/ray',
+            self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
            experiment_tag=None, resources=Resources(cpu=1, gpu=0),
            stopping_criterion={}, checkpoint_freq=0,
            restore_path=None, upload_dir=None):
@@ -295,16 +296,22 @@ class Trial(object):
            if not os.path.exists(self.local_dir):
                os.makedirs(self.local_dir)
            self.logdir = tempfile.mkdtemp(
-                prefix=str(self), dir=self.local_dir)
+                prefix=str(self), dir=self.local_dir,
+                suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
            self.result_logger = UnifiedLogger(
                self.config, self.logdir, self.upload_dir)
        remote_logdir = self.logdir
+
+        def logger_creator(config):
+            # Set the working dir in the remote process, for user file writes
+            os.chdir(remote_logdir)
+            return NoopLogger(config, remote_logdir)
+
        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        self.runner = cls.remote(
-            config=self.config,
-            registry=get_registry(),
-            logger_creator=lambda config: NoopLogger(config, remote_logdir))
+            config=self.config, registry=get_registry(),
+            logger_creator=logger_creator)

    def __str__(self):
        if "env" in self.config: