From 66734847bb7834a91b8f3a880c286e08476cf168 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Mon, 3 Jul 2017 09:01:47 -0700
Subject: [PATCH] [rllib] Standardize writing output logs and other files to
 /tmp/ray (#706)

* rllib v0

* fix imports

* lint

* comments

* update docs

* a3c wip

* a3c wip

* report stats

* update doc

* add common logdir attr

* name is too long

* fix small bug

* propagate exception on error

* fetch metrics

* fix small nits
---
 doc/source/example-a3c.rst                    |  2 +-
 python/ray/rllib/a3c/a3c.py                   |  5 ++--
 python/ray/rllib/common.py                    | 28 +++++++++++++++++--
 .../evolution_strategies.py                   |  2 +-
 python/ray/rllib/policy_gradient/agent.py     |  7 +++--
 .../rllib/policy_gradient/policy_gradient.py  | 19 ++++++-------
 6 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/doc/source/example-a3c.rst b/doc/source/example-a3c.rst
index 5e7f0c6ca..617775442 100644
--- a/doc/source/example-a3c.rst
+++ b/doc/source/example-a3c.rst
@@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes.
 
 You can visualize performance by running
 :code:`tensorboard --logdir [directory]` in a separate screen, where
-:code:`[directory]` is defaulted to :code:`/tmp/ray/a3c/`. If you are running
+:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running
 multiple experiments, be sure to vary the directory to which Tensorflow saves
 its progress (found in :code:`a3c.py`).
diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py
index f47ec3962..7c9babea9 100644
--- a/python/ray/rllib/a3c/a3c.py
+++ b/python/ray/rllib/a3c/a3c.py
@@ -26,7 +26,7 @@ class Runner(object):
 
   The gradient computation is also executed from this object.
   """
-  def __init__(self, env_name, actor_id, logdir="/tmp/ray/a3c/", start=True):
+  def __init__(self, env_name, actor_id, logdir, start=True):
     env = create_env(env_name)
     self.id = actor_id
     num_actions = env.action_space.n
@@ -89,7 +89,8 @@ class A3C(Algorithm):
     self.policy = LSTMPolicy(
         self.env.observation_space.shape, self.env.action_space.n, 0)
     self.agents = [
-        Runner.remote(env_name, i) for i in range(config["num_workers"])]
+        Runner.remote(env_name, i, self.logdir)
+        for i in range(config["num_workers"])]
     self.parameters = self.policy.get_weights()
     self.iteration = 0
 
diff --git a/python/ray/rllib/common.py b/python/ray/rllib/common.py
index 050ccf535..e3758bf01 100644
--- a/python/ray/rllib/common.py
+++ b/python/ray/rllib/common.py
@@ -1,4 +1,12 @@
 from collections import namedtuple
+from datetime import datetime
+import json
+import logging
+import os
+import tempfile
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 
 TrainingResult = namedtuple("TrainingResult", [
@@ -14,16 +22,32 @@ class Algorithm(object):
   Algorithm objects retain internal model state between calls to train(), so
   you should create a new algorithm instance for each training session.
 
+  Attributes:
+    env_name (str): Name of the OpenAI gym environment to train against.
+    config (obj): Algorithm-specific configuration data.
+    logdir (str): Directory in which training outputs should be placed.
+
   TODO(ekl): support checkpoint / restore of training state.
   """
 
   def __init__(self, env_name, config):
     self.env_name = env_name
     self.config = config
+    self.logdir = tempfile.mkdtemp(
+        prefix="{}_{}_{}".format(
+            env_name,
+            self.__class__.__name__,
+            datetime.today().strftime("%Y-%m-%d_%H-%M-%S")),
+        dir="/tmp/ray")
+    json.dump(
+        self.config, open(os.path.join(self.logdir, "config.json"), "w"),
+        sort_keys=True, indent=4)
+    logger.info(
+        "%s algorithm created with logdir '%s'",
+        self.__class__.__name__, self.logdir)
 
   def train(self):
-    """
-    Runs one logical iteration of training.
+    """Runs one logical iteration of training.
 
     Returns:
       A TrainingResult that describes training progress.
diff --git a/python/ray/rllib/evolution_strategies/evolution_strategies.py b/python/ray/rllib/evolution_strategies/evolution_strategies.py
index b9c6bbe42..64fa307f4 100644
--- a/python/ray/rllib/evolution_strategies/evolution_strategies.py
+++ b/python/ray/rllib/evolution_strategies/evolution_strategies.py
@@ -277,7 +277,7 @@ class EvolutionStrategies(Algorithm):
     if (config.snapshot_freq != 0 and
             self.iteration % config.snapshot_freq == 0):
       filename = os.path.join(
-          "/tmp", "snapshot_iter{:05d}.h5".format(self.iteration))
+          self.logdir, "snapshot_iter{:05d}.h5".format(self.iteration))
       assert not os.path.exists(filename)
       self.policy.save(filename)
       tlogger.log("Saved snapshot {}".format(filename))
diff --git a/python/ray/rllib/policy_gradient/agent.py b/python/ray/rllib/policy_gradient/agent.py
index cfe1386e1..9c9cd2110 100644
--- a/python/ray/rllib/policy_gradient/agent.py
+++ b/python/ray/rllib/policy_gradient/agent.py
@@ -48,7 +48,7 @@ class Agent(object):
   this GPU-local data.
   """
 
-  def __init__(self, name, batchsize, preprocessor, config, is_remote):
+  def __init__(self, name, batchsize, preprocessor, config, logdir, is_remote):
     if is_remote:
       os.environ["CUDA_VISIBLE_DEVICES"] = ""
       devices = ["/cpu:0"]
@@ -56,6 +56,7 @@ class Agent(object):
       devices = config["devices"]
     self.devices = devices
     self.config = config
+    self.logdir = logdir
     self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor)
     if preprocessor.shape is None:
       preprocessor.shape = self.env.observation_space.shape
@@ -220,7 +221,7 @@ class Agent(object):
         run_metadata=run_metadata)
     if full_trace:
       trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-      trace_file = open("/tmp/ray/timeline-load.json", "w")
+      trace_file = open(os.path.join(self.logdir, "timeline-load.json"), "w")
       trace_file.write(trace.generate_chrome_trace_format())
 
     tuples_per_device = len(truncated_obs) / len(self.devices)
@@ -254,7 +255,7 @@ class Agent(object):
 
     if full_trace:
       trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-      trace_file = open("/tmp/ray/timeline-sgd.json", "w")
+      trace_file = open(os.path.join(self.logdir, "timeline-sgd.json"), "w")
       trace_file.write(trace.generate_chrome_trace_format())
       file_writer.add_run_metadata(
           run_metadata, "sgd_train_{}".format(batch_index))
diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py
index 7850af337..752574fca 100644
--- a/python/ray/rllib/policy_gradient/policy_gradient.py
+++ b/python/ray/rllib/policy_gradient/policy_gradient.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from datetime import datetime
+import os
 import time
 
 import numpy as np
@@ -36,11 +36,10 @@ DEFAULT_CONFIG = {
     "kl_target": 0.01,
     "timesteps_per_batch": 40000,
     "num_agents": 5,
-    "tensorboard_log_dir": "/tmp/ray",
     "full_trace_nth_sgd_batch": -1,
     "full_trace_data_load": False,
     "use_tf_debugger": False,
-    "model_checkpoint_file": "/tmp/iteration-%s.ckpt"}
+    "model_checkpoint_file": "iteration-%s.ckpt"}
 
 
 class PolicyGradient(Algorithm):
@@ -64,10 +63,11 @@ class PolicyGradient(Algorithm):
     self.j = 0
     self.kl_coeff = config["kl_coeff"]
     self.model = Agent(
-        self.env_name, 1, self.preprocessor, self.config, False)
+        self.env_name, 1, self.preprocessor, self.config, self.logdir, False)
     self.agents = [
         RemoteAgent.remote(
-            self.env_name, 1, self.preprocessor, self.config, True)
+            self.env_name, 1, self.preprocessor, self.config,
+            self.logdir, True)
         for _ in range(config["num_agents"])]
 
   def train(self):
@@ -81,15 +81,12 @@ class PolicyGradient(Algorithm):
     if "load_checkpoint" in config:
       saver.restore(model.sess, config["load_checkpoint"])
 
-    file_writer = tf.summary.FileWriter(
-        "{}/trpo_{}_{}".format(
-            config["tensorboard_log_dir"], self.env_name,
-            str(datetime.today()).replace(" ", "_")),
-        model.sess.graph)
+    file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
     iter_start = time.time()
     if config["model_checkpoint_file"]:
       checkpoint_path = saver.save(
-          model.sess, config["model_checkpoint_file"] % j)
+          model.sess,
+          os.path.join(self.logdir, config["model_checkpoint_file"] % j))
       print("Checkpoint saved in file: %s" % checkpoint_path)
     checkpointing_end = time.time()
     weights = ray.put(model.get_weights())