From 66734847bb7834a91b8f3a880c286e08476cf168 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 3 Jul 2017 09:01:47 -0700 Subject: [PATCH] [rllib] Standardize writing output logs and other files to /tmp/ray (#706) * rllib v0 * fix imports * lint * comments * update docs * a3c wip * a3c wip * report stats * update doc * add common logdir attr * name is too long * fix small bug * propagate exception on error * fetch metrics * fix small nits --- doc/source/example-a3c.rst | 2 +- python/ray/rllib/a3c/a3c.py | 5 ++-- python/ray/rllib/common.py | 28 +++++++++++++++++-- .../evolution_strategies.py | 2 +- python/ray/rllib/policy_gradient/agent.py | 7 +++-- .../rllib/policy_gradient/policy_gradient.py | 19 ++++++------- 6 files changed, 43 insertions(+), 20 deletions(-) diff --git a/doc/source/example-a3c.rst b/doc/source/example-a3c.rst index 5e7f0c6ca..617775442 100644 --- a/doc/source/example-a3c.rst +++ b/doc/source/example-a3c.rst @@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes. You can visualize performance by running :code:`tensorboard --logdir [directory]` in a separate screen, where -:code:`[directory]` is defaulted to :code:`/tmp/ray/a3c/`. If you are running +:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running multiple experiments, be sure to vary the directory to which Tensorflow saves its progress (found in :code:`a3c.py`). diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index f47ec3962..7c9babea9 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -26,7 +26,7 @@ class Runner(object): The gradient computation is also executed from this object. """ - def __init__(self, env_name, actor_id, logdir="/tmp/ray/a3c/", start=True): + def __init__(self, env_name, actor_id, logdir, start=True): env = create_env(env_name) self.id = actor_id num_actions = env.action_space.n @@ -89,7 +89,8 @@ class A3C(Algorithm): self.policy = LSTMPolicy( self.env.observation_space.shape, self.env.action_space.n, 0) self.agents = [ - Runner.remote(env_name, i) for i in range(config["num_workers"])] + Runner.remote(env_name, i, self.logdir) + for i in range(config["num_workers"])] self.parameters = self.policy.get_weights() self.iteration = 0 diff --git a/python/ray/rllib/common.py b/python/ray/rllib/common.py index 050ccf535..e3758bf01 100644 --- a/python/ray/rllib/common.py +++ b/python/ray/rllib/common.py @@ -1,4 +1,12 @@ from collections import namedtuple +from datetime import datetime +import json +import logging +import os +import tempfile + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) TrainingResult = namedtuple("TrainingResult", [ @@ -14,16 +22,32 @@ class Algorithm(object): Algorithm objects retain internal model state between calls to train(), so you should create a new algorithm instance for each training session. + Attributes: + env_name (str): Name of the OpenAI gym environment to train against. + config (obj): Algorithm-specific configuration data. + logdir (str): Directory in which training outputs should be placed. + TODO(ekl): support checkpoint / restore of training state. """ def __init__(self, env_name, config): self.env_name = env_name self.config = config + self.logdir = tempfile.mkdtemp( + prefix="{}_{}_{}".format( + env_name, + self.__class__.__name__, + datetime.today().strftime("%Y-%m-%d_%H-%M-%S")), + dir="/tmp/ray") + json.dump( + self.config, open(os.path.join(self.logdir, "config.json"), "w"), + sort_keys=True, indent=4) + logger.info( + "%s algorithm created with logdir '%s'", + self.__class__.__name__, self.logdir) def train(self): - """ - Runs one logical iteration of training. + """Runs one logical iteration of training. Returns: A TrainingResult that describes training progress. diff --git a/python/ray/rllib/evolution_strategies/evolution_strategies.py b/python/ray/rllib/evolution_strategies/evolution_strategies.py index b9c6bbe42..64fa307f4 100644 --- a/python/ray/rllib/evolution_strategies/evolution_strategies.py +++ b/python/ray/rllib/evolution_strategies/evolution_strategies.py @@ -277,7 +277,7 @@ class EvolutionStrategies(Algorithm): if (config.snapshot_freq != 0 and self.iteration % config.snapshot_freq == 0): filename = os.path.join( - "/tmp", "snapshot_iter{:05d}.h5".format(self.iteration)) + self.logdir, "snapshot_iter{:05d}.h5".format(self.iteration)) assert not os.path.exists(filename) self.policy.save(filename) tlogger.log("Saved snapshot {}".format(filename)) diff --git a/python/ray/rllib/policy_gradient/agent.py b/python/ray/rllib/policy_gradient/agent.py index cfe1386e1..9c9cd2110 100644 --- a/python/ray/rllib/policy_gradient/agent.py +++ b/python/ray/rllib/policy_gradient/agent.py @@ -48,7 +48,7 @@ class Agent(object): this GPU-local data. """ - def __init__(self, name, batchsize, preprocessor, config, is_remote): + def __init__(self, name, batchsize, preprocessor, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] @@ -56,6 +56,7 @@ class Agent(object): devices = config["devices"] self.devices = devices self.config = config + self.logdir = logdir self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor) if preprocessor.shape is None: preprocessor.shape = self.env.observation_space.shape @@ -220,7 +221,7 @@ class Agent(object): run_metadata=run_metadata) if full_trace: trace = timeline.Timeline(step_stats=run_metadata.step_stats) - trace_file = open("/tmp/ray/timeline-load.json", "w") + trace_file = open(os.path.join(self.logdir, "timeline-load.json"), "w") trace_file.write(trace.generate_chrome_trace_format()) tuples_per_device = len(truncated_obs) / len(self.devices) @@ -254,7 +255,7 @@ class Agent(object): if full_trace: trace = timeline.Timeline(step_stats=run_metadata.step_stats) - trace_file = open("/tmp/ray/timeline-sgd.json", "w") + trace_file = open(os.path.join(self.logdir, "timeline-sgd.json"), "w") trace_file.write(trace.generate_chrome_trace_format()) file_writer.add_run_metadata( run_metadata, "sgd_train_{}".format(batch_index)) diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py index 7850af337..752574fca 100644 --- a/python/ray/rllib/policy_gradient/policy_gradient.py +++ b/python/ray/rllib/policy_gradient/policy_gradient.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from datetime import datetime +import os import time import numpy as np @@ -36,11 +36,10 @@ DEFAULT_CONFIG = { "kl_target": 0.01, "timesteps_per_batch": 40000, "num_agents": 5, - "tensorboard_log_dir": "/tmp/ray", "full_trace_nth_sgd_batch": -1, "full_trace_data_load": False, "use_tf_debugger": False, - "model_checkpoint_file": "/tmp/iteration-%s.ckpt"} + "model_checkpoint_file": "iteration-%s.ckpt"} class PolicyGradient(Algorithm): @@ -64,10 +63,11 @@ class PolicyGradient(Algorithm): self.j = 0 self.kl_coeff = config["kl_coeff"] self.model = Agent( - self.env_name, 1, self.preprocessor, self.config, False) + self.env_name, 1, self.preprocessor, self.config, self.logdir, False) self.agents = [ RemoteAgent.remote( - self.env_name, 1, self.preprocessor, self.config, True) + self.env_name, 1, self.preprocessor, self.config, + self.logdir, True) for _ in range(config["num_agents"])] def train(self): @@ -81,15 +81,12 @@ class PolicyGradient(Algorithm): if "load_checkpoint" in config: saver.restore(model.sess, config["load_checkpoint"]) - file_writer = tf.summary.FileWriter( - "{}/trpo_{}_{}".format( - config["tensorboard_log_dir"], self.env_name, - str(datetime.today()).replace(" ", "_")), - model.sess.graph) + file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph) iter_start = time.time() if config["model_checkpoint_file"]: checkpoint_path = saver.save( - model.sess, config["model_checkpoint_file"] % j) + model.sess, + os.path.join(self.logdir, config["model_checkpoint_file"] % j)) print("Checkpoint saved in file: %s" % checkpoint_path) checkpointing_end = time.time() weights = ray.put(model.get_weights())