From 06241daf6140d692b3da9c4e3f6e2fe642beae6f Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 21 May 2017 14:51:24 -0700 Subject: [PATCH] Policy gradient example: record stats for tensorboard (#577) * add tf metrics * comments * fix network scopes * add doc * use format string * fix trace level * plot intermediate and final sgd stats * add back a global step --- .gitignore | 2 + doc/source/example-policy-gradient.rst | 10 +++ examples/policy_gradient/examples/example.py | 71 ++++++++++++++++--- examples/policy_gradient/reinforce/agent.py | 21 +++--- .../policy_gradient/reinforce/models/fcnet.py | 37 +++++----- .../reinforce/models/visionnet.py | 13 ++-- examples/policy_gradient/reinforce/rollout.py | 4 +- 7 files changed, 113 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 8166a90a4..ade7f374f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # The build output should clearly not be checked in /python/ray/core +/python/build +/python/dist /src/common/thirdparty/redis /src/numbuf/thirdparty/arrow diff --git a/doc/source/example-policy-gradient.rst b/doc/source/example-policy-gradient.rst index 92d049bcc..a8f1302ec 100644 --- a/doc/source/example-policy-gradient.rst +++ b/doc/source/example-policy-gradient.rst @@ -30,5 +30,15 @@ try passing in the ``Pong-v0`` environment or the ``CartPole-v0`` environment. If you wish to use a different environment, you will need to change a few lines in ``example.py``. +Current and historical training progress can be monitored by pointing +TensorBoard to the log output directory as follows. + +.. code-block:: bash + + tensorboard --logdir=/tmp/ray + +Many of the TensorBoard metrics are also printed to the console, but you might +find it easier to visualize and compare between runs using the TensorBoard UI. + .. _`TensorFlow with GPU support`: https://www.tensorflow.org/install/ .. _`code for this example`: https://github.com/ray-project/ray/tree/master/examples/policy_gradient diff --git a/examples/policy_gradient/examples/example.py b/examples/policy_gradient/examples/example.py index 5b2e8e65a..ab5fd87a7 100644 --- a/examples/policy_gradient/examples/example.py +++ b/examples/policy_gradient/examples/example.py @@ -2,8 +2,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from datetime import datetime + import argparse import ray +import tensorflow as tf from reinforce.env import (NoPreprocessor, AtariRamPreprocessor, AtariPixelPreprocessor) @@ -13,12 +16,16 @@ from reinforce.utils import iterate, shuffle config = {"kl_coeff": 0.2, "num_sgd_iter": 30, + "max_iterations": 1000, "sgd_stepsize": 5e-5, "sgd_batchsize": 128, "entropy_coeff": 0.0, "clip_param": 0.3, "kl_target": 0.01, - "timesteps_per_batch": 40000} + "timesteps_per_batch": 40000, + "num_agents": 5, + "tensorboard_log_dir": "/tmp/ray", + "trace_level": tf.RunOptions.NO_TRACE} if __name__ == "__main__": @@ -47,12 +54,17 @@ if __name__ == "__main__": print("Using the environment {}.".format(mdp_name)) agents = [RemoteAgent.remote(mdp_name, 1, preprocessor, config, False) - for _ in range(5)] + for _ in range(config["num_agents"])] agent = Agent(mdp_name, 1, preprocessor, config, True) kl_coeff = config["kl_coeff"] - for j in range(1000): + file_writer = tf.summary.FileWriter( + '{}/trpo_{}_{}'.format( + config["tensorboard_log_dir"], mdp_name, datetime.today()), + agent.sess.graph) + global_step = 0 + for j in range(config["max_iterations"]): print("== iteration", j) weights = ray.put(agent.get_weights()) [a.load_weights.remote(weights) for a in agents] @@ -61,6 +73,15 @@ if __name__ == "__main__": print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps: ", trajectory["dones"].shape[0]) + traj_stats = tf.Summary(value=[ + tf.Summary.Value( + tag="policy_gradient/rollouts/mean_reward", + simple_value=total_reward), + tf.Summary.Value( + tag="policy_gradient/rollouts/traj_len_mean", + simple_value=traj_len_mean)]) + file_writer.add_summary(traj_stats, global_step) + global_step += 1 trajectory["advantages"] = ((trajectory["advantages"] - trajectory["advantages"].mean()) / trajectory["advantages"].std()) @@ -73,22 +94,52 @@ if __name__ == "__main__": ppo = agent.ppo for i in range(config["num_sgd_iter"]): # Test on current set of rollouts. + run_options = tf.RunOptions(trace_level=config["trace_level"]) + run_metadata = tf.RunMetadata() loss, kl, entropy = agent.sess.run( [ppo.loss, ppo.mean_kl, ppo.mean_entropy], feed_dict={ppo.observations: trajectory["observations"], ppo.advantages: trajectory["advantages"], ppo.actions: trajectory["actions"].squeeze(), ppo.prev_logits: trajectory["logprobs"], - ppo.kl_coeff: kl_coeff}) + ppo.kl_coeff: kl_coeff}, + options=run_options, + run_metadata=run_metadata) print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy)) # Run SGD for training on current set of rollouts. for batch in iterate(trajectory, config["sgd_batchsize"]): - agent.sess.run([agent.train_op], - feed_dict={ppo.observations: batch["observations"], - ppo.advantages: batch["advantages"], - ppo.actions: batch["actions"].squeeze(), - ppo.prev_logits: batch["logprobs"], - ppo.kl_coeff: kl_coeff}) + run_options = tf.RunOptions(trace_level=config["trace_level"]) + run_metadata = tf.RunMetadata() + agent.sess.run( + [agent.train_op], + feed_dict={ppo.observations: batch["observations"], + ppo.advantages: batch["advantages"], + ppo.actions: batch["actions"].squeeze(), + ppo.prev_logits: batch["logprobs"], + ppo.kl_coeff: kl_coeff}, + options=run_options, + run_metadata=run_metadata) + values = [] + if i == config["num_sgd_iter"] - 1: + metric_prefix = "policy_gradient/sgd/final_iter/" + values.append(tf.Summary.Value( + tag=metric_prefix + "kl_coeff", + simple_value=kl_coeff)) + else: + metric_prefix = "policy_gradient/sgd/intermediate_iters/" + values.extend([ + tf.Summary.Value( + tag=metric_prefix + "mean_entropy", + simple_value=entropy), + tf.Summary.Value( + tag=metric_prefix + "mean_loss", + simple_value=loss), + tf.Summary.Value( + tag=metric_prefix + "mean_kl", + simple_value=kl)]) + sgd_stats = tf.Summary(value=values) + file_writer.add_summary(sgd_stats, global_step) + global_step += 1 if kl > 2.0 * config["kl_target"]: kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: diff --git a/examples/policy_gradient/reinforce/agent.py b/examples/policy_gradient/reinforce/agent.py index c8f4254b9..d2a797362 100644 --- a/examples/policy_gradient/reinforce/agent.py +++ b/examples/policy_gradient/reinforce/agent.py @@ -21,15 +21,18 @@ class Agent(object): if preprocessor.shape is None: preprocessor.shape = self.env.observation_space.shape self.sess = tf.Session() - self.ppo = ProximalPolicyLoss(self.env.observation_space, - self.env.action_space, preprocessor, config, - self.sess) - self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"]) - self.train_op = self.optimizer.minimize(self.ppo.loss) - self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss, - self.sess) - self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None) - self.reward_filter = MeanStdFilter((), clip=5.0) + with tf.name_scope("policy_gradient/train"): + with tf.name_scope("proximal_policy_loss"): + self.ppo = ProximalPolicyLoss(self.env.observation_space, + self.env.action_space, preprocessor, + config, self.sess) + with tf.name_scope("adam_optimizer"): + self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"]) + self.train_op = self.optimizer.minimize(self.ppo.loss) + self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss, + self.sess) + self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None) + self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def get_weights(self): diff --git a/examples/policy_gradient/reinforce/models/fcnet.py b/examples/policy_gradient/reinforce/models/fcnet.py index 51a4bb7d7..ac2db1802 100644 --- a/examples/policy_gradient/reinforce/models/fcnet.py +++ b/examples/policy_gradient/reinforce/models/fcnet.py @@ -17,21 +17,22 @@ def normc_initializer(std=1.0): def fc_net(inputs, num_classes=10, logstd=False): - fc1 = slim.fully_connected(inputs, 128, - weights_initializer=normc_initializer(1.0), - scope="fc1") - fc2 = slim.fully_connected(fc1, 128, - weights_initializer=normc_initializer(1.0), - scope="fc2") - fc3 = slim.fully_connected(fc2, 128, - weights_initializer=normc_initializer(1.0), - scope="fc3") - fc4 = slim.fully_connected(fc3, num_classes, - weights_initializer=normc_initializer(0.01), - activation_fn=None, scope="fc4") - if logstd: - logstd = tf.get_variable(name="logstd", shape=[num_classes], - initializer=tf.zeros_initializer) - return tf.concat(1, [fc4, logstd]) - else: - return fc4 + with tf.name_scope("fc_net") as net: + fc1 = slim.fully_connected(inputs, 128, + weights_initializer=normc_initializer(1.0), + scope=net + "fc1") + fc2 = slim.fully_connected(fc1, 128, + weights_initializer=normc_initializer(1.0), + scope=net + "fc2") + fc3 = slim.fully_connected(fc2, 128, + weights_initializer=normc_initializer(1.0), + scope=net + "fc3") + fc4 = slim.fully_connected(fc3, num_classes, + weights_initializer=normc_initializer(0.01), + activation_fn=None, scope=net + "fc4") + if logstd: + logstd = tf.get_variable(name="logstd", shape=[num_classes], + initializer=tf.zeros_initializer) + return tf.concat(1, [fc4, logstd]) + else: + return fc4 diff --git a/examples/policy_gradient/reinforce/models/visionnet.py b/examples/policy_gradient/reinforce/models/visionnet.py index c3e240eb1..9226d3ec2 100644 --- a/examples/policy_gradient/reinforce/models/visionnet.py +++ b/examples/policy_gradient/reinforce/models/visionnet.py @@ -7,9 +7,10 @@ import tensorflow.contrib.slim as slim def vision_net(inputs, num_classes=10): - conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1") - conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2") - fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope="fc1") - fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None, - normalizer_fn=None, scope="fc2") - return tf.squeeze(fc2, [1, 2]) + with tf.name_scope("vision_net") as net: + conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope=net + "conv1") + conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope=net + "conv2") + fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope=net + "fc1") + fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope=net + "fc2") + return tf.squeeze(fc2, [1, 2]) diff --git a/examples/policy_gradient/reinforce/rollout.py b/examples/policy_gradient/reinforce/rollout.py index 63a43c11b..092b45571 100644 --- a/examples/policy_gradient/reinforce/rollout.py +++ b/examples/policy_gradient/reinforce/rollout.py @@ -91,10 +91,10 @@ def collect_samples(agents, num_timesteps, gamma, lam, horizon, [agent.compute_trajectory.remote(gamma, lam, horizon) for agent in agents]) trajectory = concatenate(trajectory_batch) - total_rewards.append( - trajectory["raw_rewards"].sum(axis=0).mean() / len(agents)) trajectory = flatten(trajectory) not_done = np.logical_not(trajectory["dones"]) + total_rewards.append( + trajectory["raw_rewards"][not_done].sum(axis=0).mean() / len(agents)) traj_len_means.append(not_done.sum(axis=0).mean() / len(agents)) trajectory = {key: val[not_done] for key, val in trajectory.items()} num_timesteps_so_far += len(trajectory["dones"])