Policy gradient example: record stats for tensorboard (#577)

* add tf metrics * comments * fix network scopes * add doc * use format string * fix trace level * plot intermediate and final sgd stats * add back a global step
2026-07-04 05:52:54 +08:00 · 2017-05-21 14:51:24 -07:00
parent c440010cbd
commit 06241daf61
7 changed files with 113 additions and 45 deletions
@@ -1,5 +1,7 @@
 # The build output should clearly not be checked in
 /python/ray/core
+/python/build
+/python/dist
 /src/common/thirdparty/redis
 /src/numbuf/thirdparty/arrow

@@ -30,5 +30,15 @@ try passing in the ``Pong-v0`` environment or the ``CartPole-v0`` environment.
 If you wish to use a different environment, you will need to change a few lines
 in ``example.py``.

+Current and historical training progress can be monitored by pointing
+TensorBoard to the log output directory as follows.
+
+.. code-block:: bash
+
+  tensorboard --logdir=/tmp/ray
+
+Many of the TensorBoard metrics are also printed to the console, but you might
+find it easier to visualize and compare between runs using the TensorBoard UI.
+
 .. _`TensorFlow with GPU support`: https://www.tensorflow.org/install/
 .. _`code for this example`: https://github.com/ray-project/ray/tree/master/examples/policy_gradient
@@ -2,8 +2,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from datetime import datetime
+
 import argparse
 import ray
+import tensorflow as tf

 from reinforce.env import (NoPreprocessor, AtariRamPreprocessor,
                           AtariPixelPreprocessor)
@@ -13,12 +16,16 @@ from reinforce.utils import iterate, shuffle

 config = {"kl_coeff": 0.2,
          "num_sgd_iter": 30,
+          "max_iterations": 1000,
          "sgd_stepsize": 5e-5,
          "sgd_batchsize": 128,
          "entropy_coeff": 0.0,
          "clip_param": 0.3,
          "kl_target": 0.01,
-          "timesteps_per_batch": 40000}
+          "timesteps_per_batch": 40000,
+          "num_agents": 5,
+          "tensorboard_log_dir": "/tmp/ray",
+          "trace_level": tf.RunOptions.NO_TRACE}


 if __name__ == "__main__":
@@ -47,12 +54,17 @@ if __name__ == "__main__":

  print("Using the environment {}.".format(mdp_name))
  agents = [RemoteAgent.remote(mdp_name, 1, preprocessor, config, False)
-            for _ in range(5)]
+            for _ in range(config["num_agents"])]
  agent = Agent(mdp_name, 1, preprocessor, config, True)

  kl_coeff = config["kl_coeff"]

-  for j in range(1000):
+  file_writer = tf.summary.FileWriter(
+      '{}/trpo_{}_{}'.format(
+          config["tensorboard_log_dir"], mdp_name, datetime.today()),
+      agent.sess.graph)
+  global_step = 0
+  for j in range(config["max_iterations"]):
    print("== iteration", j)
    weights = ray.put(agent.get_weights())
    [a.load_weights.remote(weights) for a in agents]
@@ -61,6 +73,15 @@ if __name__ == "__main__":
    print("total reward is ", total_reward)
    print("trajectory length mean is ", traj_len_mean)
    print("timesteps: ", trajectory["dones"].shape[0])
+    traj_stats = tf.Summary(value=[
+        tf.Summary.Value(
+            tag="policy_gradient/rollouts/mean_reward",
+            simple_value=total_reward),
+        tf.Summary.Value(
+            tag="policy_gradient/rollouts/traj_len_mean",
+            simple_value=traj_len_mean)])
+    file_writer.add_summary(traj_stats, global_step)
+    global_step += 1
    trajectory["advantages"] = ((trajectory["advantages"] -
                                 trajectory["advantages"].mean()) /
                                trajectory["advantages"].std())
@@ -73,22 +94,52 @@ if __name__ == "__main__":
    ppo = agent.ppo
    for i in range(config["num_sgd_iter"]):
      # Test on current set of rollouts.
+      run_options = tf.RunOptions(trace_level=config["trace_level"])
+      run_metadata = tf.RunMetadata()
      loss, kl, entropy = agent.sess.run(
          [ppo.loss, ppo.mean_kl, ppo.mean_entropy],
          feed_dict={ppo.observations: trajectory["observations"],
                     ppo.advantages: trajectory["advantages"],
                     ppo.actions: trajectory["actions"].squeeze(),
                     ppo.prev_logits: trajectory["logprobs"],
-                     ppo.kl_coeff: kl_coeff})
+                     ppo.kl_coeff: kl_coeff},
+          options=run_options,
+          run_metadata=run_metadata)
      print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy))
      # Run SGD for training on current set of rollouts.
      for batch in iterate(trajectory, config["sgd_batchsize"]):
-        agent.sess.run([agent.train_op],
-                       feed_dict={ppo.observations: batch["observations"],
-                                  ppo.advantages: batch["advantages"],
-                                  ppo.actions: batch["actions"].squeeze(),
-                                  ppo.prev_logits: batch["logprobs"],
-                                  ppo.kl_coeff: kl_coeff})
+        run_options = tf.RunOptions(trace_level=config["trace_level"])
+        run_metadata = tf.RunMetadata()
+        agent.sess.run(
+            [agent.train_op],
+            feed_dict={ppo.observations: batch["observations"],
+                       ppo.advantages: batch["advantages"],
+                       ppo.actions: batch["actions"].squeeze(),
+                       ppo.prev_logits: batch["logprobs"],
+                       ppo.kl_coeff: kl_coeff},
+            options=run_options,
+            run_metadata=run_metadata)
+      values = []
+      if i == config["num_sgd_iter"] - 1:
+        metric_prefix = "policy_gradient/sgd/final_iter/"
+        values.append(tf.Summary.Value(
+            tag=metric_prefix + "kl_coeff",
+            simple_value=kl_coeff))
+      else:
+        metric_prefix = "policy_gradient/sgd/intermediate_iters/"
+      values.extend([
+          tf.Summary.Value(
+              tag=metric_prefix + "mean_entropy",
+              simple_value=entropy),
+          tf.Summary.Value(
+              tag=metric_prefix + "mean_loss",
+              simple_value=loss),
+          tf.Summary.Value(
+              tag=metric_prefix + "mean_kl",
+              simple_value=kl)])
+      sgd_stats = tf.Summary(value=values)
+      file_writer.add_summary(sgd_stats, global_step)
+      global_step += 1
    if kl > 2.0 * config["kl_target"]:
      kl_coeff *= 1.5
    elif kl < 0.5 * config["kl_target"]:
@@ -21,15 +21,18 @@ class Agent(object):
    if preprocessor.shape is None:
      preprocessor.shape = self.env.observation_space.shape
    self.sess = tf.Session()
-    self.ppo = ProximalPolicyLoss(self.env.observation_space,
-                                  self.env.action_space, preprocessor, config,
-                                  self.sess)
-    self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
-    self.train_op = self.optimizer.minimize(self.ppo.loss)
-    self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
-                                                          self.sess)
-    self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
-    self.reward_filter = MeanStdFilter((), clip=5.0)
+    with tf.name_scope("policy_gradient/train"):
+      with tf.name_scope("proximal_policy_loss"):
+        self.ppo = ProximalPolicyLoss(self.env.observation_space,
+                                      self.env.action_space, preprocessor,
+                                      config, self.sess)
+      with tf.name_scope("adam_optimizer"):
+        self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
+        self.train_op = self.optimizer.minimize(self.ppo.loss)
+      self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
+                                                            self.sess)
+      self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
+      self.reward_filter = MeanStdFilter((), clip=5.0)
    self.sess.run(tf.global_variables_initializer())

  def get_weights(self):
@@ -17,21 +17,22 @@ def normc_initializer(std=1.0):


 def fc_net(inputs, num_classes=10, logstd=False):
-  fc1 = slim.fully_connected(inputs, 128,
-                             weights_initializer=normc_initializer(1.0),
-                             scope="fc1")
-  fc2 = slim.fully_connected(fc1, 128,
-                             weights_initializer=normc_initializer(1.0),
-                             scope="fc2")
-  fc3 = slim.fully_connected(fc2, 128,
-                             weights_initializer=normc_initializer(1.0),
-                             scope="fc3")
-  fc4 = slim.fully_connected(fc3, num_classes,
-                             weights_initializer=normc_initializer(0.01),
-                             activation_fn=None, scope="fc4")
-  if logstd:
-    logstd = tf.get_variable(name="logstd", shape=[num_classes],
-                             initializer=tf.zeros_initializer)
-    return tf.concat(1, [fc4, logstd])
-  else:
-    return fc4
+  with tf.name_scope("fc_net") as net:
+    fc1 = slim.fully_connected(inputs, 128,
+                               weights_initializer=normc_initializer(1.0),
+                               scope=net + "fc1")
+    fc2 = slim.fully_connected(fc1, 128,
+                               weights_initializer=normc_initializer(1.0),
+                               scope=net + "fc2")
+    fc3 = slim.fully_connected(fc2, 128,
+                               weights_initializer=normc_initializer(1.0),
+                               scope=net + "fc3")
+    fc4 = slim.fully_connected(fc3, num_classes,
+                               weights_initializer=normc_initializer(0.01),
+                               activation_fn=None, scope=net + "fc4")
+    if logstd:
+      logstd = tf.get_variable(name="logstd", shape=[num_classes],
+                               initializer=tf.zeros_initializer)
+      return tf.concat(1, [fc4, logstd])
+    else:
+      return fc4
@@ -7,9 +7,10 @@ import tensorflow.contrib.slim as slim


 def vision_net(inputs, num_classes=10):
-  conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1")
-  conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2")
-  fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope="fc1")
-  fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
-                    normalizer_fn=None, scope="fc2")
-  return tf.squeeze(fc2, [1, 2])
+  with tf.name_scope("vision_net") as net:
+    conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope=net + "conv1")
+    conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope=net + "conv2")
+    fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope=net + "fc1")
+    fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
+                      normalizer_fn=None, scope=net + "fc2")
+    return tf.squeeze(fc2, [1, 2])
@@ -91,10 +91,10 @@ def collect_samples(agents, num_timesteps, gamma, lam, horizon,
        [agent.compute_trajectory.remote(gamma, lam, horizon)
         for agent in agents])
    trajectory = concatenate(trajectory_batch)
-    total_rewards.append(
-        trajectory["raw_rewards"].sum(axis=0).mean() / len(agents))
    trajectory = flatten(trajectory)
    not_done = np.logical_not(trajectory["dones"])
+    total_rewards.append(
+        trajectory["raw_rewards"][not_done].sum(axis=0).mean() / len(agents))
    traj_len_means.append(not_done.sum(axis=0).mean() / len(agents))
    trajectory = {key: val[not_done] for key, val in trajectory.items()}
    num_timesteps_so_far += len(trajectory["dones"])