Policy gradient example: record stats for tensorboard (#577)

* add tf metrics

* comments

* fix network scopes

* add doc

* use format string

* fix trace level

* plot intermediate and final sgd stats

* add back a global step
This commit is contained in:
Eric Liang
2017-05-21 14:51:24 -07:00
committed by Philipp Moritz
parent c440010cbd
commit 06241daf61
7 changed files with 113 additions and 45 deletions
+2
View File
@@ -1,5 +1,7 @@
# The build output should clearly not be checked in
/python/ray/core
/python/build
/python/dist
/src/common/thirdparty/redis
/src/numbuf/thirdparty/arrow
+10
View File
@@ -30,5 +30,15 @@ try passing in the ``Pong-v0`` environment or the ``CartPole-v0`` environment.
If you wish to use a different environment, you will need to change a few lines
in ``example.py``.
Current and historical training progress can be monitored by pointing
TensorBoard to the log output directory as follows.
.. code-block:: bash
tensorboard --logdir=/tmp/ray
Many of the TensorBoard metrics are also printed to the console, but you might
find it easier to visualize and compare between runs using the TensorBoard UI.
.. _`TensorFlow with GPU support`: https://www.tensorflow.org/install/
.. _`code for this example`: https://github.com/ray-project/ray/tree/master/examples/policy_gradient
+61 -10
View File
@@ -2,8 +2,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import argparse
import ray
import tensorflow as tf
from reinforce.env import (NoPreprocessor, AtariRamPreprocessor,
AtariPixelPreprocessor)
@@ -13,12 +16,16 @@ from reinforce.utils import iterate, shuffle
config = {"kl_coeff": 0.2,
"num_sgd_iter": 30,
"max_iterations": 1000,
"sgd_stepsize": 5e-5,
"sgd_batchsize": 128,
"entropy_coeff": 0.0,
"clip_param": 0.3,
"kl_target": 0.01,
"timesteps_per_batch": 40000}
"timesteps_per_batch": 40000,
"num_agents": 5,
"tensorboard_log_dir": "/tmp/ray",
"trace_level": tf.RunOptions.NO_TRACE}
if __name__ == "__main__":
@@ -47,12 +54,17 @@ if __name__ == "__main__":
print("Using the environment {}.".format(mdp_name))
agents = [RemoteAgent.remote(mdp_name, 1, preprocessor, config, False)
for _ in range(5)]
for _ in range(config["num_agents"])]
agent = Agent(mdp_name, 1, preprocessor, config, True)
kl_coeff = config["kl_coeff"]
for j in range(1000):
file_writer = tf.summary.FileWriter(
'{}/trpo_{}_{}'.format(
config["tensorboard_log_dir"], mdp_name, datetime.today()),
agent.sess.graph)
global_step = 0
for j in range(config["max_iterations"]):
print("== iteration", j)
weights = ray.put(agent.get_weights())
[a.load_weights.remote(weights) for a in agents]
@@ -61,6 +73,15 @@ if __name__ == "__main__":
print("total reward is ", total_reward)
print("trajectory length mean is ", traj_len_mean)
print("timesteps: ", trajectory["dones"].shape[0])
traj_stats = tf.Summary(value=[
tf.Summary.Value(
tag="policy_gradient/rollouts/mean_reward",
simple_value=total_reward),
tf.Summary.Value(
tag="policy_gradient/rollouts/traj_len_mean",
simple_value=traj_len_mean)])
file_writer.add_summary(traj_stats, global_step)
global_step += 1
trajectory["advantages"] = ((trajectory["advantages"] -
trajectory["advantages"].mean()) /
trajectory["advantages"].std())
@@ -73,22 +94,52 @@ if __name__ == "__main__":
ppo = agent.ppo
for i in range(config["num_sgd_iter"]):
# Test on current set of rollouts.
run_options = tf.RunOptions(trace_level=config["trace_level"])
run_metadata = tf.RunMetadata()
loss, kl, entropy = agent.sess.run(
[ppo.loss, ppo.mean_kl, ppo.mean_entropy],
feed_dict={ppo.observations: trajectory["observations"],
ppo.advantages: trajectory["advantages"],
ppo.actions: trajectory["actions"].squeeze(),
ppo.prev_logits: trajectory["logprobs"],
ppo.kl_coeff: kl_coeff})
ppo.kl_coeff: kl_coeff},
options=run_options,
run_metadata=run_metadata)
print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy))
# Run SGD for training on current set of rollouts.
for batch in iterate(trajectory, config["sgd_batchsize"]):
agent.sess.run([agent.train_op],
feed_dict={ppo.observations: batch["observations"],
ppo.advantages: batch["advantages"],
ppo.actions: batch["actions"].squeeze(),
ppo.prev_logits: batch["logprobs"],
ppo.kl_coeff: kl_coeff})
run_options = tf.RunOptions(trace_level=config["trace_level"])
run_metadata = tf.RunMetadata()
agent.sess.run(
[agent.train_op],
feed_dict={ppo.observations: batch["observations"],
ppo.advantages: batch["advantages"],
ppo.actions: batch["actions"].squeeze(),
ppo.prev_logits: batch["logprobs"],
ppo.kl_coeff: kl_coeff},
options=run_options,
run_metadata=run_metadata)
values = []
if i == config["num_sgd_iter"] - 1:
metric_prefix = "policy_gradient/sgd/final_iter/"
values.append(tf.Summary.Value(
tag=metric_prefix + "kl_coeff",
simple_value=kl_coeff))
else:
metric_prefix = "policy_gradient/sgd/intermediate_iters/"
values.extend([
tf.Summary.Value(
tag=metric_prefix + "mean_entropy",
simple_value=entropy),
tf.Summary.Value(
tag=metric_prefix + "mean_loss",
simple_value=loss),
tf.Summary.Value(
tag=metric_prefix + "mean_kl",
simple_value=kl)])
sgd_stats = tf.Summary(value=values)
file_writer.add_summary(sgd_stats, global_step)
global_step += 1
if kl > 2.0 * config["kl_target"]:
kl_coeff *= 1.5
elif kl < 0.5 * config["kl_target"]:
+12 -9
View File
@@ -21,15 +21,18 @@ class Agent(object):
if preprocessor.shape is None:
preprocessor.shape = self.env.observation_space.shape
self.sess = tf.Session()
self.ppo = ProximalPolicyLoss(self.env.observation_space,
self.env.action_space, preprocessor, config,
self.sess)
self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
self.train_op = self.optimizer.minimize(self.ppo.loss)
self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
self.sess)
self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
self.reward_filter = MeanStdFilter((), clip=5.0)
with tf.name_scope("policy_gradient/train"):
with tf.name_scope("proximal_policy_loss"):
self.ppo = ProximalPolicyLoss(self.env.observation_space,
self.env.action_space, preprocessor,
config, self.sess)
with tf.name_scope("adam_optimizer"):
self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
self.train_op = self.optimizer.minimize(self.ppo.loss)
self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
self.sess)
self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
self.reward_filter = MeanStdFilter((), clip=5.0)
self.sess.run(tf.global_variables_initializer())
def get_weights(self):
@@ -17,21 +17,22 @@ def normc_initializer(std=1.0):
def fc_net(inputs, num_classes=10, logstd=False):
fc1 = slim.fully_connected(inputs, 128,
weights_initializer=normc_initializer(1.0),
scope="fc1")
fc2 = slim.fully_connected(fc1, 128,
weights_initializer=normc_initializer(1.0),
scope="fc2")
fc3 = slim.fully_connected(fc2, 128,
weights_initializer=normc_initializer(1.0),
scope="fc3")
fc4 = slim.fully_connected(fc3, num_classes,
weights_initializer=normc_initializer(0.01),
activation_fn=None, scope="fc4")
if logstd:
logstd = tf.get_variable(name="logstd", shape=[num_classes],
initializer=tf.zeros_initializer)
return tf.concat(1, [fc4, logstd])
else:
return fc4
with tf.name_scope("fc_net") as net:
fc1 = slim.fully_connected(inputs, 128,
weights_initializer=normc_initializer(1.0),
scope=net + "fc1")
fc2 = slim.fully_connected(fc1, 128,
weights_initializer=normc_initializer(1.0),
scope=net + "fc2")
fc3 = slim.fully_connected(fc2, 128,
weights_initializer=normc_initializer(1.0),
scope=net + "fc3")
fc4 = slim.fully_connected(fc3, num_classes,
weights_initializer=normc_initializer(0.01),
activation_fn=None, scope=net + "fc4")
if logstd:
logstd = tf.get_variable(name="logstd", shape=[num_classes],
initializer=tf.zeros_initializer)
return tf.concat(1, [fc4, logstd])
else:
return fc4
@@ -7,9 +7,10 @@ import tensorflow.contrib.slim as slim
def vision_net(inputs, num_classes=10):
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1")
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2")
fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope="fc1")
fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope="fc2")
return tf.squeeze(fc2, [1, 2])
with tf.name_scope("vision_net") as net:
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope=net + "conv1")
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope=net + "conv2")
fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope=net + "fc1")
fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope=net + "fc2")
return tf.squeeze(fc2, [1, 2])
@@ -91,10 +91,10 @@ def collect_samples(agents, num_timesteps, gamma, lam, horizon,
[agent.compute_trajectory.remote(gamma, lam, horizon)
for agent in agents])
trajectory = concatenate(trajectory_batch)
total_rewards.append(
trajectory["raw_rewards"].sum(axis=0).mean() / len(agents))
trajectory = flatten(trajectory)
not_done = np.logical_not(trajectory["dones"])
total_rewards.append(
trajectory["raw_rewards"][not_done].sum(axis=0).mean() / len(agents))
traj_len_means.append(not_done.sum(axis=0).mean() / len(agents))
trajectory = {key: val[not_done] for key, val in trajectory.items()}
num_timesteps_so_far += len(trajectory["dones"])