mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 05:52:54 +08:00
Policy gradient example: record stats for tensorboard (#577)
* add tf metrics * comments * fix network scopes * add doc * use format string * fix trace level * plot intermediate and final sgd stats * add back a global step
This commit is contained in:
committed by
Philipp Moritz
parent
c440010cbd
commit
06241daf61
@@ -1,5 +1,7 @@
|
||||
# The build output should clearly not be checked in
|
||||
/python/ray/core
|
||||
/python/build
|
||||
/python/dist
|
||||
/src/common/thirdparty/redis
|
||||
/src/numbuf/thirdparty/arrow
|
||||
|
||||
|
||||
@@ -30,5 +30,15 @@ try passing in the ``Pong-v0`` environment or the ``CartPole-v0`` environment.
|
||||
If you wish to use a different environment, you will need to change a few lines
|
||||
in ``example.py``.
|
||||
|
||||
Current and historical training progress can be monitored by pointing
|
||||
TensorBoard to the log output directory as follows.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
tensorboard --logdir=/tmp/ray
|
||||
|
||||
Many of the TensorBoard metrics are also printed to the console, but you might
|
||||
find it easier to visualize and compare between runs using the TensorBoard UI.
|
||||
|
||||
.. _`TensorFlow with GPU support`: https://www.tensorflow.org/install/
|
||||
.. _`code for this example`: https://github.com/ray-project/ray/tree/master/examples/policy_gradient
|
||||
|
||||
@@ -2,8 +2,11 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import argparse
|
||||
import ray
|
||||
import tensorflow as tf
|
||||
|
||||
from reinforce.env import (NoPreprocessor, AtariRamPreprocessor,
|
||||
AtariPixelPreprocessor)
|
||||
@@ -13,12 +16,16 @@ from reinforce.utils import iterate, shuffle
|
||||
|
||||
config = {"kl_coeff": 0.2,
|
||||
"num_sgd_iter": 30,
|
||||
"max_iterations": 1000,
|
||||
"sgd_stepsize": 5e-5,
|
||||
"sgd_batchsize": 128,
|
||||
"entropy_coeff": 0.0,
|
||||
"clip_param": 0.3,
|
||||
"kl_target": 0.01,
|
||||
"timesteps_per_batch": 40000}
|
||||
"timesteps_per_batch": 40000,
|
||||
"num_agents": 5,
|
||||
"tensorboard_log_dir": "/tmp/ray",
|
||||
"trace_level": tf.RunOptions.NO_TRACE}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -47,12 +54,17 @@ if __name__ == "__main__":
|
||||
|
||||
print("Using the environment {}.".format(mdp_name))
|
||||
agents = [RemoteAgent.remote(mdp_name, 1, preprocessor, config, False)
|
||||
for _ in range(5)]
|
||||
for _ in range(config["num_agents"])]
|
||||
agent = Agent(mdp_name, 1, preprocessor, config, True)
|
||||
|
||||
kl_coeff = config["kl_coeff"]
|
||||
|
||||
for j in range(1000):
|
||||
file_writer = tf.summary.FileWriter(
|
||||
'{}/trpo_{}_{}'.format(
|
||||
config["tensorboard_log_dir"], mdp_name, datetime.today()),
|
||||
agent.sess.graph)
|
||||
global_step = 0
|
||||
for j in range(config["max_iterations"]):
|
||||
print("== iteration", j)
|
||||
weights = ray.put(agent.get_weights())
|
||||
[a.load_weights.remote(weights) for a in agents]
|
||||
@@ -61,6 +73,15 @@ if __name__ == "__main__":
|
||||
print("total reward is ", total_reward)
|
||||
print("trajectory length mean is ", traj_len_mean)
|
||||
print("timesteps: ", trajectory["dones"].shape[0])
|
||||
traj_stats = tf.Summary(value=[
|
||||
tf.Summary.Value(
|
||||
tag="policy_gradient/rollouts/mean_reward",
|
||||
simple_value=total_reward),
|
||||
tf.Summary.Value(
|
||||
tag="policy_gradient/rollouts/traj_len_mean",
|
||||
simple_value=traj_len_mean)])
|
||||
file_writer.add_summary(traj_stats, global_step)
|
||||
global_step += 1
|
||||
trajectory["advantages"] = ((trajectory["advantages"] -
|
||||
trajectory["advantages"].mean()) /
|
||||
trajectory["advantages"].std())
|
||||
@@ -73,22 +94,52 @@ if __name__ == "__main__":
|
||||
ppo = agent.ppo
|
||||
for i in range(config["num_sgd_iter"]):
|
||||
# Test on current set of rollouts.
|
||||
run_options = tf.RunOptions(trace_level=config["trace_level"])
|
||||
run_metadata = tf.RunMetadata()
|
||||
loss, kl, entropy = agent.sess.run(
|
||||
[ppo.loss, ppo.mean_kl, ppo.mean_entropy],
|
||||
feed_dict={ppo.observations: trajectory["observations"],
|
||||
ppo.advantages: trajectory["advantages"],
|
||||
ppo.actions: trajectory["actions"].squeeze(),
|
||||
ppo.prev_logits: trajectory["logprobs"],
|
||||
ppo.kl_coeff: kl_coeff})
|
||||
ppo.kl_coeff: kl_coeff},
|
||||
options=run_options,
|
||||
run_metadata=run_metadata)
|
||||
print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy))
|
||||
# Run SGD for training on current set of rollouts.
|
||||
for batch in iterate(trajectory, config["sgd_batchsize"]):
|
||||
agent.sess.run([agent.train_op],
|
||||
feed_dict={ppo.observations: batch["observations"],
|
||||
ppo.advantages: batch["advantages"],
|
||||
ppo.actions: batch["actions"].squeeze(),
|
||||
ppo.prev_logits: batch["logprobs"],
|
||||
ppo.kl_coeff: kl_coeff})
|
||||
run_options = tf.RunOptions(trace_level=config["trace_level"])
|
||||
run_metadata = tf.RunMetadata()
|
||||
agent.sess.run(
|
||||
[agent.train_op],
|
||||
feed_dict={ppo.observations: batch["observations"],
|
||||
ppo.advantages: batch["advantages"],
|
||||
ppo.actions: batch["actions"].squeeze(),
|
||||
ppo.prev_logits: batch["logprobs"],
|
||||
ppo.kl_coeff: kl_coeff},
|
||||
options=run_options,
|
||||
run_metadata=run_metadata)
|
||||
values = []
|
||||
if i == config["num_sgd_iter"] - 1:
|
||||
metric_prefix = "policy_gradient/sgd/final_iter/"
|
||||
values.append(tf.Summary.Value(
|
||||
tag=metric_prefix + "kl_coeff",
|
||||
simple_value=kl_coeff))
|
||||
else:
|
||||
metric_prefix = "policy_gradient/sgd/intermediate_iters/"
|
||||
values.extend([
|
||||
tf.Summary.Value(
|
||||
tag=metric_prefix + "mean_entropy",
|
||||
simple_value=entropy),
|
||||
tf.Summary.Value(
|
||||
tag=metric_prefix + "mean_loss",
|
||||
simple_value=loss),
|
||||
tf.Summary.Value(
|
||||
tag=metric_prefix + "mean_kl",
|
||||
simple_value=kl)])
|
||||
sgd_stats = tf.Summary(value=values)
|
||||
file_writer.add_summary(sgd_stats, global_step)
|
||||
global_step += 1
|
||||
if kl > 2.0 * config["kl_target"]:
|
||||
kl_coeff *= 1.5
|
||||
elif kl < 0.5 * config["kl_target"]:
|
||||
|
||||
@@ -21,15 +21,18 @@ class Agent(object):
|
||||
if preprocessor.shape is None:
|
||||
preprocessor.shape = self.env.observation_space.shape
|
||||
self.sess = tf.Session()
|
||||
self.ppo = ProximalPolicyLoss(self.env.observation_space,
|
||||
self.env.action_space, preprocessor, config,
|
||||
self.sess)
|
||||
self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
|
||||
self.train_op = self.optimizer.minimize(self.ppo.loss)
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
|
||||
self.sess)
|
||||
self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
|
||||
self.reward_filter = MeanStdFilter((), clip=5.0)
|
||||
with tf.name_scope("policy_gradient/train"):
|
||||
with tf.name_scope("proximal_policy_loss"):
|
||||
self.ppo = ProximalPolicyLoss(self.env.observation_space,
|
||||
self.env.action_space, preprocessor,
|
||||
config, self.sess)
|
||||
with tf.name_scope("adam_optimizer"):
|
||||
self.optimizer = tf.train.AdamOptimizer(config["sgd_stepsize"])
|
||||
self.train_op = self.optimizer.minimize(self.ppo.loss)
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.ppo.loss,
|
||||
self.sess)
|
||||
self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
|
||||
self.reward_filter = MeanStdFilter((), clip=5.0)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def get_weights(self):
|
||||
|
||||
@@ -17,21 +17,22 @@ def normc_initializer(std=1.0):
|
||||
|
||||
|
||||
def fc_net(inputs, num_classes=10, logstd=False):
|
||||
fc1 = slim.fully_connected(inputs, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc1")
|
||||
fc2 = slim.fully_connected(fc1, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc2")
|
||||
fc3 = slim.fully_connected(fc2, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc3")
|
||||
fc4 = slim.fully_connected(fc3, num_classes,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope="fc4")
|
||||
if logstd:
|
||||
logstd = tf.get_variable(name="logstd", shape=[num_classes],
|
||||
initializer=tf.zeros_initializer)
|
||||
return tf.concat(1, [fc4, logstd])
|
||||
else:
|
||||
return fc4
|
||||
with tf.name_scope("fc_net") as net:
|
||||
fc1 = slim.fully_connected(inputs, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope=net + "fc1")
|
||||
fc2 = slim.fully_connected(fc1, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope=net + "fc2")
|
||||
fc3 = slim.fully_connected(fc2, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope=net + "fc3")
|
||||
fc4 = slim.fully_connected(fc3, num_classes,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope=net + "fc4")
|
||||
if logstd:
|
||||
logstd = tf.get_variable(name="logstd", shape=[num_classes],
|
||||
initializer=tf.zeros_initializer)
|
||||
return tf.concat(1, [fc4, logstd])
|
||||
else:
|
||||
return fc4
|
||||
|
||||
@@ -7,9 +7,10 @@ import tensorflow.contrib.slim as slim
|
||||
|
||||
|
||||
def vision_net(inputs, num_classes=10):
|
||||
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1")
|
||||
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2")
|
||||
fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope="fc1")
|
||||
fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
|
||||
normalizer_fn=None, scope="fc2")
|
||||
return tf.squeeze(fc2, [1, 2])
|
||||
with tf.name_scope("vision_net") as net:
|
||||
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope=net + "conv1")
|
||||
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope=net + "conv2")
|
||||
fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope=net + "fc1")
|
||||
fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
|
||||
normalizer_fn=None, scope=net + "fc2")
|
||||
return tf.squeeze(fc2, [1, 2])
|
||||
|
||||
@@ -91,10 +91,10 @@ def collect_samples(agents, num_timesteps, gamma, lam, horizon,
|
||||
[agent.compute_trajectory.remote(gamma, lam, horizon)
|
||||
for agent in agents])
|
||||
trajectory = concatenate(trajectory_batch)
|
||||
total_rewards.append(
|
||||
trajectory["raw_rewards"].sum(axis=0).mean() / len(agents))
|
||||
trajectory = flatten(trajectory)
|
||||
not_done = np.logical_not(trajectory["dones"])
|
||||
total_rewards.append(
|
||||
trajectory["raw_rewards"][not_done].sum(axis=0).mean() / len(agents))
|
||||
traj_len_means.append(not_done.sum(axis=0).mean() / len(agents))
|
||||
trajectory = {key: val[not_done] for key, val in trajectory.items()}
|
||||
num_timesteps_so_far += len(trajectory["dones"])
|
||||
|
||||
Reference in New Issue
Block a user