[rllib] Tune ppo more on control tasks (#777)

* tune ppo on control tasks

* introduce free log_std

* fix

* flag for writing logs

* fixes

* fixes
This commit is contained in:
Philipp Moritz
2017-08-03 16:34:06 -07:00
committed by Robert Nishihara
parent 99badc7ae4
commit df65e87fc7
5 changed files with 38 additions and 7 deletions
+1
View File
@@ -26,6 +26,7 @@ class ModelCatalog(object):
Args:
action_space (Space): Action space of the target gym env.
dist_type (Optional[str]): Identifier of the action distribution.
Returns:
dist_class (ActionDistribution): Python class of the distribution.
+18 -1
View File
@@ -19,13 +19,26 @@ def normc_initializer(std=1.0):
class FullyConnectedNetwork(Model):
"""Generic fully connected network."""
"""Generic fully connected network.
Options to construct the network are passed to the _init function.
If options["free_logstd"] is True, the last half of the
output layer will be free variables that are not dependent on
inputs. This is often used if the output of the network is used
to parametrize a probability distribution. In this case, the
first half of the parameters can be interpreted as a location
parameter (like a mean) and the second half can be interpreted as
a scale parameter (like a standard deviation).
"""
def _init(self, inputs, num_outputs, options):
hiddens = options.get("fcnet_hiddens", [256, 256])
activation = options.get("fcnet_activation", tf.nn.tanh)
print("Constructing fcnet {} {}".format(hiddens, activation))
if options.get("free_logstd", False):
num_outputs = num_outputs // 2
with tf.name_scope("fc_net"):
i = 1
last_layer = inputs
@@ -40,4 +53,8 @@ class FullyConnectedNetwork(Model):
last_layer, num_outputs,
weights_initializer=normc_initializer(0.01),
activation_fn=None, scope="fc_out")
if options.get("free_logstd", False):
logstd = tf.get_variable(name="logstd", shape=[num_outputs],
initializer=tf.zeros_initializer)
output = tf.concat([output, 0.0 * output + logstd], 1)
return output, last_layer
+1 -1
View File
@@ -22,7 +22,7 @@ class ProximalPolicyLoss(object):
self.observations = observations
self.curr_logits = ModelCatalog.get_model(
observations, logit_dim).outputs
observations, logit_dim, config["model"]).outputs
self.curr_dist = distribution_class(self.curr_logits)
self.sampler = self.curr_dist.sample()
@@ -18,6 +18,7 @@ from ray.rllib.policy_gradient.utils import shuffle
DEFAULT_CONFIG = {
"gamma": 0.995,
"kl_coeff": 0.2,
"num_sgd_iter": 30,
"max_iterations": 1000,
@@ -34,11 +35,13 @@ DEFAULT_CONFIG = {
"entropy_coeff": 0.0,
"clip_param": 0.3,
"kl_target": 0.01,
"model": {"free_logstd": False},
"timesteps_per_batch": 40000,
"num_agents": 5,
"full_trace_nth_sgd_batch": -1,
"full_trace_data_load": False,
"use_tf_debugger": False,
"write_logs": True, # write checkpoints and tensorflow logging?
"model_checkpoint_file": "iteration-%s.ckpt"}
@@ -53,7 +56,9 @@ class PolicyGradient(Algorithm):
preprocessor = AtariPixelPreprocessor()
elif self.env_name == "Pong-ram-v3":
preprocessor = AtariRamPreprocessor()
elif self.env_name == "CartPole-v0":
elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1":
preprocessor = NoPreprocessor()
elif self.env_name == "Hopper-v1":
preprocessor = NoPreprocessor()
elif self.env_name == "Walker2d-v1":
preprocessor = NoPreprocessor()
@@ -82,12 +87,14 @@ class PolicyGradient(Algorithm):
j = self.j
self.j += 1
print("===> iteration", self.j)
saver = tf.train.Saver(max_to_keep=None)
if "load_checkpoint" in config:
saver.restore(model.sess, config["load_checkpoint"])
# TF does not support to write logs to S3 at the moment
write_tf_logs = self.logdir.startswith("file")
write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
iter_start = time.time()
if write_tf_logs:
file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
@@ -101,7 +108,7 @@ class PolicyGradient(Algorithm):
weights = ray.put(model.get_weights())
[a.load_weights.remote(weights) for a in agents]
trajectory, total_reward, traj_len_mean = collect_samples(
agents, config["timesteps_per_batch"], 0.995, 1.0, 2000)
agents, config["timesteps_per_batch"], config["gamma"], 1.0, 2000)
print("total reward is ", total_reward)
print("trajectory length mean is ", traj_len_mean)
print("timesteps:", trajectory["dones"].shape[0])
+8 -2
View File
@@ -1,7 +1,13 @@
#!/bin/bash
python train.py --env Walker2d-v1 --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --alg PolicyGradient --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --upload-dir s3://bucketname/
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}}' --alg PolicyGradient --upload-dir s3://bucketname/
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/