From df65e87fc71c21bbce837063144250073c1cfa37 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 3 Aug 2017 16:34:06 -0700
Subject: [PATCH] [rllib] Tune ppo more on control tasks (#777)

* tune ppo on control tasks

* introduce free log_std

* fix

* flag for writing logs

* fixes

* fixes
---
 python/ray/rllib/models/catalog.py            |  1 +
 python/ray/rllib/models/fcnet.py              | 19 ++++++++++++++++++-
 python/ray/rllib/policy_gradient/loss.py      |  2 +-
 .../rllib/policy_gradient/policy_gradient.py  | 13 ++++++++++---
 python/ray/rllib/test.sh                      | 10 ++++++++--
 5 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index 1c1de23cf..a5548643f 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -26,6 +26,7 @@ class ModelCatalog(object):
 
         Args:
             action_space (Space): Action space of the target gym env.
+            dist_type (Optional[str]): Identifier of the action distribution.
 
         Returns:
             dist_class (ActionDistribution): Python class of the distribution.
diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py
index 34d599aea..13b2155f9 100644
--- a/python/ray/rllib/models/fcnet.py
+++ b/python/ray/rllib/models/fcnet.py
@@ -19,13 +19,26 @@ def normc_initializer(std=1.0):
 
 
 class FullyConnectedNetwork(Model):
-    """Generic fully connected network."""
+    """Generic fully connected network.
+
+    Options to construct the network are passed to the _init function.
+    If options["free_logstd"] is True, the last half of the
+    output layer will be free variables that are not dependent on
+    inputs. This is often used if the output of the network is used
+    to parametrize a probability distribution. In this case, the
+    first half of the parameters can be interpreted as a location
+    parameter (like a mean) and the second half can be interpreted as
+    a scale parameter (like a standard deviation).
+    """
 
     def _init(self, inputs, num_outputs, options):
         hiddens = options.get("fcnet_hiddens", [256, 256])
         activation = options.get("fcnet_activation", tf.nn.tanh)
         print("Constructing fcnet {} {}".format(hiddens, activation))
 
+        if options.get("free_logstd", False):
+            num_outputs = num_outputs // 2
+
         with tf.name_scope("fc_net"):
             i = 1
             last_layer = inputs
@@ -40,4 +53,8 @@ class FullyConnectedNetwork(Model):
                 last_layer, num_outputs,
                 weights_initializer=normc_initializer(0.01),
                 activation_fn=None, scope="fc_out")
+            if options.get("free_logstd", False):
+                logstd = tf.get_variable(name="logstd", shape=[num_outputs],
+                                         initializer=tf.zeros_initializer)
+                output = tf.concat([output, 0.0 * output + logstd], 1)
             return output, last_layer
diff --git a/python/ray/rllib/policy_gradient/loss.py b/python/ray/rllib/policy_gradient/loss.py
index ef40f577a..dfb82d071 100644
--- a/python/ray/rllib/policy_gradient/loss.py
+++ b/python/ray/rllib/policy_gradient/loss.py
@@ -22,7 +22,7 @@ class ProximalPolicyLoss(object):
         self.observations = observations
 
         self.curr_logits = ModelCatalog.get_model(
-            observations, logit_dim).outputs
+            observations, logit_dim, config["model"]).outputs
         self.curr_dist = distribution_class(self.curr_logits)
         self.sampler = self.curr_dist.sample()
 
diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py
index df2e2b0ee..47af6eacf 100644
--- a/python/ray/rllib/policy_gradient/policy_gradient.py
+++ b/python/ray/rllib/policy_gradient/policy_gradient.py
@@ -18,6 +18,7 @@ from ray.rllib.policy_gradient.utils import shuffle
 
 
 DEFAULT_CONFIG = {
+    "gamma": 0.995,
     "kl_coeff": 0.2,
     "num_sgd_iter": 30,
     "max_iterations": 1000,
@@ -34,11 +35,13 @@ DEFAULT_CONFIG = {
     "entropy_coeff": 0.0,
     "clip_param": 0.3,
     "kl_target": 0.01,
+    "model": {"free_logstd": False},
     "timesteps_per_batch": 40000,
     "num_agents": 5,
     "full_trace_nth_sgd_batch": -1,
     "full_trace_data_load": False,
     "use_tf_debugger": False,
+    "write_logs": True,  # write checkpoints and tensorflow logging?
     "model_checkpoint_file": "iteration-%s.ckpt"}
 
 
@@ -53,7 +56,9 @@ class PolicyGradient(Algorithm):
             preprocessor = AtariPixelPreprocessor()
         elif self.env_name == "Pong-ram-v3":
             preprocessor = AtariRamPreprocessor()
-        elif self.env_name == "CartPole-v0":
+        elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1":
+            preprocessor = NoPreprocessor()
+        elif self.env_name == "Hopper-v1":
             preprocessor = NoPreprocessor()
         elif self.env_name == "Walker2d-v1":
             preprocessor = NoPreprocessor()
@@ -82,12 +87,14 @@ class PolicyGradient(Algorithm):
         j = self.j
         self.j += 1
 
+        print("===> iteration", self.j)
+
         saver = tf.train.Saver(max_to_keep=None)
         if "load_checkpoint" in config:
             saver.restore(model.sess, config["load_checkpoint"])
 
         # TF does not support to write logs to S3 at the moment
-        write_tf_logs = self.logdir.startswith("file")
+        write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
         iter_start = time.time()
         if write_tf_logs:
             file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
@@ -101,7 +108,7 @@ class PolicyGradient(Algorithm):
         weights = ray.put(model.get_weights())
         [a.load_weights.remote(weights) for a in agents]
         trajectory, total_reward, traj_len_mean = collect_samples(
-            agents, config["timesteps_per_batch"], 0.995, 1.0, 2000)
+            agents, config["timesteps_per_batch"], config["gamma"], 1.0, 2000)
         print("total reward is ", total_reward)
         print("trajectory length mean is ", traj_len_mean)
         print("timesteps:", trajectory["dones"].shape[0])
diff --git a/python/ray/rllib/test.sh b/python/ray/rllib/test.sh
index 2e11229cd..f721dc4c4 100755
--- a/python/ray/rllib/test.sh
+++ b/python/ray/rllib/test.sh
@@ -1,7 +1,13 @@
 #!/bin/bash
 
-python train.py --env Walker2d-v1 --alg PolicyGradient --upload-dir s3://bucketname/
-python train.py --env Humanoid-v1 --alg PolicyGradient --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --upload-dir s3://bucketname/
+python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}}' --alg PolicyGradient --upload-dir s3://bucketname/
+
 python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
 python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/
 python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/