[rllib] Tune ppo more on control tasks (#777)

* tune ppo on control tasks * introduce free log_std * fix * flag for writing logs * fixes * fixes
2026-06-27 21:38:18 +08:00 · 2017-08-03 16:34:06 -07:00
parent 99badc7ae4
commit df65e87fc7
5 changed files with 38 additions and 7 deletions
@@ -26,6 +26,7 @@ class ModelCatalog(object):

        Args:
            action_space (Space): Action space of the target gym env.
+            dist_type (Optional[str]): Identifier of the action distribution.

        Returns:
            dist_class (ActionDistribution): Python class of the distribution.
@@ -19,13 +19,26 @@ def normc_initializer(std=1.0):


 class FullyConnectedNetwork(Model):
-    """Generic fully connected network."""
+    """Generic fully connected network.
+
+    Options to construct the network are passed to the _init function.
+    If options["free_logstd"] is True, the last half of the
+    output layer will be free variables that are not dependent on
+    inputs. This is often used if the output of the network is used
+    to parametrize a probability distribution. In this case, the
+    first half of the parameters can be interpreted as a location
+    parameter (like a mean) and the second half can be interpreted as
+    a scale parameter (like a standard deviation).
+    """

    def _init(self, inputs, num_outputs, options):
        hiddens = options.get("fcnet_hiddens", [256, 256])
        activation = options.get("fcnet_activation", tf.nn.tanh)
        print("Constructing fcnet {} {}".format(hiddens, activation))

+        if options.get("free_logstd", False):
+            num_outputs = num_outputs // 2
+
        with tf.name_scope("fc_net"):
            i = 1
            last_layer = inputs
@@ -40,4 +53,8 @@ class FullyConnectedNetwork(Model):
                last_layer, num_outputs,
                weights_initializer=normc_initializer(0.01),
                activation_fn=None, scope="fc_out")
+            if options.get("free_logstd", False):
+                logstd = tf.get_variable(name="logstd", shape=[num_outputs],
+                                         initializer=tf.zeros_initializer)
+                output = tf.concat([output, 0.0 * output + logstd], 1)
            return output, last_layer
@@ -22,7 +22,7 @@ class ProximalPolicyLoss(object):
        self.observations = observations

        self.curr_logits = ModelCatalog.get_model(
-            observations, logit_dim).outputs
+            observations, logit_dim, config["model"]).outputs
        self.curr_dist = distribution_class(self.curr_logits)
        self.sampler = self.curr_dist.sample()

@@ -18,6 +18,7 @@ from ray.rllib.policy_gradient.utils import shuffle


 DEFAULT_CONFIG = {
+    "gamma": 0.995,
    "kl_coeff": 0.2,
    "num_sgd_iter": 30,
    "max_iterations": 1000,
@@ -34,11 +35,13 @@ DEFAULT_CONFIG = {
    "entropy_coeff": 0.0,
    "clip_param": 0.3,
    "kl_target": 0.01,
+    "model": {"free_logstd": False},
    "timesteps_per_batch": 40000,
    "num_agents": 5,
    "full_trace_nth_sgd_batch": -1,
    "full_trace_data_load": False,
    "use_tf_debugger": False,
+    "write_logs": True,  # write checkpoints and tensorflow logging?
    "model_checkpoint_file": "iteration-%s.ckpt"}


@@ -53,7 +56,9 @@ class PolicyGradient(Algorithm):
            preprocessor = AtariPixelPreprocessor()
        elif self.env_name == "Pong-ram-v3":
            preprocessor = AtariRamPreprocessor()
-        elif self.env_name == "CartPole-v0":
+        elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1":
+            preprocessor = NoPreprocessor()
+        elif self.env_name == "Hopper-v1":
            preprocessor = NoPreprocessor()
        elif self.env_name == "Walker2d-v1":
            preprocessor = NoPreprocessor()
@@ -82,12 +87,14 @@ class PolicyGradient(Algorithm):
        j = self.j
        self.j += 1

+        print("===> iteration", self.j)
+
        saver = tf.train.Saver(max_to_keep=None)
        if "load_checkpoint" in config:
            saver.restore(model.sess, config["load_checkpoint"])

        # TF does not support to write logs to S3 at the moment
-        write_tf_logs = self.logdir.startswith("file")
+        write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
        iter_start = time.time()
        if write_tf_logs:
            file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
@@ -101,7 +108,7 @@ class PolicyGradient(Algorithm):
        weights = ray.put(model.get_weights())
        [a.load_weights.remote(weights) for a in agents]
        trajectory, total_reward, traj_len_mean = collect_samples(
-            agents, config["timesteps_per_batch"], 0.995, 1.0, 2000)
+            agents, config["timesteps_per_batch"], config["gamma"], 1.0, 2000)
        print("total reward is ", total_reward)
        print("trajectory length mean is ", traj_len_mean)
        print("timesteps:", trajectory["dones"].shape[0])
@@ -1,7 +1,13 @@
 #!/bin/bash

-python train.py --env Walker2d-v1 --alg PolicyGradient --upload-dir s3://bucketname/
-python train.py --env Humanoid-v1 --alg PolicyGradient --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --upload-dir s3://bucketname/
+python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
+
+python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}}' --alg PolicyGradient --upload-dir s3://bucketname/
+
 python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
 python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/
 python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/