[rllib] Add vf clipping param to fix pendulum example (#2921)

* add vf clip * fix test * Update ppo.py
2026-06-29 07:58:26 +08:00 · 2018-09-23 13:11:17 -07:00
parent 9f9e49e4a1
commit 8331d1ebe0
4 changed files with 11 additions and 5 deletions
@@ -40,6 +40,9 @@ DEFAULT_CONFIG = with_common_config({
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
+    # Clip param for the value function. Note that this is sensitive to the
+    # scale of the rewards. If your expected V is large, increase this.
+    "vf_clip_param": 10.0,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Number of GPUs to use for SGD
@@ -26,6 +26,7 @@ class PPOLoss(object):
                 cur_kl_coeff,
                 entropy_coeff=0,
                 clip_param=0.1,
+                 vf_clip_param=0.1,
                 vf_loss_coeff=1.0,
                 use_gae=True):
        """Constructs the loss for Proximal Policy Objective.
@@ -49,6 +50,7 @@ class PPOLoss(object):
                coefficient.
            entropy_coeff (float): Coefficient of the entropy regularizer.
            clip_param (float): Clip parameter
+            vf_clip_param (float): Clip parameter for the value function
            vf_loss_coeff (float): Coefficient of the value function loss
            use_gae (bool): If true, use the Generalized Advantage Estimator.
        """
@@ -71,8 +73,8 @@ class PPOLoss(object):

        if use_gae:
            vf_loss1 = tf.square(value_fn - value_targets)
-            vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
-                                                     -clip_param, clip_param)
+            vf_clipped = vf_preds + tf.clip_by_value(
+                value_fn - vf_preds, -vf_clip_param, vf_clip_param)
            vf_loss2 = tf.square(vf_clipped - value_targets)
            vf_loss = tf.maximum(vf_loss1, vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(vf_loss)
@@ -188,6 +190,7 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
            self.kl_coeff,
            entropy_coeff=self.config["entropy_coeff"],
            clip_param=self.config["clip_param"],
+            vf_clip_param=self.config["vf_clip_param"],
            vf_loss_coeff=self.config["vf_loss_coeff"],
            use_gae=self.config["use_gae"])

@@ -4,7 +4,8 @@ pendulum-ppo:
    run: PPO
    config:
        train_batch_size: 2048
-        num_workers: 4
+        vf_clip_param: 10.0
+        num_workers: 2
        lambda: 0.1
        gamma: 0.95
        lr: 0.0003
@@ -3,10 +3,10 @@ pendulum-ppo:
    run: PPO
    stop:
        episode_reward_mean: -160
-        # expect -140 within 300-500k steps
        timesteps_total: 600000
    config:
        train_batch_size: 2048
+        vf_clip_param: 10.0
        num_workers: 4
        lambda: 0.1
        gamma: 0.95
@@ -15,4 +15,3 @@ pendulum-ppo:
        num_sgd_iter: 10
        model:
            fcnet_hiddens: [64, 64]
-            squash_to_range: True