diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 369d9db45..41c6db5ba 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -40,6 +40,9 @@ DEFAULT_CONFIG = with_common_config({ "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, + # Clip param for the value function. Note that this is sensitive to the + # scale of the rewards. If your expected V is large, increase this. + "vf_clip_param": 10.0, # Target value for KL divergence "kl_target": 0.01, # Number of GPUs to use for SGD diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index e23f0a5b3..e6fc90d1c 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -26,6 +26,7 @@ class PPOLoss(object): cur_kl_coeff, entropy_coeff=0, clip_param=0.1, + vf_clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): """Constructs the loss for Proximal Policy Objective. @@ -49,6 +50,7 @@ class PPOLoss(object): coefficient. entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter + vf_clip_param (float): Clip parameter for the value function vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. """ @@ -71,8 +73,8 @@ class PPOLoss(object): if use_gae: vf_loss1 = tf.square(value_fn - value_targets) - vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds, - -clip_param, clip_param) + vf_clipped = vf_preds + tf.clip_by_value( + value_fn - vf_preds, -vf_clip_param, vf_clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = tf.reduce_mean(vf_loss) @@ -188,6 +190,7 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], + vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml index dcb2775fa..60df6825b 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml @@ -4,7 +4,8 @@ pendulum-ppo: run: PPO config: train_batch_size: 2048 - num_workers: 4 + vf_clip_param: 10.0 + num_workers: 2 lambda: 0.1 gamma: 0.95 lr: 0.0003 diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml index 36830dcd6..8b9d69fce 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml @@ -3,10 +3,10 @@ pendulum-ppo: run: PPO stop: episode_reward_mean: -160 - # expect -140 within 300-500k steps timesteps_total: 600000 config: train_batch_size: 2048 + vf_clip_param: 10.0 num_workers: 4 lambda: 0.1 gamma: 0.95 @@ -15,4 +15,3 @@ pendulum-ppo: num_sgd_iter: 10 model: fcnet_hiddens: [64, 64] - squash_to_range: True