[rllib] Add vf clipping param to fix pendulum example (#2921)

* add vf clip

* fix test

* Update ppo.py
This commit is contained in:
Eric Liang
2018-09-23 13:11:17 -07:00
committed by GitHub
parent 9f9e49e4a1
commit 8331d1ebe0
4 changed files with 11 additions and 5 deletions
+3
View File
@@ -40,6 +40,9 @@ DEFAULT_CONFIG = with_common_config({
"entropy_coeff": 0.0,
# PPO clip parameter
"clip_param": 0.3,
# Clip param for the value function. Note that this is sensitive to the
# scale of the rewards. If your expected V is large, increase this.
"vf_clip_param": 10.0,
# Target value for KL divergence
"kl_target": 0.01,
# Number of GPUs to use for SGD
@@ -26,6 +26,7 @@ class PPOLoss(object):
cur_kl_coeff,
entropy_coeff=0,
clip_param=0.1,
vf_clip_param=0.1,
vf_loss_coeff=1.0,
use_gae=True):
"""Constructs the loss for Proximal Policy Objective.
@@ -49,6 +50,7 @@ class PPOLoss(object):
coefficient.
entropy_coeff (float): Coefficient of the entropy regularizer.
clip_param (float): Clip parameter
vf_clip_param (float): Clip parameter for the value function
vf_loss_coeff (float): Coefficient of the value function loss
use_gae (bool): If true, use the Generalized Advantage Estimator.
"""
@@ -71,8 +73,8 @@ class PPOLoss(object):
if use_gae:
vf_loss1 = tf.square(value_fn - value_targets)
vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
-clip_param, clip_param)
vf_clipped = vf_preds + tf.clip_by_value(
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
vf_loss2 = tf.square(vf_clipped - value_targets)
vf_loss = tf.maximum(vf_loss1, vf_loss2)
self.mean_vf_loss = tf.reduce_mean(vf_loss)
@@ -188,6 +190,7 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
self.kl_coeff,
entropy_coeff=self.config["entropy_coeff"],
clip_param=self.config["clip_param"],
vf_clip_param=self.config["vf_clip_param"],
vf_loss_coeff=self.config["vf_loss_coeff"],
use_gae=self.config["use_gae"])
@@ -4,7 +4,8 @@ pendulum-ppo:
run: PPO
config:
train_batch_size: 2048
num_workers: 4
vf_clip_param: 10.0
num_workers: 2
lambda: 0.1
gamma: 0.95
lr: 0.0003
@@ -3,10 +3,10 @@ pendulum-ppo:
run: PPO
stop:
episode_reward_mean: -160
# expect -140 within 300-500k steps
timesteps_total: 600000
config:
train_batch_size: 2048
vf_clip_param: 10.0
num_workers: 4
lambda: 0.1
gamma: 0.95
@@ -15,4 +15,3 @@ pendulum-ppo:
num_sgd_iter: 10
model:
fcnet_hiddens: [64, 64]
squash_to_range: True