mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 07:58:26 +08:00
[rllib] Add vf clipping param to fix pendulum example (#2921)
* add vf clip * fix test * Update ppo.py
This commit is contained in:
@@ -40,6 +40,9 @@ DEFAULT_CONFIG = with_common_config({
|
||||
"entropy_coeff": 0.0,
|
||||
# PPO clip parameter
|
||||
"clip_param": 0.3,
|
||||
# Clip param for the value function. Note that this is sensitive to the
|
||||
# scale of the rewards. If your expected V is large, increase this.
|
||||
"vf_clip_param": 10.0,
|
||||
# Target value for KL divergence
|
||||
"kl_target": 0.01,
|
||||
# Number of GPUs to use for SGD
|
||||
|
||||
@@ -26,6 +26,7 @@ class PPOLoss(object):
|
||||
cur_kl_coeff,
|
||||
entropy_coeff=0,
|
||||
clip_param=0.1,
|
||||
vf_clip_param=0.1,
|
||||
vf_loss_coeff=1.0,
|
||||
use_gae=True):
|
||||
"""Constructs the loss for Proximal Policy Objective.
|
||||
@@ -49,6 +50,7 @@ class PPOLoss(object):
|
||||
coefficient.
|
||||
entropy_coeff (float): Coefficient of the entropy regularizer.
|
||||
clip_param (float): Clip parameter
|
||||
vf_clip_param (float): Clip parameter for the value function
|
||||
vf_loss_coeff (float): Coefficient of the value function loss
|
||||
use_gae (bool): If true, use the Generalized Advantage Estimator.
|
||||
"""
|
||||
@@ -71,8 +73,8 @@ class PPOLoss(object):
|
||||
|
||||
if use_gae:
|
||||
vf_loss1 = tf.square(value_fn - value_targets)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
|
||||
-clip_param, clip_param)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(
|
||||
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
|
||||
vf_loss2 = tf.square(vf_clipped - value_targets)
|
||||
vf_loss = tf.maximum(vf_loss1, vf_loss2)
|
||||
self.mean_vf_loss = tf.reduce_mean(vf_loss)
|
||||
@@ -188,6 +190,7 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
self.kl_coeff,
|
||||
entropy_coeff=self.config["entropy_coeff"],
|
||||
clip_param=self.config["clip_param"],
|
||||
vf_clip_param=self.config["vf_clip_param"],
|
||||
vf_loss_coeff=self.config["vf_loss_coeff"],
|
||||
use_gae=self.config["use_gae"])
|
||||
|
||||
|
||||
@@ -4,7 +4,8 @@ pendulum-ppo:
|
||||
run: PPO
|
||||
config:
|
||||
train_batch_size: 2048
|
||||
num_workers: 4
|
||||
vf_clip_param: 10.0
|
||||
num_workers: 2
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
lr: 0.0003
|
||||
|
||||
@@ -3,10 +3,10 @@ pendulum-ppo:
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: -160
|
||||
# expect -140 within 300-500k steps
|
||||
timesteps_total: 600000
|
||||
config:
|
||||
train_batch_size: 2048
|
||||
vf_clip_param: 10.0
|
||||
num_workers: 4
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
@@ -15,4 +15,3 @@ pendulum-ppo:
|
||||
num_sgd_iter: 10
|
||||
model:
|
||||
fcnet_hiddens: [64, 64]
|
||||
squash_to_range: True
|
||||
|
||||
Reference in New Issue
Block a user