From 27cd6ea40171f8d01e516c95d9f010405d3bfd12 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 17 Mar 2019 18:07:37 -0700 Subject: [PATCH] [rllib] Flip sign of A2C, IMPALA entropy coefficient; raise DeprecationWarning if negative (#4374) --- python/ray/rllib/agents/a3c/a3c.py | 5 ++++- python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py | 4 ++-- python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py | 4 ++-- python/ray/rllib/agents/impala/impala.py | 4 +++- python/ray/rllib/agents/impala/vtrace_policy_graph.py | 4 ++-- python/ray/rllib/agents/ppo/appo.py | 2 +- python/ray/rllib/agents/ppo/appo_policy_graph.py | 8 ++++---- python/ray/rllib/agents/ppo/ppo.py | 2 ++ python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml | 2 +- python/ray/rllib/tuned_examples/pong-a3c.yaml | 2 +- 10 files changed, 22 insertions(+), 15 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py index 43daa0b3e..4eb4988d9 100644 --- a/python/ray/rllib/agents/a3c/a3c.py +++ b/python/ray/rllib/agents/a3c/a3c.py @@ -27,7 +27,7 @@ DEFAULT_CONFIG = with_common_config({ # Value Function Loss coefficient "vf_loss_coeff": 0.5, # Entropy coefficient - "entropy_coeff": -0.01, + "entropy_coeff": 0.01, # Min time per iteration "min_iter_time_s": 5, # Workers sample async. Note that this increases the effective @@ -54,6 +54,9 @@ class A3CAgent(Agent): else: policy_cls = self._policy_graph + if self.config["entropy_coeff"] < 0: + raise DeprecationWarning("entropy_coeff must be >= 0") + self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index a5a91abb5..88fba485c 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -26,7 +26,7 @@ class A3CLoss(object): v_target, vf, vf_loss_coeff=0.5, - entropy_coeff=-0.01): + entropy_coeff=0.01): log_prob = action_dist.logp(actions) # The "policy gradients" loss @@ -35,7 +35,7 @@ class A3CLoss(object): delta = vf - v_target self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) self.entropy = tf.reduce_sum(action_dist.entropy()) - self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index ed72f653b..6db8c90f2 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -15,7 +15,7 @@ from ray.rllib.utils.annotations import override class A3CLoss(nn.Module): - def __init__(self, policy_model, vf_loss_coeff=0.5, entropy_coeff=-0.01): + def __init__(self, policy_model, vf_loss_coeff=0.5, entropy_coeff=0.01): nn.Module.__init__(self) self.policy_model = policy_model self.vf_loss_coeff = vf_loss_coeff @@ -32,7 +32,7 @@ class A3CLoss(nn.Module): overall_err = sum([ pi_err, self.vf_loss_coeff * value_err, - self.entropy_coeff * entropy, + -self.entropy_coeff * entropy, ]) return overall_err diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py index cf87c773b..200bd07ae 100644 --- a/python/ray/rllib/agents/impala/impala.py +++ b/python/ray/rllib/agents/impala/impala.py @@ -84,7 +84,7 @@ DEFAULT_CONFIG = with_common_config({ "epsilon": 0.1, # balancing the three losses "vf_loss_coeff": 0.5, - "entropy_coeff": -0.01, + "entropy_coeff": 0.01, }) # __sphinx_doc_end__ # yapf: enable @@ -110,6 +110,8 @@ class ImpalaAgent(Agent): self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) + if self.config["entropy_coeff"] < 0: + raise DeprecationWarning("entropy_coeff must be >= 0") @override(Agent) def _train(self): diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index 9d16c337d..347cde70e 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -35,7 +35,7 @@ class VTraceLoss(object): bootstrap_value, valid_mask, vf_loss_coeff=0.5, - entropy_coeff=-0.01, + entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): """Policy gradient loss with vtrace importance weighting. @@ -94,7 +94,7 @@ class VTraceLoss(object): tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss - self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff) diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py index d5f9f4fa4..c0ab74b44 100644 --- a/python/ray/rllib/agents/ppo/appo.py +++ b/python/ray/rllib/agents/ppo/appo.py @@ -46,7 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, { "momentum": 0.0, "epsilon": 0.1, "vf_loss_coeff": 0.5, - "entropy_coeff": -0.01, + "entropy_coeff": 0.01, }) # __sphinx_doc_end__ # yapf: enable diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index 378e089c5..55d182c5f 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -48,7 +48,7 @@ class PPOSurrogateLoss(object): advantages, value_targets, vf_loss_coeff=0.5, - entropy_coeff=-0.01, + entropy_coeff=0.01, clip_param=0.3): logp_ratio = tf.exp(actions_logp - prev_actions_logp) @@ -71,7 +71,7 @@ class PPOSurrogateLoss(object): tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss - self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff) @@ -91,7 +91,7 @@ class VTraceSurrogateLoss(object): bootstrap_value, valid_mask, vf_loss_coeff=0.5, - entropy_coeff=-0.01, + entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, clip_param=0.3): @@ -152,7 +152,7 @@ class VTraceSurrogateLoss(object): tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss - self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 790aa4b65..f39e83ffc 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -150,6 +150,8 @@ class PPOAgent(Agent): return res def _validate_config(self): + if self.config["entropy_coeff"] < 0: + raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( diff --git a/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml b/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml index 7aebde727..9268d98cf 100644 --- a/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml +++ b/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml @@ -6,7 +6,7 @@ pong-a3c-pytorch-cnn: sample_batch_size: 20 use_pytorch: true vf_loss_coeff: 0.5 - entropy_coeff: -0.01 + entropy_coeff: 0.01 gamma: 0.99 grad_clip: 40.0 lambda: 1.0 diff --git a/python/ray/rllib/tuned_examples/pong-a3c.yaml b/python/ray/rllib/tuned_examples/pong-a3c.yaml index 5b2f40508..ad3254dda 100644 --- a/python/ray/rllib/tuned_examples/pong-a3c.yaml +++ b/python/ray/rllib/tuned_examples/pong-a3c.yaml @@ -8,7 +8,7 @@ pong-a3c: sample_batch_size: 20 use_pytorch: false vf_loss_coeff: 0.5 - entropy_coeff: -0.01 + entropy_coeff: 0.01 gamma: 0.99 grad_clip: 40.0 lambda: 1.0