From dfc94ce7bcd5d9d008822efdeec17c3f6bb9c606 Mon Sep 17 00:00:00 2001
From: Stefan Pantic <stefanpantic13@gmail.com>
Date: Tue, 9 Jul 2019 03:30:32 +0200
Subject: [PATCH] [rllib]Add entropy coeff decay (#5043)

---
 python/ray/rllib/agents/impala/impala.py      |  1 +
 .../ray/rllib/agents/impala/vtrace_policy.py  | 10 +++++--
 python/ray/rllib/agents/ppo/appo.py           |  1 +
 python/ray/rllib/agents/ppo/appo_policy.py    |  4 +--
 python/ray/rllib/agents/ppo/ppo.py            |  6 ++--
 python/ray/rllib/agents/ppo/ppo_policy.py     | 14 +++++++--
 python/ray/rllib/policy/tf_policy.py          | 29 ++++++++++++++++---
 7 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
index b9699888b..f907a74fa 100644
--- a/python/ray/rllib/agents/impala/impala.py
+++ b/python/ray/rllib/agents/impala/impala.py
@@ -75,6 +75,7 @@ DEFAULT_CONFIG = with_common_config({
     # balancing the three losses
     "vf_loss_coeff": 0.5,
     "entropy_coeff": 0.01,
+    "entropy_coeff_schedule": None,
 
     # use fake (infinite speed) sampler for testing
     "_fake_sampler": False,
diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py
index e3fa88348..a13764285 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy.py
@@ -15,7 +15,8 @@ from ray.rllib.agents.impala import vtrace
 from ray.rllib.models.action_dist import Categorical
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy import LearningRateSchedule, \
+    EntropyCoeffSchedule
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
 
@@ -195,7 +196,7 @@ def build_vtrace_loss(policy, batch_tensors):
         dist_class=Categorical if is_multidiscrete else policy.dist_class,
         valid_mask=make_time_major(mask, drop_last=True),
         vf_loss_coeff=policy.config["vf_loss_coeff"],
-        entropy_coeff=policy.config["entropy_coeff"],
+        entropy_coeff=policy.entropy_coeff,
         clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
         clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
 
@@ -210,6 +211,7 @@ def stats(policy, batch_tensors):
         "cur_lr": tf.cast(policy.cur_lr, tf.float64),
         "policy_loss": policy.loss.pi_loss,
         "entropy": policy.loss.entropy,
+        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
         "var_gnorm": tf.global_norm(policy.var_list),
         "vf_loss": policy.loss.vf_loss,
         "vf_explained_var": explained_variance(
@@ -278,6 +280,8 @@ class ValueNetworkMixin(object):
 
 def setup_mixins(policy, obs_space, action_space, config):
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
     ValueNetworkMixin.__init__(policy)
 
 
@@ -293,5 +297,5 @@ VTraceTFPolicy = build_tf_policy(
     extra_action_fetches_fn=add_behaviour_logits,
     before_init=validate_config,
     before_loss_init=setup_mixins,
-    mixins=[LearningRateSchedule, ValueNetworkMixin],
+    mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
     get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py
index 4b0d9945d..f941f25e9 100644
--- a/python/ray/rllib/agents/ppo/appo.py
+++ b/python/ray/rllib/agents/ppo/appo.py
@@ -46,6 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
     "epsilon": 0.1,
     "vf_loss_coeff": 0.5,
     "entropy_coeff": 0.01,
+    "entropy_coeff_schedule": None,
 })
 # __sphinx_doc_end__
 # yapf: enable
diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py
index b6d3378b8..ad8452162 100644
--- a/python/ray/rllib/agents/ppo/appo_policy.py
+++ b/python/ray/rllib/agents/ppo/appo_policy.py
@@ -219,7 +219,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
             dist_class=Categorical if is_multidiscrete else policy.dist_class,
             valid_mask=make_time_major(mask, drop_last=True),
             vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
+            entropy_coeff=policy.entropy_coeff,
             clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
             clip_pg_rho_threshold=policy.config[
                 "vtrace_clip_pg_rho_threshold"],
@@ -238,7 +238,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
             value_targets=make_time_major(
                 batch_tensors[Postprocessing.VALUE_TARGETS]),
             vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
+            entropy_coeff=policy.entropy_coeff,
             clip_param=policy.config["clip_param"])
 
     return policy.loss.total_loss
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index eea1c5361..e31d74862 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -41,6 +41,8 @@ DEFAULT_CONFIG = with_common_config({
     "vf_loss_coeff": 1.0,
     # Coefficient of the entropy regularizer
     "entropy_coeff": 0.0,
+    # Decay schedule for the entropy regularizer
+    "entropy_coeff_schedule": None,
     # PPO clip parameter
     "clip_param": 0.3,
     # Clip param for the value function. Note that this is sensitive to the
@@ -140,11 +142,11 @@ def validate_config(config):
         raise ValueError(
             "Minibatch size {} must be <= train batch size {}.".format(
                 config["sgd_minibatch_size"], config["train_batch_size"]))
-    if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]):
+    if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
         raise ValueError(
             "Episode truncation is not supported without a value "
             "function. Consider setting batch_mode=complete_episodes.")
-    if (config["multiagent"]["policies"] and not config["simple_optimizer"]):
+    if config["multiagent"]["policies"] and not config["simple_optimizer"]:
         logger.info(
             "In multi-agent mode, policies will be optimized sequentially "
             "by the multi-GPU optimizer. Consider setting "
diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py
index 1ca54d900..cf99debf3 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy.py
@@ -8,7 +8,8 @@ import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy import LearningRateSchedule, \
+    EntropyCoeffSchedule
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
@@ -125,7 +126,7 @@ def ppo_surrogate_loss(policy, batch_tensors):
         policy.convert_to_eager(policy.value_function),
         policy.convert_to_eager(policy.kl_coeff),
         mask,
-        entropy_coeff=policy.config["entropy_coeff"],
+        entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
         clip_param=policy.config["clip_param"],
         vf_clip_param=policy.config["vf_clip_param"],
         vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -147,6 +148,8 @@ def kl_and_loss_stats(policy, batch_tensors):
             policy.convert_to_eager(policy.value_function)),
         "kl": policy.loss_obj.mean_kl,
         "entropy": policy.loss_obj.mean_entropy,
+        "entropy_coeff": tf.cast(
+            policy.convert_to_eager(policy.entropy_coeff), tf.float64),
     }
 
 
@@ -249,6 +252,8 @@ def setup_config(policy, obs_space, action_space, config):
 def setup_mixins(policy, obs_space, action_space, config):
     ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
     KLCoeffMixin.__init__(policy, config)
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
 
@@ -262,4 +267,7 @@ PPOTFPolicy = build_tf_policy(
     gradients_fn=clip_gradients,
     before_init=setup_config,
     before_loss_init=setup_mixins,
-    mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
+    mixins=[
+        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
+        ValueNetworkMixin
+    ])
diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py
index a4d456f83..c5a53abbc 100644
--- a/python/ray/rllib/policy/tf_policy.py
+++ b/python/ray/rllib/policy/tf_policy.py
@@ -2,16 +2,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import errno
 import logging
-import numpy as np
+import os
 
+import numpy as np
 import ray
 import ray.experimental.tf_utils
+from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.debug import log_once, summarize
@@ -555,7 +555,7 @@ class LearningRateSchedule(object):
 
     @DeveloperAPI
     def __init__(self, lr, lr_schedule):
-        self.cur_lr = tf.get_variable("lr", initializer=lr)
+        self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
         if lr_schedule is None:
             self.lr_schedule = ConstantSchedule(lr)
         else:
@@ -572,3 +572,24 @@ class LearningRateSchedule(object):
     @override(TFPolicy)
     def optimizer(self):
         return tf.train.AdamOptimizer(self.cur_lr)
+
+
+@DeveloperAPI
+class EntropyCoeffSchedule(object):
+    """Mixin for TFPolicy that adds entropy coeff decay."""
+
+    @DeveloperAPI
+    def __init__(self, entropy_coeff, entropy_coeff_schedule):
+        self.entropy_coeff = tf.get_variable(
+            "entropy_coeff", initializer=entropy_coeff, trainable=False)
+        self._entropy_schedule = entropy_coeff_schedule
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
+        if self._entropy_schedule is not None:
+            self.entropy_coeff.load(
+                self.entropy_coeff.eval(session=self._sess) *
+                (1 - global_vars["timestep"] /
+                 self.config["entropy_coeff_schedule"]),
+                session=self._sess)