[rllib]Add entropy coeff decay (#5043)

2026-06-28 03:34:48 +08:00 · 2019-07-09 03:30:32 +02:00
parent eeb67db861
commit dfc94ce7bc
7 changed files with 51 additions and 14 deletions
@@ -75,6 +75,7 @@ DEFAULT_CONFIG = with_common_config({
    # balancing the three losses
    "vf_loss_coeff": 0.5,
    "entropy_coeff": 0.01,
+    "entropy_coeff_schedule": None,

    # use fake (infinite speed) sampler for testing
    "_fake_sampler": False,
@@ -15,7 +15,8 @@ from ray.rllib.agents.impala import vtrace
 from ray.rllib.models.action_dist import Categorical
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy import LearningRateSchedule, \
+    EntropyCoeffSchedule
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf

@@ -195,7 +196,7 @@ def build_vtrace_loss(policy, batch_tensors):
        dist_class=Categorical if is_multidiscrete else policy.dist_class,
        valid_mask=make_time_major(mask, drop_last=True),
        vf_loss_coeff=policy.config["vf_loss_coeff"],
-        entropy_coeff=policy.config["entropy_coeff"],
+        entropy_coeff=policy.entropy_coeff,
        clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
        clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])

@@ -210,6 +211,7 @@ def stats(policy, batch_tensors):
        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        "policy_loss": policy.loss.pi_loss,
        "entropy": policy.loss.entropy,
+        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
        "var_gnorm": tf.global_norm(policy.var_list),
        "vf_loss": policy.loss.vf_loss,
        "vf_explained_var": explained_variance(
@@ -278,6 +280,8 @@ class ValueNetworkMixin(object):

 def setup_mixins(policy, obs_space, action_space, config):
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
    ValueNetworkMixin.__init__(policy)


@@ -293,5 +297,5 @@ VTraceTFPolicy = build_tf_policy(
    extra_action_fetches_fn=add_behaviour_logits,
    before_init=validate_config,
    before_loss_init=setup_mixins,
-    mixins=[LearningRateSchedule, ValueNetworkMixin],
+    mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
@@ -46,6 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
    "epsilon": 0.1,
    "vf_loss_coeff": 0.5,
    "entropy_coeff": 0.01,
+    "entropy_coeff_schedule": None,
 })
 # __sphinx_doc_end__
 # yapf: enable
@@ -219,7 +219,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
            dist_class=Categorical if is_multidiscrete else policy.dist_class,
            valid_mask=make_time_major(mask, drop_last=True),
            vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
+            entropy_coeff=policy.entropy_coeff,
            clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
            clip_pg_rho_threshold=policy.config[
                "vtrace_clip_pg_rho_threshold"],
@@ -238,7 +238,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
            value_targets=make_time_major(
                batch_tensors[Postprocessing.VALUE_TARGETS]),
            vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
+            entropy_coeff=policy.entropy_coeff,
            clip_param=policy.config["clip_param"])

    return policy.loss.total_loss
@@ -41,6 +41,8 @@ DEFAULT_CONFIG = with_common_config({
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
+    # Decay schedule for the entropy regularizer
+    "entropy_coeff_schedule": None,
    # PPO clip parameter
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
@@ -140,11 +142,11 @@ def validate_config(config):
        raise ValueError(
            "Minibatch size {} must be <= train batch size {}.".format(
                config["sgd_minibatch_size"], config["train_batch_size"]))
-    if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]):
+    if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
        raise ValueError(
            "Episode truncation is not supported without a value "
            "function. Consider setting batch_mode=complete_episodes.")
-    if (config["multiagent"]["policies"] and not config["simple_optimizer"]):
+    if config["multiagent"]["policies"] and not config["simple_optimizer"]:
        logger.info(
            "In multi-agent mode, policies will be optimized sequentially "
            "by the multi-GPU optimizer. Consider setting "
@@ -8,7 +8,8 @@ import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
    Postprocessing
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy import LearningRateSchedule, \
+    EntropyCoeffSchedule
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
@@ -125,7 +126,7 @@ def ppo_surrogate_loss(policy, batch_tensors):
        policy.convert_to_eager(policy.value_function),
        policy.convert_to_eager(policy.kl_coeff),
        mask,
-        entropy_coeff=policy.config["entropy_coeff"],
+        entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
        clip_param=policy.config["clip_param"],
        vf_clip_param=policy.config["vf_clip_param"],
        vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -147,6 +148,8 @@ def kl_and_loss_stats(policy, batch_tensors):
            policy.convert_to_eager(policy.value_function)),
        "kl": policy.loss_obj.mean_kl,
        "entropy": policy.loss_obj.mean_entropy,
+        "entropy_coeff": tf.cast(
+            policy.convert_to_eager(policy.entropy_coeff), tf.float64),
    }


@@ -249,6 +252,8 @@ def setup_config(policy, obs_space, action_space, config):
 def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    KLCoeffMixin.__init__(policy, config)
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


@@ -262,4 +267,7 @@ PPOTFPolicy = build_tf_policy(
    gradients_fn=clip_gradients,
    before_init=setup_config,
    before_loss_init=setup_mixins,
-    mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
+    mixins=[
+        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
+        ValueNetworkMixin
+    ])
@@ -2,16 +2,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os
 import errno
 import logging
-import numpy as np
+import os

+import numpy as np
 import ray
 import ray.experimental.tf_utils
+from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.debug import log_once, summarize
@@ -555,7 +555,7 @@ class LearningRateSchedule(object):

    @DeveloperAPI
    def __init__(self, lr, lr_schedule):
-        self.cur_lr = tf.get_variable("lr", initializer=lr)
+        self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
        if lr_schedule is None:
            self.lr_schedule = ConstantSchedule(lr)
        else:
@@ -572,3 +572,24 @@ class LearningRateSchedule(object):
    @override(TFPolicy)
    def optimizer(self):
        return tf.train.AdamOptimizer(self.cur_lr)
+
+
+@DeveloperAPI
+class EntropyCoeffSchedule(object):
+    """Mixin for TFPolicy that adds entropy coeff decay."""
+
+    @DeveloperAPI
+    def __init__(self, entropy_coeff, entropy_coeff_schedule):
+        self.entropy_coeff = tf.get_variable(
+            "entropy_coeff", initializer=entropy_coeff, trainable=False)
+        self._entropy_schedule = entropy_coeff_schedule
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
+        if self._entropy_schedule is not None:
+            self.entropy_coeff.load(
+                self.entropy_coeff.eval(session=self._sess) *
+                (1 - global_vars["timestep"] /
+                 self.config["entropy_coeff_schedule"]),
+                session=self._sess)