From dfc94ce7bcd5d9d008822efdeec17c3f6bb9c606 Mon Sep 17 00:00:00 2001 From: Stefan Pantic Date: Tue, 9 Jul 2019 03:30:32 +0200 Subject: [PATCH] [rllib]Add entropy coeff decay (#5043) --- python/ray/rllib/agents/impala/impala.py | 1 + .../ray/rllib/agents/impala/vtrace_policy.py | 10 +++++-- python/ray/rllib/agents/ppo/appo.py | 1 + python/ray/rllib/agents/ppo/appo_policy.py | 4 +-- python/ray/rllib/agents/ppo/ppo.py | 6 ++-- python/ray/rllib/agents/ppo/ppo_policy.py | 14 +++++++-- python/ray/rllib/policy/tf_policy.py | 29 ++++++++++++++++--- 7 files changed, 51 insertions(+), 14 deletions(-) diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py index b9699888b..f907a74fa 100644 --- a/python/ray/rllib/agents/impala/impala.py +++ b/python/ray/rllib/agents/impala/impala.py @@ -75,6 +75,7 @@ DEFAULT_CONFIG = with_common_config({ # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": 0.01, + "entropy_coeff_schedule": None, # use fake (infinite speed) sampler for testing "_fake_sampler": False, diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py index e3fa88348..a13764285 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy.py +++ b/python/ray/rllib/agents/impala/vtrace_policy.py @@ -15,7 +15,8 @@ from ray.rllib.agents.impala import vtrace from ray.rllib.models.action_dist import Categorical from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.policy.tf_policy import LearningRateSchedule +from ray.rllib.policy.tf_policy import LearningRateSchedule, \ + EntropyCoeffSchedule from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -195,7 +196,7 @@ def build_vtrace_loss(policy, batch_tensors): dist_class=Categorical if is_multidiscrete else policy.dist_class, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=policy.config["vf_loss_coeff"], - entropy_coeff=policy.config["entropy_coeff"], + entropy_coeff=policy.entropy_coeff, clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"]) @@ -210,6 +211,7 @@ def stats(policy, batch_tensors): "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "entropy": policy.loss.entropy, + "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), "var_gnorm": tf.global_norm(policy.var_list), "vf_loss": policy.loss.vf_loss, "vf_explained_var": explained_variance( @@ -278,6 +280,8 @@ class ValueNetworkMixin(object): def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], + config["entropy_coeff_schedule"]) ValueNetworkMixin.__init__(policy) @@ -293,5 +297,5 @@ VTraceTFPolicy = build_tf_policy( extra_action_fetches_fn=add_behaviour_logits, before_init=validate_config, before_loss_init=setup_mixins, - mixins=[LearningRateSchedule, ValueNetworkMixin], + mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin], get_batch_divisibility_req=lambda p: p.config["sample_batch_size"]) diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py index 4b0d9945d..f941f25e9 100644 --- a/python/ray/rllib/agents/ppo/appo.py +++ b/python/ray/rllib/agents/ppo/appo.py @@ -46,6 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, { "epsilon": 0.1, "vf_loss_coeff": 0.5, "entropy_coeff": 0.01, + "entropy_coeff_schedule": None, }) # __sphinx_doc_end__ # yapf: enable diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py index b6d3378b8..ad8452162 100644 --- a/python/ray/rllib/agents/ppo/appo_policy.py +++ b/python/ray/rllib/agents/ppo/appo_policy.py @@ -219,7 +219,7 @@ def build_appo_surrogate_loss(policy, batch_tensors): dist_class=Categorical if is_multidiscrete else policy.dist_class, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=policy.config["vf_loss_coeff"], - entropy_coeff=policy.config["entropy_coeff"], + entropy_coeff=policy.entropy_coeff, clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=policy.config[ "vtrace_clip_pg_rho_threshold"], @@ -238,7 +238,7 @@ def build_appo_surrogate_loss(policy, batch_tensors): value_targets=make_time_major( batch_tensors[Postprocessing.VALUE_TARGETS]), vf_loss_coeff=policy.config["vf_loss_coeff"], - entropy_coeff=policy.config["entropy_coeff"], + entropy_coeff=policy.entropy_coeff, clip_param=policy.config["clip_param"]) return policy.loss.total_loss diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index eea1c5361..e31d74862 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -41,6 +41,8 @@ DEFAULT_CONFIG = with_common_config({ "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, + # Decay schedule for the entropy regularizer + "entropy_coeff_schedule": None, # PPO clip parameter "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the @@ -140,11 +142,11 @@ def validate_config(config): raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( config["sgd_minibatch_size"], config["train_batch_size"])) - if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]): + if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]: raise ValueError( "Episode truncation is not supported without a value " "function. Consider setting batch_mode=complete_episodes.") - if (config["multiagent"]["policies"] and not config["simple_optimizer"]): + if config["multiagent"]["policies"] and not config["simple_optimizer"]: logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py index 1ca54d900..cf99debf3 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy.py +++ b/python/ray/rllib/agents/ppo/ppo_policy.py @@ -8,7 +8,8 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_policy import LearningRateSchedule +from ray.rllib.policy.tf_policy import LearningRateSchedule, \ + EntropyCoeffSchedule from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.explained_variance import explained_variance @@ -125,7 +126,7 @@ def ppo_surrogate_loss(policy, batch_tensors): policy.convert_to_eager(policy.value_function), policy.convert_to_eager(policy.kl_coeff), mask, - entropy_coeff=policy.config["entropy_coeff"], + entropy_coeff=policy.convert_to_eager(policy.entropy_coeff), clip_param=policy.config["clip_param"], vf_clip_param=policy.config["vf_clip_param"], vf_loss_coeff=policy.config["vf_loss_coeff"], @@ -147,6 +148,8 @@ def kl_and_loss_stats(policy, batch_tensors): policy.convert_to_eager(policy.value_function)), "kl": policy.loss_obj.mean_kl, "entropy": policy.loss_obj.mean_entropy, + "entropy_coeff": tf.cast( + policy.convert_to_eager(policy.entropy_coeff), tf.float64), } @@ -249,6 +252,8 @@ def setup_config(policy, obs_space, action_space, config): def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) KLCoeffMixin.__init__(policy, config) + EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], + config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) @@ -262,4 +267,7 @@ PPOTFPolicy = build_tf_policy( gradients_fn=clip_gradients, before_init=setup_config, before_loss_init=setup_mixins, - mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin]) + mixins=[ + LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, + ValueNetworkMixin + ]) diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py index a4d456f83..c5a53abbc 100644 --- a/python/ray/rllib/policy/tf_policy.py +++ b/python/ray/rllib/policy/tf_policy.py @@ -2,16 +2,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os import errno import logging -import numpy as np +import os +import numpy as np import ray import ray.experimental.tf_utils +from ray.rllib.models.lstm import chop_into_sequences from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.models.lstm import chop_into_sequences from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.utils.annotations import override, DeveloperAPI from ray.rllib.utils.debug import log_once, summarize @@ -555,7 +555,7 @@ class LearningRateSchedule(object): @DeveloperAPI def __init__(self, lr, lr_schedule): - self.cur_lr = tf.get_variable("lr", initializer=lr) + self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False) if lr_schedule is None: self.lr_schedule = ConstantSchedule(lr) else: @@ -572,3 +572,24 @@ class LearningRateSchedule(object): @override(TFPolicy) def optimizer(self): return tf.train.AdamOptimizer(self.cur_lr) + + +@DeveloperAPI +class EntropyCoeffSchedule(object): + """Mixin for TFPolicy that adds entropy coeff decay.""" + + @DeveloperAPI + def __init__(self, entropy_coeff, entropy_coeff_schedule): + self.entropy_coeff = tf.get_variable( + "entropy_coeff", initializer=entropy_coeff, trainable=False) + self._entropy_schedule = entropy_coeff_schedule + + @override(Policy) + def on_global_var_update(self, global_vars): + super(EntropyCoeffSchedule, self).on_global_var_update(global_vars) + if self._entropy_schedule is not None: + self.entropy_coeff.load( + self.entropy_coeff.eval(session=self._sess) * + (1 - global_vars["timestep"] / + self.config["entropy_coeff_schedule"]), + session=self._sess)