[rllib]Add entropy coeff decay (#5043)

This commit is contained in:
Stefan Pantic
2019-07-09 03:30:32 +02:00
committed by Eric Liang
parent eeb67db861
commit dfc94ce7bc
7 changed files with 51 additions and 14 deletions
+1
View File
@@ -75,6 +75,7 @@ DEFAULT_CONFIG = with_common_config({
# balancing the three losses
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
# use fake (infinite speed) sampler for testing
"_fake_sampler": False,
@@ -15,7 +15,8 @@ from ray.rllib.agents.impala import vtrace
from ray.rllib.models.action_dist import Categorical
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
EntropyCoeffSchedule
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils import try_import_tf
@@ -195,7 +196,7 @@ def build_vtrace_loss(policy, batch_tensors):
dist_class=Categorical if is_multidiscrete else policy.dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.config["entropy_coeff"],
entropy_coeff=policy.entropy_coeff,
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
@@ -210,6 +211,7 @@ def stats(policy, batch_tensors):
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"entropy": policy.loss.entropy,
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
"var_gnorm": tf.global_norm(policy.var_list),
"vf_loss": policy.loss.vf_loss,
"vf_explained_var": explained_variance(
@@ -278,6 +280,8 @@ class ValueNetworkMixin(object):
def setup_mixins(policy, obs_space, action_space, config):
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
config["entropy_coeff_schedule"])
ValueNetworkMixin.__init__(policy)
@@ -293,5 +297,5 @@ VTraceTFPolicy = build_tf_policy(
extra_action_fetches_fn=add_behaviour_logits,
before_init=validate_config,
before_loss_init=setup_mixins,
mixins=[LearningRateSchedule, ValueNetworkMixin],
mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
+1
View File
@@ -46,6 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
"epsilon": 0.1,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
})
# __sphinx_doc_end__
# yapf: enable
+2 -2
View File
@@ -219,7 +219,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
dist_class=Categorical if is_multidiscrete else policy.dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.config["entropy_coeff"],
entropy_coeff=policy.entropy_coeff,
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
clip_pg_rho_threshold=policy.config[
"vtrace_clip_pg_rho_threshold"],
@@ -238,7 +238,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
value_targets=make_time_major(
batch_tensors[Postprocessing.VALUE_TARGETS]),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.config["entropy_coeff"],
entropy_coeff=policy.entropy_coeff,
clip_param=policy.config["clip_param"])
return policy.loss.total_loss
+4 -2
View File
@@ -41,6 +41,8 @@ DEFAULT_CONFIG = with_common_config({
"vf_loss_coeff": 1.0,
# Coefficient of the entropy regularizer
"entropy_coeff": 0.0,
# Decay schedule for the entropy regularizer
"entropy_coeff_schedule": None,
# PPO clip parameter
"clip_param": 0.3,
# Clip param for the value function. Note that this is sensitive to the
@@ -140,11 +142,11 @@ def validate_config(config):
raise ValueError(
"Minibatch size {} must be <= train batch size {}.".format(
config["sgd_minibatch_size"], config["train_batch_size"]))
if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]):
if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
raise ValueError(
"Episode truncation is not supported without a value "
"function. Consider setting batch_mode=complete_episodes.")
if (config["multiagent"]["policies"] and not config["simple_optimizer"]):
if config["multiagent"]["policies"] and not config["simple_optimizer"]:
logger.info(
"In multi-agent mode, policies will be optimized sequentially "
"by the multi-GPU optimizer. Consider setting "
+11 -3
View File
@@ -8,7 +8,8 @@ import ray
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
EntropyCoeffSchedule
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.explained_variance import explained_variance
@@ -125,7 +126,7 @@ def ppo_surrogate_loss(policy, batch_tensors):
policy.convert_to_eager(policy.value_function),
policy.convert_to_eager(policy.kl_coeff),
mask,
entropy_coeff=policy.config["entropy_coeff"],
entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
clip_param=policy.config["clip_param"],
vf_clip_param=policy.config["vf_clip_param"],
vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -147,6 +148,8 @@ def kl_and_loss_stats(policy, batch_tensors):
policy.convert_to_eager(policy.value_function)),
"kl": policy.loss_obj.mean_kl,
"entropy": policy.loss_obj.mean_entropy,
"entropy_coeff": tf.cast(
policy.convert_to_eager(policy.entropy_coeff), tf.float64),
}
@@ -249,6 +252,8 @@ def setup_config(policy, obs_space, action_space, config):
def setup_mixins(policy, obs_space, action_space, config):
ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
KLCoeffMixin.__init__(policy, config)
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
config["entropy_coeff_schedule"])
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
@@ -262,4 +267,7 @@ PPOTFPolicy = build_tf_policy(
gradients_fn=clip_gradients,
before_init=setup_config,
before_loss_init=setup_mixins,
mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
mixins=[
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
ValueNetworkMixin
])
+25 -4
View File
@@ -2,16 +2,16 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import errno
import logging
import numpy as np
import os
import numpy as np
import ray
import ray.experimental.tf_utils
from ray.rllib.models.lstm import chop_into_sequences
from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models.lstm import chop_into_sequences
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils.annotations import override, DeveloperAPI
from ray.rllib.utils.debug import log_once, summarize
@@ -555,7 +555,7 @@ class LearningRateSchedule(object):
@DeveloperAPI
def __init__(self, lr, lr_schedule):
self.cur_lr = tf.get_variable("lr", initializer=lr)
self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
if lr_schedule is None:
self.lr_schedule = ConstantSchedule(lr)
else:
@@ -572,3 +572,24 @@ class LearningRateSchedule(object):
@override(TFPolicy)
def optimizer(self):
return tf.train.AdamOptimizer(self.cur_lr)
@DeveloperAPI
class EntropyCoeffSchedule(object):
"""Mixin for TFPolicy that adds entropy coeff decay."""
@DeveloperAPI
def __init__(self, entropy_coeff, entropy_coeff_schedule):
self.entropy_coeff = tf.get_variable(
"entropy_coeff", initializer=entropy_coeff, trainable=False)
self._entropy_schedule = entropy_coeff_schedule
@override(Policy)
def on_global_var_update(self, global_vars):
super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
if self._entropy_schedule is not None:
self.entropy_coeff.load(
self.entropy_coeff.eval(session=self._sess) *
(1 - global_vars["timestep"] /
self.config["entropy_coeff_schedule"]),
session=self._sess)