mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 03:34:48 +08:00
[rllib]Add entropy coeff decay (#5043)
This commit is contained in:
committed by
Eric Liang
parent
eeb67db861
commit
dfc94ce7bc
@@ -75,6 +75,7 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# balancing the three losses
|
||||
"vf_loss_coeff": 0.5,
|
||||
"entropy_coeff": 0.01,
|
||||
"entropy_coeff_schedule": None,
|
||||
|
||||
# use fake (infinite speed) sampler for testing
|
||||
"_fake_sampler": False,
|
||||
|
||||
@@ -15,7 +15,8 @@ from ray.rllib.agents.impala import vtrace
|
||||
from ray.rllib.models.action_dist import Categorical
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
EntropyCoeffSchedule
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
@@ -195,7 +196,7 @@ def build_vtrace_loss(policy, batch_tensors):
|
||||
dist_class=Categorical if is_multidiscrete else policy.dist_class,
|
||||
valid_mask=make_time_major(mask, drop_last=True),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
|
||||
clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
|
||||
|
||||
@@ -210,6 +211,7 @@ def stats(policy, batch_tensors):
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"entropy": policy.loss.entropy,
|
||||
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
|
||||
"var_gnorm": tf.global_norm(policy.var_list),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
@@ -278,6 +280,8 @@ class ValueNetworkMixin(object):
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
|
||||
config["entropy_coeff_schedule"])
|
||||
ValueNetworkMixin.__init__(policy)
|
||||
|
||||
|
||||
@@ -293,5 +297,5 @@ VTraceTFPolicy = build_tf_policy(
|
||||
extra_action_fetches_fn=add_behaviour_logits,
|
||||
before_init=validate_config,
|
||||
before_loss_init=setup_mixins,
|
||||
mixins=[LearningRateSchedule, ValueNetworkMixin],
|
||||
mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
|
||||
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
|
||||
|
||||
@@ -46,6 +46,7 @@ DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
|
||||
"epsilon": 0.1,
|
||||
"vf_loss_coeff": 0.5,
|
||||
"entropy_coeff": 0.01,
|
||||
"entropy_coeff_schedule": None,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
@@ -219,7 +219,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
|
||||
dist_class=Categorical if is_multidiscrete else policy.dist_class,
|
||||
valid_mask=make_time_major(mask, drop_last=True),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
|
||||
clip_pg_rho_threshold=policy.config[
|
||||
"vtrace_clip_pg_rho_threshold"],
|
||||
@@ -238,7 +238,7 @@ def build_appo_surrogate_loss(policy, batch_tensors):
|
||||
value_targets=make_time_major(
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS]),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_param=policy.config["clip_param"])
|
||||
|
||||
return policy.loss.total_loss
|
||||
|
||||
@@ -41,6 +41,8 @@ DEFAULT_CONFIG = with_common_config({
|
||||
"vf_loss_coeff": 1.0,
|
||||
# Coefficient of the entropy regularizer
|
||||
"entropy_coeff": 0.0,
|
||||
# Decay schedule for the entropy regularizer
|
||||
"entropy_coeff_schedule": None,
|
||||
# PPO clip parameter
|
||||
"clip_param": 0.3,
|
||||
# Clip param for the value function. Note that this is sensitive to the
|
||||
@@ -140,11 +142,11 @@ def validate_config(config):
|
||||
raise ValueError(
|
||||
"Minibatch size {} must be <= train batch size {}.".format(
|
||||
config["sgd_minibatch_size"], config["train_batch_size"]))
|
||||
if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]):
|
||||
if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
|
||||
raise ValueError(
|
||||
"Episode truncation is not supported without a value "
|
||||
"function. Consider setting batch_mode=complete_episodes.")
|
||||
if (config["multiagent"]["policies"] and not config["simple_optimizer"]):
|
||||
if config["multiagent"]["policies"] and not config["simple_optimizer"]:
|
||||
logger.info(
|
||||
"In multi-agent mode, policies will be optimized sequentially "
|
||||
"by the multi-GPU optimizer. Consider setting "
|
||||
|
||||
@@ -8,7 +8,8 @@ import ray
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
EntropyCoeffSchedule
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
@@ -125,7 +126,7 @@ def ppo_surrogate_loss(policy, batch_tensors):
|
||||
policy.convert_to_eager(policy.value_function),
|
||||
policy.convert_to_eager(policy.kl_coeff),
|
||||
mask,
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
|
||||
clip_param=policy.config["clip_param"],
|
||||
vf_clip_param=policy.config["vf_clip_param"],
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
@@ -147,6 +148,8 @@ def kl_and_loss_stats(policy, batch_tensors):
|
||||
policy.convert_to_eager(policy.value_function)),
|
||||
"kl": policy.loss_obj.mean_kl,
|
||||
"entropy": policy.loss_obj.mean_entropy,
|
||||
"entropy_coeff": tf.cast(
|
||||
policy.convert_to_eager(policy.entropy_coeff), tf.float64),
|
||||
}
|
||||
|
||||
|
||||
@@ -249,6 +252,8 @@ def setup_config(policy, obs_space, action_space, config):
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
|
||||
KLCoeffMixin.__init__(policy, config)
|
||||
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
|
||||
config["entropy_coeff_schedule"])
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
|
||||
|
||||
@@ -262,4 +267,7 @@ PPOTFPolicy = build_tf_policy(
|
||||
gradients_fn=clip_gradients,
|
||||
before_init=setup_config,
|
||||
before_loss_init=setup_mixins,
|
||||
mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
|
||||
mixins=[
|
||||
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
|
||||
ValueNetworkMixin
|
||||
])
|
||||
|
||||
@@ -2,16 +2,16 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import errno
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.models.lstm import chop_into_sequences
|
||||
from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.models.lstm import chop_into_sequences
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.debug import log_once, summarize
|
||||
@@ -555,7 +555,7 @@ class LearningRateSchedule(object):
|
||||
|
||||
@DeveloperAPI
|
||||
def __init__(self, lr, lr_schedule):
|
||||
self.cur_lr = tf.get_variable("lr", initializer=lr)
|
||||
self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
|
||||
if lr_schedule is None:
|
||||
self.lr_schedule = ConstantSchedule(lr)
|
||||
else:
|
||||
@@ -572,3 +572,24 @@ class LearningRateSchedule(object):
|
||||
@override(TFPolicy)
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(self.cur_lr)
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
class EntropyCoeffSchedule(object):
|
||||
"""Mixin for TFPolicy that adds entropy coeff decay."""
|
||||
|
||||
@DeveloperAPI
|
||||
def __init__(self, entropy_coeff, entropy_coeff_schedule):
|
||||
self.entropy_coeff = tf.get_variable(
|
||||
"entropy_coeff", initializer=entropy_coeff, trainable=False)
|
||||
self._entropy_schedule = entropy_coeff_schedule
|
||||
|
||||
@override(Policy)
|
||||
def on_global_var_update(self, global_vars):
|
||||
super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
|
||||
if self._entropy_schedule is not None:
|
||||
self.entropy_coeff.load(
|
||||
self.entropy_coeff.eval(session=self._sess) *
|
||||
(1 - global_vars["timestep"] /
|
||||
self.config["entropy_coeff_schedule"]),
|
||||
session=self._sess)
|
||||
|
||||
Reference in New Issue
Block a user