mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 03:04:28 +08:00
[rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694)
* [rllib] Separate optimisers for DDPG actor & crit. * [rllib] Better names for DDPG variables & options Config changes: - noise_scale -> exploration_ou_noise_scale - exploration_theta -> exploration_ou_theta - exploration_sigma -> exploration_ou_sigma - act_noise -> exploration_gaussian_sigma - noise_clip -> target_noise_clip * [rllib] Make DDPG less class-y Used functions to replace three classes with only an __init__ method & a handful of unrelated attributes. * [rllib] Refactor DDPG noise * [rllib] Unify DDPG exploration annealing Added option "exploration_should_anneal" to enable linear annealing of exploration noise. By default this is off, for consistency with DDPG & TD3 papers. Also renamed "exploration_final_eps" to "exploration_final_scale" (that name seems to have been carried over from DQN, and doesn't really make sense here). Finally, tried to rename "eps" to "noise_scale" wherever possible.
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.ddpg.td3 import TD3Trainer
|
||||
from ray.rllib.utils import renamed_class
|
||||
|
||||
ApexDDPGAgent = renamed_class(ApexDDPGTrainer)
|
||||
@@ -11,5 +12,5 @@ DDPGAgent = renamed_class(DDPGTrainer)
|
||||
|
||||
__all__ = [
|
||||
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
|
||||
"DEFAULT_CONFIG"
|
||||
"TD3Trainer", "DEFAULT_CONFIG"
|
||||
]
|
||||
|
||||
@@ -13,19 +13,21 @@ from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
|
||||
# TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
|
||||
# In addition to settings below, you can use "exploration_noise_type" and
|
||||
# "exploration_gauss_act_noise" to get IID Gaussian exploration noise
|
||||
# instead of OU exploration noise.
|
||||
# twin Q-net
|
||||
"twin_q": False,
|
||||
# delayed policy update
|
||||
"policy_delay": 1,
|
||||
# target policy smoothing
|
||||
# this also forces the use of gaussian instead of OU noise for exploration
|
||||
# (this also replaces OU exploration noise with IID Gaussian exploration
|
||||
# noise, for now)
|
||||
"smooth_target_policy": False,
|
||||
# gaussian stddev of act noise
|
||||
"act_noise": 0.1,
|
||||
# gaussian stddev of target noise
|
||||
# gaussian stddev of target action noise for smoothing
|
||||
"target_noise": 0.2,
|
||||
# target noise limit (bound)
|
||||
"noise_clip": 0.5,
|
||||
"target_noise_clip": 0.5,
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
@@ -37,42 +39,64 @@ DEFAULT_CONFIG = with_common_config({
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Model ===
|
||||
# Postprocess the policy network model output with these hidden layers
|
||||
"actor_hiddens": [64, 64],
|
||||
# Hidden layers activation of the policy network
|
||||
# Apply a state preprocessor with spec given by the "model" config option
|
||||
# (like other RL algorithms). This is mostly useful if you have a weird
|
||||
# observation shape, like an image. Disabled by default.
|
||||
"use_state_preprocessor": False,
|
||||
# Postprocess the policy network model output with these hidden layers. If
|
||||
# use_state_preprocessor is False, then these will be the *only* hidden
|
||||
# layers in the network.
|
||||
"actor_hiddens": [400, 300],
|
||||
# Hidden layers activation of the postprocessing stage of the policy
|
||||
# network
|
||||
"actor_hidden_activation": "relu",
|
||||
# Postprocess the critic network model output with these hidden layers
|
||||
"critic_hiddens": [64, 64],
|
||||
# Hidden layers activation of the critic network
|
||||
# Postprocess the critic network model output with these hidden layers;
|
||||
# again, if use_state_preprocessor is True, then the state will be
|
||||
# preprocessed by the model specified with the "model" config option first.
|
||||
"critic_hiddens": [400, 300],
|
||||
# Hidden layers activation of the postprocessing state of the critic.
|
||||
"critic_hidden_activation": "relu",
|
||||
# N-step Q learning
|
||||
"n_step": 1,
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
# exploration_fraction
|
||||
# Turns on annealing schedule for exploration noise. Exploration is
|
||||
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
|
||||
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
|
||||
# anneal noise, so this is False by default.
|
||||
"exploration_should_anneal": False,
|
||||
# Max num timesteps for annealing schedules.
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Number of env steps to optimize for before returning
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
"exploration_fraction": 0.1,
|
||||
# Final value of random action probability
|
||||
"exploration_final_eps": 0.02,
|
||||
# OU-noise scale
|
||||
"noise_scale": 0.1,
|
||||
# theta
|
||||
"exploration_theta": 0.15,
|
||||
# sigma
|
||||
"exploration_sigma": 0.2,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 0,
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
"tau": 0.002,
|
||||
# Final scaling multiplier for action noise (initial is 1.0)
|
||||
"exploration_final_scale": 0.02,
|
||||
# valid values: "ou" (time-correlated, like original DDPG paper),
|
||||
# "gaussian" (IID, like TD3 paper)
|
||||
"exploration_noise_type": "ou",
|
||||
# OU-noise scale; this can be used to scale down magnitude of OU noise
|
||||
# before adding to actions (requires "exploration_noise_type" to be "ou")
|
||||
"exploration_ou_noise_scale": 0.1,
|
||||
# theta for OU
|
||||
"exploration_ou_theta": 0.15,
|
||||
# sigma for OU
|
||||
"exploration_ou_sigma": 0.2,
|
||||
# gaussian stddev of act noise for exploration (requires
|
||||
# "exploration_noise_type" to be "gaussian")
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
# Until this many timesteps have elapsed, the agent's policy will be
|
||||
# ignored & it will instead take uniform random actions. Can be used in
|
||||
# conjunction with learning_starts (which controls when the first
|
||||
# optimization step happens) to decrease dependence of exploration &
|
||||
# optimization on initial policy parameters. Note that this will be
|
||||
# disabled when the action noise scale is set to 0 (e.g during evaluation).
|
||||
"pure_exploration_steps": 1000,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
@@ -90,11 +114,14 @@ DEFAULT_CONFIG = with_common_config({
|
||||
"compress_observations": False,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for adam optimizer.
|
||||
# Instead of using two optimizers, we use two different loss coefficients
|
||||
"lr": 1e-3,
|
||||
"actor_loss_coeff": 0.1,
|
||||
"critic_loss_coeff": 1.0,
|
||||
# Learning rate for the critic (Q-function) optimizer.
|
||||
"critic_lr": 1e-3,
|
||||
# Learning rate for the actor (policy) optimizer.
|
||||
"actor_lr": 1e-3,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 0,
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
"tau": 0.002,
|
||||
# If True, use huber loss instead of squared loss for critic network
|
||||
# Conventionally, no need to clip gradients if using a huber loss
|
||||
"use_huber": False,
|
||||
@@ -117,7 +144,7 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# === Parallelism ===
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
# you're using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Optimizer class to use.
|
||||
"optimizer_class": "SyncReplayOptimizer",
|
||||
@@ -138,26 +165,41 @@ class DDPGTrainer(DQNTrainer):
|
||||
_default_config = DEFAULT_CONFIG
|
||||
_policy_graph = DDPGPolicyGraph
|
||||
|
||||
@override(DQNTrainer)
|
||||
def _train(self):
|
||||
pure_expl_steps = self.config["pure_exploration_steps"]
|
||||
if pure_expl_steps:
|
||||
# tell workers whether they should do pure exploration
|
||||
only_explore = self.global_timestep < pure_expl_steps
|
||||
self.local_evaluator.foreach_trainable_policy(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
for e in self.remote_evaluators:
|
||||
e.foreach_trainable_policy.remote(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
return super(DDPGTrainer, self)._train()
|
||||
|
||||
@override(DQNTrainer)
|
||||
def _make_exploration_schedule(self, worker_index):
|
||||
# Override DQN's schedule to take into account `noise_scale`
|
||||
# Override DQN's schedule to take into account
|
||||
# `exploration_ou_noise_scale`
|
||||
if self.config["per_worker_exploration"]:
|
||||
assert self.config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
if worker_index >= 0:
|
||||
exponent = (
|
||||
1 +
|
||||
worker_index / float(self.config["num_workers"] - 1) * 7)
|
||||
return ConstantSchedule(
|
||||
self.config["noise_scale"] * 0.4**exponent)
|
||||
# FIXME: what do magic constants mean? (0.4, 7)
|
||||
max_index = float(self.config["num_workers"] - 1)
|
||||
exponent = 1 + worker_index / max_index * 7
|
||||
return ConstantSchedule(0.4**exponent)
|
||||
else:
|
||||
# local ev should have zero exploration so that eval rollouts
|
||||
# run properly
|
||||
return ConstantSchedule(0.0)
|
||||
else:
|
||||
elif self.config["exploration_should_anneal"]:
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(self.config["exploration_fraction"] *
|
||||
self.config["schedule_max_timesteps"]),
|
||||
initial_p=self.config["noise_scale"] * 1.0,
|
||||
final_p=self.config["noise_scale"] *
|
||||
self.config["exploration_final_eps"])
|
||||
initial_p=1.0,
|
||||
final_p=self.config["exploration_final_scale"])
|
||||
else:
|
||||
# *always* add exploration noise
|
||||
return ConstantSchedule(1.0)
|
||||
|
||||
@@ -19,80 +19,18 @@ from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.evaluation.policy_graph import PolicyGraph
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
ACTION_SCOPE = "a_func"
|
||||
POLICY_SCOPE = "p_func"
|
||||
POLICY_TARGET_SCOPE = "target_p_func"
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
TWIN_Q_SCOPE = "twin_q_func"
|
||||
TWIN_Q_TARGET_SCOPE = "twin_target_q_func"
|
||||
ACTION_SCOPE = "action"
|
||||
POLICY_SCOPE = "policy"
|
||||
POLICY_TARGET_SCOPE = "target_policy"
|
||||
Q_SCOPE = "critic"
|
||||
Q_TARGET_SCOPE = "target_critic"
|
||||
TWIN_Q_SCOPE = "twin_critic"
|
||||
TWIN_Q_TARGET_SCOPE = "twin_target_critic"
|
||||
|
||||
# Importance sampling weights for prioritized replay
|
||||
PRIO_WEIGHTS = "weights"
|
||||
|
||||
|
||||
class ActorCriticLoss(object):
|
||||
def __init__(self,
|
||||
q_t,
|
||||
q_tp1,
|
||||
q_tp0,
|
||||
importance_weights,
|
||||
rewards,
|
||||
done_mask,
|
||||
twin_q_t,
|
||||
twin_q_tp1,
|
||||
actor_loss_coeff=0.1,
|
||||
critic_loss_coeff=1.0,
|
||||
gamma=0.99,
|
||||
n_step=1,
|
||||
use_huber=False,
|
||||
huber_threshold=1.0,
|
||||
twin_q=False,
|
||||
policy_delay=1):
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
if twin_q:
|
||||
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
|
||||
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
|
||||
|
||||
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
if twin_q:
|
||||
td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
twin_td_error = twin_q_t_selected - tf.stop_gradient(
|
||||
q_t_selected_target)
|
||||
self.td_error = td_error + twin_td_error
|
||||
if use_huber:
|
||||
errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
|
||||
twin_td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
|
||||
twin_td_error)
|
||||
else:
|
||||
self.td_error = (
|
||||
q_t_selected - tf.stop_gradient(q_t_selected_target))
|
||||
if use_huber:
|
||||
errors = _huber_loss(self.td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(self.td_error)
|
||||
|
||||
self.critic_loss = critic_loss_coeff * tf.reduce_mean(
|
||||
importance_weights * errors)
|
||||
|
||||
# for policy gradient, update policy net one time v.s.
|
||||
# update critic net `policy_delay` time(s)
|
||||
global_step = tf.train.get_or_create_global_step()
|
||||
policy_delay_mask = tf.to_float(
|
||||
tf.equal(tf.mod(global_step, policy_delay), 0))
|
||||
self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
|
||||
tf.reduce_mean(q_tp0))
|
||||
|
||||
|
||||
class DDPGPostprocessing(object):
|
||||
"""Implements n-step learning and param noise adjustments."""
|
||||
|
||||
@@ -113,12 +51,13 @@ class DDPGPostprocessing(object):
|
||||
feed_dict={
|
||||
self.cur_observations: states,
|
||||
self.stochastic: False,
|
||||
self.eps: .0
|
||||
self.noise_scale: .0,
|
||||
self.pure_exploration_phase: False,
|
||||
})
|
||||
distance_in_action_space = np.sqrt(
|
||||
np.mean(np.square(clean_actions - noisy_actions)))
|
||||
self.pi_distance = distance_in_action_space
|
||||
if distance_in_action_space < self.config["exploration_sigma"]:
|
||||
if distance_in_action_space < self.config["exploration_ou_sigma"]:
|
||||
self.parameter_noise_sigma_val *= 1.01
|
||||
else:
|
||||
self.parameter_noise_sigma_val /= 1.01
|
||||
@@ -128,107 +67,6 @@ class DDPGPostprocessing(object):
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
|
||||
class PolicyNetwork(object):
|
||||
"""Maps an observations (i.e., state) to an action where each entry takes
|
||||
value from (0, 1) due to the sigmoid function."""
|
||||
|
||||
def __init__(self,
|
||||
model,
|
||||
dim_actions,
|
||||
hiddens=[64, 64],
|
||||
activation="relu",
|
||||
parameter_noise=False):
|
||||
action_out = model.last_layer
|
||||
activation = tf.nn.__dict__[activation]
|
||||
for hidden in hiddens:
|
||||
action_out = layers.fully_connected(
|
||||
action_out,
|
||||
num_outputs=hidden,
|
||||
activation_fn=activation,
|
||||
normalizer_fn=layers.layer_norm if parameter_noise else None)
|
||||
# Use sigmoid layer to bound values within (0, 1)
|
||||
# shape of action_scores is [batch_size, dim_actions]
|
||||
self.action_scores = layers.fully_connected(
|
||||
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
|
||||
self.model = model
|
||||
|
||||
|
||||
class ActionNetwork(object):
|
||||
"""Acts as a stochastic policy for inference, but a deterministic policy
|
||||
for training, thus ignoring the batch_size issue when constructing a
|
||||
stochastic action."""
|
||||
|
||||
def __init__(self,
|
||||
p_values,
|
||||
low_action,
|
||||
high_action,
|
||||
stochastic,
|
||||
eps,
|
||||
theta=0.15,
|
||||
sigma=0.2,
|
||||
use_gaussian_noise=False,
|
||||
act_noise=0.1,
|
||||
is_target=False,
|
||||
target_noise=0.2,
|
||||
noise_clip=0.5,
|
||||
parameter_noise=False):
|
||||
|
||||
# shape is [None, dim_action]
|
||||
deterministic_actions = (
|
||||
(high_action - low_action) * p_values + low_action)
|
||||
|
||||
if use_gaussian_noise:
|
||||
if is_target:
|
||||
normal_sample = tf.random_normal(
|
||||
tf.shape(deterministic_actions), stddev=target_noise)
|
||||
normal_sample = tf.clip_by_value(normal_sample, -noise_clip,
|
||||
noise_clip)
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + normal_sample, low_action,
|
||||
high_action)
|
||||
else:
|
||||
normal_sample = tf.random_normal(
|
||||
tf.shape(deterministic_actions), stddev=act_noise)
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + normal_sample, low_action,
|
||||
high_action)
|
||||
else:
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=low_action.size * [.0],
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[low_action.size], mean=0.0, stddev=1.0)
|
||||
exploration_value = tf.assign_add(
|
||||
exploration_sample,
|
||||
theta * (.0 - exploration_sample) + sigma * normal_sample)
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions +
|
||||
eps * (high_action - low_action) * exploration_value,
|
||||
low_action, high_action)
|
||||
|
||||
self.actions = tf.cond(
|
||||
tf.logical_and(stochastic, not parameter_noise),
|
||||
lambda: stochastic_actions, lambda: deterministic_actions)
|
||||
|
||||
|
||||
class QNetwork(object):
|
||||
def __init__(self,
|
||||
model,
|
||||
action_inputs,
|
||||
hiddens=[64, 64],
|
||||
activation="relu"):
|
||||
q_out = tf.concat([model.last_layer, action_inputs], axis=1)
|
||||
activation = tf.nn.__dict__[activation]
|
||||
for hidden in hiddens:
|
||||
q_out = layers.fully_connected(
|
||||
q_out, num_outputs=hidden, activation_fn=activation)
|
||||
self.value = layers.fully_connected(
|
||||
q_out, num_outputs=1, activation_fn=None)
|
||||
self.model = model
|
||||
|
||||
|
||||
class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
def __init__(self, observation_space, action_space, config):
|
||||
config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
|
||||
@@ -238,7 +76,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
action_space))
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
self.cur_noise_scale = 1.0
|
||||
self.cur_pure_exploration_phase = False
|
||||
self.dim_actions = action_space.shape[0]
|
||||
self.low_action = action_space.low
|
||||
self.high_action = action_space.high
|
||||
@@ -246,30 +85,38 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
# create global step for counting the number of update operations
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# use separate optimizers for actor & critic
|
||||
self._actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=self.config["actor_lr"])
|
||||
self._critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=self.config["critic_lr"])
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
|
||||
self.pure_exploration_phase = tf.placeholder(
|
||||
tf.bool, (), name="pure_exploration_phase")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32,
|
||||
shape=(None, ) + observation_space.shape,
|
||||
name="cur_obs")
|
||||
|
||||
# Actor: P (policy) network
|
||||
with tf.variable_scope(POLICY_SCOPE) as scope:
|
||||
p_values, self.p_model = self._build_p_network(
|
||||
policy_out, self.policy_model = self._build_policy_network(
|
||||
self.cur_observations, observation_space, action_space)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
self.policy_vars = _scope_vars(scope.name)
|
||||
|
||||
# Noise vars for P network except for layer normalization vars
|
||||
if self.config["parameter_noise"]:
|
||||
self._build_parameter_noise([
|
||||
var for var in self.p_func_vars if "LayerNorm" not in var.name
|
||||
var for var in self.policy_vars if "LayerNorm" not in var.name
|
||||
])
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(ACTION_SCOPE):
|
||||
self.output_actions = self._build_action_network(
|
||||
p_values, self.stochastic, self.eps)
|
||||
self.output_actions = self._add_exploration_noise(
|
||||
policy_out, self.stochastic, self.noise_scale,
|
||||
self.pure_exploration_phase, action_space)
|
||||
|
||||
if self.config["smooth_target_policy"]:
|
||||
self.reset_noise_op = tf.no_op()
|
||||
@@ -293,37 +140,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
|
||||
# p network evaluation
|
||||
# policy network evaluation
|
||||
with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope:
|
||||
prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
self.p_t, _ = self._build_p_network(self.obs_t, observation_space,
|
||||
action_space)
|
||||
p_batchnorm_update_ops = list(
|
||||
self.policy_t, _ = self._build_policy_network(
|
||||
self.obs_t, observation_space, action_space)
|
||||
policy_batchnorm_update_ops = list(
|
||||
set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
|
||||
prev_update_ops)
|
||||
|
||||
# target p network evaluation
|
||||
# target policy network evaluation
|
||||
with tf.variable_scope(POLICY_TARGET_SCOPE) as scope:
|
||||
p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space,
|
||||
action_space)
|
||||
target_p_func_vars = _scope_vars(scope.name)
|
||||
policy_tp1, _ = self._build_policy_network(
|
||||
self.obs_tp1, observation_space, action_space)
|
||||
target_policy_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(ACTION_SCOPE, reuse=True):
|
||||
output_actions = self._build_action_network(
|
||||
self.p_t,
|
||||
stochastic=tf.constant(value=False, dtype=tf.bool),
|
||||
eps=.0)
|
||||
output_actions_estimated = self._build_action_network(
|
||||
p_tp1,
|
||||
stochastic=tf.constant(
|
||||
value=self.config["smooth_target_policy"], dtype=tf.bool),
|
||||
eps=.0,
|
||||
is_target=True)
|
||||
if config["smooth_target_policy"]:
|
||||
target_noise_clip = self.config["target_noise_clip"]
|
||||
clipped_normal_sample = tf.clip_by_value(
|
||||
tf.random_normal(
|
||||
tf.shape(policy_tp1),
|
||||
stddev=self.config["target_noise"]),
|
||||
-target_noise_clip, target_noise_clip)
|
||||
policy_tp1_smoothed = tf.clip_by_value(
|
||||
policy_tp1 + clipped_normal_sample, action_space.low,
|
||||
action_space.high)
|
||||
else:
|
||||
# no smoothing, just use deterministic actions
|
||||
policy_tp1_smoothed = policy_tp1
|
||||
|
||||
# q network evaluation
|
||||
prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
# Q-values for given actions & observations in given current
|
||||
q_t, self.q_model = self._build_q_network(
|
||||
self.obs_t, observation_space, action_space, self.act_t)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
@@ -333,8 +184,9 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
"min_q": tf.reduce_min(q_t),
|
||||
}
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp0, _ = self._build_q_network(self.obs_t, observation_space,
|
||||
action_space, output_actions)
|
||||
# Q-values for current policy (no noise) in given current state
|
||||
q_t_det_policy, _ = self._build_q_network(
|
||||
self.obs_t, observation_space, action_space, self.policy_t)
|
||||
if self.config["twin_q"]:
|
||||
with tf.variable_scope(TWIN_Q_SCOPE) as scope:
|
||||
twin_q_t, self.twin_q_model = self._build_q_network(
|
||||
@@ -343,38 +195,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
q_batchnorm_update_ops = list(
|
||||
set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
|
||||
# target q network evalution
|
||||
# target q network evaluation
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space,
|
||||
action_space,
|
||||
output_actions_estimated)
|
||||
action_space, policy_tp1_smoothed)
|
||||
target_q_func_vars = _scope_vars(scope.name)
|
||||
if self.config["twin_q"]:
|
||||
with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope:
|
||||
twin_q_tp1, _ = self._build_q_network(
|
||||
self.obs_tp1, observation_space, action_space,
|
||||
output_actions_estimated)
|
||||
policy_tp1_smoothed)
|
||||
twin_target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
if self.config["twin_q"]:
|
||||
self.loss = self._build_actor_critic_loss(
|
||||
q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1)
|
||||
self.critic_loss, self.actor_loss, self.td_error \
|
||||
= self._build_actor_critic_loss(
|
||||
q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t,
|
||||
twin_q_tp1=twin_q_tp1)
|
||||
else:
|
||||
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
|
||||
self.critic_loss, self.actor_loss, self.td_error \
|
||||
= self._build_actor_critic_loss(
|
||||
q_t, q_tp1, q_t_det_policy)
|
||||
|
||||
if config["l2_reg"] is not None:
|
||||
for var in self.p_func_vars:
|
||||
for var in self.policy_vars:
|
||||
if "bias" not in var.name:
|
||||
self.loss.actor_loss += (
|
||||
self.actor_loss += (
|
||||
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
|
||||
for var in self.q_func_vars:
|
||||
if "bias" not in var.name:
|
||||
self.loss.critic_loss += (
|
||||
self.critic_loss += (
|
||||
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
|
||||
if self.config["twin_q"]:
|
||||
for var in self.twin_q_func_vars:
|
||||
if "bias" not in var.name:
|
||||
self.loss.critic_loss += (
|
||||
self.critic_loss += (
|
||||
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
@@ -396,8 +251,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
for var, var_target in zip(
|
||||
sorted(self.p_func_vars, key=lambda v: v.name),
|
||||
sorted(target_p_func_vars, key=lambda v: v.name)):
|
||||
sorted(self.policy_vars, key=lambda v: v.name),
|
||||
sorted(target_policy_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
@@ -414,14 +269,15 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
]
|
||||
input_dict = dict(self.loss_inputs)
|
||||
|
||||
# Model self-supervised losses
|
||||
self.loss.actor_loss = self.p_model.custom_loss(
|
||||
self.loss.actor_loss, input_dict)
|
||||
self.loss.critic_loss = self.q_model.custom_loss(
|
||||
self.loss.critic_loss, input_dict)
|
||||
if self.config["twin_q"]:
|
||||
self.loss.critic_loss = self.twin_q_model.custom_loss(
|
||||
self.loss.critic_loss, input_dict)
|
||||
if self.config["use_state_preprocessor"]:
|
||||
# Model self-supervised losses
|
||||
self.actor_loss = self.policy_model.custom_loss(
|
||||
self.actor_loss, input_dict)
|
||||
self.critic_loss = self.q_model.custom_loss(
|
||||
self.critic_loss, input_dict)
|
||||
if self.config["twin_q"]:
|
||||
self.critic_loss = self.twin_q_model.custom_loss(
|
||||
self.critic_loss, input_dict)
|
||||
|
||||
TFPolicyGraph.__init__(
|
||||
self,
|
||||
@@ -430,62 +286,92 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
self.sess,
|
||||
obs_input=self.cur_observations,
|
||||
action_sampler=self.output_actions,
|
||||
loss=self.loss.actor_loss + self.loss.critic_loss,
|
||||
loss=self.actor_loss + self.critic_loss,
|
||||
loss_inputs=self.loss_inputs,
|
||||
update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops)
|
||||
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
# Note that this encompasses both the policy and Q-value networks and
|
||||
# their corresponding target networks
|
||||
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
|
||||
tf.group(q_tp0, q_tp1), self.sess)
|
||||
tf.group(q_t_det_policy, q_tp1), self.sess)
|
||||
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
|
||||
# we don't use this because we have two separate optimisers
|
||||
return None
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
def build_apply_op(self, optimizer, grads_and_vars):
|
||||
# for policy gradient, update policy net one time v.s.
|
||||
# update critic net `policy_delay` time(s)
|
||||
should_apply_actor_opt = tf.equal(
|
||||
tf.mod(self.global_step, self.config["policy_delay"]), 0)
|
||||
|
||||
def make_apply_op():
|
||||
return self._actor_optimizer.apply_gradients(
|
||||
self._actor_grads_and_vars)
|
||||
|
||||
actor_op = tf.cond(
|
||||
should_apply_actor_opt,
|
||||
true_fn=make_apply_op,
|
||||
false_fn=lambda: tf.no_op())
|
||||
critic_op = self._critic_optimizer.apply_gradients(
|
||||
self._critic_grads_and_vars)
|
||||
# increment global step & apply ops
|
||||
with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
|
||||
return tf.group(actor_op, critic_op)
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
def gradients(self, optimizer, loss):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = _minimize_and_clip(
|
||||
optimizer,
|
||||
self.loss.actor_loss,
|
||||
var_list=self.p_func_vars,
|
||||
self._actor_optimizer,
|
||||
self.actor_loss,
|
||||
var_list=self.policy_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
critic_grads_and_vars = _minimize_and_clip(
|
||||
optimizer,
|
||||
self.loss.critic_loss,
|
||||
self._critic_optimizer,
|
||||
self.critic_loss,
|
||||
var_list=self.q_func_vars + self.twin_q_func_vars
|
||||
if self.config["twin_q"] else self.q_func_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
else:
|
||||
actor_grads_and_vars = optimizer.compute_gradients(
|
||||
self.loss.actor_loss, var_list=self.p_func_vars)
|
||||
critic_grads_and_vars = optimizer.compute_gradients(
|
||||
self.loss.critic_loss,
|
||||
var_list=self.q_func_vars + self.twin_q_func_vars
|
||||
if self.config["twin_q"] else self.q_func_vars)
|
||||
actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
|
||||
if g is not None]
|
||||
critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
|
||||
if g is not None]
|
||||
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
|
||||
actor_grads_and_vars = self._actor_optimizer.compute_gradients(
|
||||
self.actor_loss, var_list=self.policy_vars)
|
||||
if self.config["twin_q"]:
|
||||
critic_vars = self.q_func_vars + self.twin_q_func_vars
|
||||
else:
|
||||
critic_vars = self.q_func_vars
|
||||
critic_grads_and_vars = self._critic_optimizer.compute_gradients(
|
||||
self.critic_loss, var_list=critic_vars)
|
||||
# save these for later use in build_apply_op
|
||||
self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
|
||||
if g is not None]
|
||||
self._critic_grads_and_vars = [(g, v)
|
||||
for (g, v) in critic_grads_and_vars
|
||||
if g is not None]
|
||||
grads_and_vars = self._actor_grads_and_vars \
|
||||
+ self._critic_grads_and_vars
|
||||
return grads_and_vars
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
def extra_compute_action_feed_dict(self):
|
||||
return {
|
||||
# FIXME: what about turning off exploration? Isn't that a good
|
||||
# idea?
|
||||
self.stochastic: True,
|
||||
self.eps: self.cur_epsilon,
|
||||
self.noise_scale: self.cur_noise_scale,
|
||||
self.pure_exploration_phase: self.cur_pure_exploration_phase,
|
||||
}
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {
|
||||
"td_error": self.loss.td_error,
|
||||
"td_error": self.td_error,
|
||||
LEARNER_STATS_KEY: self.stats,
|
||||
}
|
||||
|
||||
@@ -499,59 +385,192 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
|
||||
@override(PolicyGraph)
|
||||
def get_state(self):
|
||||
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
|
||||
return [
|
||||
TFPolicyGraph.get_state(self), self.cur_noise_scale,
|
||||
self.cur_pure_exploration_phase
|
||||
]
|
||||
|
||||
@override(PolicyGraph)
|
||||
def set_state(self, state):
|
||||
TFPolicyGraph.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
self.set_pure_exploration_phase(state[2])
|
||||
|
||||
def _build_q_network(self, obs, obs_space, action_space, actions):
|
||||
q_net = QNetwork(
|
||||
ModelCatalog.get_model({
|
||||
if self.config["use_state_preprocessor"]:
|
||||
q_model = ModelCatalog.get_model({
|
||||
"obs": obs,
|
||||
"is_training": self._get_is_training_placeholder(),
|
||||
}, obs_space, action_space, 1, self.config["model"]), actions,
|
||||
self.config["critic_hiddens"],
|
||||
self.config["critic_hidden_activation"])
|
||||
return q_net.value, q_net.model
|
||||
}, obs_space, action_space, 1, self.config["model"])
|
||||
q_out = tf.concat([q_model.last_layer, actions], axis=1)
|
||||
else:
|
||||
q_model = None
|
||||
q_out = tf.concat([obs, actions], axis=1)
|
||||
|
||||
def _build_p_network(self, obs, obs_space, action_space):
|
||||
policy_net = PolicyNetwork(
|
||||
ModelCatalog.get_model({
|
||||
activation = getattr(tf.nn, self.config["critic_hidden_activation"])
|
||||
for hidden in self.config["critic_hiddens"]:
|
||||
q_out = layers.fully_connected(
|
||||
q_out, num_outputs=hidden, activation_fn=activation)
|
||||
q_values = layers.fully_connected(
|
||||
q_out, num_outputs=1, activation_fn=None)
|
||||
|
||||
return q_values, q_model
|
||||
|
||||
def _build_policy_network(self, obs, obs_space, action_space):
|
||||
if self.config["use_state_preprocessor"]:
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": obs,
|
||||
"is_training": self._get_is_training_placeholder(),
|
||||
}, obs_space, action_space, 1, self.config["model"]),
|
||||
self.dim_actions, self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"],
|
||||
self.config["parameter_noise"])
|
||||
return policy_net.action_scores, policy_net.model
|
||||
}, obs_space, action_space, 1, self.config["model"])
|
||||
action_out = model.last_layer
|
||||
else:
|
||||
model = None
|
||||
action_out = obs
|
||||
|
||||
def _build_action_network(self, p_values, stochastic, eps,
|
||||
is_target=False):
|
||||
return ActionNetwork(
|
||||
p_values, self.low_action, self.high_action, stochastic, eps,
|
||||
self.config["exploration_theta"], self.config["exploration_sigma"],
|
||||
self.config["smooth_target_policy"], self.config["act_noise"],
|
||||
is_target, self.config["target_noise"],
|
||||
self.config["noise_clip"]).actions
|
||||
activation = getattr(tf.nn, self.config["actor_hidden_activation"])
|
||||
normalizer_fn = layers.layer_norm if self.config["parameter_noise"] \
|
||||
else None
|
||||
for hidden in self.config["actor_hiddens"]:
|
||||
action_out = layers.fully_connected(
|
||||
action_out,
|
||||
num_outputs=hidden,
|
||||
activation_fn=activation,
|
||||
normalizer_fn=normalizer_fn)
|
||||
action_out = layers.fully_connected(
|
||||
action_out, num_outputs=self.dim_actions, activation_fn=None)
|
||||
|
||||
# Use sigmoid to scale to [0,1], but also double magnitude of input to
|
||||
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
|
||||
sigmoid_out = tf.nn.sigmoid(2 * action_out)
|
||||
# Rescale to actual env policy scale
|
||||
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
|
||||
# get same dims)
|
||||
action_range = (action_space.high - action_space.low)[None]
|
||||
low_action = action_space.low[None]
|
||||
actions = action_range * sigmoid_out + low_action
|
||||
|
||||
return actions, model
|
||||
|
||||
def _add_exploration_noise(self, deterministic_actions,
|
||||
should_be_stochastic, noise_scale,
|
||||
enable_pure_exploration, action_space):
|
||||
noise_type = self.config["exploration_noise_type"]
|
||||
action_low = action_space.low
|
||||
action_high = action_space.high
|
||||
action_range = action_space.high - action_low
|
||||
|
||||
def compute_stochastic_actions():
|
||||
def make_noisy_actions():
|
||||
# shape of deterministic_actions is [None, dim_action]
|
||||
if noise_type == "gaussian":
|
||||
# add IID Gaussian noise for exploration, TD3-style
|
||||
normal_sample = noise_scale * tf.random_normal(
|
||||
tf.shape(deterministic_actions),
|
||||
stddev=self.config["exploration_gaussian_sigma"])
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + normal_sample, action_low,
|
||||
action_high)
|
||||
elif noise_type == "ou":
|
||||
# add OU noise for exploration, DDPG-style
|
||||
zero_acts = action_low.size * [.0]
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=zero_acts,
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[action_low.size], mean=0.0, stddev=1.0)
|
||||
ou_new = self.config["exploration_ou_theta"] \
|
||||
* -exploration_sample \
|
||||
+ self.config["exploration_ou_sigma"] * normal_sample
|
||||
exploration_value = tf.assign_add(exploration_sample,
|
||||
ou_new)
|
||||
base_scale = self.config["exploration_ou_noise_scale"]
|
||||
noise = noise_scale * base_scale \
|
||||
* exploration_value * action_range
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + noise, action_low, action_high)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
|
||||
noise_type)
|
||||
return stochastic_actions
|
||||
|
||||
def make_uniform_random_actions():
|
||||
# pure random exploration option
|
||||
uniform_random_actions = tf.random.uniform(
|
||||
tf.shape(deterministic_actions))
|
||||
# rescale uniform random actions according to action range
|
||||
tf_range = tf.constant(action_range[None], dtype="float32")
|
||||
tf_low = tf.constant(action_low[None], dtype="float32")
|
||||
uniform_random_actions = uniform_random_actions * tf_range \
|
||||
+ tf_low
|
||||
return uniform_random_actions
|
||||
|
||||
stochastic_actions = tf.cond(
|
||||
# need to condition on noise_scale > 0 because zeroing
|
||||
# noise_scale is how evaluator signals no noise should be used
|
||||
# (this is ugly and should be fixed by adding an "eval_mode"
|
||||
# config flag or something)
|
||||
tf.logical_and(enable_pure_exploration, noise_scale > 0),
|
||||
true_fn=make_uniform_random_actions,
|
||||
false_fn=make_noisy_actions)
|
||||
return stochastic_actions
|
||||
|
||||
enable_stochastic = tf.logical_and(should_be_stochastic,
|
||||
not self.config["parameter_noise"])
|
||||
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
return actions
|
||||
|
||||
def _build_actor_critic_loss(self,
|
||||
q_t,
|
||||
q_tp1,
|
||||
q_tp0,
|
||||
q_t_det_policy,
|
||||
twin_q_t=None,
|
||||
twin_q_tp1=None):
|
||||
return ActorCriticLoss(
|
||||
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
|
||||
self.done_mask, twin_q_t, twin_q_tp1,
|
||||
self.config["actor_loss_coeff"], self.config["critic_loss_coeff"],
|
||||
self.config["gamma"], self.config["n_step"],
|
||||
self.config["use_huber"], self.config["huber_threshold"],
|
||||
self.config["twin_q"])
|
||||
twin_q = self.config["twin_q"]
|
||||
gamma = self.config["gamma"]
|
||||
n_step = self.config["n_step"]
|
||||
use_huber = self.config["use_huber"]
|
||||
huber_threshold = self.config["huber_threshold"]
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
if twin_q:
|
||||
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
|
||||
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
|
||||
|
||||
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = tf.stop_gradient(
|
||||
self.rew_t + gamma**n_step * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
if twin_q:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
twin_td_error = twin_q_t_selected - q_t_selected_target
|
||||
td_error = td_error + twin_td_error
|
||||
if use_huber:
|
||||
errors = _huber_loss(td_error, huber_threshold) \
|
||||
+ _huber_loss(twin_td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
|
||||
twin_td_error)
|
||||
else:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
if use_huber:
|
||||
errors = _huber_loss(td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error)
|
||||
|
||||
critic_loss = tf.reduce_mean(self.importance_weights * errors)
|
||||
actor_loss = -tf.reduce_mean(q_t_det_policy)
|
||||
return critic_loss, actor_loss, td_error
|
||||
|
||||
def _build_parameter_noise(self, pnet_params):
|
||||
self.parameter_noise_sigma_val = self.config["exploration_sigma"]
|
||||
self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
|
||||
self.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(
|
||||
self.parameter_noise_sigma_val),
|
||||
@@ -590,7 +609,7 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = self.sess.run(
|
||||
self.loss.td_error,
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.obs_t: [np.array(ob) for ob in obs_t],
|
||||
self.act_t: act_t,
|
||||
@@ -610,9 +629,16 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, tau=None):
|
||||
tau = tau or self.tau_value
|
||||
return self.sess.run(
|
||||
self.update_target_expr,
|
||||
feed_dict={self.tau: tau or self.tau_value})
|
||||
self.update_target_expr, feed_dict={self.tau: tau})
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
self.cur_epsilon = epsilon
|
||||
# set_epsilon is called by optimizer to anneal exploration as
|
||||
# necessary, and to turn it off during evaluation. The "epsilon" part
|
||||
# is a carry-over from DQN, which uses epsilon-greedy exploration
|
||||
# rather than adding action noise to the output of a policy network.
|
||||
self.cur_noise_scale = epsilon
|
||||
|
||||
def set_pure_exploration_phase(self, pure_exploration_phase):
|
||||
self.cur_pure_exploration_phase = pure_exploration_phase
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
|
||||
DEFAULT_CONFIG as DDPG_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
TD3_DEFAULT_CONFIG = merge_dicts(
|
||||
DDPG_CONFIG,
|
||||
{
|
||||
# largest changes: twin Q functions, delayed policy updates, and target
|
||||
# smoothing
|
||||
"twin_q": True,
|
||||
"policy_delay": 2,
|
||||
"smooth_target_policy": True,
|
||||
"target_noise": 0.2,
|
||||
"target_noise_clip": 0.5,
|
||||
|
||||
# other changes & things we want to keep fixed: IID Gaussian
|
||||
# exploration noise, larger actor learning rate, no l2 regularisation,
|
||||
# no Huber loss, etc.
|
||||
"exploration_should_anneal": False,
|
||||
"exploration_noise_type": "gaussian",
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
"learning_starts": 10000,
|
||||
"pure_exploration_steps": 10000,
|
||||
"actor_hiddens": [400, 300],
|
||||
"critic_hiddens": [400, 300],
|
||||
"n_step": 1,
|
||||
"gamma": 0.99,
|
||||
"actor_lr": 1e-3,
|
||||
"critic_lr": 1e-3,
|
||||
"l2_reg": 0.0,
|
||||
"tau": 5e-3,
|
||||
"train_batch_size": 100,
|
||||
"use_huber": False,
|
||||
"target_network_update_freq": 0,
|
||||
"optimizer_class": "SyncReplayOptimizer",
|
||||
"num_workers": 0,
|
||||
"num_gpus_per_worker": 0,
|
||||
"per_worker_exploration": False,
|
||||
"worker_side_prioritization": False,
|
||||
"buffer_size": 1000000,
|
||||
"prioritized_replay": False,
|
||||
"clip_rewards": False,
|
||||
"use_state_preprocessor": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class TD3Trainer(DDPGTrainer):
|
||||
"""A more stable successor to TD3. By default, this uses a near-identical
|
||||
configuration to that reported in the TD3 paper."""
|
||||
|
||||
_name = "TD3"
|
||||
_default_config = TD3_DEFAULT_CONFIG
|
||||
@@ -34,6 +34,11 @@ def _import_apex_ddpg():
|
||||
return ddpg.ApexDDPGTrainer
|
||||
|
||||
|
||||
def _import_td3():
|
||||
from ray.rllib.agents import ddpg
|
||||
return ddpg.TD3Trainer
|
||||
|
||||
|
||||
def _import_ppo():
|
||||
from ray.rllib.agents import ppo
|
||||
return ppo.PPOTrainer
|
||||
@@ -87,6 +92,7 @@ def _import_marwil():
|
||||
ALGORITHMS = {
|
||||
"DDPG": _import_ddpg,
|
||||
"APEX_DDPG": _import_apex_ddpg,
|
||||
"TD3": _import_td3,
|
||||
"PPO": _import_ppo,
|
||||
"ES": _import_es,
|
||||
"ARS": _import_ars,
|
||||
|
||||
@@ -40,7 +40,8 @@ CONFIGS = {
|
||||
},
|
||||
},
|
||||
"DDPG": {
|
||||
"noise_scale": 0.0,
|
||||
"pure_exploration_steps": 0,
|
||||
"exploration_ou_noise_scale": 0.0,
|
||||
"timesteps_per_iteration": 100
|
||||
},
|
||||
"PPO": {
|
||||
|
||||
@@ -116,8 +116,9 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats)
|
||||
check_support(
|
||||
"DDPG", {
|
||||
"noise_scale": 100.0,
|
||||
"timesteps_per_iteration": 1
|
||||
"exploration_ou_noise_scale": 100.0,
|
||||
"timesteps_per_iteration": 1,
|
||||
"use_state_preprocessor": True,
|
||||
},
|
||||
stats,
|
||||
check_bounds=True)
|
||||
@@ -188,6 +189,7 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
"min_iter_time_s": 1,
|
||||
"learning_starts": 1000,
|
||||
"target_network_update_freq": 100,
|
||||
"use_state_preprocessor": True,
|
||||
})
|
||||
check_support_multiagent("IMPALA", {"num_gpus": 0})
|
||||
check_support_multiagent("DQN", {"timesteps_per_iteration": 1})
|
||||
@@ -206,7 +208,10 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
"sgd_minibatch_size": 1,
|
||||
})
|
||||
check_support_multiagent("PG", {"num_workers": 1, "optimizer": {}})
|
||||
check_support_multiagent("DDPG", {"timesteps_per_iteration": 1})
|
||||
check_support_multiagent("DDPG", {
|
||||
"timesteps_per_iteration": 1,
|
||||
"use_state_preprocessor": True,
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -15,13 +15,14 @@ halfcheetah-ddpg:
|
||||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
timesteps_per_iteration: 1000
|
||||
exploration_fraction: 0.1
|
||||
exploration_final_eps: 0.02
|
||||
noise_scale: 0.1
|
||||
exploration_theta: 0.15
|
||||
exploration_sigma: 0.2
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.1
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
|
||||
@@ -34,9 +35,8 @@ halfcheetah-ddpg:
|
||||
clip_rewards: False
|
||||
|
||||
# === Optimization ===
|
||||
lr: 0.001
|
||||
actor_loss_coeff: 0.1
|
||||
critic_loss_coeff: 1.0
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: False
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
@@ -50,3 +50,7 @@ halfcheetah-ddpg:
|
||||
optimizer_class: "SyncReplayOptimizer"
|
||||
per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
invertedpendulum-td3:
|
||||
# This is a TD3 with stopping conditions and network size tuned specifically
|
||||
# for InvertedPendulum. Should be able to reach 1,000 reward (the maximum
|
||||
# achievable) in 10,000 to 20,000 steps.
|
||||
env: InvertedPendulum-v2
|
||||
run: TD3
|
||||
stop:
|
||||
episode_reward_mean: 9999.9
|
||||
time_total_s: 900 # 15 minutes
|
||||
timesteps_total: 1000000
|
||||
config:
|
||||
# === Model ===
|
||||
actor_hiddens: [32, 32]
|
||||
critic_hiddens: [32, 32]
|
||||
|
||||
# === Exploration ===
|
||||
learning_starts: 1000
|
||||
pure_exploration_steps: 1000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 1
|
||||
evaluation_num_episodes: 5
|
||||
@@ -7,7 +7,9 @@ mountaincarcontinuous-apex-ddpg:
|
||||
config:
|
||||
clip_rewards: False
|
||||
num_workers: 16
|
||||
noise_scale: 1.0
|
||||
exploration_ou_noise_scale: 1.0
|
||||
n_step: 3
|
||||
target_network_update_freq: 50000
|
||||
tau: 1.0
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
|
||||
@@ -15,13 +15,14 @@ mountaincarcontinuous-ddpg:
|
||||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
timesteps_per_iteration: 1000
|
||||
exploration_fraction: 0.4
|
||||
exploration_final_eps: 0.02
|
||||
noise_scale: 0.75
|
||||
exploration_theta: 0.15
|
||||
exploration_sigma: 0.2
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.75
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.01
|
||||
|
||||
@@ -34,9 +35,8 @@ mountaincarcontinuous-ddpg:
|
||||
clip_rewards: False
|
||||
|
||||
# === Optimization ===
|
||||
lr: 0.001
|
||||
actor_loss_coeff: 0.1
|
||||
critic_loss_coeff: 1.0
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: False
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.00001
|
||||
@@ -50,3 +50,7 @@ mountaincarcontinuous-ddpg:
|
||||
optimizer_class: "SyncReplayOptimizer"
|
||||
per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
mujoco-td3:
|
||||
# Solve latest versions of the four hardest Mujoco tasks benchmarked in the
|
||||
# original TD3 paper. Average return over 10 trials at end of 1,000,000
|
||||
# timesteps (taken from Table 2 of the paper) are given in parens at the end
|
||||
# of reach environment name.
|
||||
#
|
||||
# Paper is at https://arxiv.org/pdf/1802.09477.pdf
|
||||
env:
|
||||
grid_search:
|
||||
- HalfCheetah-v2 # (9,532.99)
|
||||
- Hopper-v2 # (3,304.75)
|
||||
- Walker2d-v2 # (4,565.24)
|
||||
- Ant-v2 # (4,185.06)
|
||||
run: TD3
|
||||
stop:
|
||||
timesteps_total: 1000000
|
||||
config:
|
||||
# === Exploration ===
|
||||
learning_starts: 10000
|
||||
pure_exploration_steps: 10000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
@@ -11,3 +11,5 @@ pendulum-apex-ddpg:
|
||||
n_step: 1
|
||||
target_network_update_freq: 50000
|
||||
tau: 1.0
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
|
||||
@@ -15,13 +15,14 @@ pendulum-ddpg:
|
||||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
timesteps_per_iteration: 600
|
||||
exploration_fraction: 0.1
|
||||
exploration_final_eps: 0.02
|
||||
noise_scale: 0.1
|
||||
exploration_theta: 0.15
|
||||
exploration_sigma: 0.2
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.1
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
|
||||
@@ -34,9 +35,8 @@ pendulum-ddpg:
|
||||
clip_rewards: False
|
||||
|
||||
# === Optimization ===
|
||||
lr: 0.001
|
||||
actor_loss_coeff: 0.1
|
||||
critic_loss_coeff: 1.0
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: True
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
@@ -50,3 +50,7 @@ pendulum-ddpg:
|
||||
optimizer_class: "SyncReplayOptimizer"
|
||||
per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
evaluation_num_episodes: 10
|
||||
|
||||
@@ -1,60 +1,19 @@
|
||||
# This configuration can expect to reach -160 reward in 10k-20k timesteps
|
||||
pendulum-ddpg:
|
||||
env: Pendulum-v0
|
||||
run: DDPG
|
||||
run: TD3
|
||||
stop:
|
||||
episode_reward_mean: -160
|
||||
time_total_s: 600 # 10 minutes
|
||||
episode_reward_mean: -130
|
||||
time_total_s: 900 # 10 minutes
|
||||
config:
|
||||
# === Tricks ===
|
||||
twin_q: True
|
||||
policy_delay: 2
|
||||
smooth_target_policy: True
|
||||
act_noise: 0.1
|
||||
target_noise: 0.2
|
||||
noise_clip: 0.5
|
||||
|
||||
# === Model ===
|
||||
actor_hiddens: [64, 64]
|
||||
critic_hiddens: [64, 64]
|
||||
n_step: 1
|
||||
model: {}
|
||||
gamma: 0.99
|
||||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
schedule_max_timesteps: 100000
|
||||
timesteps_per_iteration: 600
|
||||
exploration_fraction: 0.1
|
||||
exploration_final_eps: 0.02
|
||||
noise_scale: 0.1
|
||||
exploration_theta: 0.15
|
||||
exploration_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
learning_starts: 5000
|
||||
pure_exploration_steps: 5000
|
||||
|
||||
# === Replay buffer ===
|
||||
buffer_size: 10000
|
||||
prioritized_replay: True
|
||||
prioritized_replay_alpha: 0.6
|
||||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
clip_rewards: False
|
||||
|
||||
# === Optimization ===
|
||||
lr: 0.001
|
||||
actor_loss_coeff: 0.1
|
||||
critic_loss_coeff: 1.0
|
||||
use_huber: True
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
sample_batch_size: 1
|
||||
train_batch_size: 64
|
||||
|
||||
# === Parallelism ===
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
optimizer_class: "SyncReplayOptimizer"
|
||||
per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 1
|
||||
evaluation_num_episodes: 5
|
||||
|
||||
Reference in New Issue
Block a user