[rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694)

* [rllib] Separate optimisers for DDPG actor & crit.

* [rllib] Better names for DDPG variables & options

Config changes:

- noise_scale -> exploration_ou_noise_scale
- exploration_theta -> exploration_ou_theta
- exploration_sigma -> exploration_ou_sigma
- act_noise -> exploration_gaussian_sigma
- noise_clip -> target_noise_clip

* [rllib] Make DDPG less class-y

Used functions to replace three classes with only an __init__ method & a
handful of unrelated attributes.

* [rllib] Refactor DDPG noise

* [rllib] Unify DDPG exploration annealing

Added option "exploration_should_anneal" to enable linear annealing of
exploration noise. By default this is off, for consistency with DDPG &
TD3 papers. Also renamed "exploration_final_eps" to
"exploration_final_scale" (that name seems to have been carried over
from DQN, and doesn't really make sense here). Finally, tried to rename
"eps" to "noise_scale" wherever possible.
This commit is contained in:
Sam Toyer
2019-04-26 17:49:53 -07:00
committed by Eric Liang
parent 05c896d6f7
commit 663e92ab3f
16 changed files with 557 additions and 398 deletions
+2 -1
View File
@@ -4,6 +4,7 @@ from __future__ import print_function
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ddpg.td3 import TD3Trainer
from ray.rllib.utils import renamed_class
ApexDDPGAgent = renamed_class(ApexDDPGTrainer)
@@ -11,5 +12,5 @@ DDPGAgent = renamed_class(DDPGTrainer)
__all__ = [
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
"DEFAULT_CONFIG"
"TD3Trainer", "DEFAULT_CONFIG"
]
+84 -42
View File
@@ -13,19 +13,21 @@ from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
DEFAULT_CONFIG = with_common_config({
# === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
# TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
# In addition to settings below, you can use "exploration_noise_type" and
# "exploration_gauss_act_noise" to get IID Gaussian exploration noise
# instead of OU exploration noise.
# twin Q-net
"twin_q": False,
# delayed policy update
"policy_delay": 1,
# target policy smoothing
# this also forces the use of gaussian instead of OU noise for exploration
# (this also replaces OU exploration noise with IID Gaussian exploration
# noise, for now)
"smooth_target_policy": False,
# gaussian stddev of act noise
"act_noise": 0.1,
# gaussian stddev of target noise
# gaussian stddev of target action noise for smoothing
"target_noise": 0.2,
# target noise limit (bound)
"noise_clip": 0.5,
"target_noise_clip": 0.5,
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
@@ -37,42 +39,64 @@ DEFAULT_CONFIG = with_common_config({
"evaluation_num_episodes": 10,
# === Model ===
# Postprocess the policy network model output with these hidden layers
"actor_hiddens": [64, 64],
# Hidden layers activation of the policy network
# Apply a state preprocessor with spec given by the "model" config option
# (like other RL algorithms). This is mostly useful if you have a weird
# observation shape, like an image. Disabled by default.
"use_state_preprocessor": False,
# Postprocess the policy network model output with these hidden layers. If
# use_state_preprocessor is False, then these will be the *only* hidden
# layers in the network.
"actor_hiddens": [400, 300],
# Hidden layers activation of the postprocessing stage of the policy
# network
"actor_hidden_activation": "relu",
# Postprocess the critic network model output with these hidden layers
"critic_hiddens": [64, 64],
# Hidden layers activation of the critic network
# Postprocess the critic network model output with these hidden layers;
# again, if use_state_preprocessor is True, then the state will be
# preprocessed by the model specified with the "model" config option first.
"critic_hiddens": [400, 300],
# Hidden layers activation of the postprocessing state of the critic.
"critic_hidden_activation": "relu",
# N-step Q learning
"n_step": 1,
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
# Turns on annealing schedule for exploration noise. Exploration is
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
# anneal noise, so this is False by default.
"exploration_should_anneal": False,
# Max num timesteps for annealing schedules.
"schedule_max_timesteps": 100000,
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
"exploration_fraction": 0.1,
# Final value of random action probability
"exploration_final_eps": 0.02,
# OU-noise scale
"noise_scale": 0.1,
# theta
"exploration_theta": 0.15,
# sigma
"exploration_sigma": 0.2,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 0,
# Update the target by \tau * policy + (1-\tau) * target_policy
"tau": 0.002,
# Final scaling multiplier for action noise (initial is 1.0)
"exploration_final_scale": 0.02,
# valid values: "ou" (time-correlated, like original DDPG paper),
# "gaussian" (IID, like TD3 paper)
"exploration_noise_type": "ou",
# OU-noise scale; this can be used to scale down magnitude of OU noise
# before adding to actions (requires "exploration_noise_type" to be "ou")
"exploration_ou_noise_scale": 0.1,
# theta for OU
"exploration_ou_theta": 0.15,
# sigma for OU
"exploration_ou_sigma": 0.2,
# gaussian stddev of act noise for exploration (requires
# "exploration_noise_type" to be "gaussian")
"exploration_gaussian_sigma": 0.1,
# If True parameter space noise will be used for exploration
# See https://blog.openai.com/better-exploration-with-parameter-noise/
"parameter_noise": False,
# Until this many timesteps have elapsed, the agent's policy will be
# ignored & it will instead take uniform random actions. Can be used in
# conjunction with learning_starts (which controls when the first
# optimization step happens) to decrease dependence of exploration &
# optimization on initial policy parameters. Note that this will be
# disabled when the action noise scale is set to 0 (e.g during evaluation).
"pure_exploration_steps": 1000,
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
@@ -90,11 +114,14 @@ DEFAULT_CONFIG = with_common_config({
"compress_observations": False,
# === Optimization ===
# Learning rate for adam optimizer.
# Instead of using two optimizers, we use two different loss coefficients
"lr": 1e-3,
"actor_loss_coeff": 0.1,
"critic_loss_coeff": 1.0,
# Learning rate for the critic (Q-function) optimizer.
"critic_lr": 1e-3,
# Learning rate for the actor (policy) optimizer.
"actor_lr": 1e-3,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 0,
# Update the target by \tau * policy + (1-\tau) * target_policy
"tau": 0.002,
# If True, use huber loss instead of squared loss for critic network
# Conventionally, no need to clip gradients if using a huber loss
"use_huber": False,
@@ -117,7 +144,7 @@ DEFAULT_CONFIG = with_common_config({
# === Parallelism ===
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
# you're using the Async or Ape-X optimizers.
"num_workers": 0,
# Optimizer class to use.
"optimizer_class": "SyncReplayOptimizer",
@@ -138,26 +165,41 @@ class DDPGTrainer(DQNTrainer):
_default_config = DEFAULT_CONFIG
_policy_graph = DDPGPolicyGraph
@override(DQNTrainer)
def _train(self):
pure_expl_steps = self.config["pure_exploration_steps"]
if pure_expl_steps:
# tell workers whether they should do pure exploration
only_explore = self.global_timestep < pure_expl_steps
self.local_evaluator.foreach_trainable_policy(
lambda p, _: p.set_pure_exploration_phase(only_explore))
for e in self.remote_evaluators:
e.foreach_trainable_policy.remote(
lambda p, _: p.set_pure_exploration_phase(only_explore))
return super(DDPGTrainer, self)._train()
@override(DQNTrainer)
def _make_exploration_schedule(self, worker_index):
# Override DQN's schedule to take into account `noise_scale`
# Override DQN's schedule to take into account
# `exploration_ou_noise_scale`
if self.config["per_worker_exploration"]:
assert self.config["num_workers"] > 1, \
"This requires multiple workers"
if worker_index >= 0:
exponent = (
1 +
worker_index / float(self.config["num_workers"] - 1) * 7)
return ConstantSchedule(
self.config["noise_scale"] * 0.4**exponent)
# FIXME: what do magic constants mean? (0.4, 7)
max_index = float(self.config["num_workers"] - 1)
exponent = 1 + worker_index / max_index * 7
return ConstantSchedule(0.4**exponent)
else:
# local ev should have zero exploration so that eval rollouts
# run properly
return ConstantSchedule(0.0)
else:
elif self.config["exploration_should_anneal"]:
return LinearSchedule(
schedule_timesteps=int(self.config["exploration_fraction"] *
self.config["schedule_max_timesteps"]),
initial_p=self.config["noise_scale"] * 1.0,
final_p=self.config["noise_scale"] *
self.config["exploration_final_eps"])
initial_p=1.0,
final_p=self.config["exploration_final_scale"])
else:
# *always* add exploration noise
return ConstantSchedule(1.0)
+305 -279
View File
@@ -19,80 +19,18 @@ from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.evaluation.policy_graph import PolicyGraph
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
ACTION_SCOPE = "a_func"
POLICY_SCOPE = "p_func"
POLICY_TARGET_SCOPE = "target_p_func"
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
TWIN_Q_SCOPE = "twin_q_func"
TWIN_Q_TARGET_SCOPE = "twin_target_q_func"
ACTION_SCOPE = "action"
POLICY_SCOPE = "policy"
POLICY_TARGET_SCOPE = "target_policy"
Q_SCOPE = "critic"
Q_TARGET_SCOPE = "target_critic"
TWIN_Q_SCOPE = "twin_critic"
TWIN_Q_TARGET_SCOPE = "twin_target_critic"
# Importance sampling weights for prioritized replay
PRIO_WEIGHTS = "weights"
class ActorCriticLoss(object):
def __init__(self,
q_t,
q_tp1,
q_tp0,
importance_weights,
rewards,
done_mask,
twin_q_t,
twin_q_tp1,
actor_loss_coeff=0.1,
critic_loss_coeff=1.0,
gamma=0.99,
n_step=1,
use_huber=False,
huber_threshold=1.0,
twin_q=False,
policy_delay=1):
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
if twin_q:
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
# compute the error (potentially clipped)
if twin_q:
td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
twin_td_error = twin_q_t_selected - tf.stop_gradient(
q_t_selected_target)
self.td_error = td_error + twin_td_error
if use_huber:
errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
twin_td_error, huber_threshold)
else:
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
twin_td_error)
else:
self.td_error = (
q_t_selected - tf.stop_gradient(q_t_selected_target))
if use_huber:
errors = _huber_loss(self.td_error, huber_threshold)
else:
errors = 0.5 * tf.square(self.td_error)
self.critic_loss = critic_loss_coeff * tf.reduce_mean(
importance_weights * errors)
# for policy gradient, update policy net one time v.s.
# update critic net `policy_delay` time(s)
global_step = tf.train.get_or_create_global_step()
policy_delay_mask = tf.to_float(
tf.equal(tf.mod(global_step, policy_delay), 0))
self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
tf.reduce_mean(q_tp0))
class DDPGPostprocessing(object):
"""Implements n-step learning and param noise adjustments."""
@@ -113,12 +51,13 @@ class DDPGPostprocessing(object):
feed_dict={
self.cur_observations: states,
self.stochastic: False,
self.eps: .0
self.noise_scale: .0,
self.pure_exploration_phase: False,
})
distance_in_action_space = np.sqrt(
np.mean(np.square(clean_actions - noisy_actions)))
self.pi_distance = distance_in_action_space
if distance_in_action_space < self.config["exploration_sigma"]:
if distance_in_action_space < self.config["exploration_ou_sigma"]:
self.parameter_noise_sigma_val *= 1.01
else:
self.parameter_noise_sigma_val /= 1.01
@@ -128,107 +67,6 @@ class DDPGPostprocessing(object):
return _postprocess_dqn(self, sample_batch)
class PolicyNetwork(object):
"""Maps an observations (i.e., state) to an action where each entry takes
value from (0, 1) due to the sigmoid function."""
def __init__(self,
model,
dim_actions,
hiddens=[64, 64],
activation="relu",
parameter_noise=False):
action_out = model.last_layer
activation = tf.nn.__dict__[activation]
for hidden in hiddens:
action_out = layers.fully_connected(
action_out,
num_outputs=hidden,
activation_fn=activation,
normalizer_fn=layers.layer_norm if parameter_noise else None)
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
self.action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
self.model = model
class ActionNetwork(object):
"""Acts as a stochastic policy for inference, but a deterministic policy
for training, thus ignoring the batch_size issue when constructing a
stochastic action."""
def __init__(self,
p_values,
low_action,
high_action,
stochastic,
eps,
theta=0.15,
sigma=0.2,
use_gaussian_noise=False,
act_noise=0.1,
is_target=False,
target_noise=0.2,
noise_clip=0.5,
parameter_noise=False):
# shape is [None, dim_action]
deterministic_actions = (
(high_action - low_action) * p_values + low_action)
if use_gaussian_noise:
if is_target:
normal_sample = tf.random_normal(
tf.shape(deterministic_actions), stddev=target_noise)
normal_sample = tf.clip_by_value(normal_sample, -noise_clip,
noise_clip)
stochastic_actions = tf.clip_by_value(
deterministic_actions + normal_sample, low_action,
high_action)
else:
normal_sample = tf.random_normal(
tf.shape(deterministic_actions), stddev=act_noise)
stochastic_actions = tf.clip_by_value(
deterministic_actions + normal_sample, low_action,
high_action)
else:
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=low_action.size * [.0],
trainable=False)
normal_sample = tf.random_normal(
shape=[low_action.size], mean=0.0, stddev=1.0)
exploration_value = tf.assign_add(
exploration_sample,
theta * (.0 - exploration_sample) + sigma * normal_sample)
stochastic_actions = tf.clip_by_value(
deterministic_actions +
eps * (high_action - low_action) * exploration_value,
low_action, high_action)
self.actions = tf.cond(
tf.logical_and(stochastic, not parameter_noise),
lambda: stochastic_actions, lambda: deterministic_actions)
class QNetwork(object):
def __init__(self,
model,
action_inputs,
hiddens=[64, 64],
activation="relu"):
q_out = tf.concat([model.last_layer, action_inputs], axis=1)
activation = tf.nn.__dict__[activation]
for hidden in hiddens:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=activation)
self.value = layers.fully_connected(
q_out, num_outputs=1, activation_fn=None)
self.model = model
class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
def __init__(self, observation_space, action_space, config):
config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
@@ -238,7 +76,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
action_space))
self.config = config
self.cur_epsilon = 1.0
self.cur_noise_scale = 1.0
self.cur_pure_exploration_phase = False
self.dim_actions = action_space.shape[0]
self.low_action = action_space.low
self.high_action = action_space.high
@@ -246,30 +85,38 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
# create global step for counting the number of update operations
self.global_step = tf.train.get_or_create_global_step()
# use separate optimizers for actor & critic
self._actor_optimizer = tf.train.AdamOptimizer(
learning_rate=self.config["actor_lr"])
self._critic_optimizer = tf.train.AdamOptimizer(
learning_rate=self.config["critic_lr"])
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
self.pure_exploration_phase = tf.placeholder(
tf.bool, (), name="pure_exploration_phase")
self.cur_observations = tf.placeholder(
tf.float32,
shape=(None, ) + observation_space.shape,
name="cur_obs")
# Actor: P (policy) network
with tf.variable_scope(POLICY_SCOPE) as scope:
p_values, self.p_model = self._build_p_network(
policy_out, self.policy_model = self._build_policy_network(
self.cur_observations, observation_space, action_space)
self.p_func_vars = _scope_vars(scope.name)
self.policy_vars = _scope_vars(scope.name)
# Noise vars for P network except for layer normalization vars
if self.config["parameter_noise"]:
self._build_parameter_noise([
var for var in self.p_func_vars if "LayerNorm" not in var.name
var for var in self.policy_vars if "LayerNorm" not in var.name
])
# Action outputs
with tf.variable_scope(ACTION_SCOPE):
self.output_actions = self._build_action_network(
p_values, self.stochastic, self.eps)
self.output_actions = self._add_exploration_noise(
policy_out, self.stochastic, self.noise_scale,
self.pure_exploration_phase, action_space)
if self.config["smooth_target_policy"]:
self.reset_noise_op = tf.no_op()
@@ -293,37 +140,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
# p network evaluation
# policy network evaluation
with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope:
prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
self.p_t, _ = self._build_p_network(self.obs_t, observation_space,
action_space)
p_batchnorm_update_ops = list(
self.policy_t, _ = self._build_policy_network(
self.obs_t, observation_space, action_space)
policy_batchnorm_update_ops = list(
set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
prev_update_ops)
# target p network evaluation
# target policy network evaluation
with tf.variable_scope(POLICY_TARGET_SCOPE) as scope:
p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space,
action_space)
target_p_func_vars = _scope_vars(scope.name)
policy_tp1, _ = self._build_policy_network(
self.obs_tp1, observation_space, action_space)
target_policy_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(ACTION_SCOPE, reuse=True):
output_actions = self._build_action_network(
self.p_t,
stochastic=tf.constant(value=False, dtype=tf.bool),
eps=.0)
output_actions_estimated = self._build_action_network(
p_tp1,
stochastic=tf.constant(
value=self.config["smooth_target_policy"], dtype=tf.bool),
eps=.0,
is_target=True)
if config["smooth_target_policy"]:
target_noise_clip = self.config["target_noise_clip"]
clipped_normal_sample = tf.clip_by_value(
tf.random_normal(
tf.shape(policy_tp1),
stddev=self.config["target_noise"]),
-target_noise_clip, target_noise_clip)
policy_tp1_smoothed = tf.clip_by_value(
policy_tp1 + clipped_normal_sample, action_space.low,
action_space.high)
else:
# no smoothing, just use deterministic actions
policy_tp1_smoothed = policy_tp1
# q network evaluation
prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
with tf.variable_scope(Q_SCOPE) as scope:
# Q-values for given actions & observations in given current
q_t, self.q_model = self._build_q_network(
self.obs_t, observation_space, action_space, self.act_t)
self.q_func_vars = _scope_vars(scope.name)
@@ -333,8 +184,9 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
"min_q": tf.reduce_min(q_t),
}
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp0, _ = self._build_q_network(self.obs_t, observation_space,
action_space, output_actions)
# Q-values for current policy (no noise) in given current state
q_t_det_policy, _ = self._build_q_network(
self.obs_t, observation_space, action_space, self.policy_t)
if self.config["twin_q"]:
with tf.variable_scope(TWIN_Q_SCOPE) as scope:
twin_q_t, self.twin_q_model = self._build_q_network(
@@ -343,38 +195,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
q_batchnorm_update_ops = list(
set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
# target q network evalution
# target q network evaluation
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space,
action_space,
output_actions_estimated)
action_space, policy_tp1_smoothed)
target_q_func_vars = _scope_vars(scope.name)
if self.config["twin_q"]:
with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope:
twin_q_tp1, _ = self._build_q_network(
self.obs_tp1, observation_space, action_space,
output_actions_estimated)
policy_tp1_smoothed)
twin_target_q_func_vars = _scope_vars(scope.name)
if self.config["twin_q"]:
self.loss = self._build_actor_critic_loss(
q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1)
self.critic_loss, self.actor_loss, self.td_error \
= self._build_actor_critic_loss(
q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t,
twin_q_tp1=twin_q_tp1)
else:
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
self.critic_loss, self.actor_loss, self.td_error \
= self._build_actor_critic_loss(
q_t, q_tp1, q_t_det_policy)
if config["l2_reg"] is not None:
for var in self.p_func_vars:
for var in self.policy_vars:
if "bias" not in var.name:
self.loss.actor_loss += (
self.actor_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
for var in self.q_func_vars:
if "bias" not in var.name:
self.loss.critic_loss += (
self.critic_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
if self.config["twin_q"]:
for var in self.twin_q_func_vars:
if "bias" not in var.name:
self.loss.critic_loss += (
self.critic_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
# update_target_fn will be called periodically to copy Q network to
@@ -396,8 +251,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
for var, var_target in zip(
sorted(self.p_func_vars, key=lambda v: v.name),
sorted(target_p_func_vars, key=lambda v: v.name)):
sorted(self.policy_vars, key=lambda v: v.name),
sorted(target_policy_vars, key=lambda v: v.name)):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
@@ -414,14 +269,15 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
]
input_dict = dict(self.loss_inputs)
# Model self-supervised losses
self.loss.actor_loss = self.p_model.custom_loss(
self.loss.actor_loss, input_dict)
self.loss.critic_loss = self.q_model.custom_loss(
self.loss.critic_loss, input_dict)
if self.config["twin_q"]:
self.loss.critic_loss = self.twin_q_model.custom_loss(
self.loss.critic_loss, input_dict)
if self.config["use_state_preprocessor"]:
# Model self-supervised losses
self.actor_loss = self.policy_model.custom_loss(
self.actor_loss, input_dict)
self.critic_loss = self.q_model.custom_loss(
self.critic_loss, input_dict)
if self.config["twin_q"]:
self.critic_loss = self.twin_q_model.custom_loss(
self.critic_loss, input_dict)
TFPolicyGraph.__init__(
self,
@@ -430,62 +286,92 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions,
loss=self.loss.actor_loss + self.loss.critic_loss,
loss=self.actor_loss + self.critic_loss,
loss_inputs=self.loss_inputs,
update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops)
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
self.sess.run(tf.global_variables_initializer())
# Note that this encompasses both the policy and Q-value networks and
# their corresponding target networks
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
tf.group(q_tp0, q_tp1), self.sess)
tf.group(q_t_det_policy, q_tp1), self.sess)
# Hard initial update
self.update_target(tau=1.0)
@override(TFPolicyGraph)
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
# we don't use this because we have two separate optimisers
return None
@override(TFPolicyGraph)
def build_apply_op(self, optimizer, grads_and_vars):
# for policy gradient, update policy net one time v.s.
# update critic net `policy_delay` time(s)
should_apply_actor_opt = tf.equal(
tf.mod(self.global_step, self.config["policy_delay"]), 0)
def make_apply_op():
return self._actor_optimizer.apply_gradients(
self._actor_grads_and_vars)
actor_op = tf.cond(
should_apply_actor_opt,
true_fn=make_apply_op,
false_fn=lambda: tf.no_op())
critic_op = self._critic_optimizer.apply_gradients(
self._critic_grads_and_vars)
# increment global step & apply ops
with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
return tf.group(actor_op, critic_op)
@override(TFPolicyGraph)
def gradients(self, optimizer, loss):
if self.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = _minimize_and_clip(
optimizer,
self.loss.actor_loss,
var_list=self.p_func_vars,
self._actor_optimizer,
self.actor_loss,
var_list=self.policy_vars,
clip_val=self.config["grad_norm_clipping"])
critic_grads_and_vars = _minimize_and_clip(
optimizer,
self.loss.critic_loss,
self._critic_optimizer,
self.critic_loss,
var_list=self.q_func_vars + self.twin_q_func_vars
if self.config["twin_q"] else self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
actor_grads_and_vars = optimizer.compute_gradients(
self.loss.actor_loss, var_list=self.p_func_vars)
critic_grads_and_vars = optimizer.compute_gradients(
self.loss.critic_loss,
var_list=self.q_func_vars + self.twin_q_func_vars
if self.config["twin_q"] else self.q_func_vars)
actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
if g is not None]
critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
if g is not None]
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
actor_grads_and_vars = self._actor_optimizer.compute_gradients(
self.actor_loss, var_list=self.policy_vars)
if self.config["twin_q"]:
critic_vars = self.q_func_vars + self.twin_q_func_vars
else:
critic_vars = self.q_func_vars
critic_grads_and_vars = self._critic_optimizer.compute_gradients(
self.critic_loss, var_list=critic_vars)
# save these for later use in build_apply_op
self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
if g is not None]
self._critic_grads_and_vars = [(g, v)
for (g, v) in critic_grads_and_vars
if g is not None]
grads_and_vars = self._actor_grads_and_vars \
+ self._critic_grads_and_vars
return grads_and_vars
@override(TFPolicyGraph)
def extra_compute_action_feed_dict(self):
return {
# FIXME: what about turning off exploration? Isn't that a good
# idea?
self.stochastic: True,
self.eps: self.cur_epsilon,
self.noise_scale: self.cur_noise_scale,
self.pure_exploration_phase: self.cur_pure_exploration_phase,
}
@override(TFPolicyGraph)
def extra_compute_grad_fetches(self):
return {
"td_error": self.loss.td_error,
"td_error": self.td_error,
LEARNER_STATS_KEY: self.stats,
}
@@ -499,59 +385,192 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
@override(PolicyGraph)
def get_state(self):
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
return [
TFPolicyGraph.get_state(self), self.cur_noise_scale,
self.cur_pure_exploration_phase
]
@override(PolicyGraph)
def set_state(self, state):
TFPolicyGraph.set_state(self, state[0])
self.set_epsilon(state[1])
self.set_pure_exploration_phase(state[2])
def _build_q_network(self, obs, obs_space, action_space, actions):
q_net = QNetwork(
ModelCatalog.get_model({
if self.config["use_state_preprocessor"]:
q_model = ModelCatalog.get_model({
"obs": obs,
"is_training": self._get_is_training_placeholder(),
}, obs_space, action_space, 1, self.config["model"]), actions,
self.config["critic_hiddens"],
self.config["critic_hidden_activation"])
return q_net.value, q_net.model
}, obs_space, action_space, 1, self.config["model"])
q_out = tf.concat([q_model.last_layer, actions], axis=1)
else:
q_model = None
q_out = tf.concat([obs, actions], axis=1)
def _build_p_network(self, obs, obs_space, action_space):
policy_net = PolicyNetwork(
ModelCatalog.get_model({
activation = getattr(tf.nn, self.config["critic_hidden_activation"])
for hidden in self.config["critic_hiddens"]:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=activation)
q_values = layers.fully_connected(
q_out, num_outputs=1, activation_fn=None)
return q_values, q_model
def _build_policy_network(self, obs, obs_space, action_space):
if self.config["use_state_preprocessor"]:
model = ModelCatalog.get_model({
"obs": obs,
"is_training": self._get_is_training_placeholder(),
}, obs_space, action_space, 1, self.config["model"]),
self.dim_actions, self.config["actor_hiddens"],
self.config["actor_hidden_activation"],
self.config["parameter_noise"])
return policy_net.action_scores, policy_net.model
}, obs_space, action_space, 1, self.config["model"])
action_out = model.last_layer
else:
model = None
action_out = obs
def _build_action_network(self, p_values, stochastic, eps,
is_target=False):
return ActionNetwork(
p_values, self.low_action, self.high_action, stochastic, eps,
self.config["exploration_theta"], self.config["exploration_sigma"],
self.config["smooth_target_policy"], self.config["act_noise"],
is_target, self.config["target_noise"],
self.config["noise_clip"]).actions
activation = getattr(tf.nn, self.config["actor_hidden_activation"])
normalizer_fn = layers.layer_norm if self.config["parameter_noise"] \
else None
for hidden in self.config["actor_hiddens"]:
action_out = layers.fully_connected(
action_out,
num_outputs=hidden,
activation_fn=activation,
normalizer_fn=normalizer_fn)
action_out = layers.fully_connected(
action_out, num_outputs=self.dim_actions, activation_fn=None)
# Use sigmoid to scale to [0,1], but also double magnitude of input to
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
sigmoid_out = tf.nn.sigmoid(2 * action_out)
# Rescale to actual env policy scale
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
# get same dims)
action_range = (action_space.high - action_space.low)[None]
low_action = action_space.low[None]
actions = action_range * sigmoid_out + low_action
return actions, model
def _add_exploration_noise(self, deterministic_actions,
should_be_stochastic, noise_scale,
enable_pure_exploration, action_space):
noise_type = self.config["exploration_noise_type"]
action_low = action_space.low
action_high = action_space.high
action_range = action_space.high - action_low
def compute_stochastic_actions():
def make_noisy_actions():
# shape of deterministic_actions is [None, dim_action]
if noise_type == "gaussian":
# add IID Gaussian noise for exploration, TD3-style
normal_sample = noise_scale * tf.random_normal(
tf.shape(deterministic_actions),
stddev=self.config["exploration_gaussian_sigma"])
stochastic_actions = tf.clip_by_value(
deterministic_actions + normal_sample, action_low,
action_high)
elif noise_type == "ou":
# add OU noise for exploration, DDPG-style
zero_acts = action_low.size * [.0]
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=zero_acts,
trainable=False)
normal_sample = tf.random_normal(
shape=[action_low.size], mean=0.0, stddev=1.0)
ou_new = self.config["exploration_ou_theta"] \
* -exploration_sample \
+ self.config["exploration_ou_sigma"] * normal_sample
exploration_value = tf.assign_add(exploration_sample,
ou_new)
base_scale = self.config["exploration_ou_noise_scale"]
noise = noise_scale * base_scale \
* exploration_value * action_range
stochastic_actions = tf.clip_by_value(
deterministic_actions + noise, action_low, action_high)
else:
raise ValueError(
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
noise_type)
return stochastic_actions
def make_uniform_random_actions():
# pure random exploration option
uniform_random_actions = tf.random.uniform(
tf.shape(deterministic_actions))
# rescale uniform random actions according to action range
tf_range = tf.constant(action_range[None], dtype="float32")
tf_low = tf.constant(action_low[None], dtype="float32")
uniform_random_actions = uniform_random_actions * tf_range \
+ tf_low
return uniform_random_actions
stochastic_actions = tf.cond(
# need to condition on noise_scale > 0 because zeroing
# noise_scale is how evaluator signals no noise should be used
# (this is ugly and should be fixed by adding an "eval_mode"
# config flag or something)
tf.logical_and(enable_pure_exploration, noise_scale > 0),
true_fn=make_uniform_random_actions,
false_fn=make_noisy_actions)
return stochastic_actions
enable_stochastic = tf.logical_and(should_be_stochastic,
not self.config["parameter_noise"])
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
lambda: deterministic_actions)
return actions
def _build_actor_critic_loss(self,
q_t,
q_tp1,
q_tp0,
q_t_det_policy,
twin_q_t=None,
twin_q_tp1=None):
return ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, twin_q_t, twin_q_tp1,
self.config["actor_loss_coeff"], self.config["critic_loss_coeff"],
self.config["gamma"], self.config["n_step"],
self.config["use_huber"], self.config["huber_threshold"],
self.config["twin_q"])
twin_q = self.config["twin_q"]
gamma = self.config["gamma"]
n_step = self.config["n_step"]
use_huber = self.config["use_huber"]
huber_threshold = self.config["huber_threshold"]
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
if twin_q:
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = tf.stop_gradient(
self.rew_t + gamma**n_step * q_tp1_best_masked)
# compute the error (potentially clipped)
if twin_q:
td_error = q_t_selected - q_t_selected_target
twin_td_error = twin_q_t_selected - q_t_selected_target
td_error = td_error + twin_td_error
if use_huber:
errors = _huber_loss(td_error, huber_threshold) \
+ _huber_loss(twin_td_error, huber_threshold)
else:
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
twin_td_error)
else:
td_error = q_t_selected - q_t_selected_target
if use_huber:
errors = _huber_loss(td_error, huber_threshold)
else:
errors = 0.5 * tf.square(td_error)
critic_loss = tf.reduce_mean(self.importance_weights * errors)
actor_loss = -tf.reduce_mean(q_t_det_policy)
return critic_loss, actor_loss, td_error
def _build_parameter_noise(self, pnet_params):
self.parameter_noise_sigma_val = self.config["exploration_sigma"]
self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
self.parameter_noise_sigma = tf.get_variable(
initializer=tf.constant_initializer(
self.parameter_noise_sigma_val),
@@ -590,7 +609,7 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = self.sess.run(
self.loss.td_error,
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
@@ -610,9 +629,16 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
# support both hard and soft sync
def update_target(self, tau=None):
tau = tau or self.tau_value
return self.sess.run(
self.update_target_expr,
feed_dict={self.tau: tau or self.tau_value})
self.update_target_expr, feed_dict={self.tau: tau})
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
# set_epsilon is called by optimizer to anneal exploration as
# necessary, and to turn it off during evaluation. The "epsilon" part
# is a carry-over from DQN, which uses epsilon-greedy exploration
# rather than adding action noise to the output of a policy network.
self.cur_noise_scale = epsilon
def set_pure_exploration_phase(self, pure_exploration_phase):
self.cur_pure_exploration_phase = pure_exploration_phase
+57
View File
@@ -0,0 +1,57 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
DEFAULT_CONFIG as DDPG_CONFIG
from ray.rllib.utils import merge_dicts
TD3_DEFAULT_CONFIG = merge_dicts(
DDPG_CONFIG,
{
# largest changes: twin Q functions, delayed policy updates, and target
# smoothing
"twin_q": True,
"policy_delay": 2,
"smooth_target_policy": True,
"target_noise": 0.2,
"target_noise_clip": 0.5,
# other changes & things we want to keep fixed: IID Gaussian
# exploration noise, larger actor learning rate, no l2 regularisation,
# no Huber loss, etc.
"exploration_should_anneal": False,
"exploration_noise_type": "gaussian",
"exploration_gaussian_sigma": 0.1,
"learning_starts": 10000,
"pure_exploration_steps": 10000,
"actor_hiddens": [400, 300],
"critic_hiddens": [400, 300],
"n_step": 1,
"gamma": 0.99,
"actor_lr": 1e-3,
"critic_lr": 1e-3,
"l2_reg": 0.0,
"tau": 5e-3,
"train_batch_size": 100,
"use_huber": False,
"target_network_update_freq": 0,
"optimizer_class": "SyncReplayOptimizer",
"num_workers": 0,
"num_gpus_per_worker": 0,
"per_worker_exploration": False,
"worker_side_prioritization": False,
"buffer_size": 1000000,
"prioritized_replay": False,
"clip_rewards": False,
"use_state_preprocessor": False,
},
)
class TD3Trainer(DDPGTrainer):
"""A more stable successor to TD3. By default, this uses a near-identical
configuration to that reported in the TD3 paper."""
_name = "TD3"
_default_config = TD3_DEFAULT_CONFIG
+6
View File
@@ -34,6 +34,11 @@ def _import_apex_ddpg():
return ddpg.ApexDDPGTrainer
def _import_td3():
from ray.rllib.agents import ddpg
return ddpg.TD3Trainer
def _import_ppo():
from ray.rllib.agents import ppo
return ppo.PPOTrainer
@@ -87,6 +92,7 @@ def _import_marwil():
ALGORITHMS = {
"DDPG": _import_ddpg,
"APEX_DDPG": _import_apex_ddpg,
"TD3": _import_td3,
"PPO": _import_ppo,
"ES": _import_es,
"ARS": _import_ars,
@@ -40,7 +40,8 @@ CONFIGS = {
},
},
"DDPG": {
"noise_scale": 0.0,
"pure_exploration_steps": 0,
"exploration_ou_noise_scale": 0.0,
"timesteps_per_iteration": 100
},
"PPO": {
@@ -116,8 +116,9 @@ class ModelSupportedSpaces(unittest.TestCase):
check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats)
check_support(
"DDPG", {
"noise_scale": 100.0,
"timesteps_per_iteration": 1
"exploration_ou_noise_scale": 100.0,
"timesteps_per_iteration": 1,
"use_state_preprocessor": True,
},
stats,
check_bounds=True)
@@ -188,6 +189,7 @@ class ModelSupportedSpaces(unittest.TestCase):
"min_iter_time_s": 1,
"learning_starts": 1000,
"target_network_update_freq": 100,
"use_state_preprocessor": True,
})
check_support_multiagent("IMPALA", {"num_gpus": 0})
check_support_multiagent("DQN", {"timesteps_per_iteration": 1})
@@ -206,7 +208,10 @@ class ModelSupportedSpaces(unittest.TestCase):
"sgd_minibatch_size": 1,
})
check_support_multiagent("PG", {"num_workers": 1, "optimizer": {}})
check_support_multiagent("DDPG", {"timesteps_per_iteration": 1})
check_support_multiagent("DDPG", {
"timesteps_per_iteration": 1,
"use_state_preprocessor": True,
})
if __name__ == "__main__":
@@ -15,13 +15,14 @@ halfcheetah-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
timesteps_per_iteration: 1000
exploration_fraction: 0.1
exploration_final_eps: 0.02
noise_scale: 0.1
exploration_theta: 0.15
exploration_sigma: 0.2
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.1
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
@@ -34,9 +35,8 @@ halfcheetah-ddpg:
clip_rewards: False
# === Optimization ===
lr: 0.001
actor_loss_coeff: 0.1
critic_loss_coeff: 1.0
actor_lr: 0.001
critic_lr: 0.001
use_huber: False
huber_threshold: 1.0
l2_reg: 0.000001
@@ -50,3 +50,7 @@ halfcheetah-ddpg:
optimizer_class: "SyncReplayOptimizer"
per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -0,0 +1,22 @@
invertedpendulum-td3:
# This is a TD3 with stopping conditions and network size tuned specifically
# for InvertedPendulum. Should be able to reach 1,000 reward (the maximum
# achievable) in 10,000 to 20,000 steps.
env: InvertedPendulum-v2
run: TD3
stop:
episode_reward_mean: 9999.9
time_total_s: 900 # 15 minutes
timesteps_total: 1000000
config:
# === Model ===
actor_hiddens: [32, 32]
critic_hiddens: [32, 32]
# === Exploration ===
learning_starts: 1000
pure_exploration_steps: 1000
# === Evaluation ===
evaluation_interval: 1
evaluation_num_episodes: 5
@@ -7,7 +7,9 @@ mountaincarcontinuous-apex-ddpg:
config:
clip_rewards: False
num_workers: 16
noise_scale: 1.0
exploration_ou_noise_scale: 1.0
n_step: 3
target_network_update_freq: 50000
tau: 1.0
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -15,13 +15,14 @@ mountaincarcontinuous-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
timesteps_per_iteration: 1000
exploration_fraction: 0.4
exploration_final_eps: 0.02
noise_scale: 0.75
exploration_theta: 0.15
exploration_sigma: 0.2
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.75
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.01
@@ -34,9 +35,8 @@ mountaincarcontinuous-ddpg:
clip_rewards: False
# === Optimization ===
lr: 0.001
actor_loss_coeff: 0.1
critic_loss_coeff: 1.0
actor_lr: 0.001
critic_lr: 0.001
use_huber: False
huber_threshold: 1.0
l2_reg: 0.00001
@@ -50,3 +50,7 @@ mountaincarcontinuous-ddpg:
optimizer_class: "SyncReplayOptimizer"
per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -0,0 +1,24 @@
mujoco-td3:
# Solve latest versions of the four hardest Mujoco tasks benchmarked in the
# original TD3 paper. Average return over 10 trials at end of 1,000,000
# timesteps (taken from Table 2 of the paper) are given in parens at the end
# of reach environment name.
#
# Paper is at https://arxiv.org/pdf/1802.09477.pdf
env:
grid_search:
- HalfCheetah-v2 # (9,532.99)
- Hopper-v2 # (3,304.75)
- Walker2d-v2 # (4,565.24)
- Ant-v2 # (4,185.06)
run: TD3
stop:
timesteps_total: 1000000
config:
# === Exploration ===
learning_starts: 10000
pure_exploration_steps: 10000
# === Evaluation ===
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -11,3 +11,5 @@ pendulum-apex-ddpg:
n_step: 1
target_network_update_freq: 50000
tau: 1.0
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -15,13 +15,14 @@ pendulum-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
timesteps_per_iteration: 600
exploration_fraction: 0.1
exploration_final_eps: 0.02
noise_scale: 0.1
exploration_theta: 0.15
exploration_sigma: 0.2
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.1
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
@@ -34,9 +35,8 @@ pendulum-ddpg:
clip_rewards: False
# === Optimization ===
lr: 0.001
actor_loss_coeff: 0.1
critic_loss_coeff: 1.0
actor_lr: 0.001
critic_lr: 0.001
use_huber: True
huber_threshold: 1.0
l2_reg: 0.000001
@@ -50,3 +50,7 @@ pendulum-ddpg:
optimizer_class: "SyncReplayOptimizer"
per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===
evaluation_interval: 5
evaluation_num_episodes: 10
@@ -1,60 +1,19 @@
# This configuration can expect to reach -160 reward in 10k-20k timesteps
pendulum-ddpg:
env: Pendulum-v0
run: DDPG
run: TD3
stop:
episode_reward_mean: -160
time_total_s: 600 # 10 minutes
episode_reward_mean: -130
time_total_s: 900 # 10 minutes
config:
# === Tricks ===
twin_q: True
policy_delay: 2
smooth_target_policy: True
act_noise: 0.1
target_noise: 0.2
noise_clip: 0.5
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
n_step: 1
model: {}
gamma: 0.99
env_config: {}
# === Exploration ===
schedule_max_timesteps: 100000
timesteps_per_iteration: 600
exploration_fraction: 0.1
exploration_final_eps: 0.02
noise_scale: 0.1
exploration_theta: 0.15
exploration_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
learning_starts: 5000
pure_exploration_steps: 5000
# === Replay buffer ===
buffer_size: 10000
prioritized_replay: True
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: False
# === Optimization ===
lr: 0.001
actor_loss_coeff: 0.1
critic_loss_coeff: 1.0
use_huber: True
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
sample_batch_size: 1
train_batch_size: 64
# === Parallelism ===
num_workers: 0
num_gpus_per_worker: 0
optimizer_class: "SyncReplayOptimizer"
per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===
evaluation_interval: 1
evaluation_num_episodes: 5