[rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694)

* [rllib] Separate optimisers for DDPG actor & crit. * [rllib] Better names for DDPG variables & options Config changes: - noise_scale -> exploration_ou_noise_scale - exploration_theta -> exploration_ou_theta - exploration_sigma -> exploration_ou_sigma - act_noise -> exploration_gaussian_sigma - noise_clip -> target_noise_clip * [rllib] Make DDPG less class-y Used functions to replace three classes with only an __init__ method & a handful of unrelated attributes. * [rllib] Refactor DDPG noise * [rllib] Unify DDPG exploration annealing Added option "exploration_should_anneal" to enable linear annealing of exploration noise. By default this is off, for consistency with DDPG & TD3 papers. Also renamed "exploration_final_eps" to "exploration_final_scale" (that name seems to have been carried over from DQN, and doesn't really make sense here). Finally, tried to rename "eps" to "noise_scale" wherever possible.
2026-06-29 03:04:28 +08:00 · 2019-04-26 17:49:53 -07:00
parent 05c896d6f7
commit 663e92ab3f
16 changed files with 557 additions and 398 deletions
@@ -4,6 +4,7 @@ from __future__ import print_function

 from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
+from ray.rllib.agents.ddpg.td3 import TD3Trainer
 from ray.rllib.utils import renamed_class

 ApexDDPGAgent = renamed_class(ApexDDPGTrainer)
@@ -11,5 +12,5 @@ DDPGAgent = renamed_class(DDPGTrainer)

 __all__ = [
    "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
-    "DEFAULT_CONFIG"
+    "TD3Trainer", "DEFAULT_CONFIG"
 ]
@@ -13,19 +13,21 @@ from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 DEFAULT_CONFIG = with_common_config({
    # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
    # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
+    # In addition to settings below, you can use "exploration_noise_type" and
+    # "exploration_gauss_act_noise" to get IID Gaussian exploration noise
+    # instead of OU exploration noise.
    # twin Q-net
    "twin_q": False,
    # delayed policy update
    "policy_delay": 1,
    # target policy smoothing
-    # this also forces the use of gaussian instead of OU noise for exploration
+    # (this also replaces OU exploration noise with IID Gaussian exploration
+    # noise, for now)
    "smooth_target_policy": False,
-    # gaussian stddev of act noise
-    "act_noise": 0.1,
-    # gaussian stddev of target noise
+    # gaussian stddev of target action noise for smoothing
    "target_noise": 0.2,
    # target noise limit (bound)
-    "noise_clip": 0.5,
+    "target_noise_clip": 0.5,

    # === Evaluation ===
    # Evaluate with epsilon=0 every `evaluation_interval` training iterations.
@@ -37,42 +39,64 @@ DEFAULT_CONFIG = with_common_config({
    "evaluation_num_episodes": 10,

    # === Model ===
-    # Postprocess the policy network model output with these hidden layers
-    "actor_hiddens": [64, 64],
-    # Hidden layers activation of the policy network
+    # Apply a state preprocessor with spec given by the "model" config option
+    # (like other RL algorithms). This is mostly useful if you have a weird
+    # observation shape, like an image. Disabled by default.
+    "use_state_preprocessor": False,
+    # Postprocess the policy network model output with these hidden layers. If
+    # use_state_preprocessor is False, then these will be the *only* hidden
+    # layers in the network.
+    "actor_hiddens": [400, 300],
+    # Hidden layers activation of the postprocessing stage of the policy
+    # network
    "actor_hidden_activation": "relu",
-    # Postprocess the critic network model output with these hidden layers
-    "critic_hiddens": [64, 64],
-    # Hidden layers activation of the critic network
+    # Postprocess the critic network model output with these hidden layers;
+    # again, if use_state_preprocessor is True, then the state will be
+    # preprocessed by the model specified with the "model" config option first.
+    "critic_hiddens": [400, 300],
+    # Hidden layers activation of the postprocessing state of the critic.
    "critic_hidden_activation": "relu",
    # N-step Q learning
    "n_step": 1,

    # === Exploration ===
-    # Max num timesteps for annealing schedules. Exploration is annealed from
-    # 1.0 to exploration_fraction over this number of timesteps scaled by
-    # exploration_fraction
+    # Turns on annealing schedule for exploration noise. Exploration is
+    # annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
+    # scaled by exploration_fraction. Original DDPG and TD3 papers do not
+    # anneal noise, so this is False by default.
+    "exploration_should_anneal": False,
+    # Max num timesteps for annealing schedules.
    "schedule_max_timesteps": 100000,
    # Number of env steps to optimize for before returning
    "timesteps_per_iteration": 1000,
    # Fraction of entire training period over which the exploration rate is
    # annealed
    "exploration_fraction": 0.1,
-    # Final value of random action probability
-    "exploration_final_eps": 0.02,
-    # OU-noise scale
-    "noise_scale": 0.1,
-    # theta
-    "exploration_theta": 0.15,
-    # sigma
-    "exploration_sigma": 0.2,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 0,
-    # Update the target by \tau * policy + (1-\tau) * target_policy
-    "tau": 0.002,
+    # Final scaling multiplier for action noise (initial is 1.0)
+    "exploration_final_scale": 0.02,
+    # valid values: "ou" (time-correlated, like original DDPG paper),
+    # "gaussian" (IID, like TD3 paper)
+    "exploration_noise_type": "ou",
+    # OU-noise scale; this can be used to scale down magnitude of OU noise
+    # before adding to actions (requires "exploration_noise_type" to be "ou")
+    "exploration_ou_noise_scale": 0.1,
+    # theta for OU
+    "exploration_ou_theta": 0.15,
+    # sigma for OU
+    "exploration_ou_sigma": 0.2,
+    # gaussian stddev of act noise for exploration (requires
+    # "exploration_noise_type" to be "gaussian")
+    "exploration_gaussian_sigma": 0.1,
    # If True parameter space noise will be used for exploration
    # See https://blog.openai.com/better-exploration-with-parameter-noise/
    "parameter_noise": False,
+    # Until this many timesteps have elapsed, the agent's policy will be
+    # ignored & it will instead take uniform random actions. Can be used in
+    # conjunction with learning_starts (which controls when the first
+    # optimization step happens) to decrease dependence of exploration &
+    # optimization on initial policy parameters. Note that this will be
+    # disabled when the action noise scale is set to 0 (e.g during evaluation).
+    "pure_exploration_steps": 1000,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
@@ -90,11 +114,14 @@ DEFAULT_CONFIG = with_common_config({
    "compress_observations": False,

    # === Optimization ===
-    # Learning rate for adam optimizer.
-    # Instead of using two optimizers, we use two different loss coefficients
-    "lr": 1e-3,
-    "actor_loss_coeff": 0.1,
-    "critic_loss_coeff": 1.0,
+    # Learning rate for the critic (Q-function) optimizer.
+    "critic_lr": 1e-3,
+    # Learning rate for the actor (policy) optimizer.
+    "actor_lr": 1e-3,
+    # Update the target network every `target_network_update_freq` steps.
+    "target_network_update_freq": 0,
+    # Update the target by \tau * policy + (1-\tau) * target_policy
+    "tau": 0.002,
    # If True, use huber loss instead of squared loss for critic network
    # Conventionally, no need to clip gradients if using a huber loss
    "use_huber": False,
@@ -117,7 +144,7 @@ DEFAULT_CONFIG = with_common_config({
    # === Parallelism ===
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
-    # you"re using the Async or Ape-X optimizers.
+    # you're using the Async or Ape-X optimizers.
    "num_workers": 0,
    # Optimizer class to use.
    "optimizer_class": "SyncReplayOptimizer",
@@ -138,26 +165,41 @@ class DDPGTrainer(DQNTrainer):
    _default_config = DEFAULT_CONFIG
    _policy_graph = DDPGPolicyGraph

+    @override(DQNTrainer)
+    def _train(self):
+        pure_expl_steps = self.config["pure_exploration_steps"]
+        if pure_expl_steps:
+            # tell workers whether they should do pure exploration
+            only_explore = self.global_timestep < pure_expl_steps
+            self.local_evaluator.foreach_trainable_policy(
+                lambda p, _: p.set_pure_exploration_phase(only_explore))
+            for e in self.remote_evaluators:
+                e.foreach_trainable_policy.remote(
+                    lambda p, _: p.set_pure_exploration_phase(only_explore))
+        return super(DDPGTrainer, self)._train()
+
    @override(DQNTrainer)
    def _make_exploration_schedule(self, worker_index):
-        # Override DQN's schedule to take into account `noise_scale`
+        # Override DQN's schedule to take into account
+        # `exploration_ou_noise_scale`
        if self.config["per_worker_exploration"]:
            assert self.config["num_workers"] > 1, \
                "This requires multiple workers"
            if worker_index >= 0:
-                exponent = (
-                    1 +
-                    worker_index / float(self.config["num_workers"] - 1) * 7)
-                return ConstantSchedule(
-                    self.config["noise_scale"] * 0.4**exponent)
+                # FIXME: what do magic constants mean? (0.4, 7)
+                max_index = float(self.config["num_workers"] - 1)
+                exponent = 1 + worker_index / max_index * 7
+                return ConstantSchedule(0.4**exponent)
            else:
                # local ev should have zero exploration so that eval rollouts
                # run properly
                return ConstantSchedule(0.0)
-        else:
+        elif self.config["exploration_should_anneal"]:
            return LinearSchedule(
                schedule_timesteps=int(self.config["exploration_fraction"] *
                                       self.config["schedule_max_timesteps"]),
-                initial_p=self.config["noise_scale"] * 1.0,
-                final_p=self.config["noise_scale"] *
-                self.config["exploration_final_eps"])
+                initial_p=1.0,
+                final_p=self.config["exploration_final_scale"])
+        else:
+            # *always* add exploration noise
+            return ConstantSchedule(1.0)
@@ -19,80 +19,18 @@ from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph

-ACTION_SCOPE = "a_func"
-POLICY_SCOPE = "p_func"
-POLICY_TARGET_SCOPE = "target_p_func"
-Q_SCOPE = "q_func"
-Q_TARGET_SCOPE = "target_q_func"
-TWIN_Q_SCOPE = "twin_q_func"
-TWIN_Q_TARGET_SCOPE = "twin_target_q_func"
+ACTION_SCOPE = "action"
+POLICY_SCOPE = "policy"
+POLICY_TARGET_SCOPE = "target_policy"
+Q_SCOPE = "critic"
+Q_TARGET_SCOPE = "target_critic"
+TWIN_Q_SCOPE = "twin_critic"
+TWIN_Q_TARGET_SCOPE = "twin_target_critic"

 # Importance sampling weights for prioritized replay
 PRIO_WEIGHTS = "weights"


-class ActorCriticLoss(object):
-    def __init__(self,
-                 q_t,
-                 q_tp1,
-                 q_tp0,
-                 importance_weights,
-                 rewards,
-                 done_mask,
-                 twin_q_t,
-                 twin_q_tp1,
-                 actor_loss_coeff=0.1,
-                 critic_loss_coeff=1.0,
-                 gamma=0.99,
-                 n_step=1,
-                 use_huber=False,
-                 huber_threshold=1.0,
-                 twin_q=False,
-                 policy_delay=1):
-
-        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
-        if twin_q:
-            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
-            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
-
-        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
-
-        # compute the error (potentially clipped)
-        if twin_q:
-            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-            twin_td_error = twin_q_t_selected - tf.stop_gradient(
-                q_t_selected_target)
-            self.td_error = td_error + twin_td_error
-            if use_huber:
-                errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
-                    twin_td_error, huber_threshold)
-            else:
-                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
-                    twin_td_error)
-        else:
-            self.td_error = (
-                q_t_selected - tf.stop_gradient(q_t_selected_target))
-            if use_huber:
-                errors = _huber_loss(self.td_error, huber_threshold)
-            else:
-                errors = 0.5 * tf.square(self.td_error)
-
-        self.critic_loss = critic_loss_coeff * tf.reduce_mean(
-            importance_weights * errors)
-
-        # for policy gradient, update policy net one time v.s.
-        # update critic net `policy_delay` time(s)
-        global_step = tf.train.get_or_create_global_step()
-        policy_delay_mask = tf.to_float(
-            tf.equal(tf.mod(global_step, policy_delay), 0))
-        self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
-                           tf.reduce_mean(q_tp0))
-
-
 class DDPGPostprocessing(object):
    """Implements n-step learning and param noise adjustments."""

@@ -113,12 +51,13 @@ class DDPGPostprocessing(object):
                feed_dict={
                    self.cur_observations: states,
                    self.stochastic: False,
-                    self.eps: .0
+                    self.noise_scale: .0,
+                    self.pure_exploration_phase: False,
                })
            distance_in_action_space = np.sqrt(
                np.mean(np.square(clean_actions - noisy_actions)))
            self.pi_distance = distance_in_action_space
-            if distance_in_action_space < self.config["exploration_sigma"]:
+            if distance_in_action_space < self.config["exploration_ou_sigma"]:
                self.parameter_noise_sigma_val *= 1.01
            else:
                self.parameter_noise_sigma_val /= 1.01
@@ -128,107 +67,6 @@ class DDPGPostprocessing(object):
        return _postprocess_dqn(self, sample_batch)


-class PolicyNetwork(object):
-    """Maps an observations (i.e., state) to an action where each entry takes
-    value from (0, 1) due to the sigmoid function."""
-
-    def __init__(self,
-                 model,
-                 dim_actions,
-                 hiddens=[64, 64],
-                 activation="relu",
-                 parameter_noise=False):
-        action_out = model.last_layer
-        activation = tf.nn.__dict__[activation]
-        for hidden in hiddens:
-            action_out = layers.fully_connected(
-                action_out,
-                num_outputs=hidden,
-                activation_fn=activation,
-                normalizer_fn=layers.layer_norm if parameter_noise else None)
-        # Use sigmoid layer to bound values within (0, 1)
-        # shape of action_scores is [batch_size, dim_actions]
-        self.action_scores = layers.fully_connected(
-            action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
-        self.model = model
-
-
-class ActionNetwork(object):
-    """Acts as a stochastic policy for inference, but a deterministic policy
-    for training, thus ignoring the batch_size issue when constructing a
-    stochastic action."""
-
-    def __init__(self,
-                 p_values,
-                 low_action,
-                 high_action,
-                 stochastic,
-                 eps,
-                 theta=0.15,
-                 sigma=0.2,
-                 use_gaussian_noise=False,
-                 act_noise=0.1,
-                 is_target=False,
-                 target_noise=0.2,
-                 noise_clip=0.5,
-                 parameter_noise=False):
-
-        # shape is [None, dim_action]
-        deterministic_actions = (
-            (high_action - low_action) * p_values + low_action)
-
-        if use_gaussian_noise:
-            if is_target:
-                normal_sample = tf.random_normal(
-                    tf.shape(deterministic_actions), stddev=target_noise)
-                normal_sample = tf.clip_by_value(normal_sample, -noise_clip,
-                                                 noise_clip)
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + normal_sample, low_action,
-                    high_action)
-            else:
-                normal_sample = tf.random_normal(
-                    tf.shape(deterministic_actions), stddev=act_noise)
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + normal_sample, low_action,
-                    high_action)
-        else:
-            exploration_sample = tf.get_variable(
-                name="ornstein_uhlenbeck",
-                dtype=tf.float32,
-                initializer=low_action.size * [.0],
-                trainable=False)
-            normal_sample = tf.random_normal(
-                shape=[low_action.size], mean=0.0, stddev=1.0)
-            exploration_value = tf.assign_add(
-                exploration_sample,
-                theta * (.0 - exploration_sample) + sigma * normal_sample)
-            stochastic_actions = tf.clip_by_value(
-                deterministic_actions +
-                eps * (high_action - low_action) * exploration_value,
-                low_action, high_action)
-
-        self.actions = tf.cond(
-            tf.logical_and(stochastic, not parameter_noise),
-            lambda: stochastic_actions, lambda: deterministic_actions)
-
-
-class QNetwork(object):
-    def __init__(self,
-                 model,
-                 action_inputs,
-                 hiddens=[64, 64],
-                 activation="relu"):
-        q_out = tf.concat([model.last_layer, action_inputs], axis=1)
-        activation = tf.nn.__dict__[activation]
-        for hidden in hiddens:
-            q_out = layers.fully_connected(
-                q_out, num_outputs=hidden, activation_fn=activation)
-        self.value = layers.fully_connected(
-            q_out, num_outputs=1, activation_fn=None)
-        self.model = model
-
-
 class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
@@ -238,7 +76,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
                    action_space))

        self.config = config
-        self.cur_epsilon = 1.0
+        self.cur_noise_scale = 1.0
+        self.cur_pure_exploration_phase = False
        self.dim_actions = action_space.shape[0]
        self.low_action = action_space.low
        self.high_action = action_space.high
@@ -246,30 +85,38 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
        # create global step for counting the number of update operations
        self.global_step = tf.train.get_or_create_global_step()

+        # use separate optimizers for actor & critic
+        self._actor_optimizer = tf.train.AdamOptimizer(
+            learning_rate=self.config["actor_lr"])
+        self._critic_optimizer = tf.train.AdamOptimizer(
+            learning_rate=self.config["critic_lr"])
+
        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
+        self.pure_exploration_phase = tf.placeholder(
+            tf.bool, (), name="pure_exploration_phase")
        self.cur_observations = tf.placeholder(
            tf.float32,
            shape=(None, ) + observation_space.shape,
            name="cur_obs")

-        # Actor: P (policy) network
        with tf.variable_scope(POLICY_SCOPE) as scope:
-            p_values, self.p_model = self._build_p_network(
+            policy_out, self.policy_model = self._build_policy_network(
                self.cur_observations, observation_space, action_space)
-            self.p_func_vars = _scope_vars(scope.name)
+            self.policy_vars = _scope_vars(scope.name)

        # Noise vars for P network except for layer normalization vars
        if self.config["parameter_noise"]:
            self._build_parameter_noise([
-                var for var in self.p_func_vars if "LayerNorm" not in var.name
+                var for var in self.policy_vars if "LayerNorm" not in var.name
            ])

        # Action outputs
        with tf.variable_scope(ACTION_SCOPE):
-            self.output_actions = self._build_action_network(
-                p_values, self.stochastic, self.eps)
+            self.output_actions = self._add_exploration_noise(
+                policy_out, self.stochastic, self.noise_scale,
+                self.pure_exploration_phase, action_space)

        if self.config["smooth_target_policy"]:
            self.reset_noise_op = tf.no_op()
@@ -293,37 +140,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
        self.importance_weights = tf.placeholder(
            tf.float32, [None], name="weight")

-        # p network evaluation
+        # policy network evaluation
        with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope:
            prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-            self.p_t, _ = self._build_p_network(self.obs_t, observation_space,
-                                                action_space)
-            p_batchnorm_update_ops = list(
+            self.policy_t, _ = self._build_policy_network(
+                self.obs_t, observation_space, action_space)
+            policy_batchnorm_update_ops = list(
                set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
                prev_update_ops)

-        # target p network evaluation
+        # target policy network evaluation
        with tf.variable_scope(POLICY_TARGET_SCOPE) as scope:
-            p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space,
-                                             action_space)
-            target_p_func_vars = _scope_vars(scope.name)
+            policy_tp1, _ = self._build_policy_network(
+                self.obs_tp1, observation_space, action_space)
+            target_policy_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(ACTION_SCOPE, reuse=True):
-            output_actions = self._build_action_network(
-                self.p_t,
-                stochastic=tf.constant(value=False, dtype=tf.bool),
-                eps=.0)
-            output_actions_estimated = self._build_action_network(
-                p_tp1,
-                stochastic=tf.constant(
-                    value=self.config["smooth_target_policy"], dtype=tf.bool),
-                eps=.0,
-                is_target=True)
+            if config["smooth_target_policy"]:
+                target_noise_clip = self.config["target_noise_clip"]
+                clipped_normal_sample = tf.clip_by_value(
+                    tf.random_normal(
+                        tf.shape(policy_tp1),
+                        stddev=self.config["target_noise"]),
+                    -target_noise_clip, target_noise_clip)
+                policy_tp1_smoothed = tf.clip_by_value(
+                    policy_tp1 + clipped_normal_sample, action_space.low,
+                    action_space.high)
+            else:
+                # no smoothing, just use deterministic actions
+                policy_tp1_smoothed = policy_tp1

        # q network evaluation
        prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
        with tf.variable_scope(Q_SCOPE) as scope:
+            # Q-values for given actions & observations in given current
            q_t, self.q_model = self._build_q_network(
                self.obs_t, observation_space, action_space, self.act_t)
            self.q_func_vars = _scope_vars(scope.name)
@@ -333,8 +184,9 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
            "min_q": tf.reduce_min(q_t),
        }
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_tp0, _ = self._build_q_network(self.obs_t, observation_space,
-                                             action_space, output_actions)
+            # Q-values for current policy (no noise) in given current state
+            q_t_det_policy, _ = self._build_q_network(
+                self.obs_t, observation_space, action_space, self.policy_t)
        if self.config["twin_q"]:
            with tf.variable_scope(TWIN_Q_SCOPE) as scope:
                twin_q_t, self.twin_q_model = self._build_q_network(
@@ -343,38 +195,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
        q_batchnorm_update_ops = list(
            set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

-        # target q network evalution
+        # target q network evaluation
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space,
-                                             action_space,
-                                             output_actions_estimated)
+                                             action_space, policy_tp1_smoothed)
            target_q_func_vars = _scope_vars(scope.name)
        if self.config["twin_q"]:
            with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope:
                twin_q_tp1, _ = self._build_q_network(
                    self.obs_tp1, observation_space, action_space,
-                    output_actions_estimated)
+                    policy_tp1_smoothed)
                twin_target_q_func_vars = _scope_vars(scope.name)

        if self.config["twin_q"]:
-            self.loss = self._build_actor_critic_loss(
-                q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1)
+            self.critic_loss, self.actor_loss, self.td_error \
+                = self._build_actor_critic_loss(
+                    q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t,
+                    twin_q_tp1=twin_q_tp1)
        else:
-            self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
+            self.critic_loss, self.actor_loss, self.td_error \
+                = self._build_actor_critic_loss(
+                    q_t, q_tp1, q_t_det_policy)

        if config["l2_reg"] is not None:
-            for var in self.p_func_vars:
+            for var in self.policy_vars:
                if "bias" not in var.name:
-                    self.loss.actor_loss += (
+                    self.actor_loss += (
                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
            for var in self.q_func_vars:
                if "bias" not in var.name:
-                    self.loss.critic_loss += (
+                    self.critic_loss += (
                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
            if self.config["twin_q"]:
                for var in self.twin_q_func_vars:
                    if "bias" not in var.name:
-                        self.loss.critic_loss += (
+                        self.critic_loss += (
                            config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))

        # update_target_fn will be called periodically to copy Q network to
@@ -396,8 +251,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
                    var_target.assign(self.tau * var +
                                      (1.0 - self.tau) * var_target))
        for var, var_target in zip(
-                sorted(self.p_func_vars, key=lambda v: v.name),
-                sorted(target_p_func_vars, key=lambda v: v.name)):
+                sorted(self.policy_vars, key=lambda v: v.name),
+                sorted(target_policy_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
@@ -414,14 +269,15 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
        ]
        input_dict = dict(self.loss_inputs)

-        # Model self-supervised losses
-        self.loss.actor_loss = self.p_model.custom_loss(
-            self.loss.actor_loss, input_dict)
-        self.loss.critic_loss = self.q_model.custom_loss(
-            self.loss.critic_loss, input_dict)
-        if self.config["twin_q"]:
-            self.loss.critic_loss = self.twin_q_model.custom_loss(
-                self.loss.critic_loss, input_dict)
+        if self.config["use_state_preprocessor"]:
+            # Model self-supervised losses
+            self.actor_loss = self.policy_model.custom_loss(
+                self.actor_loss, input_dict)
+            self.critic_loss = self.q_model.custom_loss(
+                self.critic_loss, input_dict)
+            if self.config["twin_q"]:
+                self.critic_loss = self.twin_q_model.custom_loss(
+                    self.critic_loss, input_dict)

        TFPolicyGraph.__init__(
            self,
@@ -430,62 +286,92 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
            self.sess,
            obs_input=self.cur_observations,
            action_sampler=self.output_actions,
-            loss=self.loss.actor_loss + self.loss.critic_loss,
+            loss=self.actor_loss + self.critic_loss,
            loss_inputs=self.loss_inputs,
-            update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops)
+            update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
        self.sess.run(tf.global_variables_initializer())

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
-            tf.group(q_tp0, q_tp1), self.sess)
+            tf.group(q_t_det_policy, q_tp1), self.sess)

        # Hard initial update
        self.update_target(tau=1.0)

    @override(TFPolicyGraph)
    def optimizer(self):
-        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+        # we don't use this because we have two separate optimisers
+        return None
+
+    @override(TFPolicyGraph)
+    def build_apply_op(self, optimizer, grads_and_vars):
+        # for policy gradient, update policy net one time v.s.
+        # update critic net `policy_delay` time(s)
+        should_apply_actor_opt = tf.equal(
+            tf.mod(self.global_step, self.config["policy_delay"]), 0)
+
+        def make_apply_op():
+            return self._actor_optimizer.apply_gradients(
+                self._actor_grads_and_vars)
+
+        actor_op = tf.cond(
+            should_apply_actor_opt,
+            true_fn=make_apply_op,
+            false_fn=lambda: tf.no_op())
+        critic_op = self._critic_optimizer.apply_gradients(
+            self._critic_grads_and_vars)
+        # increment global step & apply ops
+        with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
+            return tf.group(actor_op, critic_op)

    @override(TFPolicyGraph)
    def gradients(self, optimizer, loss):
        if self.config["grad_norm_clipping"] is not None:
            actor_grads_and_vars = _minimize_and_clip(
-                optimizer,
-                self.loss.actor_loss,
-                var_list=self.p_func_vars,
+                self._actor_optimizer,
+                self.actor_loss,
+                var_list=self.policy_vars,
                clip_val=self.config["grad_norm_clipping"])
            critic_grads_and_vars = _minimize_and_clip(
-                optimizer,
-                self.loss.critic_loss,
+                self._critic_optimizer,
+                self.critic_loss,
                var_list=self.q_func_vars + self.twin_q_func_vars
                if self.config["twin_q"] else self.q_func_vars,
                clip_val=self.config["grad_norm_clipping"])
        else:
-            actor_grads_and_vars = optimizer.compute_gradients(
-                self.loss.actor_loss, var_list=self.p_func_vars)
-            critic_grads_and_vars = optimizer.compute_gradients(
-                self.loss.critic_loss,
-                var_list=self.q_func_vars + self.twin_q_func_vars
-                if self.config["twin_q"] else self.q_func_vars)
-        actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
-                                if g is not None]
-        critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
-                                 if g is not None]
-        grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
+            actor_grads_and_vars = self._actor_optimizer.compute_gradients(
+                self.actor_loss, var_list=self.policy_vars)
+            if self.config["twin_q"]:
+                critic_vars = self.q_func_vars + self.twin_q_func_vars
+            else:
+                critic_vars = self.q_func_vars
+            critic_grads_and_vars = self._critic_optimizer.compute_gradients(
+                self.critic_loss, var_list=critic_vars)
+        # save these for later use in build_apply_op
+        self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
+                                      if g is not None]
+        self._critic_grads_and_vars = [(g, v)
+                                       for (g, v) in critic_grads_and_vars
+                                       if g is not None]
+        grads_and_vars = self._actor_grads_and_vars \
+            + self._critic_grads_and_vars
        return grads_and_vars

    @override(TFPolicyGraph)
    def extra_compute_action_feed_dict(self):
        return {
+            # FIXME: what about turning off exploration? Isn't that a good
+            # idea?
            self.stochastic: True,
-            self.eps: self.cur_epsilon,
+            self.noise_scale: self.cur_noise_scale,
+            self.pure_exploration_phase: self.cur_pure_exploration_phase,
        }

    @override(TFPolicyGraph)
    def extra_compute_grad_fetches(self):
        return {
-            "td_error": self.loss.td_error,
+            "td_error": self.td_error,
            LEARNER_STATS_KEY: self.stats,
        }

@@ -499,59 +385,192 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):

    @override(PolicyGraph)
    def get_state(self):
-        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+        return [
+            TFPolicyGraph.get_state(self), self.cur_noise_scale,
+            self.cur_pure_exploration_phase
+        ]

    @override(PolicyGraph)
    def set_state(self, state):
        TFPolicyGraph.set_state(self, state[0])
        self.set_epsilon(state[1])
+        self.set_pure_exploration_phase(state[2])

    def _build_q_network(self, obs, obs_space, action_space, actions):
-        q_net = QNetwork(
-            ModelCatalog.get_model({
+        if self.config["use_state_preprocessor"]:
+            q_model = ModelCatalog.get_model({
                "obs": obs,
                "is_training": self._get_is_training_placeholder(),
-            }, obs_space, action_space, 1, self.config["model"]), actions,
-            self.config["critic_hiddens"],
-            self.config["critic_hidden_activation"])
-        return q_net.value, q_net.model
+            }, obs_space, action_space, 1, self.config["model"])
+            q_out = tf.concat([q_model.last_layer, actions], axis=1)
+        else:
+            q_model = None
+            q_out = tf.concat([obs, actions], axis=1)

-    def _build_p_network(self, obs, obs_space, action_space):
-        policy_net = PolicyNetwork(
-            ModelCatalog.get_model({
+        activation = getattr(tf.nn, self.config["critic_hidden_activation"])
+        for hidden in self.config["critic_hiddens"]:
+            q_out = layers.fully_connected(
+                q_out, num_outputs=hidden, activation_fn=activation)
+        q_values = layers.fully_connected(
+            q_out, num_outputs=1, activation_fn=None)
+
+        return q_values, q_model
+
+    def _build_policy_network(self, obs, obs_space, action_space):
+        if self.config["use_state_preprocessor"]:
+            model = ModelCatalog.get_model({
                "obs": obs,
                "is_training": self._get_is_training_placeholder(),
-            }, obs_space, action_space, 1, self.config["model"]),
-            self.dim_actions, self.config["actor_hiddens"],
-            self.config["actor_hidden_activation"],
-            self.config["parameter_noise"])
-        return policy_net.action_scores, policy_net.model
+            }, obs_space, action_space, 1, self.config["model"])
+            action_out = model.last_layer
+        else:
+            model = None
+            action_out = obs

-    def _build_action_network(self, p_values, stochastic, eps,
-                              is_target=False):
-        return ActionNetwork(
-            p_values, self.low_action, self.high_action, stochastic, eps,
-            self.config["exploration_theta"], self.config["exploration_sigma"],
-            self.config["smooth_target_policy"], self.config["act_noise"],
-            is_target, self.config["target_noise"],
-            self.config["noise_clip"]).actions
+        activation = getattr(tf.nn, self.config["actor_hidden_activation"])
+        normalizer_fn = layers.layer_norm if self.config["parameter_noise"] \
+            else None
+        for hidden in self.config["actor_hiddens"]:
+            action_out = layers.fully_connected(
+                action_out,
+                num_outputs=hidden,
+                activation_fn=activation,
+                normalizer_fn=normalizer_fn)
+        action_out = layers.fully_connected(
+            action_out, num_outputs=self.dim_actions, activation_fn=None)
+
+        # Use sigmoid to scale to [0,1], but also double magnitude of input to
+        # emulate behaviour of tanh activation used in DDPG and TD3 papers.
+        sigmoid_out = tf.nn.sigmoid(2 * action_out)
+        # Rescale to actual env policy scale
+        # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
+        # get same dims)
+        action_range = (action_space.high - action_space.low)[None]
+        low_action = action_space.low[None]
+        actions = action_range * sigmoid_out + low_action
+
+        return actions, model
+
+    def _add_exploration_noise(self, deterministic_actions,
+                               should_be_stochastic, noise_scale,
+                               enable_pure_exploration, action_space):
+        noise_type = self.config["exploration_noise_type"]
+        action_low = action_space.low
+        action_high = action_space.high
+        action_range = action_space.high - action_low
+
+        def compute_stochastic_actions():
+            def make_noisy_actions():
+                # shape of deterministic_actions is [None, dim_action]
+                if noise_type == "gaussian":
+                    # add IID Gaussian noise for exploration, TD3-style
+                    normal_sample = noise_scale * tf.random_normal(
+                        tf.shape(deterministic_actions),
+                        stddev=self.config["exploration_gaussian_sigma"])
+                    stochastic_actions = tf.clip_by_value(
+                        deterministic_actions + normal_sample, action_low,
+                        action_high)
+                elif noise_type == "ou":
+                    # add OU noise for exploration, DDPG-style
+                    zero_acts = action_low.size * [.0]
+                    exploration_sample = tf.get_variable(
+                        name="ornstein_uhlenbeck",
+                        dtype=tf.float32,
+                        initializer=zero_acts,
+                        trainable=False)
+                    normal_sample = tf.random_normal(
+                        shape=[action_low.size], mean=0.0, stddev=1.0)
+                    ou_new = self.config["exploration_ou_theta"] \
+                        * -exploration_sample \
+                        + self.config["exploration_ou_sigma"] * normal_sample
+                    exploration_value = tf.assign_add(exploration_sample,
+                                                      ou_new)
+                    base_scale = self.config["exploration_ou_noise_scale"]
+                    noise = noise_scale * base_scale \
+                        * exploration_value * action_range
+                    stochastic_actions = tf.clip_by_value(
+                        deterministic_actions + noise, action_low, action_high)
+                else:
+                    raise ValueError(
+                        "Unknown noise type '%s' (try 'ou' or 'gaussian')" %
+                        noise_type)
+                return stochastic_actions
+
+            def make_uniform_random_actions():
+                # pure random exploration option
+                uniform_random_actions = tf.random.uniform(
+                    tf.shape(deterministic_actions))
+                # rescale uniform random actions according to action range
+                tf_range = tf.constant(action_range[None], dtype="float32")
+                tf_low = tf.constant(action_low[None], dtype="float32")
+                uniform_random_actions = uniform_random_actions * tf_range \
+                    + tf_low
+                return uniform_random_actions
+
+            stochastic_actions = tf.cond(
+                # need to condition on noise_scale > 0 because zeroing
+                # noise_scale is how evaluator signals no noise should be used
+                # (this is ugly and should be fixed by adding an "eval_mode"
+                # config flag or something)
+                tf.logical_and(enable_pure_exploration, noise_scale > 0),
+                true_fn=make_uniform_random_actions,
+                false_fn=make_noisy_actions)
+            return stochastic_actions
+
+        enable_stochastic = tf.logical_and(should_be_stochastic,
+                                           not self.config["parameter_noise"])
+        actions = tf.cond(enable_stochastic, compute_stochastic_actions,
+                          lambda: deterministic_actions)
+        return actions

    def _build_actor_critic_loss(self,
                                 q_t,
                                 q_tp1,
-                                 q_tp0,
+                                 q_t_det_policy,
                                 twin_q_t=None,
                                 twin_q_tp1=None):
-        return ActorCriticLoss(
-            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
-            self.done_mask, twin_q_t, twin_q_tp1,
-            self.config["actor_loss_coeff"], self.config["critic_loss_coeff"],
-            self.config["gamma"], self.config["n_step"],
-            self.config["use_huber"], self.config["huber_threshold"],
-            self.config["twin_q"])
+        twin_q = self.config["twin_q"]
+        gamma = self.config["gamma"]
+        n_step = self.config["n_step"]
+        use_huber = self.config["use_huber"]
+        huber_threshold = self.config["huber_threshold"]
+
+        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
+        if twin_q:
+            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
+            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
+
+        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = tf.stop_gradient(
+            self.rew_t + gamma**n_step * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        if twin_q:
+            td_error = q_t_selected - q_t_selected_target
+            twin_td_error = twin_q_t_selected - q_t_selected_target
+            td_error = td_error + twin_td_error
+            if use_huber:
+                errors = _huber_loss(td_error, huber_threshold) \
+                    + _huber_loss(twin_td_error, huber_threshold)
+            else:
+                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
+                    twin_td_error)
+        else:
+            td_error = q_t_selected - q_t_selected_target
+            if use_huber:
+                errors = _huber_loss(td_error, huber_threshold)
+            else:
+                errors = 0.5 * tf.square(td_error)
+
+        critic_loss = tf.reduce_mean(self.importance_weights * errors)
+        actor_loss = -tf.reduce_mean(q_t_det_policy)
+        return critic_loss, actor_loss, td_error

    def _build_parameter_noise(self, pnet_params):
-        self.parameter_noise_sigma_val = self.config["exploration_sigma"]
+        self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
        self.parameter_noise_sigma = tf.get_variable(
            initializer=tf.constant_initializer(
                self.parameter_noise_sigma_val),
@@ -590,7 +609,7 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
                         importance_weights):
        td_err = self.sess.run(
-            self.loss.td_error,
+            self.td_error,
            feed_dict={
                self.obs_t: [np.array(ob) for ob in obs_t],
                self.act_t: act_t,
@@ -610,9 +629,16 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):

    # support both hard and soft sync
    def update_target(self, tau=None):
+        tau = tau or self.tau_value
        return self.sess.run(
-            self.update_target_expr,
-            feed_dict={self.tau: tau or self.tau_value})
+            self.update_target_expr, feed_dict={self.tau: tau})

    def set_epsilon(self, epsilon):
-        self.cur_epsilon = epsilon
+        # set_epsilon is called by optimizer to anneal exploration as
+        # necessary, and to turn it off during evaluation. The "epsilon" part
+        # is a carry-over from DQN, which uses epsilon-greedy exploration
+        # rather than adding action noise to the output of a policy network.
+        self.cur_noise_scale = epsilon
+
+    def set_pure_exploration_phase(self, pure_exploration_phase):
+        self.cur_pure_exploration_phase = pure_exploration_phase
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
+    DEFAULT_CONFIG as DDPG_CONFIG
+from ray.rllib.utils import merge_dicts
+
+TD3_DEFAULT_CONFIG = merge_dicts(
+    DDPG_CONFIG,
+    {
+        # largest changes: twin Q functions, delayed policy updates, and target
+        # smoothing
+        "twin_q": True,
+        "policy_delay": 2,
+        "smooth_target_policy": True,
+        "target_noise": 0.2,
+        "target_noise_clip": 0.5,
+
+        # other changes & things we want to keep fixed: IID Gaussian
+        # exploration noise, larger actor learning rate, no l2 regularisation,
+        # no Huber loss, etc.
+        "exploration_should_anneal": False,
+        "exploration_noise_type": "gaussian",
+        "exploration_gaussian_sigma": 0.1,
+        "learning_starts": 10000,
+        "pure_exploration_steps": 10000,
+        "actor_hiddens": [400, 300],
+        "critic_hiddens": [400, 300],
+        "n_step": 1,
+        "gamma": 0.99,
+        "actor_lr": 1e-3,
+        "critic_lr": 1e-3,
+        "l2_reg": 0.0,
+        "tau": 5e-3,
+        "train_batch_size": 100,
+        "use_huber": False,
+        "target_network_update_freq": 0,
+        "optimizer_class": "SyncReplayOptimizer",
+        "num_workers": 0,
+        "num_gpus_per_worker": 0,
+        "per_worker_exploration": False,
+        "worker_side_prioritization": False,
+        "buffer_size": 1000000,
+        "prioritized_replay": False,
+        "clip_rewards": False,
+        "use_state_preprocessor": False,
+    },
+)
+
+
+class TD3Trainer(DDPGTrainer):
+    """A more stable successor to TD3. By default, this uses a near-identical
+    configuration to that reported in the TD3 paper."""
+
+    _name = "TD3"
+    _default_config = TD3_DEFAULT_CONFIG
@@ -34,6 +34,11 @@ def _import_apex_ddpg():
    return ddpg.ApexDDPGTrainer


+def _import_td3():
+    from ray.rllib.agents import ddpg
+    return ddpg.TD3Trainer
+
+
 def _import_ppo():
    from ray.rllib.agents import ppo
    return ppo.PPOTrainer
@@ -87,6 +92,7 @@ def _import_marwil():
 ALGORITHMS = {
    "DDPG": _import_ddpg,
    "APEX_DDPG": _import_apex_ddpg,
+    "TD3": _import_td3,
    "PPO": _import_ppo,
    "ES": _import_es,
    "ARS": _import_ars,
@@ -40,7 +40,8 @@ CONFIGS = {
        },
    },
    "DDPG": {
-        "noise_scale": 0.0,
+        "pure_exploration_steps": 0,
+        "exploration_ou_noise_scale": 0.0,
        "timesteps_per_iteration": 100
    },
    "PPO": {
@@ -116,8 +116,9 @@ class ModelSupportedSpaces(unittest.TestCase):
        check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats)
        check_support(
            "DDPG", {
-                "noise_scale": 100.0,
-                "timesteps_per_iteration": 1
+                "exploration_ou_noise_scale": 100.0,
+                "timesteps_per_iteration": 1,
+                "use_state_preprocessor": True,
            },
            stats,
            check_bounds=True)
@@ -188,6 +189,7 @@ class ModelSupportedSpaces(unittest.TestCase):
                "min_iter_time_s": 1,
                "learning_starts": 1000,
                "target_network_update_freq": 100,
+                "use_state_preprocessor": True,
            })
        check_support_multiagent("IMPALA", {"num_gpus": 0})
        check_support_multiagent("DQN", {"timesteps_per_iteration": 1})
@@ -206,7 +208,10 @@ class ModelSupportedSpaces(unittest.TestCase):
                "sgd_minibatch_size": 1,
            })
        check_support_multiagent("PG", {"num_workers": 1, "optimizer": {}})
-        check_support_multiagent("DDPG", {"timesteps_per_iteration": 1})
+        check_support_multiagent("DDPG", {
+            "timesteps_per_iteration": 1,
+            "use_state_preprocessor": True,
+        })


 if __name__ == "__main__":
@@ -15,13 +15,14 @@ halfcheetah-ddpg:
        env_config: {}

        # === Exploration ===
+        exploration_should_anneal: True
        schedule_max_timesteps: 100000
        timesteps_per_iteration: 1000
        exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.1
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
        target_network_update_freq: 0
        tau: 0.001

@@ -34,9 +35,8 @@ halfcheetah-ddpg:
        clip_rewards: False

        # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
        use_huber: False
        huber_threshold: 1.0
        l2_reg: 0.000001
@@ -50,3 +50,7 @@ halfcheetah-ddpg:
        optimizer_class: "SyncReplayOptimizer"
        per_worker_exploration: False
        worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -0,0 +1,22 @@
+invertedpendulum-td3:
+    # This is a TD3 with stopping conditions and network size tuned specifically
+    # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum
+    # achievable) in 10,000 to 20,000 steps.
+    env: InvertedPendulum-v2
+    run: TD3
+    stop:
+        episode_reward_mean: 9999.9
+        time_total_s: 900 # 15 minutes
+        timesteps_total: 1000000
+    config:
+        # === Model ===
+        actor_hiddens: [32, 32]
+        critic_hiddens: [32, 32]
+
+        # === Exploration ===
+        learning_starts: 1000
+        pure_exploration_steps: 1000
+
+        # === Evaluation ===
+        evaluation_interval: 1
+        evaluation_num_episodes: 5
@@ -7,7 +7,9 @@ mountaincarcontinuous-apex-ddpg:
    config:
        clip_rewards: False
        num_workers: 16
-        noise_scale: 1.0
+        exploration_ou_noise_scale: 1.0
        n_step: 3
        target_network_update_freq: 50000
        tau: 1.0
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -15,13 +15,14 @@ mountaincarcontinuous-ddpg:
        env_config: {}

        # === Exploration ===
+        exploration_should_anneal: True
        schedule_max_timesteps: 100000
        timesteps_per_iteration: 1000
        exploration_fraction: 0.4
-        exploration_final_eps: 0.02
-        noise_scale: 0.75
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.75
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
        target_network_update_freq: 0
        tau: 0.01

@@ -34,9 +35,8 @@ mountaincarcontinuous-ddpg:
        clip_rewards: False

        # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
        use_huber: False
        huber_threshold: 1.0
        l2_reg: 0.00001
@@ -50,3 +50,7 @@ mountaincarcontinuous-ddpg:
        optimizer_class: "SyncReplayOptimizer"
        per_worker_exploration: False
        worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -0,0 +1,24 @@
+mujoco-td3:
+    # Solve latest versions of the four hardest Mujoco tasks benchmarked in the
+    # original TD3 paper. Average return over 10 trials at end of 1,000,000
+    # timesteps (taken from Table 2 of the paper) are given in parens at the end
+    # of reach environment name.
+    #
+    # Paper is at https://arxiv.org/pdf/1802.09477.pdf
+    env:
+        grid_search:
+            - HalfCheetah-v2  # (9,532.99)
+            - Hopper-v2  # (3,304.75)
+            - Walker2d-v2  # (4,565.24)
+            - Ant-v2  # (4,185.06)
+    run: TD3
+    stop:
+        timesteps_total: 1000000
+    config:
+        # === Exploration ===
+        learning_starts: 10000
+        pure_exploration_steps: 10000
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -11,3 +11,5 @@ pendulum-apex-ddpg:
        n_step: 1
        target_network_update_freq: 50000
        tau: 1.0
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -15,13 +15,14 @@ pendulum-ddpg:
        env_config: {}

        # === Exploration ===
+        exploration_should_anneal: True
        schedule_max_timesteps: 100000
        timesteps_per_iteration: 600
        exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.1
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
        target_network_update_freq: 0
        tau: 0.001

@@ -34,9 +35,8 @@ pendulum-ddpg:
        clip_rewards: False

        # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
        use_huber: True
        huber_threshold: 1.0
        l2_reg: 0.000001
@@ -50,3 +50,7 @@ pendulum-ddpg:
        optimizer_class: "SyncReplayOptimizer"
        per_worker_exploration: False
        worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
@@ -1,60 +1,19 @@
 # This configuration can expect to reach -160 reward in 10k-20k timesteps
 pendulum-ddpg:
    env: Pendulum-v0
-    run: DDPG
+    run: TD3
    stop:
-        episode_reward_mean: -160
-        time_total_s: 600 # 10 minutes
+        episode_reward_mean: -130
+        time_total_s: 900 # 10 minutes
    config:
-        # === Tricks ===
-        twin_q: True
-        policy_delay: 2
-        smooth_target_policy: True
-        act_noise: 0.1
-        target_noise: 0.2
-        noise_clip: 0.5
-
        # === Model ===
        actor_hiddens: [64, 64]
        critic_hiddens: [64, 64]
-        n_step: 1
-        model: {}
-        gamma: 0.99
-        env_config: {}

        # === Exploration ===
-        schedule_max_timesteps: 100000
-        timesteps_per_iteration: 600
-        exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
-        target_network_update_freq: 0
-        tau: 0.001
+        learning_starts: 5000
+        pure_exploration_steps: 5000

-        # === Replay buffer ===
-        buffer_size: 10000
-        prioritized_replay: True
-        prioritized_replay_alpha: 0.6
-        prioritized_replay_beta: 0.4
-        prioritized_replay_eps: 0.000001
-        clip_rewards: False
-
-        # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
-        use_huber: True
-        huber_threshold: 1.0
-        l2_reg: 0.000001
-        learning_starts: 500
-        sample_batch_size: 1
-        train_batch_size: 64
-
-        # === Parallelism ===
-        num_workers: 0
-        num_gpus_per_worker: 0
-        optimizer_class: "SyncReplayOptimizer"
-        per_worker_exploration: False
-        worker_side_prioritization: False
+        # === Evaluation ===
+        evaluation_interval: 1
+        evaluation_num_episodes: 5