From 663e92ab3f010491ef7420480eb6be7f57a0485c Mon Sep 17 00:00:00 2001
From: Sam Toyer <qxcv@users.noreply.github.com>
Date: Fri, 26 Apr 2019 17:49:53 -0700
Subject: [PATCH] [rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694)

* [rllib] Separate optimisers for DDPG actor & crit.

* [rllib] Better names for DDPG variables & options

Config changes:

- noise_scale -> exploration_ou_noise_scale
- exploration_theta -> exploration_ou_theta
- exploration_sigma -> exploration_ou_sigma
- act_noise -> exploration_gaussian_sigma
- noise_clip -> target_noise_clip

* [rllib] Make DDPG less class-y

Used functions to replace three classes with only an __init__ method & a
handful of unrelated attributes.

* [rllib] Refactor DDPG noise

* [rllib] Unify DDPG exploration annealing

Added option "exploration_should_anneal" to enable linear annealing of
exploration noise. By default this is off, for consistency with DDPG &
TD3 papers. Also renamed "exploration_final_eps" to
"exploration_final_scale" (that name seems to have been carried over
from DQN, and doesn't really make sense here). Finally, tried to rename
"eps" to "noise_scale" wherever possible.
---
 doc/source/rllib-algorithms.rst               |   2 +-
 python/ray/rllib/agents/ddpg/__init__.py      |   3 +-
 python/ray/rllib/agents/ddpg/ddpg.py          | 126 ++--
 .../rllib/agents/ddpg/ddpg_policy_graph.py    | 584 +++++++++---------
 python/ray/rllib/agents/ddpg/td3.py           |  57 ++
 python/ray/rllib/agents/registry.py           |   6 +
 .../rllib/tests/test_checkpoint_restore.py    |   3 +-
 .../ray/rllib/tests/test_supported_spaces.py  |  11 +-
 .../tuned_examples/halfcheetah-ddpg.yaml      |  18 +-
 .../tuned_examples/invertedpendulum-td3.yaml  |  22 +
 .../mountaincarcontinuous-apex-ddpg.yaml      |   4 +-
 .../mountaincarcontinuous-ddpg.yaml           |  18 +-
 .../ray/rllib/tuned_examples/mujoco-td3.yaml  |  24 +
 .../tuned_examples/pendulum-apex-ddpg.yaml    |   2 +
 .../rllib/tuned_examples/pendulum-ddpg.yaml   |  18 +-
 .../rllib/tuned_examples/pendulum-td3.yaml    |  57 +-
 16 files changed, 557 insertions(+), 398 deletions(-)
 create mode 100644 python/ray/rllib/agents/ddpg/td3.py
 create mode 100644 python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml
 create mode 100644 python/ray/rllib/tuned_examples/mujoco-td3.yaml

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index fd07bdc1b..9ee610847 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -142,7 +142,7 @@ Deep Deterministic Policy Gradients (DDPG, TD3)
 `[paper] <https://arxiv.org/abs/1509.02971>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ddpg/ddpg.py>`__
 DDPG is implemented similarly to DQN (below). The algorithm can be scaled by increasing the number of workers, switching to AsyncGradientsOptimizer, or using Ape-X. The improvements from `TD3 <https://spinningup.openai.com/en/latest/algorithms/td3.html>`__ are available though not enabled by default.
 
-Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml>`__, `TD3 configuration <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-td3.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml>`__
+Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml>`__, `TD3 Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-td3.yaml>`__, `TD3 InvertedPendulum-v2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml>`__, `TD3 Mujoco suite (Ant-v2, HalfCheetah-v2, Hopper-v2, Walker2d-v2) <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mujoco-td3.yaml>`__.
 
 **DDPG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
 
diff --git a/python/ray/rllib/agents/ddpg/__init__.py b/python/ray/rllib/agents/ddpg/__init__.py
index 5d2099187..9b90ca842 100644
--- a/python/ray/rllib/agents/ddpg/__init__.py
+++ b/python/ray/rllib/agents/ddpg/__init__.py
@@ -4,6 +4,7 @@ from __future__ import print_function
 
 from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
+from ray.rllib.agents.ddpg.td3 import TD3Trainer
 from ray.rllib.utils import renamed_class
 
 ApexDDPGAgent = renamed_class(ApexDDPGTrainer)
@@ -11,5 +12,5 @@ DDPGAgent = renamed_class(DDPGTrainer)
 
 __all__ = [
     "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
-    "DEFAULT_CONFIG"
+    "TD3Trainer", "DEFAULT_CONFIG"
 ]
diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py
index e5031ca11..7a140beee 100644
--- a/python/ray/rllib/agents/ddpg/ddpg.py
+++ b/python/ray/rllib/agents/ddpg/ddpg.py
@@ -13,19 +13,21 @@ from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 DEFAULT_CONFIG = with_common_config({
     # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
     # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
+    # In addition to settings below, you can use "exploration_noise_type" and
+    # "exploration_gauss_act_noise" to get IID Gaussian exploration noise
+    # instead of OU exploration noise.
     # twin Q-net
     "twin_q": False,
     # delayed policy update
     "policy_delay": 1,
     # target policy smoothing
-    # this also forces the use of gaussian instead of OU noise for exploration
+    # (this also replaces OU exploration noise with IID Gaussian exploration
+    # noise, for now)
     "smooth_target_policy": False,
-    # gaussian stddev of act noise
-    "act_noise": 0.1,
-    # gaussian stddev of target noise
+    # gaussian stddev of target action noise for smoothing
     "target_noise": 0.2,
     # target noise limit (bound)
-    "noise_clip": 0.5,
+    "target_noise_clip": 0.5,
 
     # === Evaluation ===
     # Evaluate with epsilon=0 every `evaluation_interval` training iterations.
@@ -37,42 +39,64 @@ DEFAULT_CONFIG = with_common_config({
     "evaluation_num_episodes": 10,
 
     # === Model ===
-    # Postprocess the policy network model output with these hidden layers
-    "actor_hiddens": [64, 64],
-    # Hidden layers activation of the policy network
+    # Apply a state preprocessor with spec given by the "model" config option
+    # (like other RL algorithms). This is mostly useful if you have a weird
+    # observation shape, like an image. Disabled by default.
+    "use_state_preprocessor": False,
+    # Postprocess the policy network model output with these hidden layers. If
+    # use_state_preprocessor is False, then these will be the *only* hidden
+    # layers in the network.
+    "actor_hiddens": [400, 300],
+    # Hidden layers activation of the postprocessing stage of the policy
+    # network
     "actor_hidden_activation": "relu",
-    # Postprocess the critic network model output with these hidden layers
-    "critic_hiddens": [64, 64],
-    # Hidden layers activation of the critic network
+    # Postprocess the critic network model output with these hidden layers;
+    # again, if use_state_preprocessor is True, then the state will be
+    # preprocessed by the model specified with the "model" config option first.
+    "critic_hiddens": [400, 300],
+    # Hidden layers activation of the postprocessing state of the critic.
     "critic_hidden_activation": "relu",
     # N-step Q learning
     "n_step": 1,
 
     # === Exploration ===
-    # Max num timesteps for annealing schedules. Exploration is annealed from
-    # 1.0 to exploration_fraction over this number of timesteps scaled by
-    # exploration_fraction
+    # Turns on annealing schedule for exploration noise. Exploration is
+    # annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
+    # scaled by exploration_fraction. Original DDPG and TD3 papers do not
+    # anneal noise, so this is False by default.
+    "exploration_should_anneal": False,
+    # Max num timesteps for annealing schedules.
     "schedule_max_timesteps": 100000,
     # Number of env steps to optimize for before returning
     "timesteps_per_iteration": 1000,
     # Fraction of entire training period over which the exploration rate is
     # annealed
     "exploration_fraction": 0.1,
-    # Final value of random action probability
-    "exploration_final_eps": 0.02,
-    # OU-noise scale
-    "noise_scale": 0.1,
-    # theta
-    "exploration_theta": 0.15,
-    # sigma
-    "exploration_sigma": 0.2,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 0,
-    # Update the target by \tau * policy + (1-\tau) * target_policy
-    "tau": 0.002,
+    # Final scaling multiplier for action noise (initial is 1.0)
+    "exploration_final_scale": 0.02,
+    # valid values: "ou" (time-correlated, like original DDPG paper),
+    # "gaussian" (IID, like TD3 paper)
+    "exploration_noise_type": "ou",
+    # OU-noise scale; this can be used to scale down magnitude of OU noise
+    # before adding to actions (requires "exploration_noise_type" to be "ou")
+    "exploration_ou_noise_scale": 0.1,
+    # theta for OU
+    "exploration_ou_theta": 0.15,
+    # sigma for OU
+    "exploration_ou_sigma": 0.2,
+    # gaussian stddev of act noise for exploration (requires
+    # "exploration_noise_type" to be "gaussian")
+    "exploration_gaussian_sigma": 0.1,
     # If True parameter space noise will be used for exploration
     # See https://blog.openai.com/better-exploration-with-parameter-noise/
     "parameter_noise": False,
+    # Until this many timesteps have elapsed, the agent's policy will be
+    # ignored & it will instead take uniform random actions. Can be used in
+    # conjunction with learning_starts (which controls when the first
+    # optimization step happens) to decrease dependence of exploration &
+    # optimization on initial policy parameters. Note that this will be
+    # disabled when the action noise scale is set to 0 (e.g during evaluation).
+    "pure_exploration_steps": 1000,
 
     # === Replay buffer ===
     # Size of the replay buffer. Note that if async_updates is set, then
@@ -90,11 +114,14 @@ DEFAULT_CONFIG = with_common_config({
     "compress_observations": False,
 
     # === Optimization ===
-    # Learning rate for adam optimizer.
-    # Instead of using two optimizers, we use two different loss coefficients
-    "lr": 1e-3,
-    "actor_loss_coeff": 0.1,
-    "critic_loss_coeff": 1.0,
+    # Learning rate for the critic (Q-function) optimizer.
+    "critic_lr": 1e-3,
+    # Learning rate for the actor (policy) optimizer.
+    "actor_lr": 1e-3,
+    # Update the target network every `target_network_update_freq` steps.
+    "target_network_update_freq": 0,
+    # Update the target by \tau * policy + (1-\tau) * target_policy
+    "tau": 0.002,
     # If True, use huber loss instead of squared loss for critic network
     # Conventionally, no need to clip gradients if using a huber loss
     "use_huber": False,
@@ -117,7 +144,7 @@ DEFAULT_CONFIG = with_common_config({
     # === Parallelism ===
     # Number of workers for collecting samples with. This only makes sense
     # to increase if your environment is particularly slow to sample, or if
-    # you"re using the Async or Ape-X optimizers.
+    # you're using the Async or Ape-X optimizers.
     "num_workers": 0,
     # Optimizer class to use.
     "optimizer_class": "SyncReplayOptimizer",
@@ -138,26 +165,41 @@ class DDPGTrainer(DQNTrainer):
     _default_config = DEFAULT_CONFIG
     _policy_graph = DDPGPolicyGraph
 
+    @override(DQNTrainer)
+    def _train(self):
+        pure_expl_steps = self.config["pure_exploration_steps"]
+        if pure_expl_steps:
+            # tell workers whether they should do pure exploration
+            only_explore = self.global_timestep < pure_expl_steps
+            self.local_evaluator.foreach_trainable_policy(
+                lambda p, _: p.set_pure_exploration_phase(only_explore))
+            for e in self.remote_evaluators:
+                e.foreach_trainable_policy.remote(
+                    lambda p, _: p.set_pure_exploration_phase(only_explore))
+        return super(DDPGTrainer, self)._train()
+
     @override(DQNTrainer)
     def _make_exploration_schedule(self, worker_index):
-        # Override DQN's schedule to take into account `noise_scale`
+        # Override DQN's schedule to take into account
+        # `exploration_ou_noise_scale`
         if self.config["per_worker_exploration"]:
             assert self.config["num_workers"] > 1, \
                 "This requires multiple workers"
             if worker_index >= 0:
-                exponent = (
-                    1 +
-                    worker_index / float(self.config["num_workers"] - 1) * 7)
-                return ConstantSchedule(
-                    self.config["noise_scale"] * 0.4**exponent)
+                # FIXME: what do magic constants mean? (0.4, 7)
+                max_index = float(self.config["num_workers"] - 1)
+                exponent = 1 + worker_index / max_index * 7
+                return ConstantSchedule(0.4**exponent)
             else:
                 # local ev should have zero exploration so that eval rollouts
                 # run properly
                 return ConstantSchedule(0.0)
-        else:
+        elif self.config["exploration_should_anneal"]:
             return LinearSchedule(
                 schedule_timesteps=int(self.config["exploration_fraction"] *
                                        self.config["schedule_max_timesteps"]),
-                initial_p=self.config["noise_scale"] * 1.0,
-                final_p=self.config["noise_scale"] *
-                self.config["exploration_final_eps"])
+                initial_p=1.0,
+                final_p=self.config["exploration_final_scale"])
+        else:
+            # *always* add exploration noise
+            return ConstantSchedule(1.0)
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index ad4f879e7..9304cbe0b 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -19,80 +19,18 @@ from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 
-ACTION_SCOPE = "a_func"
-POLICY_SCOPE = "p_func"
-POLICY_TARGET_SCOPE = "target_p_func"
-Q_SCOPE = "q_func"
-Q_TARGET_SCOPE = "target_q_func"
-TWIN_Q_SCOPE = "twin_q_func"
-TWIN_Q_TARGET_SCOPE = "twin_target_q_func"
+ACTION_SCOPE = "action"
+POLICY_SCOPE = "policy"
+POLICY_TARGET_SCOPE = "target_policy"
+Q_SCOPE = "critic"
+Q_TARGET_SCOPE = "target_critic"
+TWIN_Q_SCOPE = "twin_critic"
+TWIN_Q_TARGET_SCOPE = "twin_target_critic"
 
 # Importance sampling weights for prioritized replay
 PRIO_WEIGHTS = "weights"
 
 
-class ActorCriticLoss(object):
-    def __init__(self,
-                 q_t,
-                 q_tp1,
-                 q_tp0,
-                 importance_weights,
-                 rewards,
-                 done_mask,
-                 twin_q_t,
-                 twin_q_tp1,
-                 actor_loss_coeff=0.1,
-                 critic_loss_coeff=1.0,
-                 gamma=0.99,
-                 n_step=1,
-                 use_huber=False,
-                 huber_threshold=1.0,
-                 twin_q=False,
-                 policy_delay=1):
-
-        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
-        if twin_q:
-            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
-            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
-
-        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
-
-        # compute the error (potentially clipped)
-        if twin_q:
-            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-            twin_td_error = twin_q_t_selected - tf.stop_gradient(
-                q_t_selected_target)
-            self.td_error = td_error + twin_td_error
-            if use_huber:
-                errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
-                    twin_td_error, huber_threshold)
-            else:
-                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
-                    twin_td_error)
-        else:
-            self.td_error = (
-                q_t_selected - tf.stop_gradient(q_t_selected_target))
-            if use_huber:
-                errors = _huber_loss(self.td_error, huber_threshold)
-            else:
-                errors = 0.5 * tf.square(self.td_error)
-
-        self.critic_loss = critic_loss_coeff * tf.reduce_mean(
-            importance_weights * errors)
-
-        # for policy gradient, update policy net one time v.s.
-        # update critic net `policy_delay` time(s)
-        global_step = tf.train.get_or_create_global_step()
-        policy_delay_mask = tf.to_float(
-            tf.equal(tf.mod(global_step, policy_delay), 0))
-        self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
-                           tf.reduce_mean(q_tp0))
-
-
 class DDPGPostprocessing(object):
     """Implements n-step learning and param noise adjustments."""
 
@@ -113,12 +51,13 @@ class DDPGPostprocessing(object):
                 feed_dict={
                     self.cur_observations: states,
                     self.stochastic: False,
-                    self.eps: .0
+                    self.noise_scale: .0,
+                    self.pure_exploration_phase: False,
                 })
             distance_in_action_space = np.sqrt(
                 np.mean(np.square(clean_actions - noisy_actions)))
             self.pi_distance = distance_in_action_space
-            if distance_in_action_space < self.config["exploration_sigma"]:
+            if distance_in_action_space < self.config["exploration_ou_sigma"]:
                 self.parameter_noise_sigma_val *= 1.01
             else:
                 self.parameter_noise_sigma_val /= 1.01
@@ -128,107 +67,6 @@ class DDPGPostprocessing(object):
         return _postprocess_dqn(self, sample_batch)
 
 
-class PolicyNetwork(object):
-    """Maps an observations (i.e., state) to an action where each entry takes
-    value from (0, 1) due to the sigmoid function."""
-
-    def __init__(self,
-                 model,
-                 dim_actions,
-                 hiddens=[64, 64],
-                 activation="relu",
-                 parameter_noise=False):
-        action_out = model.last_layer
-        activation = tf.nn.__dict__[activation]
-        for hidden in hiddens:
-            action_out = layers.fully_connected(
-                action_out,
-                num_outputs=hidden,
-                activation_fn=activation,
-                normalizer_fn=layers.layer_norm if parameter_noise else None)
-        # Use sigmoid layer to bound values within (0, 1)
-        # shape of action_scores is [batch_size, dim_actions]
-        self.action_scores = layers.fully_connected(
-            action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
-        self.model = model
-
-
-class ActionNetwork(object):
-    """Acts as a stochastic policy for inference, but a deterministic policy
-    for training, thus ignoring the batch_size issue when constructing a
-    stochastic action."""
-
-    def __init__(self,
-                 p_values,
-                 low_action,
-                 high_action,
-                 stochastic,
-                 eps,
-                 theta=0.15,
-                 sigma=0.2,
-                 use_gaussian_noise=False,
-                 act_noise=0.1,
-                 is_target=False,
-                 target_noise=0.2,
-                 noise_clip=0.5,
-                 parameter_noise=False):
-
-        # shape is [None, dim_action]
-        deterministic_actions = (
-            (high_action - low_action) * p_values + low_action)
-
-        if use_gaussian_noise:
-            if is_target:
-                normal_sample = tf.random_normal(
-                    tf.shape(deterministic_actions), stddev=target_noise)
-                normal_sample = tf.clip_by_value(normal_sample, -noise_clip,
-                                                 noise_clip)
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + normal_sample, low_action,
-                    high_action)
-            else:
-                normal_sample = tf.random_normal(
-                    tf.shape(deterministic_actions), stddev=act_noise)
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + normal_sample, low_action,
-                    high_action)
-        else:
-            exploration_sample = tf.get_variable(
-                name="ornstein_uhlenbeck",
-                dtype=tf.float32,
-                initializer=low_action.size * [.0],
-                trainable=False)
-            normal_sample = tf.random_normal(
-                shape=[low_action.size], mean=0.0, stddev=1.0)
-            exploration_value = tf.assign_add(
-                exploration_sample,
-                theta * (.0 - exploration_sample) + sigma * normal_sample)
-            stochastic_actions = tf.clip_by_value(
-                deterministic_actions +
-                eps * (high_action - low_action) * exploration_value,
-                low_action, high_action)
-
-        self.actions = tf.cond(
-            tf.logical_and(stochastic, not parameter_noise),
-            lambda: stochastic_actions, lambda: deterministic_actions)
-
-
-class QNetwork(object):
-    def __init__(self,
-                 model,
-                 action_inputs,
-                 hiddens=[64, 64],
-                 activation="relu"):
-        q_out = tf.concat([model.last_layer, action_inputs], axis=1)
-        activation = tf.nn.__dict__[activation]
-        for hidden in hiddens:
-            q_out = layers.fully_connected(
-                q_out, num_outputs=hidden, activation_fn=activation)
-        self.value = layers.fully_connected(
-            q_out, num_outputs=1, activation_fn=None)
-        self.model = model
-
-
 class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
@@ -238,7 +76,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
                     action_space))
 
         self.config = config
-        self.cur_epsilon = 1.0
+        self.cur_noise_scale = 1.0
+        self.cur_pure_exploration_phase = False
         self.dim_actions = action_space.shape[0]
         self.low_action = action_space.low
         self.high_action = action_space.high
@@ -246,30 +85,38 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
         # create global step for counting the number of update operations
         self.global_step = tf.train.get_or_create_global_step()
 
+        # use separate optimizers for actor & critic
+        self._actor_optimizer = tf.train.AdamOptimizer(
+            learning_rate=self.config["actor_lr"])
+        self._critic_optimizer = tf.train.AdamOptimizer(
+            learning_rate=self.config["critic_lr"])
+
         # Action inputs
         self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
+        self.pure_exploration_phase = tf.placeholder(
+            tf.bool, (), name="pure_exploration_phase")
         self.cur_observations = tf.placeholder(
             tf.float32,
             shape=(None, ) + observation_space.shape,
             name="cur_obs")
 
-        # Actor: P (policy) network
         with tf.variable_scope(POLICY_SCOPE) as scope:
-            p_values, self.p_model = self._build_p_network(
+            policy_out, self.policy_model = self._build_policy_network(
                 self.cur_observations, observation_space, action_space)
-            self.p_func_vars = _scope_vars(scope.name)
+            self.policy_vars = _scope_vars(scope.name)
 
         # Noise vars for P network except for layer normalization vars
         if self.config["parameter_noise"]:
             self._build_parameter_noise([
-                var for var in self.p_func_vars if "LayerNorm" not in var.name
+                var for var in self.policy_vars if "LayerNorm" not in var.name
             ])
 
         # Action outputs
         with tf.variable_scope(ACTION_SCOPE):
-            self.output_actions = self._build_action_network(
-                p_values, self.stochastic, self.eps)
+            self.output_actions = self._add_exploration_noise(
+                policy_out, self.stochastic, self.noise_scale,
+                self.pure_exploration_phase, action_space)
 
         if self.config["smooth_target_policy"]:
             self.reset_noise_op = tf.no_op()
@@ -293,37 +140,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
         self.importance_weights = tf.placeholder(
             tf.float32, [None], name="weight")
 
-        # p network evaluation
+        # policy network evaluation
         with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope:
             prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-            self.p_t, _ = self._build_p_network(self.obs_t, observation_space,
-                                                action_space)
-            p_batchnorm_update_ops = list(
+            self.policy_t, _ = self._build_policy_network(
+                self.obs_t, observation_space, action_space)
+            policy_batchnorm_update_ops = list(
                 set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
                 prev_update_ops)
 
-        # target p network evaluation
+        # target policy network evaluation
         with tf.variable_scope(POLICY_TARGET_SCOPE) as scope:
-            p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space,
-                                             action_space)
-            target_p_func_vars = _scope_vars(scope.name)
+            policy_tp1, _ = self._build_policy_network(
+                self.obs_tp1, observation_space, action_space)
+            target_policy_vars = _scope_vars(scope.name)
 
         # Action outputs
         with tf.variable_scope(ACTION_SCOPE, reuse=True):
-            output_actions = self._build_action_network(
-                self.p_t,
-                stochastic=tf.constant(value=False, dtype=tf.bool),
-                eps=.0)
-            output_actions_estimated = self._build_action_network(
-                p_tp1,
-                stochastic=tf.constant(
-                    value=self.config["smooth_target_policy"], dtype=tf.bool),
-                eps=.0,
-                is_target=True)
+            if config["smooth_target_policy"]:
+                target_noise_clip = self.config["target_noise_clip"]
+                clipped_normal_sample = tf.clip_by_value(
+                    tf.random_normal(
+                        tf.shape(policy_tp1),
+                        stddev=self.config["target_noise"]),
+                    -target_noise_clip, target_noise_clip)
+                policy_tp1_smoothed = tf.clip_by_value(
+                    policy_tp1 + clipped_normal_sample, action_space.low,
+                    action_space.high)
+            else:
+                # no smoothing, just use deterministic actions
+                policy_tp1_smoothed = policy_tp1
 
         # q network evaluation
         prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
         with tf.variable_scope(Q_SCOPE) as scope:
+            # Q-values for given actions & observations in given current
             q_t, self.q_model = self._build_q_network(
                 self.obs_t, observation_space, action_space, self.act_t)
             self.q_func_vars = _scope_vars(scope.name)
@@ -333,8 +184,9 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
             "min_q": tf.reduce_min(q_t),
         }
         with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_tp0, _ = self._build_q_network(self.obs_t, observation_space,
-                                             action_space, output_actions)
+            # Q-values for current policy (no noise) in given current state
+            q_t_det_policy, _ = self._build_q_network(
+                self.obs_t, observation_space, action_space, self.policy_t)
         if self.config["twin_q"]:
             with tf.variable_scope(TWIN_Q_SCOPE) as scope:
                 twin_q_t, self.twin_q_model = self._build_q_network(
@@ -343,38 +195,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
         q_batchnorm_update_ops = list(
             set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
 
-        # target q network evalution
+        # target q network evaluation
         with tf.variable_scope(Q_TARGET_SCOPE) as scope:
             q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space,
-                                             action_space,
-                                             output_actions_estimated)
+                                             action_space, policy_tp1_smoothed)
             target_q_func_vars = _scope_vars(scope.name)
         if self.config["twin_q"]:
             with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope:
                 twin_q_tp1, _ = self._build_q_network(
                     self.obs_tp1, observation_space, action_space,
-                    output_actions_estimated)
+                    policy_tp1_smoothed)
                 twin_target_q_func_vars = _scope_vars(scope.name)
 
         if self.config["twin_q"]:
-            self.loss = self._build_actor_critic_loss(
-                q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1)
+            self.critic_loss, self.actor_loss, self.td_error \
+                = self._build_actor_critic_loss(
+                    q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t,
+                    twin_q_tp1=twin_q_tp1)
         else:
-            self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
+            self.critic_loss, self.actor_loss, self.td_error \
+                = self._build_actor_critic_loss(
+                    q_t, q_tp1, q_t_det_policy)
 
         if config["l2_reg"] is not None:
-            for var in self.p_func_vars:
+            for var in self.policy_vars:
                 if "bias" not in var.name:
-                    self.loss.actor_loss += (
+                    self.actor_loss += (
                         config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
             for var in self.q_func_vars:
                 if "bias" not in var.name:
-                    self.loss.critic_loss += (
+                    self.critic_loss += (
                         config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
             if self.config["twin_q"]:
                 for var in self.twin_q_func_vars:
                     if "bias" not in var.name:
-                        self.loss.critic_loss += (
+                        self.critic_loss += (
                             config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
 
         # update_target_fn will be called periodically to copy Q network to
@@ -396,8 +251,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
                     var_target.assign(self.tau * var +
                                       (1.0 - self.tau) * var_target))
         for var, var_target in zip(
-                sorted(self.p_func_vars, key=lambda v: v.name),
-                sorted(target_p_func_vars, key=lambda v: v.name)):
+                sorted(self.policy_vars, key=lambda v: v.name),
+                sorted(target_policy_vars, key=lambda v: v.name)):
             update_target_expr.append(
                 var_target.assign(self.tau * var +
                                   (1.0 - self.tau) * var_target))
@@ -414,14 +269,15 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
         ]
         input_dict = dict(self.loss_inputs)
 
-        # Model self-supervised losses
-        self.loss.actor_loss = self.p_model.custom_loss(
-            self.loss.actor_loss, input_dict)
-        self.loss.critic_loss = self.q_model.custom_loss(
-            self.loss.critic_loss, input_dict)
-        if self.config["twin_q"]:
-            self.loss.critic_loss = self.twin_q_model.custom_loss(
-                self.loss.critic_loss, input_dict)
+        if self.config["use_state_preprocessor"]:
+            # Model self-supervised losses
+            self.actor_loss = self.policy_model.custom_loss(
+                self.actor_loss, input_dict)
+            self.critic_loss = self.q_model.custom_loss(
+                self.critic_loss, input_dict)
+            if self.config["twin_q"]:
+                self.critic_loss = self.twin_q_model.custom_loss(
+                    self.critic_loss, input_dict)
 
         TFPolicyGraph.__init__(
             self,
@@ -430,62 +286,92 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
             self.sess,
             obs_input=self.cur_observations,
             action_sampler=self.output_actions,
-            loss=self.loss.actor_loss + self.loss.critic_loss,
+            loss=self.actor_loss + self.critic_loss,
             loss_inputs=self.loss_inputs,
-            update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops)
+            update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
         self.sess.run(tf.global_variables_initializer())
 
         # Note that this encompasses both the policy and Q-value networks and
         # their corresponding target networks
         self.variables = ray.experimental.tf_utils.TensorFlowVariables(
-            tf.group(q_tp0, q_tp1), self.sess)
+            tf.group(q_t_det_policy, q_tp1), self.sess)
 
         # Hard initial update
         self.update_target(tau=1.0)
 
     @override(TFPolicyGraph)
     def optimizer(self):
-        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+        # we don't use this because we have two separate optimisers
+        return None
+
+    @override(TFPolicyGraph)
+    def build_apply_op(self, optimizer, grads_and_vars):
+        # for policy gradient, update policy net one time v.s.
+        # update critic net `policy_delay` time(s)
+        should_apply_actor_opt = tf.equal(
+            tf.mod(self.global_step, self.config["policy_delay"]), 0)
+
+        def make_apply_op():
+            return self._actor_optimizer.apply_gradients(
+                self._actor_grads_and_vars)
+
+        actor_op = tf.cond(
+            should_apply_actor_opt,
+            true_fn=make_apply_op,
+            false_fn=lambda: tf.no_op())
+        critic_op = self._critic_optimizer.apply_gradients(
+            self._critic_grads_and_vars)
+        # increment global step & apply ops
+        with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
+            return tf.group(actor_op, critic_op)
 
     @override(TFPolicyGraph)
     def gradients(self, optimizer, loss):
         if self.config["grad_norm_clipping"] is not None:
             actor_grads_and_vars = _minimize_and_clip(
-                optimizer,
-                self.loss.actor_loss,
-                var_list=self.p_func_vars,
+                self._actor_optimizer,
+                self.actor_loss,
+                var_list=self.policy_vars,
                 clip_val=self.config["grad_norm_clipping"])
             critic_grads_and_vars = _minimize_and_clip(
-                optimizer,
-                self.loss.critic_loss,
+                self._critic_optimizer,
+                self.critic_loss,
                 var_list=self.q_func_vars + self.twin_q_func_vars
                 if self.config["twin_q"] else self.q_func_vars,
                 clip_val=self.config["grad_norm_clipping"])
         else:
-            actor_grads_and_vars = optimizer.compute_gradients(
-                self.loss.actor_loss, var_list=self.p_func_vars)
-            critic_grads_and_vars = optimizer.compute_gradients(
-                self.loss.critic_loss,
-                var_list=self.q_func_vars + self.twin_q_func_vars
-                if self.config["twin_q"] else self.q_func_vars)
-        actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
-                                if g is not None]
-        critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
-                                 if g is not None]
-        grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
+            actor_grads_and_vars = self._actor_optimizer.compute_gradients(
+                self.actor_loss, var_list=self.policy_vars)
+            if self.config["twin_q"]:
+                critic_vars = self.q_func_vars + self.twin_q_func_vars
+            else:
+                critic_vars = self.q_func_vars
+            critic_grads_and_vars = self._critic_optimizer.compute_gradients(
+                self.critic_loss, var_list=critic_vars)
+        # save these for later use in build_apply_op
+        self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
+                                      if g is not None]
+        self._critic_grads_and_vars = [(g, v)
+                                       for (g, v) in critic_grads_and_vars
+                                       if g is not None]
+        grads_and_vars = self._actor_grads_and_vars \
+            + self._critic_grads_and_vars
         return grads_and_vars
 
     @override(TFPolicyGraph)
     def extra_compute_action_feed_dict(self):
         return {
+            # FIXME: what about turning off exploration? Isn't that a good
+            # idea?
             self.stochastic: True,
-            self.eps: self.cur_epsilon,
+            self.noise_scale: self.cur_noise_scale,
+            self.pure_exploration_phase: self.cur_pure_exploration_phase,
         }
 
     @override(TFPolicyGraph)
     def extra_compute_grad_fetches(self):
         return {
-            "td_error": self.loss.td_error,
+            "td_error": self.td_error,
             LEARNER_STATS_KEY: self.stats,
         }
 
@@ -499,59 +385,192 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
 
     @override(PolicyGraph)
     def get_state(self):
-        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+        return [
+            TFPolicyGraph.get_state(self), self.cur_noise_scale,
+            self.cur_pure_exploration_phase
+        ]
 
     @override(PolicyGraph)
     def set_state(self, state):
         TFPolicyGraph.set_state(self, state[0])
         self.set_epsilon(state[1])
+        self.set_pure_exploration_phase(state[2])
 
     def _build_q_network(self, obs, obs_space, action_space, actions):
-        q_net = QNetwork(
-            ModelCatalog.get_model({
+        if self.config["use_state_preprocessor"]:
+            q_model = ModelCatalog.get_model({
                 "obs": obs,
                 "is_training": self._get_is_training_placeholder(),
-            }, obs_space, action_space, 1, self.config["model"]), actions,
-            self.config["critic_hiddens"],
-            self.config["critic_hidden_activation"])
-        return q_net.value, q_net.model
+            }, obs_space, action_space, 1, self.config["model"])
+            q_out = tf.concat([q_model.last_layer, actions], axis=1)
+        else:
+            q_model = None
+            q_out = tf.concat([obs, actions], axis=1)
 
-    def _build_p_network(self, obs, obs_space, action_space):
-        policy_net = PolicyNetwork(
-            ModelCatalog.get_model({
+        activation = getattr(tf.nn, self.config["critic_hidden_activation"])
+        for hidden in self.config["critic_hiddens"]:
+            q_out = layers.fully_connected(
+                q_out, num_outputs=hidden, activation_fn=activation)
+        q_values = layers.fully_connected(
+            q_out, num_outputs=1, activation_fn=None)
+
+        return q_values, q_model
+
+    def _build_policy_network(self, obs, obs_space, action_space):
+        if self.config["use_state_preprocessor"]:
+            model = ModelCatalog.get_model({
                 "obs": obs,
                 "is_training": self._get_is_training_placeholder(),
-            }, obs_space, action_space, 1, self.config["model"]),
-            self.dim_actions, self.config["actor_hiddens"],
-            self.config["actor_hidden_activation"],
-            self.config["parameter_noise"])
-        return policy_net.action_scores, policy_net.model
+            }, obs_space, action_space, 1, self.config["model"])
+            action_out = model.last_layer
+        else:
+            model = None
+            action_out = obs
 
-    def _build_action_network(self, p_values, stochastic, eps,
-                              is_target=False):
-        return ActionNetwork(
-            p_values, self.low_action, self.high_action, stochastic, eps,
-            self.config["exploration_theta"], self.config["exploration_sigma"],
-            self.config["smooth_target_policy"], self.config["act_noise"],
-            is_target, self.config["target_noise"],
-            self.config["noise_clip"]).actions
+        activation = getattr(tf.nn, self.config["actor_hidden_activation"])
+        normalizer_fn = layers.layer_norm if self.config["parameter_noise"] \
+            else None
+        for hidden in self.config["actor_hiddens"]:
+            action_out = layers.fully_connected(
+                action_out,
+                num_outputs=hidden,
+                activation_fn=activation,
+                normalizer_fn=normalizer_fn)
+        action_out = layers.fully_connected(
+            action_out, num_outputs=self.dim_actions, activation_fn=None)
+
+        # Use sigmoid to scale to [0,1], but also double magnitude of input to
+        # emulate behaviour of tanh activation used in DDPG and TD3 papers.
+        sigmoid_out = tf.nn.sigmoid(2 * action_out)
+        # Rescale to actual env policy scale
+        # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
+        # get same dims)
+        action_range = (action_space.high - action_space.low)[None]
+        low_action = action_space.low[None]
+        actions = action_range * sigmoid_out + low_action
+
+        return actions, model
+
+    def _add_exploration_noise(self, deterministic_actions,
+                               should_be_stochastic, noise_scale,
+                               enable_pure_exploration, action_space):
+        noise_type = self.config["exploration_noise_type"]
+        action_low = action_space.low
+        action_high = action_space.high
+        action_range = action_space.high - action_low
+
+        def compute_stochastic_actions():
+            def make_noisy_actions():
+                # shape of deterministic_actions is [None, dim_action]
+                if noise_type == "gaussian":
+                    # add IID Gaussian noise for exploration, TD3-style
+                    normal_sample = noise_scale * tf.random_normal(
+                        tf.shape(deterministic_actions),
+                        stddev=self.config["exploration_gaussian_sigma"])
+                    stochastic_actions = tf.clip_by_value(
+                        deterministic_actions + normal_sample, action_low,
+                        action_high)
+                elif noise_type == "ou":
+                    # add OU noise for exploration, DDPG-style
+                    zero_acts = action_low.size * [.0]
+                    exploration_sample = tf.get_variable(
+                        name="ornstein_uhlenbeck",
+                        dtype=tf.float32,
+                        initializer=zero_acts,
+                        trainable=False)
+                    normal_sample = tf.random_normal(
+                        shape=[action_low.size], mean=0.0, stddev=1.0)
+                    ou_new = self.config["exploration_ou_theta"] \
+                        * -exploration_sample \
+                        + self.config["exploration_ou_sigma"] * normal_sample
+                    exploration_value = tf.assign_add(exploration_sample,
+                                                      ou_new)
+                    base_scale = self.config["exploration_ou_noise_scale"]
+                    noise = noise_scale * base_scale \
+                        * exploration_value * action_range
+                    stochastic_actions = tf.clip_by_value(
+                        deterministic_actions + noise, action_low, action_high)
+                else:
+                    raise ValueError(
+                        "Unknown noise type '%s' (try 'ou' or 'gaussian')" %
+                        noise_type)
+                return stochastic_actions
+
+            def make_uniform_random_actions():
+                # pure random exploration option
+                uniform_random_actions = tf.random.uniform(
+                    tf.shape(deterministic_actions))
+                # rescale uniform random actions according to action range
+                tf_range = tf.constant(action_range[None], dtype="float32")
+                tf_low = tf.constant(action_low[None], dtype="float32")
+                uniform_random_actions = uniform_random_actions * tf_range \
+                    + tf_low
+                return uniform_random_actions
+
+            stochastic_actions = tf.cond(
+                # need to condition on noise_scale > 0 because zeroing
+                # noise_scale is how evaluator signals no noise should be used
+                # (this is ugly and should be fixed by adding an "eval_mode"
+                # config flag or something)
+                tf.logical_and(enable_pure_exploration, noise_scale > 0),
+                true_fn=make_uniform_random_actions,
+                false_fn=make_noisy_actions)
+            return stochastic_actions
+
+        enable_stochastic = tf.logical_and(should_be_stochastic,
+                                           not self.config["parameter_noise"])
+        actions = tf.cond(enable_stochastic, compute_stochastic_actions,
+                          lambda: deterministic_actions)
+        return actions
 
     def _build_actor_critic_loss(self,
                                  q_t,
                                  q_tp1,
-                                 q_tp0,
+                                 q_t_det_policy,
                                  twin_q_t=None,
                                  twin_q_tp1=None):
-        return ActorCriticLoss(
-            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
-            self.done_mask, twin_q_t, twin_q_tp1,
-            self.config["actor_loss_coeff"], self.config["critic_loss_coeff"],
-            self.config["gamma"], self.config["n_step"],
-            self.config["use_huber"], self.config["huber_threshold"],
-            self.config["twin_q"])
+        twin_q = self.config["twin_q"]
+        gamma = self.config["gamma"]
+        n_step = self.config["n_step"]
+        use_huber = self.config["use_huber"]
+        huber_threshold = self.config["huber_threshold"]
+
+        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
+        if twin_q:
+            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
+            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
+
+        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = tf.stop_gradient(
+            self.rew_t + gamma**n_step * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        if twin_q:
+            td_error = q_t_selected - q_t_selected_target
+            twin_td_error = twin_q_t_selected - q_t_selected_target
+            td_error = td_error + twin_td_error
+            if use_huber:
+                errors = _huber_loss(td_error, huber_threshold) \
+                    + _huber_loss(twin_td_error, huber_threshold)
+            else:
+                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
+                    twin_td_error)
+        else:
+            td_error = q_t_selected - q_t_selected_target
+            if use_huber:
+                errors = _huber_loss(td_error, huber_threshold)
+            else:
+                errors = 0.5 * tf.square(td_error)
+
+        critic_loss = tf.reduce_mean(self.importance_weights * errors)
+        actor_loss = -tf.reduce_mean(q_t_det_policy)
+        return critic_loss, actor_loss, td_error
 
     def _build_parameter_noise(self, pnet_params):
-        self.parameter_noise_sigma_val = self.config["exploration_sigma"]
+        self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
         self.parameter_noise_sigma = tf.get_variable(
             initializer=tf.constant_initializer(
                 self.parameter_noise_sigma_val),
@@ -590,7 +609,7 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
     def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
                          importance_weights):
         td_err = self.sess.run(
-            self.loss.td_error,
+            self.td_error,
             feed_dict={
                 self.obs_t: [np.array(ob) for ob in obs_t],
                 self.act_t: act_t,
@@ -610,9 +629,16 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
 
     # support both hard and soft sync
     def update_target(self, tau=None):
+        tau = tau or self.tau_value
         return self.sess.run(
-            self.update_target_expr,
-            feed_dict={self.tau: tau or self.tau_value})
+            self.update_target_expr, feed_dict={self.tau: tau})
 
     def set_epsilon(self, epsilon):
-        self.cur_epsilon = epsilon
+        # set_epsilon is called by optimizer to anneal exploration as
+        # necessary, and to turn it off during evaluation. The "epsilon" part
+        # is a carry-over from DQN, which uses epsilon-greedy exploration
+        # rather than adding action noise to the output of a policy network.
+        self.cur_noise_scale = epsilon
+
+    def set_pure_exploration_phase(self, pure_exploration_phase):
+        self.cur_pure_exploration_phase = pure_exploration_phase
diff --git a/python/ray/rllib/agents/ddpg/td3.py b/python/ray/rllib/agents/ddpg/td3.py
new file mode 100644
index 000000000..714c39c6b
--- /dev/null
+++ b/python/ray/rllib/agents/ddpg/td3.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
+    DEFAULT_CONFIG as DDPG_CONFIG
+from ray.rllib.utils import merge_dicts
+
+TD3_DEFAULT_CONFIG = merge_dicts(
+    DDPG_CONFIG,
+    {
+        # largest changes: twin Q functions, delayed policy updates, and target
+        # smoothing
+        "twin_q": True,
+        "policy_delay": 2,
+        "smooth_target_policy": True,
+        "target_noise": 0.2,
+        "target_noise_clip": 0.5,
+
+        # other changes & things we want to keep fixed: IID Gaussian
+        # exploration noise, larger actor learning rate, no l2 regularisation,
+        # no Huber loss, etc.
+        "exploration_should_anneal": False,
+        "exploration_noise_type": "gaussian",
+        "exploration_gaussian_sigma": 0.1,
+        "learning_starts": 10000,
+        "pure_exploration_steps": 10000,
+        "actor_hiddens": [400, 300],
+        "critic_hiddens": [400, 300],
+        "n_step": 1,
+        "gamma": 0.99,
+        "actor_lr": 1e-3,
+        "critic_lr": 1e-3,
+        "l2_reg": 0.0,
+        "tau": 5e-3,
+        "train_batch_size": 100,
+        "use_huber": False,
+        "target_network_update_freq": 0,
+        "optimizer_class": "SyncReplayOptimizer",
+        "num_workers": 0,
+        "num_gpus_per_worker": 0,
+        "per_worker_exploration": False,
+        "worker_side_prioritization": False,
+        "buffer_size": 1000000,
+        "prioritized_replay": False,
+        "clip_rewards": False,
+        "use_state_preprocessor": False,
+    },
+)
+
+
+class TD3Trainer(DDPGTrainer):
+    """A more stable successor to TD3. By default, this uses a near-identical
+    configuration to that reported in the TD3 paper."""
+
+    _name = "TD3"
+    _default_config = TD3_DEFAULT_CONFIG
diff --git a/python/ray/rllib/agents/registry.py b/python/ray/rllib/agents/registry.py
index 7b133fa81..aa70275ad 100644
--- a/python/ray/rllib/agents/registry.py
+++ b/python/ray/rllib/agents/registry.py
@@ -34,6 +34,11 @@ def _import_apex_ddpg():
     return ddpg.ApexDDPGTrainer
 
 
+def _import_td3():
+    from ray.rllib.agents import ddpg
+    return ddpg.TD3Trainer
+
+
 def _import_ppo():
     from ray.rllib.agents import ppo
     return ppo.PPOTrainer
@@ -87,6 +92,7 @@ def _import_marwil():
 ALGORITHMS = {
     "DDPG": _import_ddpg,
     "APEX_DDPG": _import_apex_ddpg,
+    "TD3": _import_td3,
     "PPO": _import_ppo,
     "ES": _import_es,
     "ARS": _import_ars,
diff --git a/python/ray/rllib/tests/test_checkpoint_restore.py b/python/ray/rllib/tests/test_checkpoint_restore.py
index 3b16ad1dd..68fe6e7cb 100644
--- a/python/ray/rllib/tests/test_checkpoint_restore.py
+++ b/python/ray/rllib/tests/test_checkpoint_restore.py
@@ -40,7 +40,8 @@ CONFIGS = {
         },
     },
     "DDPG": {
-        "noise_scale": 0.0,
+        "pure_exploration_steps": 0,
+        "exploration_ou_noise_scale": 0.0,
         "timesteps_per_iteration": 100
     },
     "PPO": {
diff --git a/python/ray/rllib/tests/test_supported_spaces.py b/python/ray/rllib/tests/test_supported_spaces.py
index 7d59a04fb..c3ea442c8 100644
--- a/python/ray/rllib/tests/test_supported_spaces.py
+++ b/python/ray/rllib/tests/test_supported_spaces.py
@@ -116,8 +116,9 @@ class ModelSupportedSpaces(unittest.TestCase):
         check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats)
         check_support(
             "DDPG", {
-                "noise_scale": 100.0,
-                "timesteps_per_iteration": 1
+                "exploration_ou_noise_scale": 100.0,
+                "timesteps_per_iteration": 1,
+                "use_state_preprocessor": True,
             },
             stats,
             check_bounds=True)
@@ -188,6 +189,7 @@ class ModelSupportedSpaces(unittest.TestCase):
                 "min_iter_time_s": 1,
                 "learning_starts": 1000,
                 "target_network_update_freq": 100,
+                "use_state_preprocessor": True,
             })
         check_support_multiagent("IMPALA", {"num_gpus": 0})
         check_support_multiagent("DQN", {"timesteps_per_iteration": 1})
@@ -206,7 +208,10 @@ class ModelSupportedSpaces(unittest.TestCase):
                 "sgd_minibatch_size": 1,
             })
         check_support_multiagent("PG", {"num_workers": 1, "optimizer": {}})
-        check_support_multiagent("DDPG", {"timesteps_per_iteration": 1})
+        check_support_multiagent("DDPG", {
+            "timesteps_per_iteration": 1,
+            "use_state_preprocessor": True,
+        })
 
 
 if __name__ == "__main__":
diff --git a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml
index f02399ab3..6a4bd52e7 100644
--- a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml
@@ -15,13 +15,14 @@ halfcheetah-ddpg:
         env_config: {}
 
         # === Exploration ===
+        exploration_should_anneal: True
         schedule_max_timesteps: 100000
         timesteps_per_iteration: 1000
         exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.1
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
         target_network_update_freq: 0
         tau: 0.001
 
@@ -34,9 +35,8 @@ halfcheetah-ddpg:
         clip_rewards: False
 
         # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
         use_huber: False
         huber_threshold: 1.0
         l2_reg: 0.000001
@@ -50,3 +50,7 @@ halfcheetah-ddpg:
         optimizer_class: "SyncReplayOptimizer"
         per_worker_exploration: False
         worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml b/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml
new file mode 100644
index 000000000..f215c0300
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml
@@ -0,0 +1,22 @@
+invertedpendulum-td3:
+    # This is a TD3 with stopping conditions and network size tuned specifically
+    # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum
+    # achievable) in 10,000 to 20,000 steps.
+    env: InvertedPendulum-v2
+    run: TD3
+    stop:
+        episode_reward_mean: 9999.9
+        time_total_s: 900 # 15 minutes
+        timesteps_total: 1000000
+    config:
+        # === Model ===
+        actor_hiddens: [32, 32]
+        critic_hiddens: [32, 32]
+
+        # === Exploration ===
+        learning_starts: 1000
+        pure_exploration_steps: 1000
+
+        # === Evaluation ===
+        evaluation_interval: 1
+        evaluation_num_episodes: 5
diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml
index 82947d872..9e8923ffe 100644
--- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml
@@ -7,7 +7,9 @@ mountaincarcontinuous-apex-ddpg:
     config:
         clip_rewards: False
         num_workers: 16
-        noise_scale: 1.0
+        exploration_ou_noise_scale: 1.0
         n_step: 3
         target_network_update_freq: 50000
         tau: 1.0
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
index e74b2e0f1..3a8f61229 100644
--- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
@@ -15,13 +15,14 @@ mountaincarcontinuous-ddpg:
         env_config: {}
 
         # === Exploration ===
+        exploration_should_anneal: True
         schedule_max_timesteps: 100000
         timesteps_per_iteration: 1000
         exploration_fraction: 0.4
-        exploration_final_eps: 0.02
-        noise_scale: 0.75
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.75
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
         target_network_update_freq: 0
         tau: 0.01
 
@@ -34,9 +35,8 @@ mountaincarcontinuous-ddpg:
         clip_rewards: False
 
         # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
         use_huber: False
         huber_threshold: 1.0
         l2_reg: 0.00001
@@ -50,3 +50,7 @@ mountaincarcontinuous-ddpg:
         optimizer_class: "SyncReplayOptimizer"
         per_worker_exploration: False
         worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/mujoco-td3.yaml b/python/ray/rllib/tuned_examples/mujoco-td3.yaml
new file mode 100644
index 000000000..8f626b40b
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/mujoco-td3.yaml
@@ -0,0 +1,24 @@
+mujoco-td3:
+    # Solve latest versions of the four hardest Mujoco tasks benchmarked in the
+    # original TD3 paper. Average return over 10 trials at end of 1,000,000
+    # timesteps (taken from Table 2 of the paper) are given in parens at the end
+    # of reach environment name.
+    #
+    # Paper is at https://arxiv.org/pdf/1802.09477.pdf
+    env:
+        grid_search:
+            - HalfCheetah-v2  # (9,532.99)
+            - Hopper-v2  # (3,304.75)
+            - Walker2d-v2  # (4,565.24)
+            - Ant-v2  # (4,185.06)
+    run: TD3
+    stop:
+        timesteps_total: 1000000
+    config:
+        # === Exploration ===
+        learning_starts: 10000
+        pure_exploration_steps: 10000
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml
index f7a7c71f6..7122b577e 100644
--- a/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml
@@ -11,3 +11,5 @@ pendulum-apex-ddpg:
         n_step: 1
         target_network_update_freq: 50000
         tau: 1.0
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
index 38b93ea72..59891a86b 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
@@ -15,13 +15,14 @@ pendulum-ddpg:
         env_config: {}
 
         # === Exploration ===
+        exploration_should_anneal: True
         schedule_max_timesteps: 100000
         timesteps_per_iteration: 600
         exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
+        exploration_final_scale: 0.02
+        exploration_ou_noise_scale: 0.1
+        exploration_ou_theta: 0.15
+        exploration_ou_sigma: 0.2
         target_network_update_freq: 0
         tau: 0.001
 
@@ -34,9 +35,8 @@ pendulum-ddpg:
         clip_rewards: False
 
         # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
+        actor_lr: 0.001
+        critic_lr: 0.001
         use_huber: True
         huber_threshold: 1.0
         l2_reg: 0.000001
@@ -50,3 +50,7 @@ pendulum-ddpg:
         optimizer_class: "SyncReplayOptimizer"
         per_worker_exploration: False
         worker_side_prioritization: False
+
+        # === Evaluation ===
+        evaluation_interval: 5
+        evaluation_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/pendulum-td3.yaml b/python/ray/rllib/tuned_examples/pendulum-td3.yaml
index 25b0900d6..77211cf7a 100644
--- a/python/ray/rllib/tuned_examples/pendulum-td3.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-td3.yaml
@@ -1,60 +1,19 @@
 # This configuration can expect to reach -160 reward in 10k-20k timesteps
 pendulum-ddpg:
     env: Pendulum-v0
-    run: DDPG
+    run: TD3
     stop:
-        episode_reward_mean: -160
-        time_total_s: 600 # 10 minutes
+        episode_reward_mean: -130
+        time_total_s: 900 # 10 minutes
     config:
-        # === Tricks ===
-        twin_q: True
-        policy_delay: 2
-        smooth_target_policy: True
-        act_noise: 0.1
-        target_noise: 0.2
-        noise_clip: 0.5
-
         # === Model ===
         actor_hiddens: [64, 64]
         critic_hiddens: [64, 64]
-        n_step: 1
-        model: {}
-        gamma: 0.99
-        env_config: {}
 
         # === Exploration ===
-        schedule_max_timesteps: 100000
-        timesteps_per_iteration: 600
-        exploration_fraction: 0.1
-        exploration_final_eps: 0.02
-        noise_scale: 0.1
-        exploration_theta: 0.15
-        exploration_sigma: 0.2
-        target_network_update_freq: 0
-        tau: 0.001
+        learning_starts: 5000
+        pure_exploration_steps: 5000
 
-        # === Replay buffer ===
-        buffer_size: 10000
-        prioritized_replay: True
-        prioritized_replay_alpha: 0.6
-        prioritized_replay_beta: 0.4
-        prioritized_replay_eps: 0.000001
-        clip_rewards: False
-
-        # === Optimization ===
-        lr: 0.001
-        actor_loss_coeff: 0.1
-        critic_loss_coeff: 1.0
-        use_huber: True
-        huber_threshold: 1.0
-        l2_reg: 0.000001
-        learning_starts: 500
-        sample_batch_size: 1
-        train_batch_size: 64
-
-        # === Parallelism ===
-        num_workers: 0
-        num_gpus_per_worker: 0
-        optimizer_class: "SyncReplayOptimizer"
-        per_worker_exploration: False
-        worker_side_prioritization: False
+        # === Evaluation ===
+        evaluation_interval: 1
+        evaluation_num_episodes: 5