diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 09d94730b..cfe8ee90c 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -16,6 +16,7 @@ Algorithm           Frameworks Discrete Actions        Continuous Actions Multi-
 `ES`_               tf + torch **Yes**                 **Yes**            No
 `DDPG`_, `TD3`_     tf + torch No                      **Yes**            **Yes**
 `APEX-DDPG`_        tf + torch No                      **Yes**            **Yes**
+`Dreamer`_          torch      No                      **Yes**            No          `+RNN`_
 `DQN`_, `Rainbow`_  tf + torch **Yes** `+parametric`_  No                 **Yes**
 `APEX-DQN`_         tf + torch **Yes** `+parametric`_  No                 **Yes**
 `IMPALA`_           tf + torch **Yes** `+parametric`_  **Yes**            **Yes**     `+RNN`_, `+LSTM auto-wrapping`_, `+Transformer`_, `+autoreg`_
@@ -35,7 +36,7 @@ Algorithm           Frameworks Discrete Actions        Continuous Actions Multi-
 .. _`+LSTM auto-wrapping`: rllib-models.html#built-in-models
 .. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces
 .. _`+RNN`: rllib-models.html#recurrent-models
-.. _`+Transformer`: rllib-models.html#attention-networks-transformers
+.. _`+Transformer`: rllib-models.html#attention-networks
 .. _`A2C, A3C`: rllib-algorithms.html#a3c
 .. _`APEX-DQN`: rllib-algorithms.html#apex
 .. _`APEX-DDPG`: rllib-algorithms.html#apex
@@ -304,22 +305,16 @@ SpaceInvaders  650                       1001                           1025
 
 Policy Gradients
 ----------------
-|pytorch| |tensorflow| An `implementation <https://github.com/ray-project/ray/blob/master/rllib/agents/pg/pg.py>`__ of a vanilla policy gradient algorithm for TensorFlow and PyTorch.
-
-**Papers**:
-`[1] - Policy Gradient Methods for Reinforcement Learning with Function Approximation. <https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`__
-and
-`[2] - Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. <http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf>`__
-
+|pytorch| |tensorflow|
+`[paper] <https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/agents/pg/pg.py>`__ We include a vanilla policy gradients implementation as an example algorithm.
 
 .. figure:: a2c-arch.svg
 
     Policy gradients architecture (same as A2C)
 
-**Tuned examples**: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/pg/cartpole-pg.yaml>`__
+Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/pg/cartpole-pg.yaml>`__
 
-**PG-specific configs**: The following updates will overwrite/be added to the
-(base) Trainer config in `rllib/agents/trainer.py <rllib-training.html#common-parameters>`__ (*COMMON_CONFIG* dict):
+**PG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
 
 .. literalinclude:: ../../rllib/agents/pg/pg.py
    :language: python
@@ -435,6 +430,35 @@ Tuned examples: HalfCheetahRandDirecEnv (`Env <https://github.com/ray-project/ra
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _dreamer:
+
+Dreamer
+-------
+|pytorch|
+`[paper] <https://arxiv.org/abs/1912.016030>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/agents/dreamer/dreamer.py>`__
+
+Dreamer is an image-only model-based RL method that learns by imagining trajectories in the future and is evaluated on the DeepMind Control Suite `environments <https://github.com/ray-project/ray/blob/master/rllib/examples/env/dm_control_suite.py>`__. RLlib's Dreamer is adapted from the `official Google research repo <https://github.com/google-research/dreamer>`__.
+
+To visualize learning, RLLib Dreamer's imagined trajectories are logged as gifs in Tensorboard. Examples of such can be seen `here <https://github.com/ray-project/rl-experiments>`__. 
+
+Tuned examples: `Deepmind Control Environments <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml>`__
+
+**Deepmind Control results @1M steps:** `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ==============  ======================
+DMC env        RLlib Dreamer   Danijar et al Dreamer
+=============  ==============  ======================
+Walker-Walk    920             ~930
+Cheetah-Run    640             ~800
+=============  ==============  ======================
+
+**Dreamer-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../rllib/agents/dreamer/dreamer.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Derivative-free
 ~~~~~~~~~~~~~~~
 
diff --git a/doc/source/rllib-toc.rst b/doc/source/rllib-toc.rst
index e460305f5..1a22fcda0 100644
--- a/doc/source/rllib-toc.rst
+++ b/doc/source/rllib-toc.rst
@@ -104,6 +104,8 @@ Algorithms
 
    -  |pytorch| |tensorflow| :ref:`Deep Deterministic Policy Gradients (DDPG, TD3) <ddpg>`
 
+   -  |pytorch| :ref:`Dreamer <dreamer>`
+
    -  |pytorch| |tensorflow| :ref:`Deep Q Networks (DQN, Rainbow, Parametric DQN) <dqn>`
 
    -  |pytorch| |tensorflow| :ref:`Model-Agnostic Meta-Learning (MAML) <maml>`
diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py
index 945c25537..54cf903c4 100644
--- a/python/ray/tune/logger.py
+++ b/python/ray/tune/logger.py
@@ -227,6 +227,13 @@ class TBXLogger(Logger):
                   and len(value) > 0) or (type(value) == np.ndarray
                                           and value.size > 0):
                 valid_result[full_attr] = value
+
+                # Must be video
+                if type(value) == np.ndarray and value.ndim == 5:
+                    self._file_writer.add_video(
+                        full_attr, value, global_step=step, fps=20)
+                    continue
+
                 try:
                     self._file_writer.add_histogram(
                         full_attr, value, global_step=step)
diff --git a/rllib/agents/dreamer/__init__.py b/rllib/agents/dreamer/__init__.py
new file mode 100644
index 000000000..c71cd58cc
--- /dev/null
+++ b/rllib/agents/dreamer/__init__.py
@@ -0,0 +1,6 @@
+from ray.rllib.agents.dreamer.dreamer import DREAMERTrainer, DEFAULT_CONFIG
+
+__all__ = [
+    "DREAMERTrainer",
+    "DEFAULT_CONFIG",
+]
diff --git a/rllib/agents/dreamer/dreamer.py b/rllib/agents/dreamer/dreamer.py
new file mode 100644
index 000000000..4543d1401
--- /dev/null
+++ b/rllib/agents/dreamer/dreamer.py
@@ -0,0 +1,267 @@
+import logging
+
+import random
+import numpy as np
+
+from ray.rllib.agents import with_common_config
+from ray.rllib.agents.dreamer.dreamer_torch_policy import DreamerTorchPolicy
+from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.execution.common import STEPS_SAMPLED_COUNTER, \
+    LEARNER_INFO, _get_shared_metrics
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.evaluation.metrics import collect_metrics
+from ray.rllib.agents.dreamer.dreamer_model import DreamerModel
+from ray.rllib.execution.rollout_ops import ParallelRollouts
+from ray.rllib.utils.typing import SampleBatchType
+
+logger = logging.getLogger(__name__)
+
+# yapf: disable
+# __sphinx_doc_begin__
+DEFAULT_CONFIG = with_common_config({
+    # PlaNET Model LR
+    "td_model_lr": 6e-4,
+    # Actor LR
+    "actor_lr": 8e-5,
+    # Critic LR
+    "critic_lr": 8e-5,
+    # Grad Clipping
+    "grad_clip": 100.0,
+    # Discount
+    "discount": 0.99,
+    # Lambda
+    "lambda": 0.95,
+    # Training iterations per data collection from real env
+    "dreamer_train_iters": 100,
+    # Horizon for Enviornment (1000 for Mujoco/DMC)
+    "horizon": 1000,
+    # Number of episodes to sample for Loss Calculation
+    "batch_size": 50,
+    # Length of each episode to sample for Loss Calculation
+    "batch_length": 50,
+    # Imagination Horizon for Training Actor and Critic
+    "imagine_horizon": 15,
+    # Free Nats
+    "free_nats": 3.0,
+    # KL Coeff for the Model Loss
+    "kl_coeff": 1.0,
+    # Distributed Dreamer not implemented yet
+    "num_workers": 0,
+    # Prefill Timesteps
+    "prefill_timesteps": 5000,
+    # This should be kept at 1 to preserve sample efficiency
+    "num_envs_per_worker": 1,
+    # Exploration Gaussian
+    "explore_noise": 0.3,
+    # Batch mode
+    "batch_mode": "complete_episodes",
+    # Custom Model
+    "dreamer_model": {
+        "custom_model": DreamerModel,
+        # RSSM/PlaNET parameters
+        "deter_size": 200,
+        "stoch_size": 30,
+        # CNN Decoder Encoder
+        "depth_size": 32,
+        # General Network Parameters
+        "hidden_size": 400,
+        # Action STD
+        "action_init_std": 5.0,
+    },
+
+    "env_config": {
+        # Repeats action send by policy for frame_skip times in env
+        "frame_skip": 2,
+    }
+})
+# __sphinx_doc_end__
+# yapf: enable
+
+
+class EpisodicBuffer(object):
+    def __init__(self, max_length: int = 1000, length: int = 50):
+        """Data structure that stores episodes and samples chunks
+        of size length from episodes
+
+        Args:
+            max_length: Maximum episodes it can store
+            length: Episode chunking lengh in sample()
+        """
+
+        # Stores all episodes into a list: List[SampleBatchType]
+        self.episodes = []
+        self.max_length = max_length
+        self.timesteps = 0
+        self.length = length
+
+    def add(self, batch: SampleBatchType):
+        """Splits a SampleBatch into episodes and adds episodes
+        to the episode buffer
+
+        Args:
+            batch: SampleBatch to be added
+        """
+
+        self.timesteps += batch.count
+        episodes = batch.split_by_episode()
+
+        for i, e in enumerate(episodes):
+            episodes[i] = self.preprocess_episode(e)
+        self.episodes.extend(episodes)
+
+        if len(self.episodes) > self.max_length:
+            delta = len(self.episodes) - self.max_length
+            # Drop oldest episodes
+            self.episodes = self.episodes[delta:]
+
+    def preprocess_episode(self, episode: SampleBatchType):
+        """Batch format should be in the form of (s_t, a_(t-1), r_(t-1))
+        When t=0, the resetted obs is paired with action and reward of 0.
+
+        Args:
+            episode: SampleBatch representing an episode
+        """
+        obs = episode["obs"]
+        new_obs = episode["new_obs"]
+        action = episode["actions"]
+        reward = episode["rewards"]
+
+        act_shape = action.shape
+        act_reset = np.array([0.0] * act_shape[-1])[None]
+        rew_reset = np.array(0.0)[None]
+        obs_end = np.array(new_obs[act_shape[0] - 1])[None]
+
+        batch_obs = np.concatenate([obs, obs_end], axis=0)
+        batch_action = np.concatenate([act_reset, action], axis=0)
+        batch_rew = np.concatenate([rew_reset, reward], axis=0)
+
+        new_batch = {
+            "obs": batch_obs,
+            "rewards": batch_rew,
+            "actions": batch_action
+        }
+        return SampleBatch(new_batch)
+
+    def sample(self, batch_size: int):
+        """Samples [batch_size, length] from the list of episodes
+
+        Args:
+            batch_size: batch_size to be sampled
+        """
+        episodes_buffer = []
+        while len(episodes_buffer) < batch_size:
+            rand_index = random.randint(0, len(self.episodes) - 1)
+            episode = self.episodes[rand_index]
+            if episode.count < self.length:
+                continue
+            available = episode.count - self.length
+            index = int(random.randint(0, available))
+            episodes_buffer.append(episode.slice(index, index + self.length))
+
+        batch = {}
+        for k in episodes_buffer[0].keys():
+            batch[k] = np.stack([e[k] for e in episodes_buffer], axis=0)
+
+        return SampleBatch(batch)
+
+
+def total_sampled_timesteps(worker):
+    return worker.policy_map[DEFAULT_POLICY_ID].global_timestep
+
+
+class DreamerIteration:
+    def __init__(self, worker, episode_buffer, dreamer_train_iters, batch_size,
+                 act_repeat):
+        self.worker = worker
+        self.episode_buffer = episode_buffer
+        self.dreamer_train_iters = dreamer_train_iters
+        self.repeat = act_repeat
+        self.batch_size = batch_size
+
+    def __call__(self, samples):
+
+        # Dreamer Training Loop
+        for n in range(self.dreamer_train_iters):
+            print(n)
+            batch = self.episode_buffer.sample(self.batch_size)
+            if n == self.dreamer_train_iters - 1:
+                batch["log_gif"] = True
+            fetches = self.worker.learn_on_batch(batch)
+
+        # Custom Logging
+        policy_fetches = self.policy_stats(fetches)
+        if "log_gif" in policy_fetches:
+            gif = policy_fetches["log_gif"]
+            policy_fetches["log_gif"] = self.postprocess_gif(gif)
+
+        # Metrics Calculation
+        metrics = _get_shared_metrics()
+        metrics.info[LEARNER_INFO] = fetches
+        metrics.counters[STEPS_SAMPLED_COUNTER] = self.episode_buffer.timesteps
+        metrics.counter[STEPS_SAMPLED_COUNTER] *= self.repeat
+        res = collect_metrics(local_worker=self.worker)
+        res["info"] = metrics.info
+        res["info"].update(metrics.counters)
+        res["timesteps_total"] = metrics.counters[STEPS_SAMPLED_COUNTER]
+
+        self.episode_buffer.add(samples)
+        return res
+
+    def postprocess_gif(self, gif: np.ndarray):
+        gif = np.clip(255 * gif, 0, 255).astype(np.uint8)
+        B, T, C, H, W = gif.shape
+        frames = gif.transpose((1, 2, 3, 0, 4)).reshape((1, T, C, H, B * W))
+        return frames
+
+    def policy_stats(self, fetches):
+        return fetches["default_policy"]["learner_stats"]
+
+
+def execution_plan(workers, config):
+    # Special Replay Buffer for Dreamer agent
+    episode_buffer = EpisodicBuffer(length=config["batch_length"])
+
+    local_worker = workers.local_worker()
+
+    # Prefill episode buffer with initial exploration (uniform sampling)
+    while total_sampled_timesteps(local_worker) < config["prefill_timesteps"]:
+        samples = local_worker.sample()
+        episode_buffer.add(samples)
+
+    batch_size = config["batch_size"]
+    dreamer_train_iters = config["dreamer_train_iters"]
+    act_repeat = config["action_repeat"]
+
+    rollouts = ParallelRollouts(workers)
+    rollouts = rollouts.for_each(
+        DreamerIteration(local_worker, episode_buffer, dreamer_train_iters,
+                         batch_size, act_repeat))
+    return rollouts
+
+
+def get_policy_class(config):
+    return DreamerTorchPolicy
+
+
+def validate_config(config):
+    config["action_repeat"] = config["env_config"]["frame_skip"]
+    if config["framework"] != "torch":
+        raise ValueError("Dreamer not supported in Tensorflow yet!")
+    if config["batch_mode"] != "complete_episodes":
+        raise ValueError("truncate_episodes not supported")
+    if config["num_workers"] != 0:
+        raise ValueError("Distributed Dreamer not supported yet!")
+    if config["clip_actions"]:
+        raise ValueError("Clipping is done inherently via policy tanh!")
+    if config["action_repeat"] > 1:
+        config["horizon"] = config["horizon"] / config["action_repeat"]
+
+
+DREAMERTrainer = build_trainer(
+    name="Dreamer",
+    default_config=DEFAULT_CONFIG,
+    default_policy=DreamerTorchPolicy,
+    get_policy_class=get_policy_class,
+    execution_plan=execution_plan,
+    validate_config=validate_config)
diff --git a/rllib/agents/dreamer/dreamer_model.py b/rllib/agents/dreamer/dreamer_model.py
new file mode 100644
index 000000000..2daeee643
--- /dev/null
+++ b/rllib/agents/dreamer/dreamer_model.py
@@ -0,0 +1,559 @@
+import numpy as np
+from typing import Any, List, Tuple
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.framework import TensorType
+
+torch, nn = try_import_torch()
+if torch:
+    from torch import distributions as td
+    from ray.rllib.agents.dreamer.utils import Linear, Conv2d, \
+        ConvTranspose2d, GRUCell, TanhBijector
+
+ActFunc = Any
+
+# Encoder, part of PlaNET
+if torch:
+
+    class ConvEncoder(nn.Module):
+        """Standard Convolutional Encoder for Dreamer. This encoder is used
+      to encode images frm an enviornment into a latent state for the
+      RSSM model in PlaNET.
+      """
+
+        def __init__(self,
+                     depth: int = 32,
+                     act: ActFunc = None,
+                     shape: List = [3, 64, 64]):
+            """Initializes Conv Encoder
+
+          Args:
+            depth (int): Number of channels in the first conv layer
+            act (Any): Activation for Encoder, default ReLU
+            shape (List): Shape of observation input
+          """
+            super().__init__()
+            self.act = act
+            if not act:
+                self.act = nn.ReLU
+            self.depth = depth
+            self.shape = shape
+
+            init_channels = self.shape[0]
+            self.layers = [
+                Conv2d(init_channels, self.depth, 4, stride=2),
+                self.act(),
+                Conv2d(self.depth, 2 * self.depth, 4, stride=2),
+                self.act(),
+                Conv2d(2 * self.depth, 4 * self.depth, 4, stride=2),
+                self.act(),
+                Conv2d(4 * self.depth, 8 * self.depth, 4, stride=2),
+                self.act(),
+            ]
+            self.model = nn.Sequential(*self.layers)
+
+        def forward(self, x):
+            # Flatten to [batch*horizon, 3, 64, 64] in loss function
+            orig_shape = list(x.size())
+            x = x.view(-1, *(orig_shape[-3:]))
+            x = self.model(x)
+
+            new_shape = orig_shape[:-3] + [32 * self.depth]
+            x = x.view(*new_shape)
+            return x
+
+
+if torch:
+
+    class Reshape(nn.Module):
+        """Standard module that reshapes/views a tensor
+    """
+
+        def __init__(self, shape: List):
+            super().__init__()
+            self.shape = shape
+
+        def forward(self, x):
+            return x.view(*self.shape)
+
+
+# Decoder, part of PlaNET
+if torch:
+
+    class ConvDecoder(nn.Module):
+        """Standard Convolutional Decoder for Dreamer. This decoder is used
+      to decoder images from the latent state generated by the transition
+      dynamics model. This is used in calulating loss and logging gifs for
+      imagine trajectories.
+      """
+
+        def __init__(self,
+                     input_size: int,
+                     depth: int = 32,
+                     act: ActFunc = None,
+                     shape: List = [3, 64, 64]):
+            """Initializes Conv Decoder
+
+          Args:
+            input_size (int): Input size, usually feature size output from RSSM
+            depth (int): Number of channels in the first conv layer
+            act (Any): Activation for Encoder, default ReLU
+            shape (List): Shape of observation input
+          """
+            super().__init__()
+            self.act = act
+            if not act:
+                self.act = nn.ReLU
+            self.depth = depth
+            self.shape = shape
+
+            self.layers = [
+                Linear(input_size, 32 * self.depth),
+                Reshape((-1, 32 * self.depth, 1, 1)),
+                ConvTranspose2d(32 * self.depth, 4 * self.depth, 5, stride=2),
+                self.act(),
+                ConvTranspose2d(4 * self.depth, 2 * self.depth, 5, stride=2),
+                self.act(),
+                ConvTranspose2d(2 * self.depth, self.depth, 6, stride=2),
+                self.act(),
+                ConvTranspose2d(self.depth, self.shape[0], 6, stride=2),
+            ]
+            self.model = nn.Sequential(*self.layers)
+
+        def forward(self, x):
+            # x is [batch, hor_length, input_size]
+            orig_shape = list(x.size())
+            x = self.model(x)
+
+            reshape_size = orig_shape[:-1] + self.shape
+            mean = x.view(*reshape_size)
+
+            # Equivalent to making a multivariate diag
+            return td.Independent(td.Normal(mean, 1), len(self.shape))
+
+
+# Reward Model (PlaNET), and Value Function
+if torch:
+
+    class DenseDecoder(nn.Module):
+        """Fully Connected network that outputs a distribution for calculating log_prob
+      later in DreamerLoss
+      """
+
+        def __init__(self,
+                     input_size: int,
+                     output_size: int,
+                     layers: int,
+                     units: int,
+                     dist: str = "normal",
+                     act: ActFunc = None):
+            """Initializes FC network
+
+          Args:
+            input_size (int): Input size to network
+            output_size (int): Output size to network
+            layers (int): Number of layers in network
+            units (int): Size of the hidden layers
+            dist (str): Output distribution, parameterized by FC output logits
+            act (Any): Activation function
+          """
+            super().__init__()
+            self.layrs = layers
+            self.units = units
+            self.act = act
+            if not act:
+                self.act = nn.ELU
+            self.dist = dist
+            self.input_size = input_size
+            self.output_size = output_size
+            self.layers = []
+            cur_size = input_size
+            for _ in range(self.layrs):
+                self.layers.extend([Linear(cur_size, self.units), self.act()])
+                cur_size = units
+            self.layers.append(Linear(cur_size, output_size))
+            self.model = nn.Sequential(*self.layers)
+
+        def forward(self, x):
+            x = self.model(x)
+            if self.output_size == 1:
+                x = torch.squeeze(x)
+            if self.dist == "normal":
+                output_dist = td.Normal(x, 1)
+            elif self.dist == "binary":
+                output_dist = td.Bernoulli(logits=x)
+            else:
+                raise NotImplementedError("Distribution type not implemented!")
+            return td.Independent(output_dist, 0)
+
+
+# Represents dreamer policy
+if torch:
+
+    class ActionDecoder(nn.Module):
+        """ActionDecoder is the policy module in Dreamer. It outputs a distribution
+      parameterized by mean and std, later to be transformed by a custom
+      TanhBijector in utils.py for Dreamer.
+      """
+
+        def __init__(self,
+                     input_size: int,
+                     action_size: int,
+                     layers: int,
+                     units: int,
+                     dist: str = "tanh_normal",
+                     act: ActFunc = None,
+                     min_std: float = 1e-4,
+                     init_std: float = 5.0,
+                     mean_scale: float = 5.0):
+            """Initializes Policy
+
+          Args:
+            input_size (int): Input size to network
+            action_size (int): Action space size
+            layers (int): Number of layers in network
+            units (int): Size of the hidden layers
+            dist (str): Output distribution, with tanh_normal implemented
+            act (Any): Activation function
+            min_std (float): Minimum std for output distribution
+            init_std (float): Intitial std
+            mean_scale (float): Augmenting mean output from FC network
+          """
+            super().__init__()
+            self.layrs = layers
+            self.units = units
+            self.dist = dist
+            self.act = act
+            if not act:
+                self.act = nn.ReLU
+            self.min_std = min_std
+            self.init_std = init_std
+            self.mean_scale = mean_scale
+            self.action_size = action_size
+
+            self.layers = []
+            self.softplus = nn.Softplus()
+
+            # MLP Construction
+            cur_size = input_size
+            for _ in range(self.layrs):
+                self.layers.extend([Linear(cur_size, self.units), self.act()])
+                cur_size = self.units
+            if self.dist == "tanh_normal":
+                self.layers.append(Linear(cur_size, 2 * action_size))
+            elif self.dist == "onehot":
+                self.layers.append(Linear(cur_size, action_size))
+            self.model = nn.Sequential(*self.layers)
+
+        # Returns distribution
+        def forward(self, x):
+            raw_init_std = np.log(np.exp(self.init_std) - 1)
+            x = self.model(x)
+            if self.dist == "tanh_normal":
+                mean, std = torch.chunk(x, 2, dim=-1)
+                mean = self.mean_scale * torch.tanh(mean / self.mean_scale)
+                std = self.softplus(std + raw_init_std) + self.min_std
+                dist = td.Normal(mean, std)
+                transforms = [TanhBijector()]
+                dist = td.transformed_distribution.TransformedDistribution(
+                    dist, transforms)
+                dist = td.Independent(dist, 1)
+            elif self.dist == "onehot":
+                dist = td.OneHotCategorical(logits=x)
+                raise NotImplementedError("Atari not implemented yet!")
+            return dist
+
+
+# Represents TD model in PlaNET
+if torch:
+
+    class RSSM(nn.Module):
+        """RSSM is the core recurrent part of the PlaNET module. It consists of
+      two networks, one (obs) to calculate posterior beliefs and states and
+      the second (img) to calculate prior beliefs and states. The prior network
+      takes in the previous state and action, while the posterior network takes
+      in the previous state, action, and a latent embedding of the most recent
+      observation.
+      """
+
+        def __init__(self,
+                     action_size: int,
+                     embed_size: int,
+                     stoch: int = 30,
+                     deter: int = 200,
+                     hidden: int = 200,
+                     act: ActFunc = None):
+            """Initializes RSSM
+
+          Args:
+            action_size (int): Action space size
+            embed_size (int): Size of ConvEncoder embedding
+            stoch (int): Size of the distributional hidden state
+            deter (int): Size of the deterministic hidden state
+            hidden (int): General size of hidden layers
+            act (Any): Activation function
+          """
+            super().__init__()
+            self.stoch_size = stoch
+            self.deter_size = deter
+            self.hidden_size = hidden
+            self.act = act
+            if act is None:
+                self.act = nn.ELU
+
+            self.obs1 = Linear(embed_size + deter, hidden)
+            self.obs2 = Linear(hidden, 2 * stoch)
+
+            self.cell = GRUCell(self.hidden_size, hidden_size=self.deter_size)
+            self.img1 = Linear(stoch + action_size, hidden)
+            self.img2 = Linear(deter, hidden)
+            self.img3 = Linear(hidden, 2 * stoch)
+
+            self.softplus = nn.Softplus
+
+            self.device = (torch.device("cuda") if torch.cuda.is_available()
+                           else torch.device("cpu"))
+
+        def get_initial_state(self, batch_size: int) -> List[TensorType]:
+            """Returns the inital state for the RSSM, which consists of mean, std
+          for the stochastic state, the sampled stochastic hidden state
+          (from mean, std), and the deterministic hidden state, which is pushed
+          through the GRUCell.
+
+          Args:
+            batch_size (int): Batch size for initial state
+
+          Returns:
+            List of tensors
+          """
+            return [
+                torch.zeros(batch_size, self.stoch_size).to(self.device),
+                torch.zeros(batch_size, self.stoch_size).to(self.device),
+                torch.zeros(batch_size, self.stoch_size).to(self.device),
+                torch.zeros(batch_size, self.deter_size).to(self.device),
+            ]
+
+        def observe(self,
+                    embed: TensorType,
+                    action: TensorType,
+                    state: List[TensorType] = None
+                    ) -> Tuple[List[TensorType], List[TensorType]]:
+            """Returns the corresponding states from the embedding from ConvEncoder
+          and actions. This is accomplished by rolling out the RNN from the
+          starting state through eacn index of embed and action, saving all
+          intermediate states between.
+
+          Args:
+            embed (TensorType): ConvEncoder embedding
+            action (TensorType): Actions
+            state (List[TensorType]): Initial state before rollout
+
+          Returns:
+            Posterior states and prior states (both List[TensorType])
+          """
+            if state is None:
+                state = self.get_initial_state(action.size()[0])
+
+            embed = embed.permute(1, 0, 2)
+            action = action.permute(1, 0, 2)
+
+            priors = [[] for i in range(len(state))]
+            posts = [[] for i in range(len(state))]
+            last = (state, state)
+            for index in range(len(action)):
+                # Tuple of post and prior
+                last = self.obs_step(last[0], action[index], embed[index])
+                [o.append(l) for l, o in zip(last[0], posts)]
+                [o.append(l) for l, o in zip(last[1], priors)]
+
+            prior = [torch.stack(x, dim=0) for x in priors]
+            post = [torch.stack(x, dim=0) for x in posts]
+
+            prior = [e.permute(1, 0, 2) for e in prior]
+            post = [e.permute(1, 0, 2) for e in post]
+
+            return post, prior
+
+        def imagine(self, action: TensorType,
+                    state: List[TensorType] = None) -> List[TensorType]:
+            """Imagines the trajectory starting from state through a list of actions.
+          Similar to observe(), requires rolling out the RNN for each timestep.
+
+          Args:
+            action (TensorType): Actions
+            state (List[TensorType]): Starting state before rollout
+
+          Returns:
+            Prior states
+          """
+            if state is None:
+                state = self.get_initial_state(action.size()[0])
+
+            action = action.permute(1, 0, 2)
+
+            indices = range(len(action))
+            priors = [[] for _ in range(len(state))]
+            last = state
+            for index in indices:
+                last = self.img_step(last, action[index])
+                [o.append(l) for l, o in zip(last, priors)]
+
+            prior = [torch.stack(x, dim=0) for x in priors]
+            prior = [e.permute(1, 0, 2) for e in prior]
+            return prior
+
+        def obs_step(self, prev_state: TensorType, prev_action: TensorType,
+                     embed: TensorType
+                     ) -> Tuple[List[TensorType], List[TensorType]]:
+            """Runs through the posterior model and returns the posterior state
+
+          Args:
+            prev_state (TensorType): The previous state
+            prev_action (TensorType): The previous action
+            embed (TensorType): Embedding from ConvEncoder
+
+          Returns:
+            Post and Prior state
+          """
+            prior = self.img_step(prev_state, prev_action)
+            x = torch.cat([prior[3], embed], dim=-1)
+            x = self.obs1(x)
+            x = self.act()(x)
+            x = self.obs2(x)
+            mean, std = torch.chunk(x, 2, dim=-1)
+            std = self.softplus()(std) + 0.1
+            stoch = self.get_dist(mean, std).rsample()
+            post = [mean, std, stoch, prior[3]]
+            return post, prior
+
+        def img_step(self, prev_state: TensorType,
+                     prev_action: TensorType) -> List[TensorType]:
+            """Runs through the prior model and returns the prior state
+
+          Args:
+            prev_state (TensorType): The previous state
+            prev_action (TensorType): The previous action
+
+          Returns:
+            Prior state
+          """
+            x = torch.cat([prev_state[2], prev_action], dim=-1)
+            x = self.img1(x)
+            x = self.act()(x)
+            deter = self.cell(x, prev_state[3])
+            x = deter
+            x = self.img2(x)
+            x = self.act()(x)
+            x = self.img3(x)
+            mean, std = torch.chunk(x, 2, dim=-1)
+            std = self.softplus()(std) + 0.1
+            stoch = self.get_dist(mean, std).rsample()
+            return [mean, std, stoch, deter]
+
+        def get_feature(self, state: List[TensorType]) -> TensorType:
+            # Constructs feature for input to reward, decoder, actor, critic
+            return torch.cat([state[2], state[3]], dim=-1)
+
+        def get_dist(self, mean: TensorType, std: TensorType) -> TensorType:
+            return td.Normal(mean, std)
+
+
+# Represents all models in Dreamer, unifies them all into a single interface
+if torch:
+
+    class DreamerModel(TorchModelV2, nn.Module):
+        def __init__(self, obs_space, action_space, num_outputs, model_config,
+                     name):
+            super().__init__(obs_space, action_space, num_outputs,
+                             model_config, name)
+
+            nn.Module.__init__(self)
+            self.depth = model_config["depth_size"]
+            self.deter_size = model_config["deter_size"]
+            self.stoch_size = model_config["stoch_size"]
+            self.hidden_size = model_config["hidden_size"]
+
+            self.action_size = action_space.shape[0]
+
+            self.encoder = ConvEncoder(self.depth)
+            self.decoder = ConvDecoder(
+                self.stoch_size + self.deter_size, depth=self.depth)
+            self.reward = DenseDecoder(self.stoch_size + self.deter_size, 1, 2,
+                                       self.hidden_size)
+            self.dynamics = RSSM(
+                self.action_size,
+                32 * self.depth,
+                stoch=self.stoch_size,
+                deter=self.deter_size)
+            self.actor = ActionDecoder(self.stoch_size + self.deter_size,
+                                       self.action_size, 4, self.hidden_size)
+            self.value = DenseDecoder(self.stoch_size + self.deter_size, 1, 3,
+                                      self.hidden_size)
+            self.state = None
+
+            self.device = (torch.device("cuda") if torch.cuda.is_available()
+                           else torch.device("cpu"))
+
+        def policy(self,
+                   obs: TensorType,
+                   state: List[TensorType],
+                   explore=True
+                   ) -> Tuple[TensorType, List[float], List[TensorType]]:
+            """Returns the action. Runs through the encoder, recurrent model,
+          and policy to obtain action.
+          """
+            if state is None:
+                self.initial_state()
+            else:
+                self.state = state
+            post = self.state[:4]
+            action = self.state[4]
+
+            embed = self.encoder(obs)
+            post, _ = self.dynamics.obs_step(post, action, embed)
+            feat = self.dynamics.get_feature(post)
+
+            action_dist = self.actor(feat)
+            if explore:
+                action = action_dist.sample()
+            else:
+                action = action_dist.mean
+            logp = action_dist.log_prob(action)
+
+            self.state = post + [action]
+            return action, logp, self.state
+
+        def imagine_ahead(self, state: List[TensorType],
+                          horizon: int) -> TensorType:
+            """Given a batch of states, rolls out more state of length horizon.
+          """
+            start = []
+            for s in state:
+                s = s.contiguous().detach()
+                shpe = [-1] + list(s.size())[2:]
+                start.append(s.view(*shpe))
+
+            def next_state(state):
+                feature = self.dynamics.get_feature(state).detach()
+                action = self.actor(feature).rsample()
+                next_state = self.dynamics.img_step(state, action)
+                return next_state
+
+            last = start
+            outputs = [[] for i in range(len(start))]
+            for _ in range(horizon):
+                last = next_state(last)
+                [o.append(l) for l, o in zip(last, outputs)]
+            outputs = [torch.stack(x, dim=0) for x in outputs]
+
+            imag_feat = self.dynamics.get_feature(outputs)
+            return imag_feat
+
+        def get_initial_state(self) -> List[TensorType]:
+            self.state = self.dynamics.get_initial_state(1) + [
+                torch.zeros(1, self.action_space.shape[0]).to(self.device)
+            ]
+            return self.state
+
+        def value_function(self) -> TensorType:
+            return None
diff --git a/rllib/agents/dreamer/dreamer_torch_policy.py b/rllib/agents/dreamer/dreamer_torch_policy.py
new file mode 100644
index 000000000..f9abd10c8
--- /dev/null
+++ b/rllib/agents/dreamer/dreamer_torch_policy.py
@@ -0,0 +1,247 @@
+import logging
+
+import ray
+from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.agents.dreamer.utils import FreezeParameters
+
+torch, nn = try_import_torch()
+if torch:
+    from torch import distributions as td
+
+logger = logging.getLogger(__name__)
+
+
+# This is the computation graph for workers (inner adaptation steps)
+def compute_dreamer_loss(obs,
+                         action,
+                         reward,
+                         model,
+                         imagine_horizon,
+                         discount=0.99,
+                         lambda_=0.95,
+                         kl_coeff=1.0,
+                         free_nats=3.0,
+                         log=False):
+    """Constructs loss for the Dreamer objective
+
+        Args:
+            obs (TensorType): Observations (o_t)
+            action (TensorType): Actions (a_(t-1))
+            reward (TensorType): Rewards (r_(t-1))
+            model (TorchModelV2): DreamerModel, encompassing all other models
+            imagine_horizon (int): Imagine horizon for actor and critic loss
+            discount (float): Discount
+            lambda_ (float): Lambda, like in GAE
+            kl_coeff (float): KL Coefficient for Divergence loss in model loss
+            free_nats (float): Threshold for minimum divergence in model loss
+            log (bool): If log, generate gifs
+        """
+    encoder_weights = list(model.encoder.parameters())
+    decoder_weights = list(model.decoder.parameters())
+    reward_weights = list(model.reward.parameters())
+    dynamics_weights = list(model.dynamics.parameters())
+    critic_weights = list(model.value.parameters())
+    model_weights = list(encoder_weights + decoder_weights + reward_weights +
+                         dynamics_weights)
+
+    device = (torch.device("cuda")
+              if torch.cuda.is_available() else torch.device("cpu"))
+
+    # PlaNET Model Loss
+    latent = model.encoder(obs)
+    post, prior = model.dynamics.observe(latent, action)
+    features = model.dynamics.get_feature(post)
+    image_pred = model.decoder(features)
+    reward_pred = model.reward(features)
+    image_loss = -torch.mean(image_pred.log_prob(obs))
+    reward_loss = -torch.mean(reward_pred.log_prob(reward))
+    prior_dist = model.dynamics.get_dist(prior[0], prior[1])
+    post_dist = model.dynamics.get_dist(post[0], post[1])
+    div = torch.mean(
+        torch.distributions.kl_divergence(post_dist, prior_dist).sum(dim=2))
+    div = torch.clamp(div, min=free_nats)
+    model_loss = kl_coeff * div + reward_loss + image_loss
+
+    # Actor Loss
+    # [imagine_horizon, batch_length*batch_size, feature_size]
+    with torch.no_grad():
+        actor_states = [v.detach() for v in post]
+    with FreezeParameters(model_weights):
+        imag_feat = model.imagine_ahead(actor_states, imagine_horizon)
+    with FreezeParameters(model_weights + critic_weights):
+        reward = model.reward(imag_feat).mean
+        value = model.value(imag_feat).mean
+    pcont = discount * torch.ones_like(reward)
+    returns = lambda_return(reward[:-1], value[:-1], pcont[:-1], value[-1],
+                            lambda_)
+    discount_shape = pcont[:1].size()
+    discount = torch.cumprod(
+        torch.cat([torch.ones(*discount_shape).to(device), pcont[:-2]], dim=0),
+        dim=0)
+    actor_loss = -torch.mean(discount * returns)
+
+    # Critic Loss
+    with torch.no_grad():
+        val_feat = imag_feat.detach()[:-1]
+        target = returns.detach()
+        val_discount = discount.detach()
+    val_pred = model.value(val_feat)
+    critic_loss = -torch.mean(val_discount * val_pred.log_prob(target))
+
+    # Logging purposes
+    prior_ent = torch.mean(prior_dist.entropy())
+    post_ent = torch.mean(post_dist.entropy())
+
+    log_gif = None
+    if log:
+        log_gif = log_summary(obs, action, latent, image_pred, model)
+
+    return_dict = {
+        "model_loss": model_loss,
+        "reward_loss": reward_loss,
+        "image_loss": image_loss,
+        "divergence": div,
+        "actor_loss": actor_loss,
+        "critic_loss": critic_loss,
+        "prior_ent": prior_ent,
+        "post_ent": post_ent,
+    }
+
+    if log_gif is not None:
+        return_dict["log_gif"] = log_gif
+    return return_dict
+
+
+# Similar to GAE-Lambda, calculate value targets
+def lambda_return(reward, value, pcont, bootstrap, lambda_):
+    def agg_fn(x, y):
+        return y[0] + y[1] * lambda_ * x
+
+    next_values = torch.cat([value[1:], bootstrap[None]], dim=0)
+    inputs = reward + pcont * next_values * (1 - lambda_)
+
+    last = bootstrap
+    returns = []
+    for i in reversed(range(len(inputs))):
+        last = agg_fn(last, [inputs[i], pcont[i]])
+        returns.append(last)
+
+    returns = list(reversed(returns))
+    returns = torch.stack(returns, dim=0)
+    return returns
+
+
+# Creates gif
+def log_summary(obs, action, embed, image_pred, model):
+    truth = obs[:6] + 0.5
+    recon = image_pred.mean[:6]
+    init, _ = model.dynamics.observe(embed[:6, :5], action[:6, :5])
+    init = [itm[:, -1] for itm in init]
+    prior = model.dynamics.imagine(action[:6, 5:], init)
+    openl = model.decoder(model.dynamics.get_feature(prior)).mean
+
+    mod = torch.cat([recon[:, :5] + 0.5, openl + 0.5], 1)
+    error = (mod - truth + 1.0) / 2.0
+    return torch.cat([truth, mod, error], 3)
+
+
+def dreamer_loss(policy, model, dist_class, train_batch):
+    log_gif = False
+    if "log_gif" in train_batch:
+        log_gif = True
+
+    policy.stats_dict = compute_dreamer_loss(
+        train_batch["obs"],
+        train_batch["actions"],
+        train_batch["rewards"],
+        policy.model,
+        policy.config["imagine_horizon"],
+        policy.config["discount"],
+        policy.config["lambda"],
+        policy.config["kl_coeff"],
+        policy.config["free_nats"],
+        log_gif,
+    )
+
+    loss_dict = policy.stats_dict
+
+    return (loss_dict["model_loss"], loss_dict["actor_loss"],
+            loss_dict["critic_loss"])
+
+
+def build_dreamer_model(policy, obs_space, action_space, config):
+
+    policy.model = ModelCatalog.get_model_v2(
+        obs_space,
+        action_space,
+        1,
+        config["dreamer_model"],
+        name="DreamerModel",
+        framework="torch")
+
+    policy.model_variables = policy.model.variables()
+
+    return policy.model
+
+
+def action_sampler_fn(policy, model, input_dict, state, explore, timestep):
+    """Action sampler function has two phases. During the prefill phase,
+    actions are sampled uniformly [-1, 1]. During training phase, actions
+    are evaluated through DreamerPolicy and an additive gaussian is added
+    to incentivize exploration.
+    """
+    obs = input_dict["obs"]
+
+    # Custom Exploration
+    if timestep <= policy.config["prefill_timesteps"]:
+        logp = [0.0]
+        # Random action in space [-1.0, 1.0]
+        action = 2.0 * torch.rand(1, model.action_space.shape[0]) - 1.0
+        state = model.get_initial_state()
+    else:
+        # Weird RLLib Handling, this happens when env rests
+        if len(state[0].size()) == 3:
+            # Very hacky, but works on all envs
+            state = model.get_initial_state()
+        action, logp, state = model.policy(obs, state, explore)
+        action = td.Normal(action, policy.config["explore_noise"]).sample()
+        action = torch.clamp(action, min=-1.0, max=1.0)
+
+    policy.global_timestep += policy.config["action_repeat"]
+
+    return action, logp, state
+
+
+def dreamer_stats(policy, train_batch):
+    return policy.stats_dict
+
+
+def dreamer_optimizer_fn(policy, config):
+    model = policy.model
+    encoder_weights = list(model.encoder.parameters())
+    decoder_weights = list(model.decoder.parameters())
+    reward_weights = list(model.reward.parameters())
+    dynamics_weights = list(model.dynamics.parameters())
+    actor_weights = list(model.actor.parameters())
+    critic_weights = list(model.value.parameters())
+    model_opt = torch.optim.Adam(
+        encoder_weights + decoder_weights + reward_weights + dynamics_weights,
+        lr=config["td_model_lr"])
+    actor_opt = torch.optim.Adam(actor_weights, lr=config["actor_lr"])
+    critic_opt = torch.optim.Adam(critic_weights, lr=config["critic_lr"])
+
+    return (model_opt, actor_opt, critic_opt)
+
+
+DreamerTorchPolicy = build_torch_policy(
+    name="DreamerTorchPolicy",
+    get_default_config=lambda: ray.rllib.agents.dreamer.dreamer.DEFAULT_CONFIG,
+    action_sampler_fn=action_sampler_fn,
+    loss_fn=dreamer_loss,
+    stats_fn=dreamer_stats,
+    make_model=build_dreamer_model,
+    optimizer_fn=dreamer_optimizer_fn,
+    extra_grad_process_fn=apply_grad_clipping)
diff --git a/rllib/agents/dreamer/utils.py b/rllib/agents/dreamer/utils.py
new file mode 100644
index 000000000..e86707ade
--- /dev/null
+++ b/rllib/agents/dreamer/utils.py
@@ -0,0 +1,94 @@
+from ray.rllib.utils.framework import try_import_torch
+import numpy as np
+
+torch, nn = try_import_torch()
+
+# Custom initialization for different types of layers
+if torch:
+
+    class Linear(nn.Linear):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def reset_parameters(self):
+            nn.init.xavier_uniform_(self.weight)
+            if self.bias is not None:
+                nn.init.zeros_(self.bias)
+
+
+if torch:
+
+    class Conv2d(nn.Conv2d):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def reset_parameters(self):
+            nn.init.xavier_uniform_(self.weight)
+            if self.bias is not None:
+                nn.init.zeros_(self.bias)
+
+
+if torch:
+
+    class ConvTranspose2d(nn.ConvTranspose2d):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def reset_parameters(self):
+            nn.init.xavier_uniform_(self.weight)
+            if self.bias is not None:
+                nn.init.zeros_(self.bias)
+
+
+if torch:
+
+    class GRUCell(nn.GRUCell):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def reset_parameters(self):
+            nn.init.xavier_uniform_(self.weight_ih)
+            nn.init.orthogonal_(self.weight_hh)
+            nn.init.zeros_(self.bias_ih)
+            nn.init.zeros_(self.bias_hh)
+
+
+# Custom Tanh Bijector due to big gradients through Dreamer Actor
+if torch:
+
+    class TanhBijector(torch.distributions.Transform):
+        def __init__(self):
+            super().__init__()
+
+        def atanh(self, x):
+            return 0.5 * torch.log((1 + x) / (1 - x))
+
+        def sign(self):
+            return 1.
+
+        def _call(self, x):
+            return torch.tanh(x)
+
+        def _inverse(self, y):
+            y = torch.where((torch.abs(y) <= 1.),
+                            torch.clamp(y, -0.99999997, 0.99999997), y)
+            y = self.atanh(y)
+            return y
+
+        def log_abs_det_jacobian(self, x, y):
+            return 2. * (np.log(2) - x - nn.functional.softplus(-2. * x))
+
+
+# Modified from https://github.com/juliusfrost/dreamer-pytorch
+class FreezeParameters:
+    def __init__(self, parameters):
+        self.parameters = parameters
+        self.param_states = [p.requires_grad for p in self.parameters]
+
+    def __enter__(self):
+        for param in self.parameters:
+            param.requires_grad = False
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for i, param in enumerate(self.parameters):
+            param.requires_grad = self.param_states[i]
diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py
index a1dbe2b2d..2f46106c7 100644
--- a/rllib/agents/registry.py
+++ b/rllib/agents/registry.py
@@ -105,6 +105,11 @@ def _import_mbmpo():
     return mbmpo.MBMPOTrainer
 
 
+def _import_dreamer():
+    from ray.rllib.agents import dreamer
+    return dreamer.DREAMERTrainer
+
+
 ALGORITHMS = {
     "SAC": _import_sac,
     "DDPG": _import_ddpg,
@@ -126,6 +131,7 @@ ALGORITHMS = {
     "MARWIL": _import_marwil,
     "MAML": _import_maml,
     "MBMPO": _import_mbmpo,
+    "DREAMER": _import_dreamer,
 }
 
 
diff --git a/rllib/env/dm_control_wrapper.py b/rllib/env/dm_control_wrapper.py
index 212bd9e9b..6734e2a3a 100644
--- a/rllib/env/dm_control_wrapper.py
+++ b/rllib/env/dm_control_wrapper.py
@@ -73,19 +73,20 @@ class DMCEnv(core.Env):
                  task_kwargs=None,
                  visualize_reward=False,
                  from_pixels=False,
-                 height=84,
-                 width=84,
+                 height=64,
+                 width=64,
                  camera_id=0,
-                 frame_skip=1,
+                 frame_skip=2,
                  environment_kwargs=None,
-                 channels_first=False):
-        assert "random" in task_kwargs, "Seed for deterministic behaviour"
+                 channels_first=True,
+                 preprocess=True):
         self._from_pixels = from_pixels
         self._height = height
         self._width = width
         self._camera_id = camera_id
         self._frame_skip = frame_skip
         self._channels_first = channels_first
+        self.preprocess = preprocess
 
         if specs is None:
             raise RuntimeError((
@@ -120,6 +121,9 @@ class DMCEnv(core.Env):
                      width] if channels_first else [height, width, 3]
             self._observation_space = spaces.Box(
                 low=0, high=255, shape=shape, dtype=np.uint8)
+            if preprocess:
+                self._observation_space = spaces.Box(
+                    low=-0.5, high=0.5, shape=shape, dtype=np.float32)
         else:
             self._observation_space = _spec_to_box(
                 self._env.observation_spec().values())
@@ -128,9 +132,6 @@ class DMCEnv(core.Env):
 
         self.current_state = None
 
-        # set seed
-        self.seed(seed=task_kwargs.get("random", 1))
-
     def __getattr__(self, name):
         return getattr(self._env, name)
 
@@ -142,6 +143,8 @@ class DMCEnv(core.Env):
                 camera_id=self._camera_id)
             if self._channels_first:
                 obs = obs.transpose(2, 0, 1).copy()
+            if self.preprocess:
+                obs = obs / 255.0 - 0.5
         else:
             obs = _flatten_obs(time_step.observation)
         return obs
@@ -167,11 +170,6 @@ class DMCEnv(core.Env):
     def action_space(self):
         return self._norm_action_space
 
-    def seed(self, seed):
-        self._true_action_space.seed(seed)
-        self._norm_action_space.seed(seed)
-        self._observation_space.seed(seed)
-
     def step(self, action):
         assert self._norm_action_space.contains(action)
         action = self._convert_action(action)
diff --git a/rllib/examples/env/dm_control_suite.py b/rllib/examples/env/dm_control_suite.py
index abddc3eb9..165344794 100644
--- a/rllib/examples/env/dm_control_suite.py
+++ b/rllib/examples/env/dm_control_suite.py
@@ -1,85 +1,139 @@
 from ray.rllib.env.dm_control_wrapper import DMCEnv
-import numpy as np
 """
 8 Environments from Deepmind Control Suite
 """
 
 
-def acrobot_swingup():
+def acrobot_swingup(from_pixels=True,
+                    height=64,
+                    width=64,
+                    frame_skip=2,
+                    channels_first=True):
     return DMCEnv(
         "acrobot",
         "swingup",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def hopper_hop():
+def walker_walk(from_pixels=True,
+                height=64,
+                width=64,
+                frame_skip=2,
+                channels_first=True):
+    return DMCEnv(
+        "walker",
+        "walk",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
+
+
+def hopper_hop(from_pixels=True,
+               height=64,
+               width=64,
+               frame_skip=2,
+               channels_first=True):
     return DMCEnv(
         "hopper",
         "hop",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def hopper_stand():
+def hopper_stand(from_pixels=True,
+                 height=64,
+                 width=64,
+                 frame_skip=2,
+                 channels_first=True):
     return DMCEnv(
         "hopper",
         "stand",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def cheetah_run():
+def cheetah_run(from_pixels=True,
+                height=64,
+                width=64,
+                frame_skip=2,
+                channels_first=True):
     return DMCEnv(
         "cheetah",
         "run",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def walker_run():
+def walker_run(from_pixels=True,
+               height=64,
+               width=64,
+               frame_skip=2,
+               channels_first=True):
     return DMCEnv(
         "walker",
         "run",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def pendulum_swingup():
+def pendulum_swingup(from_pixels=True,
+                     height=64,
+                     width=64,
+                     frame_skip=2,
+                     channels_first=True):
     return DMCEnv(
         "pendulum",
         "swingup",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def cartpole_swingup():
+def cartpole_swingup(from_pixels=True,
+                     height=64,
+                     width=64,
+                     frame_skip=2,
+                     channels_first=True):
     return DMCEnv(
         "cartpole",
         "swingup",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
 
 
-def humanoid_walk():
+def humanoid_walk(from_pixels=True,
+                  height=64,
+                  width=64,
+                  frame_skip=2,
+                  channels_first=True):
     return DMCEnv(
         "humanoid",
         "walk",
-        from_pixels=True,
-        height=64,
-        width=64,
-        task_kwargs={"random": np.random.randint(low=0, high=1e9)})
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first)
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index 1a908ec86..f5dd7be39 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -225,11 +225,12 @@ class TorchPolicy(Policy):
         """
         if self.action_sampler_fn:
             action_dist = dist_inputs = None
-            state_out = []
-            actions, logp = self.action_sampler_fn(
+            state_out = state_batches
+            actions, logp, state_out = self.action_sampler_fn(
                 self,
                 self.model,
-                input_dict[SampleBatch.CUR_OBS],
+                input_dict,
+                state_out,
                 explore=explore,
                 timestep=timestep)
         else:
@@ -363,6 +364,7 @@ class TorchPolicy(Policy):
 
         # Loop through all optimizers.
         grad_info = {"allreduce_latency": 0.0}
+
         for i, opt in enumerate(self._optimizers):
             # Erase gradients in all vars of this optimizer.
             opt.zero_grad()
@@ -394,7 +396,8 @@ class TorchPolicy(Policy):
 
                 grad_info["allreduce_latency"] += time.time() - start
 
-            # Step the optimizer.
+        # Step the optimizer
+        for i, opt in enumerate(self._optimizers):
             opt.step()
 
         grad_info["allreduce_latency"] /= len(self._optimizers)
diff --git a/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml b/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml
new file mode 100644
index 000000000..71d28e7fe
--- /dev/null
+++ b/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml
@@ -0,0 +1,26 @@
+dmc-dreamer:
+    run: DREAMER
+    env:
+        grid_search:
+            - ray.rllib.examples.env.dm_control_suite.walker_walk
+            - ray.rllib.examples.env.dm_control_suite.cheetah_run
+            - ray.rllib.examples.env.dm_control_suite.hopper_hop
+    stop:
+      timesteps_total: 1000000
+    config:
+        framework: torch
+        td_model_lr: 0.0006
+        actor_lr: 0.00008
+        critic_lr: 0.00008
+        discount: 0.99
+        lambda: 0.95
+        dreamer_train_iters: 100
+        horizon: 1000
+        batch_size: 50
+        batch_length: 50
+        imagine_horizon: 15
+        free_nats: 3.0
+        batch_mode: complete_episodes
+        num_gpus: 1
+        num_workers: 0
+        clip_actions: False