[rllib] Refactor rllib to have a common sample collection pathway (#2149)

2026-06-28 15:22:56 +08:00 · 2018-06-09 00:21:35 -07:00
parent cb5e6e6d68
commit 71eb558eb0
54 changed files with 1981 additions and 2192 deletions
@@ -6,6 +6,11 @@ from __future__ import print_function
 # This file is imported from the tune module in order to register RLlib agents.
 from ray.tune.registry import register_trainable

+from ray.rllib.utils.policy_graph import PolicyGraph
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
+from ray.rllib.optimizers.sample_batch import SampleBatch
+

 def _register_all():
    for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
@@ -16,3 +21,7 @@ def _register_all():


 _register_all()
+
+__all__ = [
+    "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
+]
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
 import pickle
 import os

@@ -10,14 +9,14 @@ import ray
 from ray.rllib.agent import Agent
 from ray.rllib.optimizers import AsyncOptimizer
 from ray.rllib.utils import FilterManager
-from ray.rllib.a3c.a3c_evaluator import A3CEvaluator, RemoteA3CEvaluator, \
-    GPURemoteA3CEvaluator
-from ray.tune.result import TrainingResult
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
+from ray.rllib.a3c.common import get_policy_cls
 from ray.tune.trial import Resources

 DEFAULT_CONFIG = {
    # Number of workers (excluding master)
-    "num_workers": 4,
+    "num_workers": 2,
    # Size of rollout batch
    "batch_size": 10,
    # Use LSTM model - only applicable for image states
@@ -42,6 +41,8 @@ DEFAULT_CONFIG = {
    "entropy_coeff": -0.01,
    # Whether to place workers on GPUs
    "use_gpu_for_workers": False,
+    # Whether to emit extra summary stats
+    "summarize": False,
    # Model and preprocessor options
    "model": {
        # (Image statespace) - Converts image to Channels = 1
@@ -78,56 +79,48 @@ class A3CAgent(Agent):
            extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)

    def _init(self):
-        self.local_evaluator = A3CEvaluator(
-            self.registry,
-            self.env_creator,
-            self.config,
-            self.logdir,
-            start_sampler=False)
-        if self.config["use_gpu_for_workers"]:
-            remote_cls = GPURemoteA3CEvaluator
+        self.policy_cls = get_policy_cls(self.config)
+
+        if self.config["use_pytorch"]:
+            session_creator = None
        else:
-            remote_cls = RemoteA3CEvaluator
+            import tensorflow as tf
+
+            def session_creator():
+                return tf.Session(
+                    config=tf.ConfigProto(
+                        intra_op_parallelism_threads=1,
+                        inter_op_parallelism_threads=1,
+                        gpu_options=tf.GPUOptions(allow_growth=True)))
+
+        remote_cls = CommonPolicyEvaluator.as_remote(
+            num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
+        self.local_evaluator = CommonPolicyEvaluator(
+            self.env_creator, self.policy_cls,
+            batch_steps=self.config["batch_size"],
+            batch_mode="truncate_episodes",
+            tf_session_creator=session_creator,
+            registry=self.registry, env_config=self.config["env_config"],
+            model_config=self.config["model"], policy_config=self.config)
        self.remote_evaluators = [
-            remote_cls.remote(self.registry, self.env_creator, self.config,
-                              self.logdir)
-            for i in range(self.config["num_workers"])
-        ]
-        self.optimizer = AsyncOptimizer(self.config["optimizer"],
-                                        self.local_evaluator,
-                                        self.remote_evaluators)
+            remote_cls.remote(
+                self.env_creator, self.policy_cls,
+                batch_steps=self.config["batch_size"],
+                batch_mode="truncate_episodes", sample_async=True,
+                tf_session_creator=session_creator,
+                registry=self.registry, env_config=self.config["env_config"],
+                model_config=self.config["model"], policy_config=self.config)
+            for i in range(self.config["num_workers"])]
+
+        self.optimizer = AsyncOptimizer(
+            self.config["optimizer"], self.local_evaluator,
+            self.remote_evaluators)

    def _train(self):
        self.optimizer.step()
-        FilterManager.synchronize(self.local_evaluator.filters,
-                                  self.remote_evaluators)
-        res = self._fetch_metrics_from_remote_evaluators()
-        return res
-
-    def _fetch_metrics_from_remote_evaluators(self):
-        episode_rewards = []
-        episode_lengths = []
-        metric_lists = [
-            a.get_completed_rollout_metrics.remote()
-            for a in self.remote_evaluators
-        ]
-        for metrics in metric_lists:
-            for episode in ray.get(metrics):
-                episode_lengths.append(episode.episode_length)
-                episode_rewards.append(episode.episode_reward)
-        avg_reward = (np.mean(episode_rewards)
-                      if episode_rewards else float('nan'))
-        avg_length = (np.mean(episode_lengths)
-                      if episode_lengths else float('nan'))
-        timesteps = np.sum(episode_lengths) if episode_lengths else 0
-
-        result = TrainingResult(
-            episode_reward_mean=avg_reward,
-            episode_len_mean=avg_length,
-            timesteps_this_iter=timesteps,
-            info={})
-
-        return result
+        FilterManager.synchronize(
+            self.local_evaluator.filters, self.remote_evaluators)
+        return collect_metrics(self.local_evaluator, self.remote_evaluators)

    def _stop(self):
        # workaround for https://github.com/ray-project/ray/issues/1516
@@ -154,7 +147,10 @@ class A3CAgent(Agent):
        ])
        self.local_evaluator.restore(extra_data["local_state"])

-    def compute_action(self, observation):
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
        obs = self.local_evaluator.obs_filter(observation, update=False)
-        action, info = self.local_evaluator.policy.compute(obs)
-        return action
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                obs, state, is_training=False)[0])
@@ -1,119 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pickle
-
-import ray
-from ray.rllib.models import ModelCatalog
-from ray.rllib.optimizers import PolicyEvaluator
-from ray.rllib.a3c.common import get_policy_cls
-from ray.rllib.utils.filter import get_filter
-from ray.rllib.utils.sampler import AsyncSampler
-from ray.rllib.utils.process_rollout import process_rollout
-
-
-class A3CEvaluator(PolicyEvaluator):
-    """Actor object to start running simulation on workers.
-
-    The gradient computation is also executed from this object.
-
-    Attributes:
-        policy: Copy of graph used for policy. Used by sampler and gradients.
-        obs_filter: Observation filter used in environment sampling
-        rew_filter: Reward filter used in rollout post-processing.
-        sampler: Component for interacting with environment and generating
-            rollouts.
-        logdir: Directory for logging.
-    """
-    def __init__(
-            self, registry, env_creator, config, logdir, start_sampler=True):
-        env = ModelCatalog.get_preprocessor_as_wrapper(
-            registry, env_creator(config["env_config"]), config["model"])
-        self.env = env
-        policy_cls = get_policy_cls(config)
-        # TODO(rliaw): should change this to be just env.observation_space
-        self.policy = policy_cls(
-            registry, env.observation_space.shape, env.action_space, config)
-        self.config = config
-
-        # Technically not needed when not remote
-        self.obs_filter = get_filter(
-            config["observation_filter"], env.observation_space.shape)
-        self.rew_filter = get_filter(config["reward_filter"], ())
-        self.filters = {"obs_filter": self.obs_filter,
-                        "rew_filter": self.rew_filter}
-        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
-                                    config["batch_size"])
-        if start_sampler and self.sampler._async:
-            self.sampler.start()
-        self.logdir = logdir
-
-    def sample(self):
-        rollout = self.sampler.get_data()
-        samples = process_rollout(
-            rollout, self.rew_filter, gamma=self.config["gamma"],
-            lambda_=self.config["lambda"], use_gae=True)
-        return samples
-
-    def get_completed_rollout_metrics(self):
-        """Returns metrics on previously completed rollouts.
-
-        Calling this clears the queue of completed rollout metrics.
-        """
-        return self.sampler.get_metrics()
-
-    def compute_gradients(self, samples):
-        gradient, info = self.policy.compute_gradients(samples)
-        return gradient, {}
-
-    def apply_gradients(self, grads):
-        self.policy.apply_gradients(grads)
-
-    def get_weights(self):
-        return self.policy.get_weights()
-
-    def set_weights(self, params):
-        self.policy.set_weights(params)
-
-    def save(self):
-        filters = self.get_filters(flush_after=True)
-        weights = self.get_weights()
-        return pickle.dumps({
-            "filters": filters,
-            "weights": weights})
-
-    def restore(self, objs):
-        objs = pickle.loads(objs)
-        self.sync_filters(objs["filters"])
-        self.set_weights(objs["weights"])
-
-    def sync_filters(self, new_filters):
-        """Changes self's filter to given and rebases any accumulated delta.
-
-        Args:
-            new_filters (dict): Filters with new state to update local copy.
-        """
-        assert all(k in new_filters for k in self.filters)
-        for k in self.filters:
-            self.filters[k].sync(new_filters[k])
-
-    def get_filters(self, flush_after=False):
-        """Returns a snapshot of filters.
-
-        Args:
-            flush_after (bool): Clears the filter buffer state.
-
-        Returns:
-            return_filters (dict): Dict for serializable filters
-        """
-        return_filters = {}
-        for k, f in self.filters.items():
-            return_filters[k] = f.as_serializable()
-            if flush_after:
-                f.clear_buffer()
-        return return_filters
-
-
-RemoteA3CEvaluator = ray.remote(A3CEvaluator)
-GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import gym
+
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+class A3CTFPolicyGraph(TFPolicyGraph):
+    """The TF policy base class."""
+
+    def __init__(self, ob_space, action_space, registry, config):
+        self.registry = registry
+        self.local_steps = 0
+        self.config = config
+        self.summarize = config.get("summarize")
+
+        self._setup_graph(ob_space, action_space)
+        assert all(hasattr(self, attr)
+                   for attr in ["vf", "logits", "x", "var_list"])
+        print("Setting up loss")
+        self.setup_loss(action_space)
+        self.is_training = tf.placeholder_with_default(True, ())
+        self.sess = tf.get_default_session()
+
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.x,
+            action_sampler=self.action_dist.sample(), loss=self.loss,
+            loss_inputs=self.loss_in, is_training=self.is_training,
+            state_inputs=self.state_in, state_outputs=self.state_out)
+
+        self.sess.run(tf.global_variables_initializer())
+
+        if self.summarize:
+            bs = tf.to_float(tf.shape(self.x)[0])
+            tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
+            tf.summary.scalar("model/value_loss", self.vf_loss / bs)
+            tf.summary.scalar("model/entropy", self.entropy / bs)
+            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
+            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
+            self.summary_op = tf.summary.merge_all()
+
+    def _setup_graph(self, ob_space, ac_space):
+        raise NotImplementedError
+
+    def setup_loss(self, action_space):
+        if isinstance(action_space, gym.spaces.Box):
+            ac_size = action_space.shape[0]
+            self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
+        elif isinstance(action_space, gym.spaces.Discrete):
+            self.ac = tf.placeholder(tf.int64, [None], name="ac")
+        else:
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for A3C.".format(
+                    action_space))
+        self.adv = tf.placeholder(tf.float32, [None], name="adv")
+        self.r = tf.placeholder(tf.float32, [None], name="r")
+
+        log_prob = self.action_dist.logp(self.ac)
+
+        # The "policy gradients" loss: its derivative is precisely the policy
+        # gradient. Notice that self.ac is a placeholder that is provided
+        # externally. adv will contain the advantages, as calculated in
+        # compute_advantages.
+        self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
+
+        delta = self.vf - self.r
+        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
+        self.entropy = tf.reduce_sum(self.action_dist.entropy())
+        self.loss = (self.pi_loss +
+                     self.vf_loss * self.config["vf_loss_coeff"] +
+                     self.entropy * self.config["entropy_coeff"])
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer(self.config["lr"])
+
+    def gradients(self, optimizer):
+        grads = tf.gradients(self.loss, self.var_list)
+        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
+        clipped_grads = list(zip(self.grads, self.var_list))
+        return clipped_grads
+
+    def extra_compute_grad_fetches(self):
+        if self.summarize:
+            return {"summary": self.summary_op}
+        else:
+            return {}
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        completed = sample_batch["dones"][-1]
+        if completed:
+            last_r = 0.0
+        else:
+            next_state = []
+            for i in range(len(self.state_in)):
+                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
+            last_r = self.value(sample_batch["new_obs"][-1], *next_state)
+        return compute_advantages(
+            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
@@ -0,0 +1,113 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from threading import Lock
+
+import torch
+import torch.nn.functional as F
+
+from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.policy_graph import PolicyGraph
+
+
+class SharedTorchPolicy(PolicyGraph):
+    """A simple, non-recurrent PyTorch policy example."""
+
+    def __init__(self, obs_space, action_space, registry, config):
+        self.registry = registry
+        self.local_steps = 0
+        self.config = config
+        self.summarize = config.get("summarize")
+        self.setup_graph(obs_space, action_space)
+        torch.set_num_threads(2)
+        self.lock = Lock()
+
+    def setup_graph(self, obs_space, action_space):
+        _, self.logit_dim = ModelCatalog.get_action_dist(action_space)
+        self._model = ModelCatalog.get_torch_model(
+            self.registry, obs_space.shape, self.logit_dim,
+            self.config["model"])
+        self.optimizer = torch.optim.Adam(
+            self._model.parameters(), lr=self.config["lr"])
+
+    def compute_single_action(self, obs, state, is_training=False):
+        assert not state, "RNN not supported"
+        with self.lock:
+            ob = torch.from_numpy(obs).float().unsqueeze(0)
+            logits, values = self._model(ob)
+            samples = F.softmax(logits, dim=1).multinomial(1).squeeze()
+            values = values.squeeze()
+            return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
+
+    def compute_gradients(self, samples):
+        with self.lock:
+            self.backward(samples)
+            # Note that return values are just references;
+            # calling zero_grad will modify the values
+            return [p.grad.data.numpy() for p in self._model.parameters()], {}
+
+    def apply_gradients(self, grads):
+        self.optimizer.zero_grad()
+        for g, p in zip(grads, self._model.parameters()):
+            p.grad = torch.from_numpy(g)
+        self.optimizer.step()
+        return {}
+
+    def get_weights(self):
+        # !! This only returns references to the data.
+        return self._model.state_dict()
+
+    def set_weights(self, weights):
+        with self.lock:
+            self._model.load_state_dict(weights)
+
+    def value(self, obs):
+        with self.lock:
+            obs = torch.from_numpy(obs).float().unsqueeze(0)
+            res = self._model.hidden_layers(obs)
+            res = self._model.value_branch(res)
+            res = res.squeeze()
+            return var_to_np(res)
+
+    def forward(self, obs_batch, actions):
+        logits, values = self._model(obs_batch)
+        log_probs = F.log_softmax(logits, dim=1)
+        probs = F.softmax(logits, dim=1)
+        action_log_probs = log_probs.gather(1, actions.view(-1, 1))
+        entropy = -(log_probs * probs).sum(-1).sum()
+        return values, action_log_probs, entropy
+
+    def backward(self, sample_batch):
+        """Loss is encoded here.
+
+        Defining a new loss function would start by rewriting this function.
+        """
+
+        states, actions, advs, rs = convert_batch(sample_batch)
+        values, action_log_probs, entropy = self.forward(states, actions)
+        pi_err = -advs.dot(action_log_probs.reshape(-1))
+        value_err = F.mse_loss(values.reshape(-1), rs)
+
+        self.optimizer.zero_grad()
+
+        overall_err = sum([
+            pi_err,
+            self.config["vf_loss_coeff"] * value_err,
+            self.config["entropy_coeff"] * entropy,
+        ])
+
+        overall_err.backward()
+        torch.nn.utils.clip_grad_norm_(self._model.parameters(),
+                                       self.config["grad_clip"])
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        completed = sample_batch["dones"][-1]
+        if completed:
+            last_r = 0.0
+        else:
+            last_r = self.value(sample_batch["new_obs"][-1])
+        return compute_advantages(
+            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
@@ -8,7 +8,7 @@ def get_policy_cls(config):
        from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
        policy_cls = SharedModelLSTM
    elif config["use_pytorch"]:
-        from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy
+        from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy
        policy_cls = SharedTorchPolicy
    else:
        from ray.rllib.a3c.shared_model import SharedModel
@@ -1,28 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class Policy(object):
-    """The policy base class."""
-    def __init__(self, ob_space, action_space, name="local", summarize=True):
-        pass
-
-    def apply_gradients(self, grads):
-        raise NotImplementedError
-
-    def get_weights(self):
-        raise NotImplementedError
-
-    def set_weights(self, weights):
-        raise NotImplementedError
-
-    def compute_gradients(self, samples):
-        raise NotImplementedError
-
-    def compute(self, observations):
-        """Compute action for a _single_ observation"""
-        raise NotImplementedError
-
-    def value(self, ob):
-        raise NotImplementedError
@@ -4,30 +4,27 @@ from __future__ import print_function

 import tensorflow as tf
 from ray.rllib.models.misc import linear, normc_initializer
-from ray.rllib.a3c.tfpolicy import TFPolicy
+from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
 from ray.rllib.models.catalog import ModelCatalog


-class SharedModel(TFPolicy):
+class SharedModel(A3CTFPolicyGraph):

-    other_output = ["vf_preds"]
-    is_recurrent = False
-
-    def __init__(self, registry, ob_space, ac_space, config, **kwargs):
+    def __init__(self, ob_space, ac_space, registry, config, **kwargs):
        super(SharedModel, self).__init__(
-            registry, ob_space, ac_space, config, **kwargs)
+            ob_space, ac_space, registry, config, **kwargs)

    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+        self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
        self._model = ModelCatalog.get_model(
            self.registry, self.x, self.logit_dim, self.config["model"])
        self.logits = self._model.outputs
-        self.curr_dist = dist_class(self.logits)
+        self.action_dist = dist_class(self.logits)
        self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
                                    normc_initializer(1.0)), [-1])

-        self.sample = self.curr_dist.sample()
+        self.sample = self.action_dist.sample()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        self.global_step = tf.get_variable(
@@ -35,28 +32,20 @@ class SharedModel(TFPolicy):
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

-    def compute_gradients(self, samples):
-        info = {}
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-            self.r: samples["value_targets"],
-        }
-        self.grads = [g for g in self.grads if g is not None]
-        self.local_steps += 1
-        if self.summarize:
-            grad, summ = self.sess.run([self.grads, self.summary_op],
-                                       feed_dict=feed_dict)
-            info['summary'] = summ
-        else:
-            grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
+        self.state_in = []
+        self.state_out = []

-    def compute(self, ob, *args):
-        action, vf = self.sess.run([self.sample, self.vf],
-                                   {self.x: [ob]})
-        return action[0], {"vf_preds": vf[0]}
+    def setup_loss(self, action_space):
+        A3CTFPolicyGraph.setup_loss(self, action_space)
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+            ("value_targets", self.r),
+        ]
+
+    def extra_compute_action_fetches(self):
+        return {"vf_preds": self.vf}

    def value(self, ob, *args):
        vf = self.sess.run(self.vf, {self.x: [ob]})
@@ -5,43 +5,32 @@ from __future__ import print_function
 import tensorflow as tf
 from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.a3c.tfpolicy import TFPolicy
+from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
 from ray.rllib.models.lstm import LSTM


-class SharedModelLSTM(TFPolicy):
-    """
-    Attributes:
-        other_output (list): Other than `action`, the other return values from
-            `compute_gradients`.
-        is_recurrent (bool): True if is a recurrent network (requires features
-            to be tracked).
-    """
+class SharedModelLSTM(A3CTFPolicyGraph):

-    other_output = ["vf_preds", "features"]
-    is_recurrent = True
-
-    def __init__(self, registry, ob_space, ac_space, config, **kwargs):
+    def __init__(self, ob_space, ac_space, registry, config, **kwargs):
        super(SharedModelLSTM, self).__init__(
-            registry, ob_space, ac_space, config, **kwargs)
+            ob_space, ac_space, registry, config, **kwargs)

    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+        self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
        self._model = LSTM(self.x, self.logit_dim, {})

-        self.state_init = self._model.state_init
        self.state_in = self._model.state_in
        self.state_out = self._model.state_out

        self.logits = self._model.outputs
-        self.curr_dist = dist_class(self.logits)
+        self.action_dist = dist_class(self.logits)
        # with tf.variable_scope("vf"):
        #     vf_model = ModelCatalog.get_model(self.x, 1)
        self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
                                    normc_initializer(1.0)), [-1])

-        self.sample = self.curr_dist.sample()
+        self.sample = self.action_dist.sample()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        self.global_step = tf.get_variable(
@@ -49,42 +38,25 @@ class SharedModelLSTM(TFPolicy):
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

-    def compute_gradients(self, samples):
-        """Computing the gradient is actually model-dependent.
+    def get_initial_state(self):
+        return self._model.state_init

-        The LSTM needs its hidden states in order to compute the gradient
-        accurately.
-        """
-        features = samples["features"][0]
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-            self.r: samples["value_targets"],
-            self.state_in[0]: features[0],
-            self.state_in[1]: features[1]
-        }
-        info = {}
-        self.local_steps += 1
-        if self.summarize and self.local_steps % 10 == 0:
-            grad, summ = self.sess.run([self.grads, self.summary_op],
-                                       feed_dict=feed_dict)
-            info['summary'] = summ
-        else:
-            grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
+    def setup_loss(self, action_space):
+        A3CTFPolicyGraph.setup_loss(self, action_space)
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+            ("value_targets", self.r),
+            ("state_in_0", self.state_in[0]),
+            ("state_in_1", self.state_in[1]),
+        ]

-    def compute(self, ob, c, h):
-        action, vf, c, h = self.sess.run(
-            [self.sample, self.vf] + self.state_out,
-            {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
-        return action[0], {"vf_preds": vf[0], "features": (c, h)}
+    def extra_compute_action_fetches(self):
+        return {"vf_preds": self.vf}

    def value(self, ob, c, h):
        vf = self.sess.run(self.vf, {self.x: [ob],
                                     self.state_in[0]: c,
                                     self.state_in[1]: h})
        return vf[0]
-
-    def get_initial_features(self):
-        return self.state_init
@@ -1,106 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import ray
-import gym
-from ray.rllib.a3c.policy import Policy
-
-
-class TFPolicy(Policy):
-    """The policy base class."""
-    def __init__(self, registry, ob_space, action_space, config,
-                 name="local", summarize=True):
-        self.registry = registry
-        self.local_steps = 0
-        self.config = config
-        self.summarize = summarize
-        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
-        self.g = tf.Graph()
-        with self.g.as_default(), tf.device(worker_device):
-            with tf.variable_scope(name):
-                self._setup_graph(ob_space, action_space)
-                assert all(hasattr(self, attr)
-                           for attr in ["vf", "logits", "x", "var_list"])
-            print("Setting up loss")
-            self.setup_loss(action_space)
-            self.setup_gradients()
-            self.initialize()
-
-    def _setup_graph(self, ob_space, ac_space):
-        raise NotImplementedError
-
-    def setup_loss(self, action_space):
-        if isinstance(action_space, gym.spaces.Box):
-            ac_size = action_space.shape[0]
-            self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
-        elif isinstance(action_space, gym.spaces.Discrete):
-            self.ac = tf.placeholder(tf.int64, [None], name="ac")
-        else:
-            raise NotImplementedError(
-                "action space" + str(type(action_space)) +
-                "currently not supported")
-        self.adv = tf.placeholder(tf.float32, [None], name="adv")
-        self.r = tf.placeholder(tf.float32, [None], name="r")
-
-        log_prob = self.curr_dist.logp(self.ac)
-
-        # The "policy gradients" loss: its derivative is precisely the policy
-        # gradient. Notice that self.ac is a placeholder that is provided
-        # externally. adv will contain the advantages, as calculated in
-        # process_rollout.
-        self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
-
-        delta = self.vf - self.r
-        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
-        self.entropy = tf.reduce_sum(self.curr_dist.entropy())
-        self.loss = (self.pi_loss +
-                     self.vf_loss * self.config["vf_loss_coeff"] +
-                     self.entropy * self.config["entropy_coeff"])
-
-    def setup_gradients(self):
-        grads = tf.gradients(self.loss, self.var_list)
-        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
-        grads_and_vars = list(zip(self.grads, self.var_list))
-        opt = tf.train.AdamOptimizer(self.config["lr"])
-        self._apply_gradients = opt.apply_gradients(grads_and_vars)
-
-    def initialize(self):
-        if self.summarize:
-            bs = tf.to_float(tf.shape(self.x)[0])
-            tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
-            tf.summary.scalar("model/value_loss", self.vf_loss / bs)
-            tf.summary.scalar("model/entropy", self.entropy / bs)
-            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
-            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
-            self.summary_op = tf.summary.merge_all()
-
-        # TODO(rliaw): Can consider exposing these parameters
-        self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
-            intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
-            gpu_options=tf.GPUOptions(allow_growth=True)))
-        self.variables = ray.experimental.TensorFlowVariables(self.loss,
-                                                              self.sess)
-        self.sess.run(tf.global_variables_initializer())
-
-    def apply_gradients(self, grads):
-        feed_dict = {self.grads[i]: grads[i]
-                     for i in range(len(grads))}
-        self.sess.run(self._apply_gradients, feed_dict=feed_dict)
-
-    def get_weights(self):
-        weights = self.variables.get_weights()
-        return weights
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def compute_gradients(self, samples):
-        raise NotImplementedError
-
-    def compute(self, observation):
-        raise NotImplementedError
-
-    def value(self, ob):
-        raise NotImplementedError
@@ -61,7 +61,7 @@ class Agent(Trainable):
    """

    _allow_unknown_configs = False
-    _allow_unknown_subkeys = []
+    _allow_unknown_subkeys = ["env_config", "model", "optimizer"]

    @classmethod
    def resource_help(cls, config):
@@ -17,8 +17,7 @@ class BCEvaluator(PolicyEvaluator):
        env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(
            config["env_config"]), config["model"])
        self.dataset = ExperienceDataset(config["dataset_path"])
-        # TODO(rliaw): should change this to be just env.observation_space
-        self.policy = BCPolicy(registry, env.observation_space.shape,
+        self.policy = BCPolicy(registry, env.observation_space,
                               env.action_space, config)
        self.config = config
        self.logdir = logdir
@@ -6,30 +6,22 @@ import tensorflow as tf
 import gym

 import ray
-from ray.rllib.a3c.policy import Policy
 from ray.rllib.models.catalog import ModelCatalog


-class BCPolicy(Policy):
-    def __init__(self, registry, ob_space, action_space, config, name="local",
-                 summarize=True):
-        super(BCPolicy, self).__init__(ob_space, action_space, name, summarize)
+class BCPolicy(object):
+    def __init__(self, registry, obs_space, action_space, config):
        self.registry = registry
        self.local_steps = 0
        self.config = config
-        self.summarize = summarize
-        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
-        self.g = tf.Graph()
-        with self.g.as_default(), tf.device(worker_device):
-            with tf.variable_scope(name):
-                self._setup_graph(ob_space, action_space)
-            print("Setting up loss")
-            self.setup_loss(action_space)
-            self.setup_gradients()
-            self.initialize()
+        self.summarize = config.get("summarize")
+        self._setup_graph(obs_space, action_space)
+        self.setup_loss(action_space)
+        self.setup_gradients()
+        self.initialize()

-    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+    def _setup_graph(self, obs_space, ac_space):
+        self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
        self._model = ModelCatalog.get_model(
            self.registry, self.x, self.logit_dim, self.config["model"])
@@ -8,25 +8,25 @@ from ray.utils import merge_dicts
 APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
    DDPG_CONFIG,
    {
-        'optimizer_class': 'ApexOptimizer',
-        'optimizer_config':
+        "optimizer_class": "ApexOptimizer",
+        "optimizer_config":
            merge_dicts(
-                DDPG_CONFIG['optimizer_config'], {
-                    'max_weight_sync_delay': 400,
-                    'num_replay_buffer_shards': 4,
-                    'debug': False
+                DDPG_CONFIG["optimizer_config"], {
+                    "max_weight_sync_delay": 400,
+                    "num_replay_buffer_shards": 4,
+                    "debug": False
                }),
-        'n_step': 3,
-        'num_workers': 32,
-        'buffer_size': 2000000,
-        'learning_starts': 50000,
-        'train_batch_size': 512,
-        'sample_batch_size': 50,
-        'max_weight_sync_delay': 400,
-        'target_network_update_freq': 500000,
-        'timesteps_per_iteration': 25000,
-        'per_worker_exploration': True,
-        'worker_side_prioritization': True,
+        "n_step": 3,
+        "num_workers": 32,
+        "buffer_size": 2000000,
+        "learning_starts": 50000,
+        "train_batch_size": 512,
+        "sample_batch_size": 50,
+        "max_weight_sync_delay": 400,
+        "target_network_update_freq": 500000,
+        "timesteps_per_iteration": 25000,
+        "per_worker_exploration": True,
+        "worker_side_prioritization": True,
    },
 )

@@ -2,17 +2,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import pickle
-import os
-
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib import optimizers
-from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator
-from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
+from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
+from ray.rllib.dqn.dqn import DQNAgent
+from ray.rllib.ddpg.ddpg_policy_graph import DDPGPolicyGraph

 OPTIMIZER_SHARED_CONFIGS = [
    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
@@ -23,247 +15,120 @@ OPTIMIZER_SHARED_CONFIGS = [
 DEFAULT_CONFIG = {
    # === Model ===
    # Hidden layer sizes of the policy networks
-    'actor_hiddens': [64, 64],
+    "actor_hiddens": [64, 64],
    # Hidden layer sizes of the policy networks
-    'critic_hiddens': [64, 64],
+    "critic_hiddens": [64, 64],
    # N-step Q learning
-    'n_step': 1,
+    "n_step": 1,
    # Config options to pass to the model constructor
-    'model': {},
+    "model": {},
    # Discount factor for the MDP
-    'gamma': 0.99,
+    "gamma": 0.99,
    # Arguments to pass to the env creator
-    'env_config': {},
+    "env_config": {},

    # === Exploration ===
    # Max num timesteps for annealing schedules. Exploration is annealed from
    # 1.0 to exploration_fraction over this number of timesteps scaled by
    # exploration_fraction
-    'schedule_max_timesteps': 100000,
+    "schedule_max_timesteps": 100000,
    # Number of env steps to optimize for before returning
-    'timesteps_per_iteration': 1000,
+    "timesteps_per_iteration": 1000,
    # Fraction of entire training period over which the exploration rate is
    # annealed
-    'exploration_fraction': 0.1,
+    "exploration_fraction": 0.1,
    # Final value of random action probability
-    'exploration_final_eps': 0.02,
+    "exploration_final_eps": 0.02,
    # OU-noise scale
-    'noise_scale': 0.1,
+    "noise_scale": 0.1,
    # theta
-    'exploration_theta': 0.15,
+    "exploration_theta": 0.15,
    # sigma
-    'exploration_sigma': 0.2,
+    "exploration_sigma": 0.2,
    # Update the target network every `target_network_update_freq` steps.
-    'target_network_update_freq': 0,
+    "target_network_update_freq": 0,
    # Update the target by \tau * policy + (1-\tau) * target_policy
-    'tau': 0.002,
-    # Whether to start with random actions instead of noops.
-    'random_starts': True,
+    "tau": 0.002,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
-    'buffer_size': 50000,
+    "buffer_size": 50000,
    # If True prioritized replay buffer will be used.
-    'prioritized_replay': True,
+    "prioritized_replay": True,
    # Alpha parameter for prioritized replay buffer.
-    'prioritized_replay_alpha': 0.6,
+    "prioritized_replay_alpha": 0.6,
    # Beta parameter for sampling from prioritized replay buffer.
-    'prioritized_replay_beta': 0.4,
+    "prioritized_replay_beta": 0.4,
    # Epsilon to add to the TD errors when updating priorities.
-    'prioritized_replay_eps': 1e-6,
+    "prioritized_replay_eps": 1e-6,
    # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
-    'clip_rewards': True,
+    "clip_rewards": True,

    # === Optimization ===
    # Learning rate for adam optimizer
-    'actor_lr': 1e-4,
-    'critic_lr': 1e-3,
+    "actor_lr": 1e-4,
+    "critic_lr": 1e-3,
    # If True, use huber loss instead of squared loss for critic network
    # Conventionally, no need to clip gradients if using a huber loss
-    'use_huber': False,
+    "use_huber": False,
    # Threshold of a huber loss
-    'huber_threshold': 1.0,
+    "huber_threshold": 1.0,
    # Weights for L2 regularization
-    'l2_reg': 1e-6,
+    "l2_reg": 1e-6,
    # If not None, clip gradients during optimization at this value
-    'grad_norm_clipping': None,
+    "grad_norm_clipping": None,
    # How many steps of the model to sample before learning starts.
-    'learning_starts': 1500,
+    "learning_starts": 1500,
    # Update the replay buffer with this many samples at once. Note that this
    # setting applies per-worker if num_workers > 1.
-    'sample_batch_size': 1,
+    "sample_batch_size": 1,
    # Size of a batched sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
-    'train_batch_size': 256,
-    # Smooth the current average reward over this many previous episodes.
-    'smoothing_num_episodes': 100,
-
-    # === Tensorflow ===
-    # Arguments to pass to tensorflow
-    'tf_session_args': {
-        "device_count": {
-            "CPU": 2
-        },
-        "log_device_placement": False,
-        "allow_soft_placement": True,
-        "gpu_options": {
-            "allow_growth": True
-        },
-        "inter_op_parallelism_threads": 1,
-        "intra_op_parallelism_threads": 1,
-    },
+    "train_batch_size": 256,

    # === Parallelism ===
+    # Whether to use a GPU for local optimization.
+    "gpu": False,
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    'num_workers': 0,
+    # you"re using the Async or Ape-X optimizers.
+    "num_workers": 0,
    # Whether to allocate GPUs for workers (if > 0).
-    'num_gpus_per_worker': 0,
+    "num_gpus_per_worker": 0,
+    # Whether to allocate CPUs for workers (if > 0).
+    "num_cpus_per_worker": 1,
    # Optimizer class to use.
-    'optimizer_class': "LocalSyncReplayOptimizer",
+    "optimizer_class": "LocalSyncReplayOptimizer",
    # Config to pass to the optimizer.
-    'optimizer_config': {},
+    "optimizer_config": {},
    # Whether to use a distribution of epsilons across workers for exploration.
-    'per_worker_exploration': False,
+    "per_worker_exploration": False,
    # Whether to compute priorities on workers.
-    'worker_side_prioritization': False
+    "worker_side_prioritization": False
 }


-class DDPGAgent(Agent):
+class DDPGAgent(DQNAgent):
    _agent_name = "DDPG"
    _allow_unknown_subkeys = [
-        "model", "optimizer", "tf_session_args", "env_config"
-    ]
+        "model", "optimizer", "tf_session_args", "env_config"]
    _default_config = DEFAULT_CONFIG
+    _policy_graph = DDPGPolicyGraph

-    def _init(self):
-        self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
-                                             self.config, self.logdir, 0)
-        remote_cls = ray.remote(
-            num_cpus=1,
-            num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
-        self.remote_evaluators = [
-            remote_cls.remote(self.registry, self.env_creator, self.config,
-                              self.logdir, i)
-            for i in range(self.config["num_workers"])
-        ]
-
-        for k in OPTIMIZER_SHARED_CONFIGS:
-            if k not in self.config["optimizer_config"]:
-                self.config["optimizer_config"][k] = self.config[k]
-
-        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
-            self.config["optimizer_config"], self.local_evaluator,
-            self.remote_evaluators)
-
-        self.saver = tf.train.Saver(max_to_keep=None)
-        self.last_target_update_ts = 0
-        self.num_target_updates = 0
-
-    @property
-    def global_timestep(self):
-        return self.optimizer.num_steps_sampled
-
-    def update_target_if_needed(self):
-        if self.global_timestep - self.last_target_update_ts > \
-                self.config["target_network_update_freq"]:
-            self.local_evaluator.update_target()
-            self.last_target_update_ts = self.global_timestep
-            self.num_target_updates += 1
-
-    def _train(self):
-        start_timestep = self.global_timestep
-
-        while (self.global_timestep - start_timestep <
-               self.config["timesteps_per_iteration"]):
-
-            self.optimizer.step()
-            self.update_target_if_needed()
-
-        self.local_evaluator.set_global_timestep(self.global_timestep)
-        for e in self.remote_evaluators:
-            e.set_global_timestep.remote(self.global_timestep)
-
-        return self._train_stats(start_timestep)
-
-    def _train_stats(self, start_timestep):
-        if self.remote_evaluators:
-            stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
-        else:
-            stats = self.local_evaluator.stats()
-            if not isinstance(stats, list):
-                stats = [stats]
-
-        mean_100ep_reward = 0.0
-        mean_100ep_length = 0.0
-        num_episodes = 0
-        explorations = []
-
+    def _make_exploration_schedule(self, worker_index):
+        # Override DQN's schedule to take into account `noise_scale`
        if self.config["per_worker_exploration"]:
-            # Return stats from workers with the lowest 20% of exploration
-            test_stats = stats[-int(max(1, len(stats) * 0.2)):]
+            assert self.config["num_workers"] > 1, \
+                "This requires multiple workers"
+            return ConstantSchedule(
+                self.config["noise_scale"] * 0.4 **
+                (1 + worker_index / float(self.config["num_workers"] - 1) * 7))
        else:
-            test_stats = stats
-
-        for s in test_stats:
-            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
-            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
-
-        for s in stats:
-            num_episodes += s["num_episodes"]
-            explorations.append(s["exploration"])
-
-        opt_stats = self.optimizer.stats()
-
-        result = TrainingResult(
-            episode_reward_mean=mean_100ep_reward,
-            episode_len_mean=mean_100ep_length,
-            episodes_total=num_episodes,
-            timesteps_this_iter=self.global_timestep - start_timestep,
-            info=dict({
-                "min_exploration": min(explorations),
-                "max_exploration": max(explorations),
-                "num_target_updates": self.num_target_updates,
-            }, **opt_stats))
-
-        return result
-
-    def _stop(self):
-        # workaround for https://github.com/ray-project/ray/issues/1516
-        for ev in self.remote_evaluators:
-            ev.__ray_terminate__.remote()
-
-    def _save(self, checkpoint_dir):
-        checkpoint_path = self.saver.save(
-            self.local_evaluator.sess,
-            os.path.join(checkpoint_dir, "checkpoint"),
-            global_step=self.iteration)
-        extra_data = [
-            self.local_evaluator.save(),
-            ray.get([e.save.remote() for e in self.remote_evaluators]),
-            self.optimizer.save(), self.num_target_updates,
-            self.last_target_update_ts
-        ]
-        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
-        return checkpoint_path
-
-    def _restore(self, checkpoint_path):
-        self.saver.restore(self.local_evaluator.sess, checkpoint_path)
-        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
-        self.local_evaluator.restore(extra_data[0])
-        ray.get([
-            e.restore.remote(d)
-            for (d, e) in zip(extra_data[1], self.remote_evaluators)
-        ])
-        self.optimizer.restore(extra_data[2])
-        self.num_target_updates = extra_data[3]
-        self.last_target_update_ts = extra_data[4]
-
-    def compute_action(self, observation):
-        return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
-                                                   np.array(observation)[None],
-                                                   0.0)[0]
+            return LinearSchedule(
+                schedule_timesteps=int(self.config["exploration_fraction"] *
+                                       self.config["schedule_max_timesteps"]),
+                initial_p=self.config["noise_scale"] * 1.0,
+                final_p=self.config["noise_scale"] *
+                self.config["exploration_final_eps"])
@@ -1,186 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Box
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.ddpg import models
-from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
-from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
-from ray.rllib.utils.compression import pack
-from ray.rllib.dqn.dqn_evaluator import adjust_nstep
-from ray.rllib.dqn.common.wrappers import wrap_dqn
-
-
-class DDPGEvaluator(PolicyEvaluator):
-    """The base DDPG Evaluator."""
-
-    def __init__(self, registry, env_creator, config, logdir, worker_index):
-        env = env_creator(config["env_config"])
-        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
-        self.env = env
-        self.config = config
-
-        # when env.action_space is of Box type, e.g., Pendulum-v0
-        # action_space.low is [-2.0], high is [2.0]
-        # take action by calling, e.g., env.step([3.5])
-        if not isinstance(env.action_space, Box):
-            raise UnsupportedSpaceException(
-                "Action space {} is not supported for DDPG.".format(
-                    env.action_space))
-
-        tf_config = tf.ConfigProto(**config["tf_session_args"])
-        self.sess = tf.Session(config=tf_config)
-        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)
-
-        # Use either a different `eps` per worker, or a linear schedule.
-        if config["per_worker_exploration"]:
-            assert config["num_workers"] > 1, "This requires multiple workers"
-            self.exploration = ConstantSchedule(
-                config["noise_scale"] * 0.4 **
-                (1 + worker_index / float(config["num_workers"] - 1) * 7))
-        else:
-            self.exploration = LinearSchedule(
-                schedule_timesteps=int(config["exploration_fraction"] *
-                                       config["schedule_max_timesteps"]),
-                initial_p=config["noise_scale"] * 1.0,
-                final_p=config["noise_scale"] *
-                config["exploration_final_eps"])
-
-        # Initialize the parameters and copy them to the target network.
-        self.sess.run(tf.global_variables_initializer())
-        # hard instead of soft
-        self.ddpg_graph.update_target(self.sess, 1.0)
-        self.global_timestep = 0
-        self.local_timestep = 0
-
-        # Note that this encompasses both the policy and Q-value networks and
-        # their corresponding target networks
-        self.variables = ray.experimental.TensorFlowVariables(
-            tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)
-
-        self.episode_rewards = [0.0]
-        self.episode_lengths = [0.0]
-        self.saved_mean_reward = None
-
-        self.obs = self.env.reset()
-
-    def set_global_timestep(self, global_timestep):
-        self.global_timestep = global_timestep
-
-    def update_target(self):
-        self.ddpg_graph.update_target(self.sess)
-
-    def sample(self):
-        obs, actions, rewards, new_obs, dones = [], [], [], [], []
-        for _ in range(
-                self.config["sample_batch_size"] + self.config["n_step"] - 1):
-            ob, act, rew, ob1, done = self._step(self.global_timestep)
-            obs.append(ob)
-            actions.append(act)
-            rewards.append(rew)
-            new_obs.append(ob1)
-            dones.append(done)
-
-        # N-step Q adjustments
-        if self.config["n_step"] > 1:
-            # Adjust for steps lost from truncation
-            self.local_timestep -= (self.config["n_step"] - 1)
-            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
-                         actions, rewards, new_obs, dones)
-
-        batch = SampleBatch({
-            "obs": [pack(np.array(o)) for o in obs],
-            "actions": actions,
-            "rewards": rewards,
-            "new_obs": [pack(np.array(o)) for o in new_obs],
-            "dones": dones,
-            "weights": np.ones_like(rewards)
-        })
-        assert (batch.count == self.config["sample_batch_size"])
-
-        # Prioritize on the worker side
-        if self.config["worker_side_prioritization"]:
-            td_errors = self.ddpg_graph.compute_td_error(
-                self.sess, obs, batch["actions"], batch["rewards"], new_obs,
-                batch["dones"], batch["weights"])
-            new_priorities = (
-                np.abs(td_errors) + self.config["prioritized_replay_eps"])
-            batch.data["weights"] = new_priorities
-
-        return batch
-
-    def compute_gradients(self, samples):
-        td_err, grads = self.ddpg_graph.compute_gradients(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return grads, {"td_error": td_err}
-
-    def apply_gradients(self, grads):
-        self.ddpg_graph.apply_gradients(self.sess, grads)
-
-    def compute_apply(self, samples):
-        td_error = self.ddpg_graph.compute_apply(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return {"td_error": td_error}
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def _step(self, global_timestep):
-        """Takes a single step, and returns the result of the step."""
-        action = self.ddpg_graph.act(
-            self.sess,
-            np.array(self.obs)[None],
-            self.exploration.value(global_timestep))[0]
-        new_obs, rew, done, _ = self.env.step(action)
-        ret = (self.obs, action, rew, new_obs, float(done))
-        self.obs = new_obs
-        self.episode_rewards[-1] += rew
-        self.episode_lengths[-1] += 1
-        if done:
-            self.obs = self.env.reset()
-            self.episode_rewards.append(0.0)
-            self.episode_lengths.append(0.0)
-            # reset UO noise for each episode
-            self.ddpg_graph.reset_noise(self.sess)
-
-        self.local_timestep += 1
-        return ret
-
-    def stats(self):
-        n = self.config["smoothing_num_episodes"] + 1
-        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
-        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
-        exploration = self.exploration.value(self.global_timestep)
-        return {
-            "mean_100ep_reward": mean_100ep_reward,
-            "mean_100ep_length": mean_100ep_length,
-            "num_episodes": len(self.episode_rewards),
-            "exploration": exploration,
-            "local_timestep": self.local_timestep,
-        }
-
-    def save(self):
-        return [
-            self.exploration, self.episode_rewards, self.episode_lengths,
-            self.saved_mean_reward, self.obs, self.global_timestep,
-            self.local_timestep
-        ]
-
-    def restore(self, data):
-        self.exploration = data[0]
-        self.episode_rewards = data[1]
-        self.episode_lengths = data[2]
-        self.saved_mean_reward = data[3]
-        self.obs = data[4]
-        self.global_timestep = data[5]
-        self.local_timestep = data[6]
@@ -0,0 +1,327 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from gym.spaces import Box
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+
+import ray
+from ray.rllib.dqn.dqn_policy_graph import _huber_loss, _minimize_and_clip, \
+    _scope_vars, _postprocess_dqn
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+A_SCOPE = "a_func"
+P_SCOPE = "p_func"
+P_TARGET_SCOPE = "target_p_func"
+Q_SCOPE = "q_func"
+Q_TARGET_SCOPE = "target_q_func"
+
+
+def _build_p_network(registry, inputs, dim_actions, config):
+    """
+    map an observation (i.e., state) to an action where
+    each entry takes value from (0, 1) due to the sigmoid function
+    """
+    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
+
+    hiddens = config["actor_hiddens"]
+    action_out = frontend.last_layer
+    for hidden in hiddens:
+        action_out = layers.fully_connected(
+            action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
+    # Use sigmoid layer to bound values within (0, 1)
+    # shape of action_scores is [batch_size, dim_actions]
+    action_scores = layers.fully_connected(
+        action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
+
+    return action_scores
+
+
+# As a stochastic policy for inference, but a deterministic policy for training
+# thus ignore batch_size issue when constructing a stochastic action
+def _build_action_network(p_values, low_action, high_action, stochastic, eps,
+                          theta, sigma):
+    # shape is [None, dim_action]
+    deterministic_actions = (high_action - low_action) * p_values + low_action
+
+    exploration_sample = tf.get_variable(
+        name="ornstein_uhlenbeck",
+        dtype=tf.float32,
+        initializer=low_action.size * [.0],
+        trainable=False)
+    normal_sample = tf.random_normal(
+        shape=[low_action.size], mean=0.0, stddev=1.0)
+    exploration_value = tf.assign_add(
+        exploration_sample,
+        theta * (.0 - exploration_sample) + sigma * normal_sample)
+    stochastic_actions = deterministic_actions + eps * (
+        high_action - low_action) * exploration_value
+
+    return tf.cond(stochastic, lambda: stochastic_actions,
+                   lambda: deterministic_actions)
+
+
+def _build_q_network(registry, inputs, action_inputs, config):
+    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
+
+    hiddens = config["critic_hiddens"]
+
+    q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
+    for hidden in hiddens:
+        q_out = layers.fully_connected(
+            q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
+    q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
+
+    return q_scores
+
+
+class DDPGPolicyGraph(TFPolicyGraph):
+    def __init__(self, observation_space, action_space, registry, config):
+        if not isinstance(action_space, Box):
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for DDPG.".format(
+                    action_space))
+
+        self.config = config
+        self.cur_epsilon = 1.0
+        dim_actions = action_space.shape[0]
+        low_action = action_space.low
+        high_action = action_space.high
+        self.actor_optimizer = tf.train.AdamOptimizer(
+            learning_rate=config["actor_lr"])
+        self.critic_optimizer = tf.train.AdamOptimizer(
+            learning_rate=config["critic_lr"])
+
+        # Action inputs
+        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
+        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.cur_observations = tf.placeholder(
+            tf.float32, shape=(None, ) + observation_space.shape)
+
+        # Actor: P (policy) network
+        with tf.variable_scope(P_SCOPE) as scope:
+            p_values = _build_p_network(registry, self.cur_observations,
+                                        dim_actions, config)
+            self.p_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        with tf.variable_scope(A_SCOPE):
+            self.output_actions = _build_action_network(
+                p_values, low_action, high_action, self.stochastic, self.eps,
+                config["exploration_theta"], config["exploration_sigma"])
+
+        with tf.variable_scope(A_SCOPE, reuse=True):
+            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
+            self.reset_noise_op = tf.assign(exploration_sample,
+                                            dim_actions * [.0])
+
+        # Replay inputs
+        self.obs_t = tf.placeholder(
+            tf.float32,
+            shape=(None, ) + observation_space.shape,
+            name="observation")
+        self.act_t = tf.placeholder(
+            tf.float32, shape=(None, ) + action_space.shape, name="action")
+        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
+        self.obs_tp1 = tf.placeholder(
+            tf.float32, shape=(None, ) + observation_space.shape)
+        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
+        self.importance_weights = tf.placeholder(
+            tf.float32, [None], name="weight")
+
+        # p network evaluation
+        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
+            self.p_t = _build_p_network(
+                registry, self.obs_t, dim_actions, config)
+
+        # target p network evaluation
+        with tf.variable_scope(P_TARGET_SCOPE) as scope:
+            p_tp1 = _build_p_network(
+                registry, self.obs_tp1, dim_actions, config)
+            target_p_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        with tf.variable_scope(A_SCOPE, reuse=True):
+            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
+            zero_eps = tf.constant(value=.0, dtype=tf.float32)
+            output_actions = _build_action_network(
+                self.p_t, low_action, high_action, deterministic_flag,
+                zero_eps, config["exploration_theta"],
+                config["exploration_sigma"])
+
+            output_actions_estimated = _build_action_network(
+                p_tp1, low_action, high_action, deterministic_flag,
+                zero_eps, config["exploration_theta"],
+                config["exploration_sigma"])
+
+        # q network evaluation
+        with tf.variable_scope(Q_SCOPE) as scope:
+            q_t = _build_q_network(
+                registry, self.obs_t, self.act_t, config)
+            self.q_func_vars = _scope_vars(scope.name)
+        with tf.variable_scope(Q_SCOPE, reuse=True):
+            q_tp0 = _build_q_network(
+                registry, self.obs_t, output_actions, config)
+
+        # target q network evalution
+        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
+            q_tp1 = _build_q_network(
+                registry, self.obs_tp1, output_actions_estimated, config)
+            target_q_func_vars = _scope_vars(scope.name)
+
+        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
+
+        q_tp1_best = tf.squeeze(
+            input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = (
+            self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
+        if config.get("use_huber"):
+            errors = _huber_loss(self.td_error, config.get("huber_threshold"))
+        else:
+            errors = 0.5 * tf.square(self.td_error)
+
+        self.loss = tf.reduce_mean(self.importance_weights * errors)
+
+        # for policy gradient
+        self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
+
+        if config["l2_reg"] is not None:
+            for var in self.p_func_vars:
+                if "bias" not in var.name:
+                    self.actor_loss += (
+                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
+            for var in self.q_func_vars:
+                if "bias" not in var.name:
+                    self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
+                        var)
+
+        # update_target_fn will be called periodically to copy Q network to
+        # target Q network
+        self.tau_value = config.get("tau")
+        self.tau = tf.placeholder(tf.float32, (), name="tau")
+        update_target_expr = []
+        for var, var_target in zip(
+                sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(target_q_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(
+                var_target.assign(self.tau * var +
+                                  (1.0 - self.tau) * var_target))
+        for var, var_target in zip(
+                sorted(self.p_func_vars, key=lambda v: v.name),
+                sorted(target_p_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(
+                var_target.assign(self.tau * var +
+                                  (1.0 - self.tau) * var_target))
+        self.update_target_expr = tf.group(*update_target_expr)
+
+        self.sess = tf.get_default_session()
+        self.loss_inputs = [
+            ("obs", self.obs_t),
+            ("actions", self.act_t),
+            ("rewards", self.rew_t),
+            ("new_obs", self.obs_tp1),
+            ("dones", self.done_mask),
+            ("weights", self.importance_weights),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.cur_observations,
+            action_sampler=self.output_actions, loss=self.loss,
+            loss_inputs=self.loss_inputs, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+        # Note that this encompasses both the policy and Q-value networks and
+        # their corresponding target networks
+        self.variables = ray.experimental.TensorFlowVariables(
+            tf.group(q_tp0, q_tp1), self.sess)
+
+        # Hard initial update
+        self.update_target(tau=1.0)
+
+    def gradients(self, optimizer):
+        if self.config["grad_norm_clipping"] is not None:
+            actor_grads_and_vars = _minimize_and_clip(
+                self.actor_optimizer,
+                self.actor_loss,
+                var_list=self.p_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+            critic_grads_and_vars = _minimize_and_clip(
+                self.critic_optimizer,
+                self.loss,
+                var_list=self.q_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+        else:
+            actor_grads_and_vars = self.actor_optimizer.compute_gradients(
+                self.actor_loss, var_list=self.p_func_vars)
+            critic_grads_and_vars = self.critic_optimizer.compute_gradients(
+                self.loss, var_list=self.q_func_vars)
+        actor_grads_and_vars = [
+            (g, v) for (g, v) in actor_grads_and_vars if g is not None]
+        critic_grads_and_vars = [
+            (g, v) for (g, v) in critic_grads_and_vars if g is not None]
+        grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
+        return grads_and_vars
+
+    def extra_compute_action_feed_dict(self):
+        return {
+            self.stochastic: True,
+            self.eps: self.cur_epsilon,
+        }
+
+    def extra_compute_grad_fetches(self):
+        return {
+            "td_error": self.td_error,
+        }
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return _postprocess_dqn(self, sample_batch)
+
+    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
+                         importance_weights):
+        td_err = self.sess.run(
+            self.td_error,
+            feed_dict={
+                self.obs_t: [np.array(ob) for ob in obs_t],
+                self.act_t: act_t,
+                self.rew_t: rew_t,
+                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
+                self.done_mask: done_mask,
+                self.importance_weights: importance_weights
+            })
+        return td_err
+
+    def reset_noise(self, sess):
+        sess.run(self.reset_noise_op)
+
+    # support both hard and soft sync
+    def update_target(self, tau=None):
+        return self.sess.run(
+            self.update_target_expr,
+            feed_dict={self.tau: tau or self.tau_value})
+
+    def set_epsilon(self, epsilon):
+        self.cur_epsilon = epsilon
+
+    def get_weights(self):
+        return self.variables.get_weights()
+
+    def set_weights(self, weights):
+        self.variables.set_weights(weights)
+
+    def get_state(self):
+        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+
+    def set_state(self, state):
+        TFPolicyGraph.set_state(self, state[0])
+        self.set_epsilon(state[1])
@@ -1,391 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-import tensorflow.contrib.layers as layers
-
-from ray.rllib.models import ModelCatalog
-
-
-def _build_p_network(registry, inputs, dim_actions, config):
-    """
-    map an observation (i.e., state) to an action where
-    each entry takes value from (0, 1) due to the sigmoid function
-    """
-    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
-
-    hiddens = config["actor_hiddens"]
-    action_out = frontend.last_layer
-    for hidden in hiddens:
-        action_out = layers.fully_connected(
-            action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
-    # Use sigmoid layer to bound values within (0, 1)
-    # shape of action_scores is [batch_size, dim_actions]
-    action_scores = layers.fully_connected(
-        action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
-
-    return action_scores
-
-
-# As a stochastic policy for inference, but a deterministic policy for training
-# thus ignore batch_size issue when constructing a stochastic action
-def _build_action_network(p_values, low_action, high_action, stochastic, eps,
-                          theta, sigma):
-    # shape is [None, dim_action]
-    deterministic_actions = (high_action - low_action) * p_values + low_action
-
-    exploration_sample = tf.get_variable(
-        name="ornstein_uhlenbeck",
-        dtype=tf.float32,
-        initializer=low_action.size * [.0],
-        trainable=False)
-    normal_sample = tf.random_normal(
-        shape=[low_action.size], mean=0.0, stddev=1.0)
-    exploration_value = tf.assign_add(
-        exploration_sample,
-        theta * (.0 - exploration_sample) + sigma * normal_sample)
-    stochastic_actions = deterministic_actions + eps * (
-        high_action - low_action) * exploration_value
-
-    return tf.cond(stochastic, lambda: stochastic_actions,
-                   lambda: deterministic_actions)
-
-
-def _build_q_network(registry, inputs, action_inputs, config):
-    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
-
-    hiddens = config["critic_hiddens"]
-
-    q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
-    for hidden in hiddens:
-        q_out = layers.fully_connected(
-            q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
-    q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
-
-    return q_scores
-
-
-def _huber_loss(x, delta=1.0):
-    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
-    return tf.where(
-        tf.abs(x) < delta,
-        tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
-
-
-def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
-    """Minimized `objective` using `optimizer` w.r.t. variables in
-    `var_list` while ensure the norm of the gradients for each
-    variable is clipped to `clip_val`
-    """
-    gradients = optimizer.compute_gradients(objective, var_list=var_list)
-    for i, (grad, var) in enumerate(gradients):
-        if grad is not None:
-            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
-    return gradients
-
-
-def _scope_vars(scope, trainable_only=False):
-    """
-    Get variables inside a scope
-    The scope can be specified as a string
-
-    Parameters
-    ----------
-    scope: str or VariableScope
-      scope in which the variables reside.
-    trainable_only: bool
-      whether or not to return only the variables that were marked as
-      trainable.
-
-    Returns
-    -------
-    vars: [tf.Variable]
-      list of variables in `scope`.
-    """
-    return tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES
-        if trainable_only else tf.GraphKeys.VARIABLES,
-        scope=scope if isinstance(scope, str) else scope.name)
-
-
-class ModelAndLoss(object):
-    """Holds the model and loss function.
-
-    Both graphs are necessary in order for the multi-gpu SGD implementation
-    to create towers on each device.
-    """
-
-    def __init__(self, registry, dim_actions, low_action, high_action, config,
-                 obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-        # p network evaluation
-        with tf.variable_scope("p_func", reuse=True) as scope:
-            self.p_t = _build_p_network(registry, obs_t, dim_actions, config)
-
-        # target p network evaluation
-        with tf.variable_scope("target_p_func") as scope:
-            self.p_tp1 = _build_p_network(registry, obs_tp1, dim_actions,
-                                          config)
-            self.target_p_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        with tf.variable_scope("a_func", reuse=True):
-            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
-            zero_eps = tf.constant(value=.0, dtype=tf.float32)
-            output_actions = _build_action_network(
-                self.p_t, low_action, high_action, deterministic_flag,
-                zero_eps, config["exploration_theta"],
-                config["exploration_sigma"])
-
-            output_actions_estimated = _build_action_network(
-                self.p_tp1, low_action, high_action, deterministic_flag,
-                zero_eps, config["exploration_theta"],
-                config["exploration_sigma"])
-
-        # q network evaluation
-        with tf.variable_scope("q_func") as scope:
-            self.q_t = _build_q_network(registry, obs_t, act_t, config)
-            self.q_func_vars = _scope_vars(scope.name)
-        with tf.variable_scope("q_func", reuse=True):
-            self.q_tp0 = _build_q_network(registry, obs_t, output_actions,
-                                          config)
-
-        # target q network evalution
-        with tf.variable_scope("target_q_func") as scope:
-            self.q_tp1 = _build_q_network(registry, obs_tp1,
-                                          output_actions_estimated, config)
-            self.target_q_func_vars = _scope_vars(scope.name)
-
-        q_t_selected = tf.squeeze(self.q_t, axis=len(self.q_t.shape) - 1)
-
-        q_tp1_best = tf.squeeze(
-            input=self.q_tp1, axis=len(self.q_tp1.shape) - 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = (
-            rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
-
-        # compute the error (potentially clipped)
-        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        if config.get("use_huber"):
-            errors = _huber_loss(self.td_error, config.get("huber_threshold"))
-        else:
-            errors = 0.5 * tf.square(self.td_error)
-
-        weighted_error = tf.reduce_mean(importance_weights * errors)
-
-        self.loss = weighted_error
-
-        # for policy gradient
-        self.actor_loss = -1.0 * tf.reduce_mean(self.q_tp0)
-
-
-class DDPGGraph(object):
-    def __init__(self, registry, env, config, logdir):
-        self.env = env
-        dim_actions = env.action_space.shape[0]
-        low_action = env.action_space.low
-        high_action = env.action_space.high
-        actor_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["actor_lr"])
-        critic_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["critic_lr"])
-
-        # Action inputs
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
-        self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None, ) + env.observation_space.shape)
-
-        # Actor: P (policy) network
-        p_scope_name = "p_func"
-        with tf.variable_scope(p_scope_name) as scope:
-            p_values = _build_p_network(registry, self.cur_observations,
-                                        dim_actions, config)
-            p_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        a_scope_name = "a_func"
-        with tf.variable_scope(a_scope_name):
-            self.output_actions = _build_action_network(
-                p_values, low_action, high_action, self.stochastic, self.eps,
-                config["exploration_theta"], config["exploration_sigma"])
-
-        with tf.variable_scope(a_scope_name, reuse=True):
-            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
-            self.reset_noise_op = tf.assign(exploration_sample,
-                                            dim_actions * [.0])
-
-        # Replay inputs
-        self.obs_t = tf.placeholder(
-            tf.float32,
-            shape=(None, ) + env.observation_space.shape,
-            name="observation")
-        self.act_t = tf.placeholder(
-            tf.float32, shape=(None, ) + env.action_space.shape, name="action")
-        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
-        self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None, ) + env.observation_space.shape)
-        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
-        self.importance_weights = tf.placeholder(
-            tf.float32, [None], name="weight")
-
-        def build_loss(obs_t, act_t, rew_t, obs_tp1, done_mask,
-                       importance_weights):
-            return ModelAndLoss(registry, dim_actions, low_action, high_action,
-                                config, obs_t, act_t, rew_t, obs_tp1,
-                                done_mask, importance_weights)
-
-        self.loss_inputs = [
-            ("obs", self.obs_t),
-            ("actions", self.act_t),
-            ("rewards", self.rew_t),
-            ("new_obs", self.obs_tp1),
-            ("dones", self.done_mask),
-            ("weights", self.importance_weights),
-        ]
-
-        loss_obj = build_loss(self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
-                              self.done_mask, self.importance_weights)
-
-        self.build_loss = build_loss
-
-        actor_loss = loss_obj.actor_loss
-        weighted_error = loss_obj.loss
-        q_func_vars = loss_obj.q_func_vars
-        target_p_func_vars = loss_obj.target_p_func_vars
-        target_q_func_vars = loss_obj.target_q_func_vars
-        self.p_t = loss_obj.p_t
-        self.q_t = loss_obj.q_t
-        self.q_tp0 = loss_obj.q_tp0
-        self.q_tp1 = loss_obj.q_tp1
-        self.td_error = loss_obj.td_error
-
-        if config["l2_reg"] is not None:
-            for var in p_func_vars:
-                if "bias" not in var.name:
-                    actor_loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)
-            for var in q_func_vars:
-                if "bias" not in var.name:
-                    weighted_error += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
-                        var)
-
-        # compute optimization op (potentially with gradient clipping)
-        if config["grad_norm_clipping"] is not None:
-            self.actor_grads_and_vars = _minimize_and_clip(
-                actor_optimizer,
-                actor_loss,
-                var_list=p_func_vars,
-                clip_val=config["grad_norm_clipping"])
-            self.critic_grads_and_vars = _minimize_and_clip(
-                critic_optimizer,
-                weighted_error,
-                var_list=q_func_vars,
-                clip_val=config["grad_norm_clipping"])
-        else:
-            self.actor_grads_and_vars = actor_optimizer.compute_gradients(
-                actor_loss, var_list=p_func_vars)
-            self.critic_grads_and_vars = critic_optimizer.compute_gradients(
-                weighted_error, var_list=q_func_vars)
-        self.actor_grads_and_vars = [(g, v)
-                                     for (g, v) in self.actor_grads_and_vars
-                                     if g is not None]
-        self.critic_grads_and_vars = [(g, v)
-                                      for (g, v) in self.critic_grads_and_vars
-                                      if g is not None]
-        self.grads_and_vars = (
-            self.actor_grads_and_vars + self.critic_grads_and_vars)
-        self.grads = [g for (g, v) in self.grads_and_vars]
-        self.actor_train_expr = actor_optimizer.apply_gradients(
-            self.actor_grads_and_vars)
-        self.critic_train_expr = critic_optimizer.apply_gradients(
-            self.critic_grads_and_vars)
-
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        self.tau_value = config.get("tau")
-        self.tau = tf.placeholder(tf.float32, (), name="tau")
-        update_target_expr = []
-        for var, var_target in zip(
-                sorted(q_func_vars, key=lambda v: v.name),
-                sorted(target_q_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-        for var, var_target in zip(
-                sorted(p_func_vars, key=lambda v: v.name),
-                sorted(target_p_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-    # support both hard and soft sync
-    def update_target(self, sess, tau=None):
-        return sess.run(
-            self.update_target_expr,
-            feed_dict={self.tau: tau or self.tau_value})
-
-    def act(self, sess, obs, eps, stochastic=True):
-        return sess.run(
-            self.output_actions,
-            feed_dict={
-                self.cur_observations: obs,
-                self.stochastic: stochastic,
-                self.eps: eps
-            })
-
-    def compute_gradients(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                          importance_weights):
-        td_err, grads = sess.run(
-            [self.td_error, self.grads],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err, grads
-
-    def compute_td_error(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                         importance_weights):
-        td_err = sess.run(
-            self.td_error,
-            feed_dict={
-                self.obs_t: [np.array(ob) for ob in obs_t],
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def apply_gradients(self, sess, grads):
-        assert len(grads) == len(self.grads_and_vars)
-        feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
-        sess.run(
-            [self.critic_train_expr, self.actor_train_expr],
-            feed_dict=feed_dict)
-
-    def compute_apply(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                      importance_weights):
-        td_err, _, _ = sess.run(
-            [self.td_error, self.critic_train_expr, self.actor_train_expr],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def reset_noise(self, sess):
-        sess.run(self.reset_noise_op)
@@ -9,7 +9,7 @@ from ray.rllib.ddpg2.models import DDPGModel
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.optimizers import PolicyEvaluator
 from ray.rllib.utils.filter import NoFilter
-from ray.rllib.utils.process_rollout import process_rollout
+from ray.rllib.utils.process_rollout import compute_advantages
 from ray.rllib.utils.sampler import SyncSampler


@@ -34,9 +34,7 @@ class DDPGEvaluator(PolicyEvaluator):

        # since each sample is one step, no discounting needs to be applied;
        # this does not involve config["gamma"]
-        samples = process_rollout(
-                    rollout, NoFilter(),
-                    gamma=1.0, use_gae=False)
+        samples = compute_advantages(rollout, 0.0, gamma=1.0, use_gae=False)

        return samples

@@ -227,7 +227,7 @@ class DDPGActorCritic():
        self.critic_vars.set_weights(critic_weights)
        self.actor_vars.set_weights(actor_weights)

-    def compute(self, ob):
+    def compute_single_action(self, ob, h, is_training):
        """Returns action, given state."""
        flattened_ob = np.reshape(ob, [-1, np.prod(ob.shape)])
        action = self.sess.run(self.output_action, {self.obs: flattened_ob})
@@ -235,7 +235,10 @@ class DDPGActorCritic():
            action += self.epsilon * self.rand_process.sample()
            if (self.epsilon > 0):
                self.epsilon -= self.config["noise_epsilon"]
-        return action[0], {}
+        return action[0], [], {}

    def value(self, *args):
        return 0
+
+    def get_initial_state(self):
+        return []
@@ -9,26 +9,26 @@ from ray.utils import merge_dicts
 APEX_DEFAULT_CONFIG = merge_dicts(
    DQN_CONFIG,
    {
-        'optimizer_class': 'ApexOptimizer',
-        'optimizer_config':
+        "optimizer_class": "ApexOptimizer",
+        "optimizer_config":
            merge_dicts(
-                DQN_CONFIG['optimizer_config'], {
-                    'max_weight_sync_delay': 400,
-                    'num_replay_buffer_shards': 4,
-                    'debug': False
+                DQN_CONFIG["optimizer_config"], {
+                    "max_weight_sync_delay": 400,
+                    "num_replay_buffer_shards": 4,
+                    "debug": False
                }),
-        'n_step': 3,
-        'gpu': True,
-        'num_workers': 32,
-        'buffer_size': 2000000,
-        'learning_starts': 50000,
-        'train_batch_size': 512,
-        'sample_batch_size': 50,
-        'max_weight_sync_delay': 400,
-        'target_network_update_freq': 500000,
-        'timesteps_per_iteration': 25000,
-        'per_worker_exploration': True,
-        'worker_side_prioritization': True,
+        "n_step": 3,
+        "gpu": True,
+        "num_workers": 32,
+        "buffer_size": 2000000,
+        "learning_starts": 50000,
+        "train_batch_size": 512,
+        "sample_batch_size": 50,
+        "max_weight_sync_delay": 400,
+        "target_network_update_freq": 500000,
+        "timesteps_per_iteration": 25000,
+        "per_worker_exploration": True,
+        "worker_side_prioritization": True,
    },
 )

@@ -5,14 +5,13 @@ from __future__ import print_function
 import pickle
 import os

-import numpy as np
-import tensorflow as tf
-
 import ray
 from ray.rllib import optimizers
-from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
+from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
+from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
 from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
 from ray.tune.trial import Resources


@@ -24,101 +23,84 @@ OPTIMIZER_SHARED_CONFIGS = [
 DEFAULT_CONFIG = {
    # === Model ===
    # Whether to use dueling dqn
-    'dueling': True,
+    "dueling": True,
    # Whether to use double dqn
-    'double_q': True,
+    "double_q": True,
    # Hidden layer sizes of the state and action value networks
-    'hiddens': [256],
+    "hiddens": [256],
    # N-step Q learning
-    'n_step': 1,
+    "n_step": 1,
    # Config options to pass to the model constructor
-    'model': {},
+    "model": {},
    # Discount factor for the MDP
-    'gamma': 0.99,
+    "gamma": 0.99,
    # Arguments to pass to the env creator
-    'env_config': {},
+    "env_config": {},

    # === Exploration ===
    # Max num timesteps for annealing schedules. Exploration is annealed from
    # 1.0 to exploration_fraction over this number of timesteps scaled by
    # exploration_fraction
-    'schedule_max_timesteps': 100000,
+    "schedule_max_timesteps": 100000,
    # Number of env steps to optimize for before returning
-    'timesteps_per_iteration': 1000,
+    "timesteps_per_iteration": 1000,
    # Fraction of entire training period over which the exploration rate is
    # annealed
-    'exploration_fraction': 0.1,
+    "exploration_fraction": 0.1,
    # Final value of random action probability
-    'exploration_final_eps': 0.02,
+    "exploration_final_eps": 0.02,
    # Update the target network every `target_network_update_freq` steps.
-    'target_network_update_freq': 500,
-    # Whether to start with random actions instead of noops.
-    'random_starts': True,
+    "target_network_update_freq": 500,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
-    'buffer_size': 50000,
+    "buffer_size": 50000,
    # If True prioritized replay buffer will be used.
-    'prioritized_replay': True,
+    "prioritized_replay": True,
    # Alpha parameter for prioritized replay buffer.
-    'prioritized_replay_alpha': 0.6,
+    "prioritized_replay_alpha": 0.6,
    # Beta parameter for sampling from prioritized replay buffer.
-    'prioritized_replay_beta': 0.4,
+    "prioritized_replay_beta": 0.4,
    # Epsilon to add to the TD errors when updating priorities.
-    'prioritized_replay_eps': 1e-6,
+    "prioritized_replay_eps": 1e-6,
    # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
-    'clip_rewards': True,
+    "clip_rewards": True,

    # === Optimization ===
    # Learning rate for adam optimizer
-    'lr': 5e-4,
+    "lr": 5e-4,
    # If not None, clip gradients during optimization at this value
-    'grad_norm_clipping': 40,
+    "grad_norm_clipping": 40,
    # How many steps of the model to sample before learning starts.
-    'learning_starts': 1000,
+    "learning_starts": 1000,
    # Update the replay buffer with this many samples at once. Note that
    # this setting applies per-worker if num_workers > 1.
-    'sample_batch_size': 4,
+    "sample_batch_size": 4,
    # Size of a batched sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
-    'train_batch_size': 32,
-    # Smooth the current average reward over this many previous episodes.
-    'smoothing_num_episodes': 100,
-
-    # === Tensorflow ===
-    # Arguments to pass to tensorflow
-    'tf_session_args': {
-        "device_count": {"CPU": 2},
-        "log_device_placement": False,
-        "allow_soft_placement": True,
-        "gpu_options": {
-            "allow_growth": True
-        },
-        "inter_op_parallelism_threads": 1,
-        "intra_op_parallelism_threads": 1,
-    },
+    "train_batch_size": 32,

    # === Parallelism ===
    # Whether to use a GPU for local optimization.
-    'gpu': False,
+    "gpu": False,
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    'num_workers': 0,
+    # you"re using the Async or Ape-X optimizers.
+    "num_workers": 0,
    # Whether to allocate GPUs for workers (if > 0).
-    'num_gpus_per_worker': 0,
+    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
-    'num_cpus_per_worker': 1,
+    "num_cpus_per_worker": 1,
    # Optimizer class to use.
-    'optimizer_class': "LocalSyncReplayOptimizer",
+    "optimizer_class": "LocalSyncReplayOptimizer",
    # Config to pass to the optimizer.
-    'optimizer_config': {},
+    "optimizer_config": {},
    # Whether to use a distribution of epsilons across workers for exploration.
-    'per_worker_exploration': False,
+    "per_worker_exploration": False,
    # Whether to compute priorities on workers.
-    'worker_side_prioritization': False
+    "worker_side_prioritization": False
 }


@@ -127,6 +109,7 @@ class DQNAgent(Agent):
    _allow_unknown_subkeys = [
        "model", "optimizer", "tf_session_args", "env_config"]
    _default_config = DEFAULT_CONFIG
+    _policy_graph = DQNPolicyGraph

    @classmethod
    def default_resource_request(cls, config):
@@ -137,16 +120,31 @@ class DQNAgent(Agent):
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

    def _init(self):
-        self.local_evaluator = DQNEvaluator(
-            self.registry, self.env_creator, self.config, self.logdir, 0)
-        remote_cls = ray.remote(
+        adjusted_batch_size = (
+            self.config["sample_batch_size"] + self.config["n_step"] - 1)
+        self.local_evaluator = CommonPolicyEvaluator(
+            self.env_creator, self._policy_graph,
+            batch_steps=adjusted_batch_size,
+            batch_mode="pack_episodes", preprocessor_pref="deepmind",
+            compress_observations=True,
+            registry=self.registry, env_config=self.config["env_config"],
+            model_config=self.config["model"], policy_config=self.config)
+        remote_cls = CommonPolicyEvaluator.as_remote(
            num_cpus=self.config["num_cpus_per_worker"],
-            num_gpus=self.config["num_gpus_per_worker"])(
-            DQNEvaluator)
+            num_gpus=self.config["num_gpus_per_worker"])
        self.remote_evaluators = [
            remote_cls.remote(
-                self.registry, self.env_creator, self.config, self.logdir,
-                i)
+                self.env_creator, self._policy_graph,
+                batch_steps=adjusted_batch_size,
+                batch_mode="pack_episodes", preprocessor_pref="deepmind",
+                compress_observations=True,
+                registry=self.registry, env_config=self.config["env_config"],
+                model_config=self.config["model"], policy_config=self.config)
+            for _ in range(self.config["num_workers"])]
+
+        self.exploration0 = self._make_exploration_schedule(0)
+        self.explorations = [
+            self._make_exploration_schedule(i)
            for i in range(self.config["num_workers"])]

        for k in OPTIMIZER_SHARED_CONFIGS:
@@ -157,10 +155,25 @@ class DQNAgent(Agent):
            self.config["optimizer_config"], self.local_evaluator,
            self.remote_evaluators)

-        self.saver = tf.train.Saver(max_to_keep=None)
        self.last_target_update_ts = 0
        self.num_target_updates = 0

+    def _make_exploration_schedule(self, worker_index):
+        # Use either a different `eps` per worker, or a linear schedule.
+        if self.config["per_worker_exploration"]:
+            assert self.config["num_workers"] > 1, \
+                "This requires multiple workers"
+            return ConstantSchedule(
+                0.4 ** (
+                    1 + worker_index / float(
+                        self.config["num_workers"] - 1) * 7))
+        return LinearSchedule(
+            schedule_timesteps=int(
+                self.config["exploration_fraction"] *
+                self.config["schedule_max_timesteps"]),
+            initial_p=1.0,
+            final_p=self.config["exploration_final_eps"])
+
    @property
    def global_timestep(self):
        return self.optimizer.num_steps_sampled
@@ -168,7 +181,7 @@ class DQNAgent(Agent):
    def update_target_if_needed(self):
        if self.global_timestep - self.last_target_update_ts > \
                self.config["target_network_update_freq"]:
-            self.local_evaluator.update_target()
+            self.local_evaluator.for_policy(lambda p: p.update_target())
            self.last_target_update_ts = self.global_timestep
            self.num_target_updates += 1

@@ -177,58 +190,25 @@ class DQNAgent(Agent):

        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]):
-
            self.optimizer.step()
            self.update_target_if_needed()

-        self.local_evaluator.set_global_timestep(self.global_timestep)
-        for e in self.remote_evaluators:
-            e.set_global_timestep.remote(self.global_timestep)
+        exp_vals = [self.exploration0.value(self.global_timestep)]
+        self.local_evaluator.for_policy(
+            lambda p: p.set_epsilon(exp_vals[0]))
+        for i, e in enumerate(self.remote_evaluators):
+            exp_val = self.explorations[i].value(self.global_timestep)
+            e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
+            exp_vals.append(exp_val)

-        return self._train_stats(start_timestep)
-
-    def _train_stats(self, start_timestep):
-        if self.remote_evaluators:
-            stats = ray.get([
-                e.stats.remote() for e in self.remote_evaluators])
-        else:
-            stats = self.local_evaluator.stats()
-            if not isinstance(stats, list):
-                stats = [stats]
-
-        mean_100ep_reward = 0.0
-        mean_100ep_length = 0.0
-        num_episodes = 0
-        explorations = []
-
-        if self.config["per_worker_exploration"]:
-            # Return stats from workers with the lowest 20% of exploration
-            test_stats = stats[-int(max(1, len(stats)*0.2)):]
-        else:
-            test_stats = stats
-
-        for s in test_stats:
-            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
-            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
-
-        for s in stats:
-            num_episodes += s["num_episodes"]
-            explorations.append(s["exploration"])
-
-        opt_stats = self.optimizer.stats()
-
-        result = TrainingResult(
-            episode_reward_mean=mean_100ep_reward,
-            episode_len_mean=mean_100ep_length,
-            episodes_total=num_episodes,
-            timesteps_this_iter=self.global_timestep - start_timestep,
+        result = collect_metrics(
+            self.local_evaluator, self.remote_evaluators)
+        return result._replace(
            info=dict({
-                "min_exploration": min(explorations),
-                "max_exploration": max(explorations),
+                "min_exploration": min(exp_vals),
+                "max_exploration": max(exp_vals),
                "num_target_updates": self.num_target_updates,
-            }, **opt_stats))
-
-        return result
+            }, **self.optimizer.stats()))

    def _stop(self):
        # workaround for https://github.com/ray-project/ray/issues/1516
@@ -236,10 +216,8 @@ class DQNAgent(Agent):
            ev.__ray_terminate__.remote()

    def _save(self, checkpoint_dir):
-        checkpoint_path = self.saver.save(
-            self.local_evaluator.sess,
-            os.path.join(checkpoint_dir, "checkpoint"),
-            global_step=self.iteration)
+        checkpoint_path = os.path.join(
+            checkpoint_dir, "checkpoint-{}".format(self.iteration))
        extra_data = [
            self.local_evaluator.save(),
            ray.get([e.save.remote() for e in self.remote_evaluators]),
@@ -250,7 +228,6 @@ class DQNAgent(Agent):
        return checkpoint_path

    def _restore(self, checkpoint_path):
-        self.saver.restore(self.local_evaluator.sess, checkpoint_path)
        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
        self.local_evaluator.restore(extra_data[0])
        ray.get([
@@ -260,6 +237,9 @@ class DQNAgent(Agent):
        self.num_target_updates = extra_data[3]
        self.last_target_update_ts = extra_data[4]

-    def compute_action(self, observation):
-        return self.local_evaluator.dqn_graph.act(
-            self.local_evaluator.sess, np.array(observation)[None], 0.0)[0]
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                observation, state, is_training=False)[0])
@@ -1,207 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Discrete
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.dqn import models
-from ray.rllib.dqn.common.wrappers import wrap_dqn
-from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
-from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
-from ray.rllib.utils.compression import pack
-
-
-def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
-    """Rewrites the given trajectory fragments to encode n-step rewards.
-
-    reward[i] = (
-        reward[i] * gamma**0 +
-        reward[i+1] * gamma**1 +
-        ... +
-        reward[i+n_step-1] * gamma**(n_step-1))
-
-    The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
-
-    If the episode finishes, the reward will be truncated. After this rewrite,
-    all the arrays will be shortened by (n_step - 1).
-    """
-    for i in range(len(rewards) - n_step + 1):
-        if dones[i]:
-            continue  # episode end
-        for j in range(1, n_step):
-            new_obs[i] = new_obs[i + j]
-            rewards[i] += gamma ** j * rewards[i + j]
-            if dones[i + j]:
-                break  # episode end
-    # truncate ends of the trajectory
-    new_len = len(obs) - n_step + 1
-    for arr in [obs, actions, rewards, new_obs, dones]:
-        del arr[new_len:]
-
-
-class DQNEvaluator(PolicyEvaluator):
-    """The DQN Evaluator.
-
-    TODO(rliaw): Support observation/reward filters?"""
-
-    def __init__(self, registry, env_creator, config, logdir, worker_index):
-        env = env_creator(config["env_config"])
-        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
-        self.env = env
-        self.config = config
-
-        if not isinstance(env.action_space, Discrete):
-            raise UnsupportedSpaceException(
-                "Action space {} is not supported for DQN.".format(
-                    env.action_space))
-
-        tf_config = tf.ConfigProto(**config["tf_session_args"])
-        self.sess = tf.Session(config=tf_config)
-        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)
-
-        # Use either a different `eps` per worker, or a linear schedule.
-        if config["per_worker_exploration"]:
-            assert config["num_workers"] > 1, "This requires multiple workers"
-            self.exploration = ConstantSchedule(
-                0.4 ** (
-                    1 + worker_index / float(config["num_workers"] - 1) * 7))
-        else:
-            self.exploration = LinearSchedule(
-                schedule_timesteps=int(
-                    config["exploration_fraction"] *
-                    config["schedule_max_timesteps"]),
-                initial_p=1.0,
-                final_p=config["exploration_final_eps"])
-
-        # Initialize the parameters and copy them to the target network.
-        self.sess.run(tf.global_variables_initializer())
-        self.dqn_graph.update_target(self.sess)
-        self.global_timestep = 0
-        self.local_timestep = 0
-
-        # Note that this encompasses both the Q and target network
-        self.variables = ray.experimental.TensorFlowVariables(
-            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)
-
-        self.episode_rewards = [0.0]
-        self.episode_lengths = [0.0]
-        self.saved_mean_reward = None
-
-        self.obs = self.env.reset()
-
-    def set_global_timestep(self, global_timestep):
-        self.global_timestep = global_timestep
-
-    def update_target(self):
-        self.dqn_graph.update_target(self.sess)
-
-    def sample(self):
-        obs, actions, rewards, new_obs, dones = [], [], [], [], []
-        for _ in range(
-                self.config["sample_batch_size"] + self.config["n_step"] - 1):
-            ob, act, rew, ob1, done = self._step(self.global_timestep)
-            obs.append(ob)
-            actions.append(act)
-            rewards.append(rew)
-            new_obs.append(ob1)
-            dones.append(done)
-
-        # N-step Q adjustments
-        if self.config["n_step"] > 1:
-            # Adjust for steps lost from truncation
-            self.local_timestep -= (self.config["n_step"] - 1)
-            adjust_nstep(
-                self.config["n_step"], self.config["gamma"],
-                obs, actions, rewards, new_obs, dones)
-
-        batch = SampleBatch({
-            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
-            "rewards": rewards,
-            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
-            "weights": np.ones_like(rewards)})
-        assert (batch.count == self.config["sample_batch_size"])
-
-        # Prioritize on the worker side
-        if self.config["worker_side_prioritization"]:
-            td_errors = self.dqn_graph.compute_td_error(
-                self.sess, obs, batch["actions"], batch["rewards"],
-                new_obs, batch["dones"], batch["weights"])
-            new_priorities = (
-                np.abs(td_errors) + self.config["prioritized_replay_eps"])
-            batch.data["weights"] = new_priorities
-
-        return batch
-
-    def compute_gradients(self, samples):
-        td_err, grads = self.dqn_graph.compute_gradients(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return grads, {"td_error": td_err}
-
-    def apply_gradients(self, grads):
-        self.dqn_graph.apply_gradients(self.sess, grads)
-
-    def compute_apply(self, samples):
-        td_error = self.dqn_graph.compute_apply(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return {"td_error": td_error}
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def _step(self, global_timestep):
-        """Takes a single step, and returns the result of the step."""
-        action = self.dqn_graph.act(
-            self.sess, np.array(self.obs)[None],
-            self.exploration.value(global_timestep))[0]
-        new_obs, rew, done, _ = self.env.step(action)
-        ret = (self.obs, action, rew, new_obs, float(done))
-        self.obs = new_obs
-        self.episode_rewards[-1] += rew
-        self.episode_lengths[-1] += 1
-        if done:
-            self.obs = self.env.reset()
-            self.episode_rewards.append(0.0)
-            self.episode_lengths.append(0.0)
-        self.local_timestep += 1
-        return ret
-
-    def stats(self):
-        n = self.config["smoothing_num_episodes"] + 1
-        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
-        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
-        exploration = self.exploration.value(self.global_timestep)
-        return {
-            "mean_100ep_reward": mean_100ep_reward,
-            "mean_100ep_length": mean_100ep_length,
-            "num_episodes": len(self.episode_rewards),
-            "exploration": exploration,
-            "local_timestep": self.local_timestep,
-        }
-
-    def save(self):
-        return [
-            self.exploration,
-            self.episode_rewards,
-            self.episode_lengths,
-            self.saved_mean_reward,
-            self.obs,
-            self.global_timestep,
-            self.local_timestep]
-
-    def restore(self, data):
-        self.exploration = data[0]
-        self.episode_rewards = data[1]
-        self.episode_lengths = data[2]
-        self.saved_mean_reward = data[3]
-        self.obs = data[4]
-        self.global_timestep = data[5]
-        self.local_timestep = data[6]
@@ -2,13 +2,240 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from gym.spaces import Discrete
 import numpy as np
-
 import tensorflow as tf
 import tensorflow.contrib.layers as layers

 from ray.rllib.models import ModelCatalog
-from ray.rllib.optimizers.multi_gpu_impl import TOWER_SCOPE_NAME
+from ray.rllib.optimizers.sample_batch import SampleBatch
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+Q_SCOPE = "q_func"
+Q_TARGET_SCOPE = "target_q_func"
+
+
+def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
+    """Rewrites the given trajectory fragments to encode n-step rewards.
+
+    reward[i] = (
+        reward[i] * gamma**0 +
+        reward[i+1] * gamma**1 +
+        ... +
+        reward[i+n_step-1] * gamma**(n_step-1))
+
+    The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
+
+    If the episode finishes, the reward will be truncated. After this rewrite,
+    all the arrays will be shortened by (n_step - 1).
+    """
+    for i in range(len(rewards) - n_step + 1):
+        if dones[i]:
+            continue  # episode end
+        for j in range(1, n_step):
+            new_obs[i] = new_obs[i + j]
+            rewards[i] += gamma ** j * rewards[i + j]
+            if dones[i + j]:
+                break  # episode end
+    # truncate ends of the trajectory
+    new_len = len(obs) - n_step + 1
+    for arr in [obs, actions, rewards, new_obs, dones]:
+        del arr[new_len:]
+
+
+class DQNPolicyGraph(TFPolicyGraph):
+    def __init__(self, observation_space, action_space, registry, config):
+        if not isinstance(action_space, Discrete):
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for DQN.".format(
+                    action_space))
+
+        self.config = config
+        self.cur_epsilon = 1.0
+        num_actions = action_space.n
+
+        # Action inputs
+        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
+        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.cur_observations = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+
+        # Action Q network
+        with tf.variable_scope(Q_SCOPE) as scope:
+            q_values = _build_q_network(
+                registry, self.cur_observations, num_actions, config)
+            self.q_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        self.output_actions = _build_action_network(
+            q_values,
+            self.cur_observations,
+            num_actions,
+            self.stochastic,
+            self.eps)
+
+        # Replay inputs
+        self.obs_t = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+        self.act_t = tf.placeholder(tf.int32, [None], name="action")
+        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
+        self.obs_tp1 = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
+        self.importance_weights = tf.placeholder(
+            tf.float32, [None], name="weight")
+
+        # q network evaluation
+        with tf.variable_scope(Q_SCOPE, reuse=True):
+            q_t = _build_q_network(
+                registry, self.obs_t, num_actions, config)
+
+        # target q network evalution
+        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
+            q_tp1 = _build_q_network(
+                registry, self.obs_tp1, num_actions, config)
+            self.target_q_func_vars = _scope_vars(scope.name)
+
+        # q scores for actions which we know were selected in the given state.
+        q_t_selected = tf.reduce_sum(
+            q_t * tf.one_hot(self.act_t, num_actions), 1)
+
+        # compute estimate of best possible value starting from state at t + 1
+        if config["double_q"]:
+            with tf.variable_scope(Q_SCOPE, reuse=True):
+                q_tp1_using_online_net = _build_q_network(
+                    registry, self.obs_tp1, num_actions, config)
+            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
+            q_tp1_best = tf.reduce_sum(
+                q_tp1 * tf.one_hot(
+                    q_tp1_best_using_online_net, num_actions), 1)
+        else:
+            q_tp1_best = tf.reduce_max(q_tp1, 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = (
+            self.rew_t +
+            config["gamma"] ** config["n_step"] * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
+        self.loss = tf.reduce_mean(
+            self.importance_weights * _huber_loss(self.td_error))
+
+        # update_target_fn will be called periodically to copy Q network to
+        # target Q network
+        update_target_expr = []
+        for var, var_target in zip(
+            sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(self.target_q_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(var_target.assign(var))
+        self.update_target_expr = tf.group(*update_target_expr)
+
+        # initialize TFPolicyGraph
+        self.sess = tf.get_default_session()
+        self.loss_inputs = [
+            ("obs", self.obs_t),
+            ("actions", self.act_t),
+            ("rewards", self.rew_t),
+            ("new_obs", self.obs_tp1),
+            ("dones", self.done_mask),
+            ("weights", self.importance_weights),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.cur_observations,
+            action_sampler=self.output_actions, loss=self.loss,
+            loss_inputs=self.loss_inputs, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+
+    def gradients(self, optimizer):
+        if self.config["grad_norm_clipping"] is not None:
+            grads_and_vars = _minimize_and_clip(
+                optimizer, self.loss, var_list=self.q_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+        else:
+            grads_and_vars = optimizer.compute_gradients(
+                self.loss, var_list=self.q_func_vars)
+        grads_and_vars = [
+            (g, v) for (g, v) in grads_and_vars if g is not None]
+        return grads_and_vars
+
+    def extra_compute_action_feed_dict(self):
+        return {
+            self.stochastic: True,
+            self.eps: self.cur_epsilon,
+        }
+
+    def extra_compute_grad_fetches(self):
+        return {
+            "td_error": self.td_error,
+        }
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return _postprocess_dqn(self, sample_batch)
+
+    def compute_td_error(
+            self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
+        td_err = self.sess.run(
+            self.td_error,
+            feed_dict={
+                self.obs_t: [np.array(ob) for ob in obs_t],
+                self.act_t: act_t,
+                self.rew_t: rew_t,
+                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
+                self.done_mask: done_mask,
+                self.importance_weights: importance_weights
+            })
+        return td_err
+
+    def update_target(self):
+        return self.sess.run(self.update_target_expr)
+
+    def set_epsilon(self, epsilon):
+        self.cur_epsilon = epsilon
+
+    def get_state(self):
+        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+
+    def set_state(self, state):
+        TFPolicyGraph.set_state(self, state[0])
+        self.set_epsilon(state[1])
+
+
+def _postprocess_dqn(policy_graph, sample_batch):
+    obs, actions, rewards, new_obs, dones = [
+        list(x) for x in sample_batch.columns(
+            ["obs", "actions", "rewards", "new_obs", "dones"])]
+
+    # N-step Q adjustments
+    if policy_graph.config["n_step"] > 1:
+        adjust_nstep(
+            policy_graph.config["n_step"], policy_graph.config["gamma"],
+            obs, actions, rewards, new_obs, dones)
+
+    batch = SampleBatch({
+        "obs": obs, "actions": actions, "rewards": rewards,
+        "new_obs": new_obs, "dones": dones,
+        "weights": np.ones_like(rewards)})
+    assert batch.count == policy_graph.config["sample_batch_size"], \
+        (batch.count, policy_graph.config["sample_batch_size"])
+
+    # Prioritize on the worker side
+    if policy_graph.config["worker_side_prioritization"]:
+        td_errors = policy_graph.compute_td_error(
+            batch["obs"], batch["actions"], batch["rewards"],
+            batch["new_obs"], batch["dones"], batch["weights"])
+        new_priorities = (
+            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
+        batch.data["weights"] = new_priorities
+
+    return batch


 def _build_q_network(registry, inputs, num_actions, config):
@@ -98,205 +325,3 @@ def _scope_vars(scope, trainable_only=False):
        tf.GraphKeys.TRAINABLE_VARIABLES
        if trainable_only else tf.GraphKeys.VARIABLES,
        scope=scope if isinstance(scope, str) else scope.name)
-
-
-class ModelAndLoss(object):
-    """Holds the model and loss function.
-
-    Both graphs are necessary in order for the multi-gpu SGD implementation
-    to create towers on each device.
-    """
-
-    def __init__(
-            self, registry, num_actions, config,
-            obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-        # q network evaluation
-        with tf.variable_scope("q_func", reuse=True):
-            self.q_t = _build_q_network(registry, obs_t, num_actions, config)
-
-        # target q network evalution
-        with tf.variable_scope("target_q_func") as scope:
-            self.q_tp1 = _build_q_network(
-                registry, obs_tp1, num_actions, config)
-            self.target_q_func_vars = _scope_vars(scope.name)
-
-        # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(
-            self.q_t * tf.one_hot(act_t, num_actions), 1)
-
-        # compute estimate of best possible value starting from state at t + 1
-        if config["double_q"]:
-            with tf.variable_scope("q_func", reuse=True):
-                q_tp1_using_online_net = _build_q_network(
-                    registry, obs_tp1, num_actions, config)
-            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
-            q_tp1_best = tf.reduce_sum(
-                self.q_tp1 * tf.one_hot(
-                    q_tp1_best_using_online_net, num_actions), 1)
-        else:
-            q_tp1_best = tf.reduce_max(self.q_tp1, 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = (
-            rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked)
-
-        # compute the error (potentially clipped)
-        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        errors = _huber_loss(self.td_error)
-
-        weighted_error = tf.reduce_mean(importance_weights * errors)
-
-        self.loss = weighted_error
-
-
-class DQNGraph(object):
-    def __init__(self, registry, env, config, logdir):
-        self.env = env
-        num_actions = env.action_space.n
-        optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"])
-
-        # Action inputs
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
-        self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-
-        # Action Q network
-        q_scope_name = TOWER_SCOPE_NAME + "/q_func"
-        with tf.variable_scope(q_scope_name) as scope:
-            q_values = _build_q_network(
-                registry, self.cur_observations, num_actions, config)
-            q_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        self.output_actions = _build_action_network(
-            q_values,
-            self.cur_observations,
-            num_actions,
-            self.stochastic,
-            self.eps)
-
-        # Replay inputs
-        self.obs_t = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-        self.act_t = tf.placeholder(tf.int32, [None], name="action")
-        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
-        self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
-        self.importance_weights = tf.placeholder(
-            tf.float32, [None], name="weight")
-
-        def build_loss(
-                obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-            return ModelAndLoss(
-                registry,
-                num_actions, config,
-                obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights)
-
-        self.loss_inputs = [
-            ("obs", self.obs_t),
-            ("actions", self.act_t),
-            ("rewards", self.rew_t),
-            ("new_obs", self.obs_tp1),
-            ("dones", self.done_mask),
-            ("weights", self.importance_weights),
-        ]
-
-        with tf.variable_scope(TOWER_SCOPE_NAME):
-            loss_obj = build_loss(
-                self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
-                self.done_mask, self.importance_weights)
-
-        self.build_loss = build_loss
-
-        weighted_error = loss_obj.loss
-        target_q_func_vars = loss_obj.target_q_func_vars
-        self.q_t = loss_obj.q_t
-        self.q_tp1 = loss_obj.q_tp1
-        self.td_error = loss_obj.td_error
-
-        # compute optimization op (potentially with gradient clipping)
-        if config["grad_norm_clipping"] is not None:
-            self.grads_and_vars = _minimize_and_clip(
-                optimizer, weighted_error, var_list=q_func_vars,
-                clip_val=config["grad_norm_clipping"])
-        else:
-            self.grads_and_vars = optimizer.compute_gradients(
-                weighted_error, var_list=q_func_vars)
-        self.grads_and_vars = [
-            (g, v) for (g, v) in self.grads_and_vars if g is not None]
-        self.grads = [g for (g, v) in self.grads_and_vars]
-        self.train_expr = optimizer.apply_gradients(self.grads_and_vars)
-
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        update_target_expr = []
-        for var, var_target in zip(
-            sorted(q_func_vars, key=lambda v: v.name),
-                sorted(target_q_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(var_target.assign(var))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-    def update_target(self, sess):
-        return sess.run(self.update_target_expr)
-
-    def act(self, sess, obs, eps, stochastic=True):
-        return sess.run(
-            self.output_actions,
-            feed_dict={
-                self.cur_observations: obs,
-                self.stochastic: stochastic,
-                self.eps: eps,
-            })
-
-    def compute_gradients(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err, grads = sess.run(
-            [self.td_error, self.grads],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err, grads
-
-    def compute_td_error(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err = sess.run(
-            self.td_error,
-            feed_dict={
-                self.obs_t: [np.array(ob) for ob in obs_t],
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def apply_gradients(self, sess, grads):
-        assert len(grads) == len(self.grads_and_vars)
-        feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
-        sess.run(self.train_expr, feed_dict=feed_dict)
-
-    def compute_apply(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err, _ = sess.run(
-            [self.td_error, self.train_expr],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
@@ -35,8 +35,8 @@ class LSTM(Model):
            lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
        step_size = tf.shape(self.x)[:1]

-        c_init = np.zeros((1, lstm.state_size.c), np.float32)
-        h_init = np.zeros((1, lstm.state_size.h), np.float32)
+        c_init = np.zeros(lstm.state_size.c, np.float32)
+        h_init = np.zeros(lstm.state_size.h, np.float32)
        self.state_init = [c_init, h_init]
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
@@ -7,18 +7,14 @@ import numpy as np
 import torch


-def convert_batch(trajectory, has_features=False):
+def convert_batch(trajectory):
    """Convert trajectory from numpy to PT variable"""
    states = torch.from_numpy(trajectory["obs"]).float()
    acs = torch.from_numpy(trajectory["actions"])
    advs = torch.from_numpy(
        trajectory["advantages"].copy()).float().reshape(-1)
    rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
-    if has_features:
-        features = [torch.from_numpy(f) for f in trajectory["features"]]
-    else:
-        features = trajectory["features"]
-    return states, acs, advs, rs, features
+    return states, acs, advs, rs


 def var_to_np(var):
@@ -43,7 +43,7 @@ class LocalSyncParallelOptimizer(object):
            processed.
        build_loss: Function that takes the specified inputs and returns an
            object with a 'loss' property that is a scalar Tensor. For example,
-            ray.rllib.ppo.ProximalPolicyLoss.
+            ray.rllib.ppo.ProximalPolicyGraph.
        logdir: Directory to place debugging output in.
        grad_norm_clipping: None or int stdev to clip grad norms by
    """
@@ -38,18 +38,24 @@ class PolicyOptimizer(object):

        Args:
            evaluator_cls (class): Python class of the evaluators to create.
-            evaluator_args (list): List of constructor args for the evaluators.
+            evaluator_args (list|dict): Constructor args for the evaluators.
            num_workers (int): Number of remote evaluators to create in
                addition to a local evaluator. This can be zero or greater.
            optimizer_config (dict): Keyword arguments to pass to the
                optimizer class constructor.
        """

-        local_evaluator = evaluator_cls(*evaluator_args)
        remote_cls = ray.remote(**evaluator_resources)(evaluator_cls)
-        remote_evaluators = [
-            remote_cls.remote(*evaluator_args)
-            for _ in range(num_workers)]
+        if isinstance(evaluator_args, list):
+            local_evaluator = evaluator_cls(*evaluator_args)
+            remote_evaluators = [
+                remote_cls.remote(*evaluator_args)
+                for _ in range(num_workers)]
+        else:
+            local_evaluator = evaluator_cls(**evaluator_args)
+            remote_evaluators = [
+                remote_cls.remote(**evaluator_args)
+                for _ in range(num_workers)]
        return cls(optimizer_config, local_evaluator, remote_evaluators)

    def __init__(self, config, local_evaluator, remote_evaluators):
@@ -2,17 +2,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import collections
 import numpy as np


-def arrayify(s):
-    if type(s) in [int, float, str, np.ndarray]:
-        return s
-    elif type(s) is list:
-        # recursive call to convert LazyFrames to arrays
-        return np.array([arrayify(x) for x in s])
-    else:
-        return np.array(s)
+class SampleBatchBuilder(object):
+    """Util to build a SampleBatch incrementally."""
+
+    def __init__(self):
+        self.buffers = collections.defaultdict(list)
+
+    def add_values(self, **values):
+        for k, v in values.items():
+            self.buffers[k].append(v)
+
+    def build(self):
+        return SampleBatch({k: np.array(v) for k, v in self.buffers.items()})


 class SampleBatch(object):
@@ -2,13 +2,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-
-import ray
-from ray.rllib.optimizers import LocalSyncOptimizer
-from ray.rllib.pg.pg_evaluator import PGEvaluator
 from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
+from ray.rllib.optimizers import LocalSyncOptimizer
+from ray.rllib.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
 from ray.tune.trial import Resources


@@ -33,7 +31,6 @@ DEFAULT_CONFIG = {


 class PGAgent(Agent):
-
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
@@ -50,34 +47,28 @@ class PGAgent(Agent):

    def _init(self):
        self.optimizer = LocalSyncOptimizer.make(
-            evaluator_cls=PGEvaluator,
-            evaluator_args=[self.registry, self.env_creator, self.config],
+            evaluator_cls=CommonPolicyEvaluator,
+            evaluator_args={
+                "env_creator": self.env_creator,
+                "policy_graph": PGPolicyGraph,
+                "batch_steps": self.config["batch_size"],
+                "batch_mode": "truncate_episodes",
+                "registry": self.registry,
+                "model_config": self.config["model"],
+                "env_config": self.config["env_config"],
+                "policy_config": self.config,
+            },
            num_workers=self.config["num_workers"],
            optimizer_config=self.config["optimizer"])

    def _train(self):
        self.optimizer.step()
+        return collect_metrics(
+            self.optimizer.local_evaluator, self.optimizer.remote_evaluators)

-        episode_rewards = []
-        episode_lengths = []
-        metric_lists = [a.get_completed_rollout_metrics.remote()
-                        for a in self.optimizer.remote_evaluators]
-        for metrics in metric_lists:
-            for episode in ray.get(metrics):
-                episode_lengths.append(episode.episode_length)
-                episode_rewards.append(episode.episode_reward)
-        avg_reward = np.mean(episode_rewards)
-        avg_length = np.mean(episode_lengths)
-        timesteps = np.sum(episode_lengths)
-
-        result = TrainingResult(
-            episode_reward_mean=avg_reward,
-            episode_len_mean=avg_length,
-            timesteps_this_iter=timesteps,
-            info={})
-
-        return result
-
-    def compute_action(self, obs):
-        action, info = self.optimizer.local_evaluator.policy.compute(obs)
-        return action
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                observation, state, is_training=False)[0])
@@ -1,56 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.optimizers import PolicyEvaluator
-from ray.rllib.pg.policy import PGPolicy
-from ray.rllib.utils.filter import NoFilter
-from ray.rllib.utils.process_rollout import process_rollout
-from ray.rllib.utils.sampler import SyncSampler
-
-
-class PGEvaluator(PolicyEvaluator):
-    """Evaluator for simple policy gradient."""
-
-    def __init__(self, registry, env_creator, config):
-        self.env = ModelCatalog.get_preprocessor_as_wrapper(
-            registry, env_creator(config["env_config"]), config["model"])
-        self.config = config
-
-        self.policy = PGPolicy(registry, self.env.observation_space,
-                               self.env.action_space, config)
-        self.sampler = SyncSampler(
-                        self.env, self.policy, NoFilter(),
-                        config["batch_size"], horizon=config["horizon"])
-
-    def sample(self):
-        rollout = self.sampler.get_data()
-        samples = process_rollout(
-                    rollout, NoFilter(),
-                    gamma=self.config["gamma"], use_gae=False)
-        return samples
-
-    def get_completed_rollout_metrics(self):
-        """Returns metrics on previously completed rollouts.
-
-        Calling this clears the queue of completed rollout metrics.
-        """
-        return self.sampler.get_metrics()
-
-    def compute_gradients(self, samples):
-        """ Returns gradient w.r.t. samples."""
-        gradient, info = self.policy.compute_gradients(samples)
-        return gradient, {}
-
-    def apply_gradients(self, grads):
-        """Applies gradients to evaluator weights."""
-        self.policy.apply_gradients(grads)
-
-    def get_weights(self):
-        """Returns model weights."""
-        return self.policy.get_weights()
-
-    def set_weights(self, weights):
-        """Sets model weights."""
-        return self.policy.set_weights(weights)
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+class PGPolicyGraph(TFPolicyGraph):
+
+    def __init__(self, obs_space, action_space, registry, config):
+        self.config = config
+
+        # setup policy
+        self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
+        dist_class, self.logit_dim = ModelCatalog.get_action_dist(action_space)
+        self.model = ModelCatalog.get_model(
+            registry, self.x, self.logit_dim, options=self.config["model"])
+        self.dist = dist_class(self.model.outputs)  # logit for each action
+
+        # setup policy loss
+        self.ac = ModelCatalog.get_action_placeholder(action_space)
+        self.adv = tf.placeholder(tf.float32, [None], name="adv")
+        self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)
+
+        # initialize TFPolicyGraph
+        self.sess = tf.get_default_session()
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.x,
+            action_sampler=self.dist.sample(), loss=self.loss,
+            loss_inputs=self.loss_in, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return compute_advantages(
+            sample_batch, 0.0, self.config["gamma"], use_gae=False)
@@ -1,82 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import ray
-from ray.rllib.models.catalog import ModelCatalog
-
-
-class PGPolicy():
-
-    other_output = []
-    is_recurrent = False
-
-    def __init__(self, registry, ob_space, ac_space, config):
-        self.config = config
-        self.registry = registry
-        with tf.variable_scope("local"):
-            self._setup_graph(ob_space, ac_space)
-        print("Setting up loss")
-        self._setup_loss(ac_space)
-        self._setup_gradients()
-        self.initialize()
-
-    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, shape=[None]+list(ob_space.shape))
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
-        self.model = ModelCatalog.get_model(
-                        self.registry, self.x, self.logit_dim,
-                        options=self.config["model"])
-        self.action_logits = self.model.outputs  # logit for each action
-        self.dist = dist_class(self.action_logits)
-        self.sample = self.dist.sample()
-        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                          tf.get_variable_scope().name)
-
-    def _setup_loss(self, action_space):
-        self.ac = ModelCatalog.get_action_placeholder(action_space)
-        self.adv = tf.placeholder(tf.float32, [None], name="adv")
-
-        log_prob = self.dist.logp(self.ac)
-
-        # policy loss
-        self.loss = -tf.reduce_mean(log_prob * self.adv)
-
-    def _setup_gradients(self):
-        self.grads = tf.gradients(self.loss, self.var_list)
-        grads_and_vars = list(zip(self.grads, self.var_list))
-        opt = tf.train.AdamOptimizer(self.config["lr"])
-        self._apply_gradients = opt.apply_gradients(grads_and_vars)
-
-    def initialize(self):
-        self.sess = tf.Session()
-        self.variables = ray.experimental.TensorFlowVariables(
-                            self.loss, self.sess)
-        self.sess.run(tf.global_variables_initializer())
-
-    def compute_gradients(self, samples):
-        info = {}
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-        }
-        self.grads = [g for g in self.grads if g is not None]
-        grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
-
-    def apply_gradients(self, grads):
-        feed_dict = dict(zip(self.grads, grads))
-        self.sess.run(self._apply_gradients, feed_dict=feed_dict)
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def compute(self, ob, *args):
-        action = self.sess.run(self.sample, {self.x: [ob]})
-        return action[0], {}
@@ -7,7 +7,7 @@ import tensorflow as tf
 from ray.rllib.models import ModelCatalog


-class ProximalPolicyLoss(object):
+class ProximalPolicyGraph(object):

    other_output = ["vf_preds", "logprobs"]
    is_recurrent = False
@@ -82,11 +82,14 @@ class ProximalPolicyLoss(object):
            self.policy_results = [
                self.sampler, self.curr_logits, tf.constant("NA")]

-    def compute(self, observation):
+    def compute_single_action(self, observation, features, is_training=False):
        action, logprobs, vf = self.sess.run(
            self.policy_results,
            feed_dict={self.observations: [observation]})
-        return action[0], {"vf_preds": vf[0], "logprobs": logprobs[0]}
+        return action[0], [], {"vf_preds": vf[0], "logprobs": logprobs[0]}
+
+    def get_initial_state(self):
+        return []

    def loss(self):
        return self.loss
@@ -172,7 +172,7 @@ class PPOAgent(Agent):
            batch_index = 0
            num_batches = (
                int(tuples_per_device) // int(model.per_device_batch_size))
-            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
+            loss, policy_graph, vf_loss, kl, entropy = [], [], [], [], []
            permutation = np.random.permutation(num_batches)
            # Prepare to drop into the debugger
            if self.iteration == config["tf_debug_iteration"]:
@@ -181,26 +181,26 @@ class PPOAgent(Agent):
                full_trace = (
                    i == 0 and self.iteration == 0 and
                    batch_index == config["full_trace_nth_sgd_batch"])
-                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
+                batch_loss, batch_policy_graph, batch_vf_loss, batch_kl, \
                    batch_entropy = model.run_sgd_minibatch(
                        permutation[batch_index] * model.per_device_batch_size,
                        self.kl_coeff, full_trace,
                        self.file_writer)
                loss.append(batch_loss)
-                policy_loss.append(batch_policy_loss)
+                policy_graph.append(batch_policy_graph)
                vf_loss.append(batch_vf_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
-            policy_loss = np.mean(policy_loss)
+            policy_graph = np.mean(policy_graph)
            vf_loss = np.mean(vf_loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print(
                "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
-                    i, loss, policy_loss, vf_loss, kl, entropy))
+                    i, loss, policy_graph, vf_loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
@@ -299,4 +299,5 @@ class PPOAgent(Agent):
    def compute_action(self, observation):
        observation = self.local_evaluator.obs_filter(
            observation, update=False)
-        return self.local_evaluator.common_policy.compute(observation)[0]
+        return self.local_evaluator.common_policy.compute_single_action(
+            observation, [], False)[0]
@@ -16,8 +16,8 @@ from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.sampler import SyncSampler
 from ray.rllib.utils.filter import get_filter, MeanStdFilter
-from ray.rllib.utils.process_rollout import process_rollout
-from ray.rllib.ppo.loss import ProximalPolicyLoss
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.ppo.loss import ProximalPolicyGraph


 # TODO(rliaw): Move this onto LocalMultiGPUOptimizer
@@ -86,7 +86,7 @@ class PPOEvaluator(PolicyEvaluator):
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
-            return ProximalPolicyLoss(
+            return ProximalPolicyGraph(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
@@ -190,8 +190,9 @@ class PPOEvaluator(PolicyEvaluator):

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
-            samples = process_rollout(
-                rollout, self.rew_filter, self.config["gamma"],
+            last_r = 0.0  # note: not needed since we don't truncate rollouts
+            samples = compute_advantages(
+                rollout, last_r, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
@@ -17,18 +17,19 @@ def get_mean_action(alg, obs):
    return np.mean(out)


-ray.init()
+ray.init(num_cpus=10)

 CONFIGS = {
-    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100},
+    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
+           "num_workers": 2},
    "DQN": {},
-    "DDPG": {"noise_scale": 0.0},
-    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000},
-    "A3C": {"use_lstm": False},
+    "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
+    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
+    "A3C": {"use_lstm": False, "num_workers": 1},
 }


-def test(use_object_store, alg_name):
+def test(use_object_store, alg_name, failures):
    cls = get_agent_class(alg_name)
    if alg_name == "DDPG":
        alg1 = cls(config=CONFIGS[name], env="Pendulum-v0")
@@ -55,12 +56,15 @@ def test(use_object_store, alg_name):
        a1 = get_mean_action(alg1, obs)
        a2 = get_mean_action(alg2, obs)
        print("Checking computed actions", alg1, obs, a1, a2)
-        assert abs(a1 - a2) < .1, (a1, a2)
+        if abs(a1 - a2) > .1:
+            failures.append((alg_name, [a1, a2]))


 if __name__ == "__main__":
+    failures = []
    for use_object_store in [False, True]:
        for name in ["ES", "DQN", "DDPG", "PPO", "A3C"]:
-            test(use_object_store, name)
+            test(use_object_store, name, failures)

+    assert not failures, failures
    print("All checkpoint restore tests passed!")
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import time
+import unittest
+
+import ray
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
+from ray.rllib.utils.policy_graph import PolicyGraph
+from ray.rllib.utils.process_rollout import compute_advantages
+
+
+class MockPolicyGraph(PolicyGraph):
+    def compute_actions(self, obs_batch, state_batches, is_training=False):
+        return [0] * len(obs_batch), [], {}
+
+    def postprocess_trajectory(self, batch):
+        return compute_advantages(batch, 100.0, 0.9, use_gae=False)
+
+
+class TestCommonPolicyEvaluator(unittest.TestCase):
+    def testBasic(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph)
+        batch = ev.sample()
+        for key in ["obs", "actions", "rewards", "dones", "advantages"]:
+            self.assertIn(key, batch)
+        self.assertGreater(batch["advantages"][0], 1)
+
+    def testPackEpisodes(self):
+        for batch_size in [1, 10, 100, 1000]:
+            ev = CommonPolicyEvaluator(
+                env_creator=lambda _: gym.make("CartPole-v0"),
+                policy_graph=MockPolicyGraph,
+                batch_steps=batch_size,
+                batch_mode="pack_episodes")
+            batch = ev.sample()
+            self.assertEqual(batch.count, batch_size)
+
+    def testTruncateEpisodes(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=2,
+            batch_mode="truncate_episodes")
+        batch = ev.sample()
+        self.assertEqual(batch.count, 2)
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=1000,
+            batch_mode="truncate_episodes")
+        self.assertLess(batch.count, 200)
+
+    def testCompleteEpisodes(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=2,
+            batch_mode="complete_episodes")
+        batch = ev.sample()
+        self.assertGreater(batch.count, 2)
+        self.assertTrue(batch["dones"][-1])
+        batch = ev.sample()
+        self.assertGreater(batch.count, 2)
+        self.assertTrue(batch["dones"][-1])
+
+    def testFilterSync(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        time.sleep(2)
+        ev.sample()
+        filters = ev.get_filters(flush_after=True)
+        obs_f = filters["obs_filter"]
+        self.assertNotEqual(obs_f.rs.n, 0)
+        self.assertNotEqual(obs_f.buffer.n, 0)
+
+    def testGetFilters(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        self.sample_and_flush(ev)
+        filters = ev.get_filters(flush_after=False)
+        time.sleep(2)
+        filters2 = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+        obs_f2 = filters2["obs_filter"]
+        self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
+        self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
+
+    def testSyncFilter(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        obs_f = self.sample_and_flush(ev)
+
+        # Current State
+        filters = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+
+        self.assertLessEqual(obs_f.buffer.n, 20)
+
+        new_obsf = obs_f.copy()
+        new_obsf.rs._n = 100
+        ev.sync_filters({"obs_filter": new_obsf})
+        filters = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+        self.assertGreaterEqual(obs_f.rs.n, 100)
+        self.assertLessEqual(obs_f.buffer.n, 20)
+
+    def sample_and_flush(self, ev):
+        time.sleep(2)
+        ev.sample()
+        filters = ev.get_filters(flush_after=True)
+        obs_f = filters["obs_filter"]
+        self.assertNotEqual(obs_f.rs.n, 0)
+        self.assertNotEqual(obs_f.buffer.n, 0)
+        return obs_f
+
+
+if __name__ == '__main__':
+    ray.init()
+    unittest.main(verbosity=2)
@@ -3,19 +3,11 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import gym
-import shutil
-import tempfile
-import time

-import ray
-from ray.rllib.a3c import DEFAULT_CONFIG
-from ray.rllib.a3c.a3c_evaluator import A3CEvaluator
-from ray.rllib.dqn.dqn_evaluator import adjust_nstep
-from ray.tune.registry import get_registry
+from ray.rllib.dqn.dqn_policy_graph import adjust_nstep


-class DQNEvaluatorTest(unittest.TestCase):
+class DQNTest(unittest.TestCase):
    def testNStep(self):
        obs = [1, 2, 3, 4, 5, 6, 7]
        actions = ["a", "b", "a", "a", "a", "b", "a"]
@@ -30,70 +22,5 @@ class DQNEvaluatorTest(unittest.TestCase):
        self.assertEqual(dones, [1, 0, 0, 0, 0])


-class A3CEvaluatorTest(unittest.TestCase):
-
-    def setUp(self):
-        ray.init(num_cpus=1)
-        config = DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["observation_filter"] = "ConcurrentMeanStdFilter"
-        config["reward_filter"] = "MeanStdFilter"
-        config["batch_size"] = 2
-        self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test")
-        self.e = A3CEvaluator(
-            get_registry(),
-            lambda config: gym.make("CartPole-v0"),
-            config,
-            logdir=self._temp_dir)
-
-    def tearDown(self):
-        ray.worker.cleanup()
-        shutil.rmtree(self._temp_dir)
-
-    def sample_and_flush(self):
-        e = self.e
-        time.sleep(2)
-        self.e.sample()
-        filters = e.get_filters(flush_after=True)
-        obs_f = filters["obs_filter"]
-        rew_f = filters["rew_filter"]
-        self.assertNotEqual(obs_f.rs.n, 0)
-        self.assertNotEqual(obs_f.buffer.n, 0)
-        self.assertNotEqual(rew_f.rs.n, 0)
-        self.assertNotEqual(rew_f.buffer.n, 0)
-        return obs_f, rew_f
-
-    def testGetFilters(self):
-        """Show `flush_after=False` provides does not affect the buffer."""
-        e = self.e
-        self.sample_and_flush()
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        filters2 = e.get_filters(flush_after=False)
-        obs_f2 = filters2["obs_filter"]
-        self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
-        self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
-
-    def testSyncFilter(self):
-        """Show that sync_filters rebases own buffer over input"""
-        e = self.e
-        obs_f, _ = self.sample_and_flush()
-
-        # Current State
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        rew_f = filters["rew_filter"]
-
-        self.assertLessEqual(obs_f.buffer.n, 20)
-
-        new_obsf = obs_f.copy()
-        new_obsf.rs._n = 100
-        e.sync_filters({"obs_filter": new_obsf, "rew_filter": rew_f})
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        self.assertGreaterEqual(obs_f.rs.n, 100)
-        self.assertLessEqual(obs_f.buffer.n, 20)
-
-
 if __name__ == '__main__':
    unittest.main(verbosity=2)
@@ -36,32 +36,6 @@ OBSERVATION_SPACES_TO_TEST = {
        Box(0.0, 1.0, (5,), dtype=np.float32)]),
 }

-# (alg, action_space, obs_space)
-KNOWN_FAILURES = [
-    # TODO(ekl) multiagent support for a3c
-    ("A3C", "implicit_tuple", "atari"),
-    ("A3C", "implicit_tuple", "atari_ram"),
-    ("A3C", "implicit_tuple", "discrete"),
-    ("A3C", "implicit_tuple", "image"),
-    ("A3C", "implicit_tuple", "mixed_tuple"),
-    ("A3C", "implicit_tuple", "simple_tuple"),
-    ("A3C", "implicit_tuple", "vector"),
-    ("A3C", "mixed_tuple", "atari"),
-    ("A3C", "mixed_tuple", "atari_ram"),
-    ("A3C", "mixed_tuple", "discrete"),
-    ("A3C", "mixed_tuple", "image"),
-    ("A3C", "mixed_tuple", "mixed_tuple"),
-    ("A3C", "mixed_tuple", "simple_tuple"),
-    ("A3C", "mixed_tuple", "vector"),
-    ("A3C", "simple_tuple", "atari"),
-    ("A3C", "simple_tuple", "atari_ram"),
-    ("A3C", "simple_tuple", "discrete"),
-    ("A3C", "simple_tuple", "image"),
-    ("A3C", "simple_tuple", "mixed_tuple"),
-    ("A3C", "simple_tuple", "simple_tuple"),
-    ("A3C", "simple_tuple", "vector"),
-]
-

 def make_stub_env(action_space, obs_space):
    class StubEnv(gym.Env):
@@ -135,19 +109,13 @@ class ModelSupportedSpaces(unittest.TestCase):
            {"num_workers": 1, "optimizer": {}},
            stats)
        num_unexpected_errors = 0
-        num_unexpected_success = 0
        for (alg, a_name, o_name), stat in sorted(stats.items()):
-            if stat in ["ok", "unsupported"]:
-                if (alg, a_name, o_name) in KNOWN_FAILURES:
-                    num_unexpected_success += 1
-            else:
-                if (alg, a_name, o_name) not in KNOWN_FAILURES:
-                    num_unexpected_errors += 1
+            if stat not in ["ok", "unsupported"]:
+                num_unexpected_errors += 1
            print(
                alg, "action_space", a_name, "obs_space", o_name,
                "result", stat)
        self.assertEqual(num_unexpected_errors, 0)
-        self.assertEqual(num_unexpected_success, 0)


 if __name__ == "__main__":
@@ -13,7 +13,6 @@ mountaincarcontinuous-ddpg:
        tau: 0.01
        l2_reg: 0.00001
        buffer_size: 50000
-        random_starts: False
        clip_rewards: False
        learning_starts: 1000
        #model:
@@ -6,6 +6,5 @@ pendulum-ddpg:
        episode_reward_mean: -160
    config:
        use_huber: True
-        random_starts: False
        clip_rewards: False
        exploration_fraction: 0.1
@@ -0,0 +1,10 @@
+cartpole-a3c:
+    env: CartPole-v0
+    run: A3C
+    stop:
+        episode_reward_mean: 200
+        time_total_s: 600
+    config:
+        num_workers: 1
+        gamma: 0.95
+        use_pytorch: true
@@ -5,5 +5,5 @@ cartpole-a3c:
        episode_reward_mean: 200
        time_total_s: 600
    config:
-        num_workers: 4
+        num_workers: 1
        gamma: 0.95
@@ -7,4 +7,3 @@ cartpole-dqn:
    config:
        n_step: 3
        gamma: 0.95
-        smoothing_num_episodes: 10
@@ -0,0 +1,8 @@
+cartpole-pg:
+    env: CartPole-v0
+    run: PG
+    stop:
+        episode_reward_mean: 200
+        time_total_s: 300
+    config:
+        num_workers: 1
@@ -6,7 +6,5 @@ pendulum-ddpg:
        time_total_s: 900
    config:
        use_huber: True
-        random_starts: False
        clip_rewards: False
        exploration_fraction: 0.1
-        smoothing_num_episodes: 10
@@ -0,0 +1,278 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+import numpy as np
+import tensorflow as tf
+
+import ray
+from ray.rllib.models import ModelCatalog
+from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator
+from ray.rllib.utils.atari_wrappers import wrap_deepmind
+from ray.rllib.utils.compression import pack
+from ray.rllib.utils.filter import get_filter
+from ray.rllib.utils.sampler import AsyncSampler, SyncSampler
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+from ray.tune.registry import get_registry
+from ray.tune.result import TrainingResult
+
+
+def collect_metrics(local_evaluator, remote_evaluators):
+    """Gathers episode metrics from CommonPolicyEvaluator instances."""
+
+    episode_rewards = []
+    episode_lengths = []
+    metric_lists = ray.get(
+        [a.apply.remote(lambda ev: ev.sampler.get_metrics())
+         for a in remote_evaluators])
+    metric_lists.append(local_evaluator.sampler.get_metrics())
+    for metrics in metric_lists:
+        for episode in metrics:
+            episode_lengths.append(episode.episode_length)
+            episode_rewards.append(episode.episode_reward)
+    if episode_rewards:
+        min_reward = min(episode_rewards)
+        max_reward = max(episode_rewards)
+    else:
+        min_reward = float('nan')
+        max_reward = float('nan')
+    avg_reward = np.mean(episode_rewards)
+    avg_length = np.mean(episode_lengths)
+    timesteps = np.sum(episode_lengths)
+
+    return TrainingResult(
+        episode_reward_max=max_reward,
+        episode_reward_min=min_reward,
+        episode_reward_mean=avg_reward,
+        episode_len_mean=avg_length,
+        episodes_total=len(episode_lengths),
+        timesteps_this_iter=timesteps)
+
+
+class CommonPolicyEvaluator(PolicyEvaluator):
+    """Policy evaluator implementation that operates on a rllib.PolicyGraph.
+
+    TODO: vector env
+    TODO: multi-agent
+    TODO: consumer buffering for multi-agent
+    TODO: complete episode batch mode
+
+    Examples:
+        # Create a policy evaluator and using it to collect experiences.
+        >>> evaluator = CommonPolicyEvaluator(
+              env_creator=lambda _: gym.make("CartPole-v0"),
+              policy_graph=PGPolicyGraph)
+        >>> print(evaluator.sample().keys())
+        {"obs": [[...]], "actions": [[...]], "rewards": [[...]],
+         "dones": [[...]], "new_obs": [[...]]}
+
+        # Creating policy evaluators using optimizer_cls.make().
+        >>> optimizer = LocalSyncOptimizer.make(
+              evaluator_cls=CommonPolicyEvaluator,
+              evaluator_args={
+                "env_creator": lambda _: gym.make("CartPole-v0"),
+                "policy_graph": PGPolicyGraph,
+              },
+              num_workers=10)
+        >>> for _ in range(10): optimizer.step()
+    """
+
+    @classmethod
+    def as_remote(cls, num_cpus=None, num_gpus=None):
+        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
+
+    def __init__(
+            self,
+            env_creator,
+            policy_graph,
+            tf_session_creator=None,
+            batch_steps=100,
+            batch_mode="truncate_episodes",
+            preprocessor_pref="rllib",
+            sample_async=False,
+            compress_observations=False,
+            observation_filter="NoFilter",
+            registry=None,
+            env_config=None,
+            model_config=None,
+            policy_config=None):
+        """Initialize a policy evaluator.
+
+        Arguments:
+            env_creator (func): Function that returns a gym.Env given an
+                env config dict.
+            policy_graph (class): A class implementing rllib.PolicyGraph or
+                rllib.TFPolicyGraph.
+            tf_session_creator (func): A function that returns a TF session.
+                This is optional and only useful with TFPolicyGraph.
+            batch_steps (int): The target number of env transitions to include
+                in each sample batch returned from this evaluator.
+            batch_mode (str): One of the following choices:
+                complete_episodes: each batch will be at least batch_steps
+                    in size, and will include one or more complete episodes.
+                truncate_episodes: each batch will be around batch_steps
+                    in size, and include transitions from one episode only.
+                pack_episodes: each batch will be exactly batch_steps in
+                    size, and may include transitions from multiple episodes.
+            preprocessor_pref (str): Whether to prefer RLlib preprocessors
+                ("rllib") or deepmind ("deepmind") when applicable.
+            sample_async (bool): Whether to compute samples asynchronously in
+                the background, which improves throughput but can cause samples
+                to be slightly off-policy.
+            compress_observations (bool): If true, compress the observations
+                returned.
+            observation_filter (str): Name of observation filter to use.
+            registry (tune.Registry): User-registered objects. Pass in the
+                value from tune.registry.get_registry() if you're having
+                trouble resolving things like custom envs.
+            env_config (dict): Config to pass to the env creator.
+            model_config (dict): Config to use when creating the policy model.
+            policy_config (dict): Config to pass to the policy.
+        """
+
+        registry = registry or get_registry()
+        env_config = env_config or {}
+        policy_config = policy_config or {}
+        model_config = model_config or {}
+
+        assert batch_mode in [
+            "complete_episodes", "truncate_episodes", "pack_episodes"]
+        self.env_creator = env_creator
+        self.policy_graph = policy_graph
+        self.batch_steps = batch_steps
+        self.batch_mode = batch_mode
+        self.compress_observations = compress_observations
+
+        self.env = env_creator(env_config)
+        is_atari = hasattr(self.env.unwrapped, "ale")
+        if is_atari and "custom_preprocessor" not in model_config and \
+                preprocessor_pref == "deepmind":
+            self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
+        else:
+            self.env = ModelCatalog.get_preprocessor_as_wrapper(
+                registry, self.env, model_config)
+
+        self.vectorized = hasattr(self.env, "vector_reset")
+        self.policy_map = {}
+
+        if issubclass(policy_graph, TFPolicyGraph):
+            with tf.Graph().as_default():
+                if tf_session_creator:
+                    self.sess = tf_session_creator()
+                else:
+                    self.sess = tf.Session(config=tf.ConfigProto(
+                        gpu_options=tf.GPUOptions(allow_growth=True)))
+                with self.sess.as_default():
+                    policy = policy_graph(
+                        self.env.observation_space, self.env.action_space,
+                        registry, policy_config)
+        else:
+            policy = policy_graph(
+                self.env.observation_space, self.env.action_space,
+                registry, policy_config)
+        self.policy_map = {
+            "default": policy
+        }
+
+        self.obs_filter = get_filter(
+            observation_filter, self.env.observation_space.shape)
+        self.filters = {"obs_filter": self.obs_filter}
+
+        if self.vectorized:
+            raise NotImplementedError("Vector envs not yet supported")
+        else:
+            if batch_mode not in [
+                    "pack_episodes", "truncate_episodes", "complete_episodes"]:
+                raise NotImplementedError("Batch mode not yet supported")
+            pack = batch_mode == "pack_episodes"
+            if batch_mode == "complete_episodes":
+                batch_steps = 999999
+            if sample_async:
+                self.sampler = AsyncSampler(
+                    self.env, self.policy_map["default"], self.obs_filter,
+                    batch_steps, pack=pack)
+                self.sampler.start()
+            else:
+                self.sampler = SyncSampler(
+                    self.env, self.policy_map["default"], self.obs_filter,
+                    batch_steps, pack=pack)
+
+    def sample(self):
+        """Evaluate the current policies and return a batch of experiences.
+
+        Return:
+            SampleBatch from evaluating the current policies.
+        """
+
+        batch = self.policy_map["default"].postprocess_trajectory(
+            self.sampler.get_data())
+
+        if self.compress_observations:
+            batch["obs"] = [pack(o) for o in batch["obs"]]
+            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]
+
+        return batch
+
+    def apply(self, func):
+        """Apply the given function to this evaluator instance."""
+
+        return func(self)
+
+    def for_policy(self, func):
+        """Apply the given function to this evaluator's default policy."""
+
+        return func(self.policy_map["default"])
+
+    def sync_filters(self, new_filters):
+        """Changes self's filter to given and rebases any accumulated delta.
+
+        Args:
+            new_filters (dict): Filters with new state to update local copy.
+        """
+        assert all(k in new_filters for k in self.filters)
+        for k in self.filters:
+            self.filters[k].sync(new_filters[k])
+
+    def get_filters(self, flush_after=False):
+        """Returns a snapshot of filters.
+
+        Args:
+            flush_after (bool): Clears the filter buffer state.
+
+        Returns:
+            return_filters (dict): Dict for serializable filters
+        """
+        return_filters = {}
+        for k, f in self.filters.items():
+            return_filters[k] = f.as_serializable()
+            if flush_after:
+                f.clear_buffer()
+        return return_filters
+
+    def get_weights(self):
+        return self.policy_map["default"].get_weights()
+
+    def set_weights(self, weights):
+        return self.policy_map["default"].set_weights(weights)
+
+    def compute_gradients(self, samples):
+        return self.policy_map["default"].compute_gradients(samples)
+
+    def apply_gradients(self, grads):
+        return self.policy_map["default"].apply_gradients(grads)
+
+    def compute_apply(self, samples):
+        grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
+            samples)
+        return grad_fetch
+
+    def save(self):
+        filters = self.get_filters(flush_after=True)
+        state = self.policy_map["default"].get_state()
+        return pickle.dumps({"filters": filters, "state": state})
+
+    def restore(self, objs):
+        objs = pickle.loads(objs)
+        self.sync_filters(objs["filters"])
+        self.policy_map["default"].set_state(objs["state"])
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class PolicyGraph(object):
+    """An agent policy and loss, i.e., a TFPolicyGraph or other subclass.
+
+    This object defines how to act in the environment, and also losses used to
+    improve the policy based on its experiences. Note that both policy and
+    loss are defined together for convenience, though the policy itself is
+    logically separate.
+
+    All policies can directly extend PolicyGraph, however TensorFlow users may
+    find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib
+    to apply TensorFlow-specific optimizations such as fusing multiple policy
+    graphs and multi-GPU support.
+    """
+
+    def __init__(self, registry, observation_space, action_space, config):
+        """Initialize the graph.
+
+        Args:
+            registry (obj): Object registry for user-defined envs, models, etc.
+            observation_space (gym.Space): Observation space of the env.
+            action_space (gym.Space): Action space of the env.
+            config (dict): Policy-specific configuration data.
+        """
+        pass
+
+    def compute_actions(self, obs_batch, state_batches, is_training=False):
+        """Compute actions for the current policy.
+
+        Arguments:
+            obs_batch (np.ndarray): batch of observations
+            state_batches (list): list of RNN state input batches, if any
+            is_training (bool): whether we are training the policy
+
+        Returns:
+            actions (np.ndarray): batch of output actions, with shape like
+                [BATCH_SIZE, ACTION_SHAPE].
+            state_outs (list): list of RNN state output batches, if any, with
+                shape like [STATE_SIZE, BATCH_SIZE].
+            info (dict): dictionary of extra feature batches, if any, with
+                shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
+        """
+        raise NotImplementedError
+
+    def compute_single_action(self, obs, state, is_training=False):
+        """Unbatched version of compute_actions.
+
+        Arguments:
+            obs (obj): single observation
+            state_batches (list): list of RNN state inputs, if any
+            is_training (bool): whether we are training the policy
+
+        Returns:
+            actions (obj): single action
+            state_outs (list): list of RNN state outputs, if any
+            info (dict): dictionary of extra features, if any
+        """
+
+        [action], state_out, info = self.compute_actions(
+            [obs], [[s] for s in state], is_training)
+        return action, [s[0] for s in state_out], \
+            {k: v[0] for k, v in info.items()}
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        """Implements algorithm-specific trajectory postprocessing.
+
+        Arguments:
+            sample_batch (SampleBatch): batch of experiences for the policy
+            other_agent_batches (dict): In a multi-agent env, this contains the
+                experience batches seen by other agents.
+
+        Returns:
+            SampleBatch: postprocessed sample batch.
+        """
+        return sample_batch
+
+    def compute_gradients(self, postprocessed_batch):
+        """Computes gradients against a batch of experiences.
+
+        Returns:
+            grads (list): List of gradient output values
+            info (dict): Extra policy-specific values
+        """
+        raise NotImplementedError
+
+    def apply_gradients(self, gradients):
+        """Applies previously computed gradients.
+
+        Returns:
+            info (dict): Extra policy-specific values
+        """
+        raise NotImplementedError
+
+    def get_weights(self):
+        """Returns model weights.
+
+        Returns:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    def set_weights(self, weights):
+        """Sets model weights.
+
+        Arguments:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    def get_initial_state(self):
+        """Returns initial RNN state for the current policy."""
+        return []
+
+    def get_state(self):
+        """Saves all local state.
+
+        Returns:
+            state (obj): Serialized local state.
+        """
+        return self.get_weights()
+
+    def set_state(self, state):
+        """Restores all local state.
+
+        Arguments:
+            state (obj): Serialized local state.
+        """
+        self.set_weights(state)
@@ -11,12 +11,12 @@ def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


-def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
+def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
    """Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (PartialRollout): Partial Rollout Object
-        reward_filter (Filter): Filter for processing advantanges
+        last_r (float): Value estimation for last observation
        gamma (float): Parameter for GAE
        lambda_ (float): Parameter for GAE
        use_gae (bool): Using Generalized Advantage Estamation
@@ -32,21 +32,17 @@ def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):

    if use_gae:
        assert "vf_preds" in rollout, "Values not found!"
-        vpred_t = np.stack(rollout["vf_preds"] +
-                           [np.array(rollout.last_r)]).squeeze()
+        vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
        delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
        # This formula for the advantage comes
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        traj["advantages"] = discount(delta_t, gamma * lambda_)
        traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
    else:
-        rewards_plus_v = np.stack(rollout["rewards"] +
-                                  [np.array(rollout.last_r)]).squeeze()
+        rewards_plus_v = np.concatenate(
+            [rollout["rewards"], np.array([last_r])])
        traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]

-    for i in range(traj["advantages"].shape[0]):
-        traj["advantages"][i] = reward_filter(traj["advantages"][i])
-
    traj["advantages"] = traj["advantages"].copy()

    assert all(val.shape[0] == trajsize for val in traj.values()), \
@@ -2,80 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import six.moves.queue as queue
-import threading
 from collections import namedtuple
 import numpy as np
+import six.moves.queue as queue
+import threading

-
-class PartialRollout(object):
-    """A piece of a complete rollout.
-
-    We run our agent, and process its experience once it has processed enough
-    steps.
-
-    Attributes:
-        data (dict): Stores rollout data. All numpy arrays other than
-            `observations` and `features` will be squeezed.
-        last_r (float): Value of next state. Used for bootstrapping.
-    """
-
-    fields = ["obs", "actions", "rewards", "new_obs", "dones", "features"]
-
-    def __init__(self, extra_fields=None):
-        """Initializers internals. Maintains a `last_r` field
-        in support of partial rollouts, used in bootstrapping advantage
-        estimation.
-
-        Args:
-            extra_fields: Optional field for object to keep track.
-        """
-        if extra_fields:
-            self.fields.extend(extra_fields)
-        self.data = {k: [] for k in self.fields}
-        self.last_r = 0.0
-
-    def add(self, **kwargs):
-        for k, v in kwargs.items():
-            self.data[k] += [v]
-
-    def extend(self, other_rollout):
-        """Extends internal data structure. Assumes other_rollout contains
-        data that occured afterwards."""
-
-        assert not self.is_terminal()
-        assert all(k in other_rollout.fields for k in self.fields)
-        for k, v in other_rollout.data.items():
-            self.data[k].extend(v)
-        self.last_r = other_rollout.last_r
-
-    def is_terminal(self):
-        """Check if terminal.
-
-        Returns:
-            terminal (bool): if rollout has terminated."""
-        return self.data["dones"][-1]
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __setitem__(self, key, item):
-        self.data[key] = item
-
-    def keys(self):
-        return self.data.keys()
-
-    def items(self):
-        return self.data.items()
-
-    def __iter__(self):
-        return self.data.__iter__()
-
-    def __next__(self):
-        return self.data.__next__()
-
-    def __contains__(self, x):
-        return x in self.data
+from ray.rllib.optimizers.sample_batch import SampleBatchBuilder


 CompletedRollout = namedtuple("CompletedRollout",
@@ -92,7 +24,9 @@ class SyncSampler(object):
    thread."""
    _async = False

-    def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
+    def __init__(
+            self, env, policy, obs_filter, num_local_steps, horizon=None,
+            pack=False):
        self.num_local_steps = num_local_steps
        self.horizon = horizon
        self.env = env
@@ -100,7 +34,7 @@ class SyncSampler(object):
        self._obs_filter = obs_filter
        self.rollout_provider = _env_runner(self.env, self.policy,
                                            self.num_local_steps, self.horizon,
-                                            self._obs_filter)
+                                            self._obs_filter, pack)
        self.metrics_queue = queue.Queue()

    def get_data(self):
@@ -128,7 +62,9 @@ class AsyncSampler(threading.Thread):
    accumulate and the gradient can be calculated on up to 5 batches."""
    _async = True

-    def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
+    def __init__(
+            self, env, policy, obs_filter, num_local_steps, horizon=None,
+            pack=False):
        assert getattr(
            obs_filter, "is_concurrent",
            False), ("Observation Filter must support concurrent updates.")
@@ -142,6 +78,7 @@ class AsyncSampler(threading.Thread):
        self._obs_filter = obs_filter
        self.started = False
        self.daemon = True
+        self.pack = pack

    def run(self):
        self.started = True
@@ -154,7 +91,7 @@ class AsyncSampler(threading.Thread):
    def _run(self):
        rollout_provider = _env_runner(self.env, self.policy,
                                       self.num_local_steps, self.horizon,
-                                       self._obs_filter)
+                                       self._obs_filter, self.pack)
        while True:
            # The timeout variable exists because apparently, if one worker
            # dies, the other workers won't die with it, unless the timeout is
@@ -169,18 +106,18 @@ class AsyncSampler(threading.Thread):
        """Gets currently accumulated data.

        Returns:
-            rollout (PartialRollout): trajectory data (unprocessed)
+            rollout (SampleBatch): trajectory data (unprocessed)
        """
        assert self.started, "Sampler never started running!"
        rollout = self.queue.get(timeout=600.0)
        if isinstance(rollout, BaseException):
            raise rollout
-        while not rollout.is_terminal():
+        while not rollout["dones"][-1]:
            try:
                part = self.queue.get_nowait()
                if isinstance(part, BaseException):
                    raise rollout
-                rollout.extend(part)
+                rollout = rollout.concat(part)
            except queue.Empty:
                break
        return rollout
@@ -195,7 +132,7 @@ class AsyncSampler(threading.Thread):
        return completed


-def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
+def _env_runner(env, policy, num_local_steps, horizon, obs_filter, pack):
    """This implements the logic of the thread runner.

    It continually runs the policy, and as long as the rollout exceeds a
@@ -206,12 +143,16 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
    Args:
        env: Environment generated by env_creator
        policy: Policy used to interact with environment. Also sets fields
-            to be included in `PartialRollout`
-        num_local_steps: Number of steps before `PartialRollout` is yielded.
+            to be included in `SampleBatch`
+        num_local_steps: Number of steps before `SampleBatch` is yielded. Set
+            to infinity to yield complete episodes.
+        horizon: Horizon of the episode.
        obs_filter: Filter used to process observations.
+        pack: Whether to pack multiple episodes into each batch. This
+            guarantees batches will be exactly `num_local_steps` in size.

    Yields:
-        rollout (PartialRollout): Object containing state, action, reward,
+        rollout (SampleBatch): Object containing state, action, reward,
            terminal condition, and other fields as dictated by `policy`.
    """
    last_observation = obs_filter(env.reset())
@@ -221,24 +162,23 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
        print("Warning, no horizon specified, assuming infinite")
    if not horizon:
        horizon = 999999
-    if hasattr(policy, "get_initial_features"):
-        last_features = policy.get_initial_features()
-    else:
-        last_features = []
+    last_features = policy.get_initial_state()
    features = last_features
    length = 0
    rewards = 0
    rollout_number = 0

    while True:
-        terminal_end = False
-        rollout = PartialRollout(extra_fields=policy.other_output)
+        batch_builder = SampleBatchBuilder()

        for _ in range(num_local_steps):
-            action, pi_info = policy.compute(last_observation, *last_features)
-            if policy.is_recurrent:
-                features = pi_info["features"]
-                del pi_info["features"]
+            # Assume batch size one for now
+            action, features, pi_info = policy.compute_single_action(
+                last_observation, last_features, is_training=True)
+            for i, state_value in enumerate(last_features):
+                pi_info["state_in_{}".format(i)] = state_value
+            for i, state_value in enumerate(features):
+                pi_info["state_out_{}".format(i)] = state_value
            observation, reward, terminal, info = env.step(action)
            observation = obs_filter(observation)

@@ -252,12 +192,11 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
                action = np.concatenate(action, axis=0).flatten()

            # Collect the experience.
-            rollout.add(
+            batch_builder.add_values(
                obs=last_observation,
                actions=action,
                rewards=reward,
                dones=terminal,
-                features=last_features,
                new_obs=observation,
                **pi_info)

@@ -265,24 +204,18 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
            last_features = features

            if terminal:
-                terminal_end = True
                yield CompletedRollout(length, rewards)

-                if (length >= horizon
-                        or not env.metadata.get("semantics.autoreset")):
+                if (length >= horizon or
+                        not env.metadata.get("semantics.autoreset")):
                    last_observation = obs_filter(env.reset())
-                    if hasattr(policy, "get_initial_features"):
-                        last_features = policy.get_initial_features()
-                    else:
-                        last_features = []
+                    last_features = policy.get_initial_state()
                    rollout_number += 1
                    length = 0
                    rewards = 0
-                    break
-
-        if not terminal_end:
-            rollout.last_r = policy.value(last_observation, *last_features)
+                    if not pack:
+                        break

        # Once we have enough experience, yield it, and have the ThreadRunner
        # place it on a queue.
-        yield rollout
+        yield batch_builder.build()
@@ -0,0 +1,152 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import ray
+from ray.rllib.utils.policy_graph import PolicyGraph
+
+
+class TFPolicyGraph(PolicyGraph):
+    """An agent policy and loss implemented in TensorFlow.
+
+    Extending this class enables RLlib to perform TensorFlow specific
+    optimizations on the policy graph, e.g., parallelization across gpus or
+    fusing multiple graphs together in the multi-agent setting.
+
+    All input and output tensors are of shape [BATCH_DIM, ...].
+
+    Examples:
+        >>> policy = TFPolicyGraphSubclass(
+            sess, obs_input, action_sampler, loss, loss_inputs, is_training)
+
+        >>> print(policy.compute_actions([1, 0, 2]))
+        (array([0, 1, 1]), [], {})
+
+        >>> print(policy.postprocess_trajectory(SampleBatch({...})))
+        SampleBatch({"action": ..., "advantages": ..., ...})
+    """
+
+    def __init__(
+            self, sess, obs_input, action_sampler, loss, loss_inputs,
+            is_training, state_inputs=None, state_outputs=None):
+        """Initialize the policy.
+
+        Arguments:
+            obs_input (Tensor): input placeholder for observations.
+            action_sampler (Tensor): Tensor for sampling an action.
+            loss (Tensor): scalar policy loss output tensor.
+            loss_inputs (list): a (name, placeholder) tuple for each loss
+                input argument. Each placeholder name must correspond to a
+                SampleBatch column key returned by postprocess_trajectory().
+            is_training (Tensor): input placeholder for whether we are
+                currently training the policy.
+            state_inputs (list): list of RNN state output Tensors.
+            state_outputs (list): list of initial state values.
+        """
+
+        self._sess = sess
+        self._obs_input = obs_input
+        self._sampler = action_sampler
+        self._loss = loss
+        self._loss_inputs = loss_inputs
+        self._is_training = is_training
+        self._state_inputs = state_inputs or []
+        self._state_outputs = state_outputs or []
+        self._optimizer = self.optimizer()
+        self._grads_and_vars = self.gradients(self._optimizer)
+        self._grads = [g for (g, v) in self._grads_and_vars]
+        self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
+        self._variables = ray.experimental.TensorFlowVariables(
+            self._loss, self._sess)
+
+        assert len(self._state_inputs) == len(self._state_outputs) == \
+            len(self.get_initial_state())
+
+    def compute_actions(
+            self, obs_batch, state_batches=None, is_training=False):
+        state_batches = state_batches or []
+        assert len(self._state_inputs) == len(state_batches), \
+            (self._state_inputs, state_batches)
+        feed_dict = self.extra_compute_action_feed_dict()
+        feed_dict[self._obs_input] = obs_batch
+        feed_dict[self._is_training] = is_training
+        for ph, value in zip(self._state_inputs, state_batches):
+            feed_dict[ph] = value
+        fetches = self._sess.run(
+            ([self._sampler] + self._state_outputs +
+             [self.extra_compute_action_fetches()]), feed_dict=feed_dict)
+        return fetches[0], fetches[1:-1], fetches[-1]
+
+    def _get_loss_inputs_dict(self, postprocessed_batch):
+        feed_dict = {}
+        for key, ph in self._loss_inputs:
+            # TODO(ekl) fix up handling of RNN inputs so that we can batch
+            # across multiple rollouts
+            if key.startswith("state_in_"):
+                feed_dict[ph] = postprocessed_batch[key][:1]  # in state only
+            else:
+                feed_dict[ph] = postprocessed_batch[key]
+        return feed_dict
+
+    def compute_gradients(self, postprocessed_batch):
+        feed_dict = self.extra_compute_grad_feed_dict()
+        feed_dict[self._is_training] = True
+        feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
+        fetches = self._sess.run(
+            [self._grads, self.extra_compute_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[0], fetches[1]
+
+    def apply_gradients(self, gradients):
+        assert len(gradients) == len(self._grads), (gradients, self._grads)
+        feed_dict = self.extra_apply_grad_feed_dict()
+        feed_dict[self._is_training] = True
+        for ph, value in zip(self._grads, gradients):
+            feed_dict[ph] = value
+        fetches = self.sess.run(
+            [self._apply_op, self.extra_apply_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[1]
+
+    def compute_apply(self, postprocessed_batch):
+        feed_dict = self.extra_compute_grad_feed_dict()
+        feed_dict.update(self.extra_apply_grad_feed_dict())
+        feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
+        feed_dict[self._is_training] = True
+        fetches = self._sess.run(
+            [self._apply_op, self.extra_compute_grad_fetches(),
+             self.extra_apply_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[1], fetches[2]
+
+    def get_weights(self):
+        return self._variables.get_flat()
+
+    def set_weights(self, weights):
+        return self._variables.set_flat(weights)
+
+    def extra_compute_action_feed_dict(self):
+        return {}
+
+    def extra_compute_action_fetches(self):
+        return {}  # e.g, value function
+
+    def extra_compute_grad_feed_dict(self):
+        return {}  # e.g, kl_coeff
+
+    def extra_compute_grad_fetches(self):
+        return {}  # e.g, td error
+
+    def extra_apply_grad_feed_dict(self):
+        return {}
+
+    def extra_apply_grad_fetches(self):
+        return {}  # e.g., batch norm updates
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer()
+
+    def gradients(self, optimizer):
+        return optimizer.compute_gradients(self._loss)
@@ -31,6 +31,12 @@ TrainingResult = namedtuple(
        # (Optional) The mean episode reward if applicable.
        "episode_reward_mean",

+        # (Optional) The min episode reward if applicable.
+        "episode_reward_min",
+
+        # (Optional) The max episode reward if applicable.
+        "episode_reward_max",
+
        # (Optional) The mean episode length if applicable.
        "episode_len_mean",