[rllib] Try moving RLlib to top level dir (#5324)

2026-07-04 11:54:54 +08:00 · 2019-08-05 23:25:49 -07:00
parent 384cbfb211
commit 5d7afe8092
309 changed files with 240 additions and 234 deletions
@@ -0,0 +1 @@
+../../rllib
@@ -1,25 +0,0 @@
-RLlib: Scalable Reinforcement Learning
-======================================
-
-RLlib is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications.
-
-For an overview of RLlib, see the [documentation](http://ray.readthedocs.io/en/latest/rllib.html).
-
-If you've found RLlib useful for your research, you can cite the [paper](https://arxiv.org/abs/1712.09381) as follows:
-
-```
-@inproceedings{liang2018rllib,
-    Author = {Eric Liang and
-              Richard Liaw and
-              Robert Nishihara and
-              Philipp Moritz and
-              Roy Fox and
-              Ken Goldberg and
-              Joseph E. Gonzalez and
-              Michael I. Jordan and
-              Ion Stoica},
-    Title = {{RLlib}: Abstractions for Distributed Reinforcement Learning},
-    Booktitle = {International Conference on Machine Learning ({ICML})},
-    Year = {2018}
-}
-```
@@ -1,65 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-import sys
-
-# Note: do not introduce unnecessary library dependencies here, e.g. gym.
-# This file is imported from the tune module in order to register RLlib agents.
-from ray.tune.registry import register_trainable
-
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.evaluation.rollout_worker import RolloutWorker
-from ray.rllib.env.base_env import BaseEnv
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.rllib.env.vector_env import VectorEnv
-from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.policy.sample_batch import SampleBatch
-
-
-def _setup_logger():
-    logger = logging.getLogger("ray.rllib")
-    handler = logging.StreamHandler()
-    handler.setFormatter(
-        logging.Formatter(
-            "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
-        ))
-    logger.addHandler(handler)
-    logger.propagate = False
-
-    if sys.version_info[0] < 3:
-        logger.warn(
-            "RLlib Python 2 support is deprecated, and will be removed "
-            "in a future release.")
-
-
-def _register_all():
-
-    from ray.rllib.agents.registry import ALGORITHMS
-    from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
-    for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys(
-    )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
-        from ray.rllib.agents.registry import get_agent_class
-        register_trainable(key, get_agent_class(key))
-
-
-_setup_logger()
-_register_all()
-
-__all__ = [
-    "Policy",
-    "PolicyGraph",
-    "TFPolicy",
-    "TFPolicyGraph",
-    "RolloutWorker",
-    "PolicyEvaluator",
-    "SampleBatch",
-    "BaseEnv",
-    "MultiAgentEnv",
-    "VectorEnv",
-    "ExternalEnv",
-]
@@ -1,4 +0,0 @@
-from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.agent import Agent
-
-__all__ = ["Agent", "Trainer", "with_common_config"]
@@ -1,10 +0,0 @@
-from ray.rllib.agents.a3c.a3c import A3CTrainer, DEFAULT_CONFIG
-from ray.rllib.agents.a3c.a2c import A2CTrainer
-from ray.rllib.utils import renamed_agent
-
-A2CAgent = renamed_agent(A2CTrainer)
-A3CAgent = renamed_agent(A3CTrainer)
-
-__all__ = [
-    "A2CAgent", "A3CAgent", "A2CTrainer", "A3CTrainer", "DEFAULT_CONFIG"
-]
@@ -1,25 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.a3c.a3c import DEFAULT_CONFIG as A3C_CONFIG, \
-    validate_config, get_policy_class
-from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.utils import merge_dicts
-
-A2C_DEFAULT_CONFIG = merge_dicts(
-    A3C_CONFIG,
-    {
-        "sample_batch_size": 20,
-        "min_iter_time_s": 10,
-        "sample_async": False,
-    },
-)
-
-A2CTrainer = build_trainer(
-    name="A2C",
-    default_config=A2C_DEFAULT_CONFIG,
-    default_policy=A3CTFPolicy,
-    get_policy_class=get_policy_class,
-    validate_config=validate_config)
@@ -1,67 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.optimizers import AsyncGradientsOptimizer
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # Size of rollout batch
-    "sample_batch_size": 10,
-    # Use PyTorch as backend - no LSTM support
-    "use_pytorch": False,
-    # GAE(gamma) parameter
-    "lambda": 1.0,
-    # Max global norm for each gradient calculated by worker
-    "grad_clip": 40.0,
-    # Learning rate
-    "lr": 0.0001,
-    # Learning rate schedule
-    "lr_schedule": None,
-    # Value Function Loss coefficient
-    "vf_loss_coeff": 0.5,
-    # Entropy coefficient
-    "entropy_coeff": 0.01,
-    # Min time per iteration
-    "min_iter_time_s": 5,
-    # Workers sample async. Note that this increases the effective
-    # sample_batch_size by up to 5x due to async buffering of batches.
-    "sample_async": True,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def get_policy_class(config):
-    if config["use_pytorch"]:
-        from ray.rllib.agents.a3c.a3c_torch_policy import \
-            A3CTorchPolicy
-        return A3CTorchPolicy
-    else:
-        return A3CTFPolicy
-
-
-def validate_config(config):
-    if config["entropy_coeff"] < 0:
-        raise DeprecationWarning("entropy_coeff must be >= 0")
-    if config["sample_async"] and config["use_pytorch"]:
-        raise ValueError(
-            "The sample_async option is not supported with use_pytorch: "
-            "Multithreading can be lead to crashes if used with pytorch.")
-
-
-def make_async_optimizer(workers, config):
-    return AsyncGradientsOptimizer(workers, **config["optimizer"])
-
-
-A3CTrainer = build_trainer(
-    name="A3C",
-    default_config=DEFAULT_CONFIG,
-    default_policy=A3CTFPolicy,
-    get_policy_class=get_policy_class,
-    validate_config=validate_config,
-    make_policy_optimizer=make_async_optimizer)
@@ -1,133 +0,0 @@
-"""Note: Keep in sync with changes to VTraceTFPolicy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.tf_policy import LearningRateSchedule
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class A3CLoss(object):
-    def __init__(self,
-                 action_dist,
-                 actions,
-                 advantages,
-                 v_target,
-                 vf,
-                 vf_loss_coeff=0.5,
-                 entropy_coeff=0.01):
-        log_prob = action_dist.logp(actions)
-
-        # The "policy gradients" loss
-        self.pi_loss = -tf.reduce_sum(log_prob * advantages)
-
-        delta = vf - v_target
-        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
-        self.entropy = tf.reduce_sum(action_dist.entropy())
-        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
-                           self.entropy * entropy_coeff)
-
-
-def actor_critic_loss(policy, batch_tensors):
-    policy.loss = A3CLoss(
-        policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
-        batch_tensors[Postprocessing.ADVANTAGES],
-        batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
-        policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
-    return policy.loss.total_loss
-
-
-def postprocess_advantages(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    completed = sample_batch[SampleBatch.DONES][-1]
-    if completed:
-        last_r = 0.0
-    else:
-        next_state = []
-        for i in range(len(policy.state_in)):
-            next_state.append([sample_batch["state_out_{}".format(i)][-1]])
-        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
-                               sample_batch[SampleBatch.ACTIONS][-1],
-                               sample_batch[SampleBatch.REWARDS][-1],
-                               *next_state)
-    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
-                              policy.config["lambda"])
-
-
-def add_value_function_fetch(policy):
-    return {SampleBatch.VF_PREDS: policy.vf}
-
-
-class ValueNetworkMixin(object):
-    def __init__(self):
-        self.vf = self.model.value_function()
-
-    def _value(self, ob, prev_action, prev_reward, *args):
-        feed_dict = {
-            self.get_placeholder(SampleBatch.CUR_OBS): [ob],
-            self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
-            self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
-            self.seq_lens: [1]
-        }
-        assert len(args) == len(self.state_in), \
-            (args, self.state_in)
-        for k, v in zip(self.state_in, args):
-            feed_dict[k] = v
-        vf = self.get_session().run(self.vf, feed_dict)
-        return vf[0]
-
-
-def stats(policy, batch_tensors):
-    return {
-        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
-        "policy_loss": policy.loss.pi_loss,
-        "policy_entropy": policy.loss.entropy,
-        "var_gnorm": tf.global_norm([x for x in policy.var_list]),
-        "vf_loss": policy.loss.vf_loss,
-    }
-
-
-def grad_stats(policy, grads):
-    return {
-        "grad_gnorm": tf.global_norm(grads),
-        "vf_explained_var": explained_variance(
-            policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf),
-    }
-
-
-def clip_gradients(policy, optimizer, loss):
-    grads = tf.gradients(loss, policy.var_list)
-    grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
-    clipped_grads = list(zip(grads, policy.var_list))
-    return clipped_grads
-
-
-def setup_mixins(policy, obs_space, action_space, config):
-    ValueNetworkMixin.__init__(policy)
-    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
-    policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                        tf.get_variable_scope().name)
-
-
-A3CTFPolicy = build_tf_policy(
-    name="A3CTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=actor_critic_loss,
-    stats_fn=stats,
-    grad_stats_fn=grad_stats,
-    gradients_fn=clip_gradients,
-    postprocess_fn=postprocess_advantages,
-    extra_action_fetches_fn=add_value_function_fetch,
-    before_loss_init=setup_mixins,
-    mixins=[ValueNetworkMixin, LearningRateSchedule])
@@ -1,91 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
-
-
-def actor_critic_loss(policy, batch_tensors):
-    logits, _ = policy.model({
-        SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
-    })  # TODO(ekl) seq lens shouldn't be None
-    values = policy.model.value_function()
-    dist = policy.dist_class(logits)
-    log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
-    policy.entropy = dist.entropy().mean()
-    policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
-        log_probs.reshape(-1))
-    policy.value_err = F.mse_loss(
-        values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
-    overall_err = sum([
-        policy.pi_err,
-        policy.config["vf_loss_coeff"] * policy.value_err,
-        -policy.config["entropy_coeff"] * policy.entropy,
-    ])
-    return overall_err
-
-
-def loss_and_entropy_stats(policy, batch_tensors):
-    return {
-        "policy_entropy": policy.entropy.item(),
-        "policy_loss": policy.pi_err.item(),
-        "vf_loss": policy.value_err.item(),
-    }
-
-
-def add_advantages(policy,
-                   sample_batch,
-                   other_agent_batches=None,
-                   episode=None):
-    completed = sample_batch[SampleBatch.DONES][-1]
-    if completed:
-        last_r = 0.0
-    else:
-        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
-    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
-                              policy.config["lambda"])
-
-
-def model_value_predictions(policy, input_dict, state_batches, model):
-    return {SampleBatch.VF_PREDS: model.value_function().cpu().numpy()}
-
-
-def apply_grad_clipping(policy):
-    info = {}
-    if policy.config["grad_clip"]:
-        total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
-                                              policy.config["grad_clip"])
-        info["grad_gnorm"] = total_norm
-    return info
-
-
-def torch_optimizer(policy, config):
-    return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])
-
-
-class ValueNetworkMixin(object):
-    def _value(self, obs):
-        with self.lock:
-            obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
-            _ = self.model({"obs": obs}, [], [1])
-            return self.model.value_function().detach().cpu().numpy().squeeze()
-
-
-A3CTorchPolicy = build_torch_policy(
-    name="A3CTorchPolicy",
-    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=actor_critic_loss,
-    stats_fn=loss_and_entropy_stats,
-    postprocess_fn=add_advantages,
-    extra_action_out_fn=model_value_predictions,
-    extra_grad_process_fn=apply_grad_clipping,
-    optimizer_fn=torch_optimizer,
-    mixins=[ValueNetworkMixin])
@@ -1,8 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import Trainer
-from ray.rllib.utils import renamed_agent
-
-Agent = renamed_agent(Trainer)
@@ -1,6 +0,0 @@
-from ray.rllib.agents.ars.ars import (ARSTrainer, DEFAULT_CONFIG)
-from ray.rllib.utils import renamed_agent
-
-ARSAgent = renamed_agent(ARSTrainer)
-
-__all__ = ["ARSAgent", "ARSTrainer", "DEFAULT_CONFIG"]
@@ -1,340 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter and from
-# https://github.com/modestyachts/ARS
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import logging
-import numpy as np
-import time
-
-import ray
-from ray.rllib.agents import Trainer, with_common_config
-
-from ray.rllib.agents.ars import optimizers
-from ray.rllib.agents.ars import policies
-from ray.rllib.agents.ars import utils
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.memory import ray_get_and_free
-from ray.rllib.utils import FilterManager
-
-logger = logging.getLogger(__name__)
-
-Result = namedtuple("Result", [
-    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
-    "eval_returns", "eval_lengths"
-])
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    "noise_stdev": 0.02,  # std deviation of parameter noise
-    "num_rollouts": 32,  # number of perturbs to try
-    "rollouts_used": 32,  # number of perturbs to keep in gradient estimate
-    "num_workers": 2,
-    "sgd_stepsize": 0.01,  # sgd step-size
-    "observation_filter": "MeanStdFilter",
-    "noise_size": 250000000,
-    "eval_prob": 0.03,  # probability of evaluating the parameter rewards
-    "report_length": 10,  # how many of the last rewards we average over
-    "offset": 0,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-@ray.remote
-def create_shared_noise(count):
-    """Create a large array of noise to be shared by all workers."""
-    seed = 123
-    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
-    return noise
-
-
-class SharedNoiseTable(object):
-    def __init__(self, noise):
-        self.noise = noise
-        assert self.noise.dtype == np.float32
-
-    def get(self, i, dim):
-        return self.noise[i:i + dim]
-
-    def sample_index(self, dim):
-        return np.random.randint(0, len(self.noise) - dim + 1)
-
-    def get_delta(self, dim):
-        idx = self.sample_index(dim)
-        return idx, self.get(idx, dim)
-
-
-@ray.remote
-class Worker(object):
-    def __init__(self, config, env_creator, noise, min_task_runtime=0.2):
-        self.min_task_runtime = min_task_runtime
-        self.config = config
-        self.noise = SharedNoiseTable(noise)
-
-        self.env = env_creator(config["env_config"])
-        from ray.rllib import models
-        self.preprocessor = models.ModelCatalog.get_preprocessor(self.env)
-
-        self.sess = utils.make_session(single_threaded=True)
-        self.policy = policies.GenericPolicy(
-            self.sess, self.env.action_space, self.env.observation_space,
-            self.preprocessor, config["observation_filter"], config["model"])
-
-    @property
-    def filters(self):
-        return {DEFAULT_POLICY_ID: self.policy.get_filter()}
-
-    def sync_filters(self, new_filters):
-        for k in self.filters:
-            self.filters[k].sync(new_filters[k])
-
-    def get_filters(self, flush_after=False):
-        return_filters = {}
-        for k, f in self.filters.items():
-            return_filters[k] = f.as_serializable()
-            if flush_after:
-                f.clear_buffer()
-        return return_filters
-
-    def rollout(self, timestep_limit, add_noise=False):
-        rollout_rewards, rollout_length = policies.rollout(
-            self.policy,
-            self.env,
-            timestep_limit=timestep_limit,
-            add_noise=add_noise,
-            offset=self.config["offset"])
-        return rollout_rewards, rollout_length
-
-    def do_rollouts(self, params, timestep_limit=None):
-        # Set the network weights.
-        self.policy.set_weights(params)
-
-        noise_indices, returns, sign_returns, lengths = [], [], [], []
-        eval_returns, eval_lengths = [], []
-
-        # Perform some rollouts with noise.
-        while (len(noise_indices) == 0):
-            if np.random.uniform() < self.config["eval_prob"]:
-                # Do an evaluation run with no perturbation.
-                self.policy.set_weights(params)
-                rewards, length = self.rollout(timestep_limit, add_noise=False)
-                eval_returns.append(rewards.sum())
-                eval_lengths.append(length)
-            else:
-                # Do a regular run with parameter perturbations.
-                noise_index = self.noise.sample_index(self.policy.num_params)
-
-                perturbation = self.config["noise_stdev"] * self.noise.get(
-                    noise_index, self.policy.num_params)
-
-                # These two sampling steps could be done in parallel on
-                # different actors letting us update twice as frequently.
-                self.policy.set_weights(params + perturbation)
-                rewards_pos, lengths_pos = self.rollout(timestep_limit)
-
-                self.policy.set_weights(params - perturbation)
-                rewards_neg, lengths_neg = self.rollout(timestep_limit)
-
-                noise_indices.append(noise_index)
-                returns.append([rewards_pos.sum(), rewards_neg.sum()])
-                sign_returns.append(
-                    [np.sign(rewards_pos).sum(),
-                     np.sign(rewards_neg).sum()])
-                lengths.append([lengths_pos, lengths_neg])
-
-        return Result(
-            noise_indices=noise_indices,
-            noisy_returns=returns,
-            sign_noisy_returns=sign_returns,
-            noisy_lengths=lengths,
-            eval_returns=eval_returns,
-            eval_lengths=eval_lengths)
-
-
-class ARSTrainer(Trainer):
-    """Large-scale implementation of Augmented Random Search in Ray."""
-
-    _name = "ARS"
-    _default_config = DEFAULT_CONFIG
-
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        env = env_creator(config["env_config"])
-        from ray.rllib import models
-        preprocessor = models.ModelCatalog.get_preprocessor(env)
-
-        self.sess = utils.make_session(single_threaded=False)
-        self.policy = policies.GenericPolicy(
-            self.sess, env.action_space, env.observation_space, preprocessor,
-            config["observation_filter"], config["model"])
-        self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
-
-        self.rollouts_used = config["rollouts_used"]
-        self.num_rollouts = config["num_rollouts"]
-        self.report_length = config["report_length"]
-
-        # Create the shared noise table.
-        logger.info("Creating shared noise table.")
-        noise_id = create_shared_noise.remote(config["noise_size"])
-        self.noise = SharedNoiseTable(ray.get(noise_id))
-
-        # Create the actors.
-        logger.info("Creating actors.")
-        self.workers = [
-            Worker.remote(config, env_creator, noise_id)
-            for _ in range(config["num_workers"])
-        ]
-
-        self.episodes_so_far = 0
-        self.reward_list = []
-        self.tstart = time.time()
-
-    @override(Trainer)
-    def _train(self):
-        config = self.config
-
-        theta = self.policy.get_weights()
-        assert theta.dtype == np.float32
-
-        # Put the current policy weights in the object store.
-        theta_id = ray.put(theta)
-        # Use the actors to do rollouts, note that we pass in the ID of the
-        # policy weights.
-        results, num_episodes, num_timesteps = self._collect_results(
-            theta_id, config["num_rollouts"])
-
-        all_noise_indices = []
-        all_training_returns = []
-        all_training_lengths = []
-        all_eval_returns = []
-        all_eval_lengths = []
-
-        # Loop over the results.
-        for result in results:
-            all_eval_returns += result.eval_returns
-            all_eval_lengths += result.eval_lengths
-
-            all_noise_indices += result.noise_indices
-            all_training_returns += result.noisy_returns
-            all_training_lengths += result.noisy_lengths
-
-        assert len(all_eval_returns) == len(all_eval_lengths)
-        assert (len(all_noise_indices) == len(all_training_returns) ==
-                len(all_training_lengths))
-
-        self.episodes_so_far += num_episodes
-
-        # Assemble the results.
-        eval_returns = np.array(all_eval_returns)
-        eval_lengths = np.array(all_eval_lengths)
-        noise_indices = np.array(all_noise_indices)
-        noisy_returns = np.array(all_training_returns)
-        noisy_lengths = np.array(all_training_lengths)
-
-        # keep only the best returns
-        # select top performing directions if rollouts_used < num_rollouts
-        max_rewards = np.max(noisy_returns, axis=1)
-        if self.rollouts_used > self.num_rollouts:
-            self.rollouts_used = self.num_rollouts
-
-        percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
-        idx = np.arange(max_rewards.size)[
-            max_rewards >= np.percentile(max_rewards, percentile)]
-        noise_idx = noise_indices[idx]
-        noisy_returns = noisy_returns[idx, :]
-
-        # Compute and take a step.
-        g, count = utils.batched_weighted_sum(
-            noisy_returns[:, 0] - noisy_returns[:, 1],
-            (self.noise.get(index, self.policy.num_params)
-             for index in noise_idx),
-            batch_size=min(500, noisy_returns[:, 0].size))
-        g /= noise_idx.size
-        # scale the returns by their standard deviation
-        if not np.isclose(np.std(noisy_returns), 0.0):
-            g /= np.std(noisy_returns)
-        assert (g.shape == (self.policy.num_params, )
-                and g.dtype == np.float32)
-        # Compute the new weights theta.
-        theta, update_ratio = self.optimizer.update(-g)
-        # Set the new weights in the local copy of the policy.
-        self.policy.set_weights(theta)
-        # update the reward list
-        if len(all_eval_returns) > 0:
-            self.reward_list.append(eval_returns.mean())
-
-        # Now sync the filters
-        FilterManager.synchronize({
-            DEFAULT_POLICY_ID: self.policy.get_filter()
-        }, self.workers)
-
-        info = {
-            "weights_norm": np.square(theta).sum(),
-            "weights_std": np.std(theta),
-            "grad_norm": np.square(g).sum(),
-            "update_ratio": update_ratio,
-            "episodes_this_iter": noisy_lengths.size,
-            "episodes_so_far": self.episodes_so_far,
-        }
-        result = dict(
-            episode_reward_mean=np.mean(
-                self.reward_list[-self.report_length:]),
-            episode_len_mean=eval_lengths.mean(),
-            timesteps_this_iter=noisy_lengths.sum(),
-            info=info)
-
-        return result
-
-    @override(Trainer)
-    def _stop(self):
-        # workaround for https://github.com/ray-project/ray/issues/1516
-        for w in self.workers:
-            w.__ray_terminate__.remote()
-
-    @override(Trainer)
-    def compute_action(self, observation):
-        return self.policy.compute(observation, update=True)[0]
-
-    def _collect_results(self, theta_id, min_episodes):
-        num_episodes, num_timesteps = 0, 0
-        results = []
-        while num_episodes < min_episodes:
-            logger.debug(
-                "Collected {} episodes {} timesteps so far this iter".format(
-                    num_episodes, num_timesteps))
-            rollout_ids = [
-                worker.do_rollouts.remote(theta_id) for worker in self.workers
-            ]
-            # Get the results of the rollouts.
-            for result in ray_get_and_free(rollout_ids):
-                results.append(result)
-                # Update the number of episodes and the number of timesteps
-                # keeping in mind that result.noisy_lengths is a list of lists,
-                # where the inner lists have length 2.
-                num_episodes += sum(len(pair) for pair in result.noisy_lengths)
-                num_timesteps += sum(
-                    sum(pair) for pair in result.noisy_lengths)
-
-        return results, num_episodes, num_timesteps
-
-    def __getstate__(self):
-        return {
-            "weights": self.policy.get_weights(),
-            "filter": self.policy.get_filter(),
-            "episodes_so_far": self.episodes_so_far,
-        }
-
-    def __setstate__(self, state):
-        self.episodes_so_far = state["episodes_so_far"]
-        self.policy.set_weights(state["weights"])
-        self.policy.set_filter(state["filter"])
-        FilterManager.synchronize({
-            DEFAULT_POLICY_ID: self.policy.get_filter()
-        }, self.workers)
@@ -1,57 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-class Optimizer(object):
-    def __init__(self, policy):
-        self.policy = policy
-        self.dim = policy.num_params
-        self.t = 0
-
-    def update(self, globalg):
-        self.t += 1
-        step = self._compute_step(globalg)
-        theta = self.policy.get_weights()
-        ratio = np.linalg.norm(step) / np.linalg.norm(theta)
-        return theta + step, ratio
-
-    def _compute_step(self, globalg):
-        raise NotImplementedError
-
-
-class SGD(Optimizer):
-    def __init__(self, policy, stepsize, momentum=0.0):
-        Optimizer.__init__(self, policy)
-        self.v = np.zeros(self.dim, dtype=np.float32)
-        self.stepsize, self.momentum = stepsize, momentum
-
-    def _compute_step(self, globalg):
-        self.v = self.momentum * self.v + (1. - self.momentum) * globalg
-        step = -self.stepsize * self.v
-        return step
-
-
-class Adam(Optimizer):
-    def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999,
-                 epsilon=1e-08):
-        Optimizer.__init__(self, policy)
-        self.stepsize = stepsize
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.m = np.zeros(self.dim, dtype=np.float32)
-        self.v = np.zeros(self.dim, dtype=np.float32)
-
-    def _compute_step(self, globalg):
-        a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
-                             (1 - self.beta1**self.t))
-        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
-        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
-        step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
-        return step
@@ -1,115 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gym
-import numpy as np
-
-import ray
-import ray.experimental.tf_utils
-from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
-from ray.rllib.utils.filter import get_filter
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0):
-    """Do a rollout.
-
-    If add_noise is True, the rollout will take noisy actions with
-    noise drawn from that stream. Otherwise, no action noise will be added.
-
-    Parameters
-    ----------
-    policy: tf object
-        policy from which to draw actions
-    env: GymEnv
-        environment from which to draw rewards, done, and next state
-    timestep_limit: int, optional
-        steps after which to end the rollout
-    add_noise: bool, optional
-        indicates whether exploratory action noise should be added
-    offset: int, optional
-        value to subtract from the reward. For example, survival bonus
-        from humanoid
-    """
-    env_timestep_limit = env.spec.max_episode_steps
-    timestep_limit = (env_timestep_limit if timestep_limit is None else min(
-        timestep_limit, env_timestep_limit))
-    rews = []
-    t = 0
-    observation = env.reset()
-    for _ in range(timestep_limit or 999999):
-        ac = policy.compute(observation, add_noise=add_noise, update=True)[0]
-        observation, rew, done, _ = env.step(ac)
-        rew -= np.abs(offset)
-        rews.append(rew)
-        t += 1
-        if done:
-            break
-    rews = np.array(rews, dtype=np.float32)
-    return rews, t
-
-
-class GenericPolicy(object):
-    def __init__(self,
-                 sess,
-                 action_space,
-                 obs_space,
-                 preprocessor,
-                 observation_filter,
-                 model_config,
-                 action_noise_std=0.0):
-        self.sess = sess
-        self.action_space = action_space
-        self.action_noise_std = action_noise_std
-        self.preprocessor = preprocessor
-        self.observation_filter = get_filter(observation_filter,
-                                             self.preprocessor.shape)
-        self.inputs = tf.placeholder(tf.float32,
-                                     [None] + list(self.preprocessor.shape))
-
-        # Policy network.
-        dist_class, dist_dim = ModelCatalog.get_action_dist(
-            action_space, model_config, dist_type="deterministic")
-
-        model = ModelCatalog.get_model({
-            "obs": self.inputs
-        }, obs_space, action_space, dist_dim, model_config)
-        dist = dist_class(model.outputs)
-        self.sampler = dist.sample()
-
-        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
-            model.outputs, self.sess)
-
-        self.num_params = sum(
-            np.prod(variable.shape.as_list())
-            for _, variable in self.variables.variables.items())
-        self.sess.run(tf.global_variables_initializer())
-
-    def compute(self, observation, add_noise=False, update=True):
-        observation = self.preprocessor.transform(observation)
-        observation = self.observation_filter(observation[None], update=update)
-        action = self.sess.run(
-            self.sampler, feed_dict={self.inputs: observation})
-        action = _unbatch_tuple_actions(action)
-        if add_noise and isinstance(self.action_space, gym.spaces.Box):
-            action += np.random.randn(*action.shape) * self.action_noise_std
-        return action
-
-    def set_weights(self, x):
-        self.variables.set_flat(x)
-
-    def set_filter(self, obs_filter):
-        self.observation_filter = obs_filter
-
-    def get_filter(self):
-        return self.observation_filter
-
-    def get_weights(self):
-        return self.variables.get_flat()
@@ -1,63 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-def compute_ranks(x):
-    """Returns ranks in [0, len(x))
-
-    Note: This is different from scipy.stats.rankdata, which returns ranks in
-    [1, len(x)].
-    """
-    assert x.ndim == 1
-    ranks = np.empty(len(x), dtype=int)
-    ranks[x.argsort()] = np.arange(len(x))
-    return ranks
-
-
-def compute_centered_ranks(x):
-    y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
-    y /= (x.size - 1)
-    y -= 0.5
-    return y
-
-
-def make_session(single_threaded):
-    if not single_threaded:
-        return tf.Session()
-    return tf.Session(
-        config=tf.ConfigProto(
-            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
-
-
-def itergroups(items, group_size):
-    assert group_size >= 1
-    group = []
-    for x in items:
-        group.append(x)
-        if len(group) == group_size:
-            yield tuple(group)
-            del group[:]
-    if group:
-        yield tuple(group)
-
-
-def batched_weighted_sum(weights, vecs, batch_size):
-    total = 0
-    num_items_summed = 0
-    for batch_weights, batch_vecs in zip(
-            itergroups(weights, batch_size), itergroups(vecs, batch_size)):
-        assert len(batch_weights) == len(batch_vecs) <= batch_size
-        total += np.dot(
-            np.asarray(batch_weights, dtype=np.float32),
-            np.asarray(batch_vecs, dtype=np.float32))
-        num_items_summed += len(batch_weights)
-    return total, num_items_summed
@@ -1 +0,0 @@
-Implementation of deep deterministic policy gradients (https://arxiv.org/abs/1509.02971), including an Ape-X variant.
@@ -1,16 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
-from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
-from ray.rllib.agents.ddpg.td3 import TD3Trainer
-from ray.rllib.utils import renamed_agent
-
-ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
-DDPGAgent = renamed_agent(DDPGTrainer)
-
-__all__ = [
-    "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
-    "TD3Trainer", "DEFAULT_CONFIG"
-]
@@ -1,37 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
-from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
-    DEFAULT_CONFIG as DDPG_CONFIG
-from ray.rllib.utils import merge_dicts
-
-APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
-    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
-    {
-        "optimizer": merge_dicts(
-            DDPG_CONFIG["optimizer"], {
-                "max_weight_sync_delay": 400,
-                "num_replay_buffer_shards": 4,
-                "debug": False
-            }),
-        "n_step": 3,
-        "num_gpus": 0,
-        "num_workers": 32,
-        "buffer_size": 2000000,
-        "learning_starts": 50000,
-        "train_batch_size": 512,
-        "sample_batch_size": 50,
-        "target_network_update_freq": 500000,
-        "timesteps_per_iteration": 25000,
-        "per_worker_exploration": True,
-        "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
-    },
-)
-
-ApexDDPGTrainer = DDPGTrainer.with_updates(
-    name="APEX_DDPG",
-    default_config=APEX_DDPG_DEFAULT_CONFIG,
-    **APEX_TRAINER_PROPERTIES)
@@ -1,222 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer, \
-    update_worker_explorations
-from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
-from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
-    # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
-    # In addition to settings below, you can use "exploration_noise_type" and
-    # "exploration_gauss_act_noise" to get IID Gaussian exploration noise
-    # instead of OU exploration noise.
-    # twin Q-net
-    "twin_q": False,
-    # delayed policy update
-    "policy_delay": 1,
-    # target policy smoothing
-    # (this also replaces OU exploration noise with IID Gaussian exploration
-    # noise, for now)
-    "smooth_target_policy": False,
-    # gaussian stddev of target action noise for smoothing
-    "target_noise": 0.2,
-    # target noise limit (bound)
-    "target_noise_clip": 0.5,
-
-    # === Evaluation ===
-    # Evaluate with epsilon=0 every `evaluation_interval` training iterations.
-    # The evaluation stats will be reported under the "evaluation" metric key.
-    # Note that evaluation is currently not parallelized, and that for Ape-X
-    # metrics are already only reported for the lowest epsilon workers.
-    "evaluation_interval": None,
-    # Number of episodes to run per evaluation period.
-    "evaluation_num_episodes": 10,
-
-    # === Model ===
-    # Apply a state preprocessor with spec given by the "model" config option
-    # (like other RL algorithms). This is mostly useful if you have a weird
-    # observation shape, like an image. Auto-enabled if a custom model is set.
-    "use_state_preprocessor": False,
-    # Postprocess the policy network model output with these hidden layers. If
-    # use_state_preprocessor is False, then these will be the *only* hidden
-    # layers in the network.
-    "actor_hiddens": [400, 300],
-    # Hidden layers activation of the postprocessing stage of the policy
-    # network
-    "actor_hidden_activation": "relu",
-    # Postprocess the critic network model output with these hidden layers;
-    # again, if use_state_preprocessor is True, then the state will be
-    # preprocessed by the model specified with the "model" config option first.
-    "critic_hiddens": [400, 300],
-    # Hidden layers activation of the postprocessing state of the critic.
-    "critic_hidden_activation": "relu",
-    # N-step Q learning
-    "n_step": 1,
-
-    # === Exploration ===
-    # Turns on annealing schedule for exploration noise. Exploration is
-    # annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
-    # scaled by exploration_fraction. Original DDPG and TD3 papers do not
-    # anneal noise, so this is False by default.
-    "exploration_should_anneal": False,
-    # Max num timesteps for annealing schedules.
-    "schedule_max_timesteps": 100000,
-    # Number of env steps to optimize for before returning
-    "timesteps_per_iteration": 1000,
-    # Fraction of entire training period over which the exploration rate is
-    # annealed
-    "exploration_fraction": 0.1,
-    # Final scaling multiplier for action noise (initial is 1.0)
-    "exploration_final_scale": 0.02,
-    # valid values: "ou" (time-correlated, like original DDPG paper),
-    # "gaussian" (IID, like TD3 paper)
-    "exploration_noise_type": "ou",
-    # OU-noise scale; this can be used to scale down magnitude of OU noise
-    # before adding to actions (requires "exploration_noise_type" to be "ou")
-    "exploration_ou_noise_scale": 0.1,
-    # theta for OU
-    "exploration_ou_theta": 0.15,
-    # sigma for OU
-    "exploration_ou_sigma": 0.2,
-    # gaussian stddev of act noise for exploration (requires
-    # "exploration_noise_type" to be "gaussian")
-    "exploration_gaussian_sigma": 0.1,
-    # If True parameter space noise will be used for exploration
-    # See https://blog.openai.com/better-exploration-with-parameter-noise/
-    "parameter_noise": False,
-    # Until this many timesteps have elapsed, the agent's policy will be
-    # ignored & it will instead take uniform random actions. Can be used in
-    # conjunction with learning_starts (which controls when the first
-    # optimization step happens) to decrease dependence of exploration &
-    # optimization on initial policy parameters. Note that this will be
-    # disabled when the action noise scale is set to 0 (e.g during evaluation).
-    "pure_exploration_steps": 1000,
-    # Extra configuration that disables exploration.
-    "evaluation_config": {
-        "exploration_fraction": 0,
-        "exploration_final_eps": 0,
-    },
-
-    # === Replay buffer ===
-    # Size of the replay buffer. Note that if async_updates is set, then
-    # each worker will have a replay buffer of this size.
-    "buffer_size": 50000,
-    # If True prioritized replay buffer will be used.
-    "prioritized_replay": True,
-    # Alpha parameter for prioritized replay buffer.
-    "prioritized_replay_alpha": 0.6,
-    # Beta parameter for sampling from prioritized replay buffer.
-    "prioritized_replay_beta": 0.4,
-    # Fraction of entire training period over which the beta parameter is
-    # annealed
-    "beta_annealing_fraction": 0.2,
-    # Final value of beta
-    "final_prioritized_replay_beta": 0.4,
-    # Epsilon to add to the TD errors when updating priorities.
-    "prioritized_replay_eps": 1e-6,
-    # Whether to LZ4 compress observations
-    "compress_observations": False,
-
-    # === Optimization ===
-    # Learning rate for the critic (Q-function) optimizer.
-    "critic_lr": 1e-3,
-    # Learning rate for the actor (policy) optimizer.
-    "actor_lr": 1e-3,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 0,
-    # Update the target by \tau * policy + (1-\tau) * target_policy
-    "tau": 0.002,
-    # If True, use huber loss instead of squared loss for critic network
-    # Conventionally, no need to clip gradients if using a huber loss
-    "use_huber": False,
-    # Threshold of a huber loss
-    "huber_threshold": 1.0,
-    # Weights for L2 regularization
-    "l2_reg": 1e-6,
-    # If not None, clip gradients during optimization at this value
-    "grad_norm_clipping": None,
-    # How many steps of the model to sample before learning starts.
-    "learning_starts": 1500,
-    # Update the replay buffer with this many samples at once. Note that this
-    # setting applies per-worker if num_workers > 1.
-    "sample_batch_size": 1,
-    # Size of a batched sampled from replay buffer for training. Note that
-    # if async_updates is set, then each worker returns gradients for a
-    # batch of this size.
-    "train_batch_size": 256,
-
-    # === Parallelism ===
-    # Number of workers for collecting samples with. This only makes sense
-    # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    "num_workers": 0,
-    # Whether to use a distribution of epsilons across workers for exploration.
-    "per_worker_exploration": False,
-    # Whether to compute priorities on workers.
-    "worker_side_prioritization": False,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 1,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def make_exploration_schedule(config, worker_index):
-    # Modification of DQN's schedule to take into account
-    # `exploration_ou_noise_scale`
-    if config["per_worker_exploration"]:
-        assert config["num_workers"] > 1, "This requires multiple workers"
-        if worker_index >= 0:
-            # Exploration constants from the Ape-X paper
-            max_index = float(config["num_workers"] - 1)
-            exponent = 1 + worker_index / max_index * 7
-            return ConstantSchedule(0.4**exponent)
-        else:
-            # local ev should have zero exploration so that eval rollouts
-            # run properly
-            return ConstantSchedule(0.0)
-    elif config["exploration_should_anneal"]:
-        return LinearSchedule(
-            schedule_timesteps=int(config["exploration_fraction"] *
-                                   config["schedule_max_timesteps"]),
-            initial_p=1.0,
-            final_p=config["exploration_final_scale"])
-    else:
-        # *always* add exploration noise
-        return ConstantSchedule(1.0)
-
-
-def setup_ddpg_exploration(trainer):
-    trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
-    trainer.explorations = [
-        make_exploration_schedule(trainer.config, i)
-        for i in range(trainer.config["num_workers"])
-    ]
-
-
-def add_pure_exploration_phase(trainer):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    pure_expl_steps = trainer.config["pure_exploration_steps"]
-    if pure_expl_steps:
-        # tell workers whether they should do pure exploration
-        only_explore = global_timestep < pure_expl_steps
-        trainer.workers.local_worker().foreach_trainable_policy(
-            lambda p, _: p.set_pure_exploration_phase(only_explore))
-        for e in trainer.workers.remote_workers():
-            e.foreach_trainable_policy.remote(
-                lambda p, _: p.set_pure_exploration_phase(only_explore))
-    update_worker_explorations(trainer)
-
-
-DDPGTrainer = GenericOffPolicyTrainer.with_updates(
-    name="DDPG",
-    default_config=DEFAULT_CONFIG,
-    default_policy=DDPGTFPolicy,
-    before_init=setup_ddpg_exploration,
-    before_train_step=add_pure_exploration_phase)
@@ -1,246 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class DDPGModel(TFModelV2):
-    """Extension of standard TFModel for DDPG.
-
-    Data flow:
-        obs -> forward() -> model_out
-        model_out -> get_policy_output() -> pi(s)
-        model_out, actions -> get_q_values() -> Q(s, a)
-        model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
-
-    Note that this class by itself is not a valid model unless you
-    implement forward() in a subclass."""
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 actor_hidden_activation="relu",
-                 actor_hiddens=(400, 300),
-                 critic_hidden_activation="relu",
-                 critic_hiddens=(400, 300),
-                 parameter_noise=False,
-                 twin_q=False,
-                 exploration_ou_sigma=0.2):
-        """Initialize variables of this model.
-
-        Extra model kwargs:
-            actor_hidden_activation (str): activation for actor network
-            actor_hiddens (list): hidden layers sizes for actor network
-            critic_hidden_activation (str): activation for critic network
-            critic_hiddens (list): hidden layers sizes for critic network
-            parameter_noise (bool): use param noise exploration
-            twin_q (bool): build twin Q networks
-            exploration_ou_sigma (float): ou noise sigma for exploration
-
-        Note that the core layers for forward() are not defined here, this
-        only defines the layers for the output heads. Those layers for
-        forward() should be defined in subclasses of DDPGModel.
-        """
-
-        super(DDPGModel, self).__init__(obs_space, action_space, num_outputs,
-                                        model_config, name)
-        self.exploration_ou_sigma = exploration_ou_sigma
-
-        self.action_dim = np.product(action_space.shape)
-        self.model_out = tf.keras.layers.Input(
-            shape=(num_outputs, ), name="model_out")
-        self.actions = tf.keras.layers.Input(
-            shape=(self.action_dim, ), name="actions")
-
-        def build_action_net(action_out):
-            activation = getattr(tf.nn, actor_hidden_activation)
-            i = 0
-            for hidden in actor_hiddens:
-                if parameter_noise:
-                    import tensorflow.contrib.layers as layers
-                    action_out = layers.fully_connected(
-                        action_out,
-                        num_outputs=hidden,
-                        activation_fn=activation,
-                        normalizer_fn=layers.layer_norm)
-                else:
-                    action_out = tf.layers.dense(
-                        action_out,
-                        units=hidden,
-                        activation=activation,
-                        name="action_hidden_{}".format(i))
-                i += 1
-            return tf.layers.dense(
-                action_out,
-                units=self.action_dim,
-                activation=None,
-                name="action_out")
-
-        action_scope = name + "/action_net"
-
-        # TODO(ekl) use keras layers instead of variable scopes
-        def build_action_net_scope(model_out):
-            with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
-                return build_action_net(model_out)
-
-        pi_out = tf.keras.layers.Lambda(build_action_net_scope)(self.model_out)
-        self.action_net = tf.keras.Model(self.model_out, pi_out)
-        self.register_variables(self.action_net.variables)
-
-        # Noise vars for P network except for layer normalization vars
-        if parameter_noise:
-            with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
-                self._build_parameter_noise([
-                    var for var in self.action_net.variables
-                    if "LayerNorm" not in var.name
-                ])
-
-        def build_q_net(name, model_out, actions):
-            q_out = tf.keras.layers.Concatenate(axis=1)([model_out, actions])
-            activation = getattr(tf.nn, critic_hidden_activation)
-            for i, n in enumerate(critic_hiddens):
-                q_out = tf.keras.layers.Dense(
-                    n,
-                    name="{}_hidden_{}".format(name, i),
-                    activation=activation)(q_out)
-            q_out = tf.keras.layers.Dense(
-                1, activation=None, name="{}_out".format(name))(q_out)
-            return tf.keras.Model([model_out, actions], q_out)
-
-        self.q_net = build_q_net("q", self.model_out, self.actions)
-        self.register_variables(self.q_net.variables)
-
-        if twin_q:
-            self.twin_q_net = build_q_net("twin_q", self.model_out,
-                                          self.actions)
-            self.register_variables(self.twin_q_net.variables)
-        else:
-            self.twin_q_net = None
-
-    def forward(self, input_dict, state, seq_lens):
-        """This generates the model_out tensor input.
-
-        You must implement this as documented in modelv2.py."""
-        raise NotImplementedError
-
-    def get_policy_output(self, model_out):
-        """Return the (unscaled) output of the policy network.
-
-        This returns the unscaled outputs of pi(s).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-
-        Returns:
-            tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
-        """
-        return self.action_net(model_out)
-
-    def get_q_values(self, model_out, actions):
-        """Return the Q estimates for the most recent forward pass.
-
-        This implements Q(s, a).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-            actions (Tensor): action values that correspond with the most
-                recent batch of observations passed through forward(), of shape
-                [BATCH_SIZE, action_dim].
-
-        Returns:
-            tensor of shape [BATCH_SIZE].
-        """
-        return self.q_net([model_out, actions])
-
-    def get_twin_q_values(self, model_out, actions):
-        """Same as get_q_values but using the twin Q net.
-
-        This implements the twin Q(s, a).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-            actions (Tensor): action values that correspond with the most
-                recent batch of observations passed through forward(), of shape
-                [BATCH_SIZE, action_dim].
-
-        Returns:
-            tensor of shape [BATCH_SIZE].
-        """
-        return self.twin_q_net([model_out, actions])
-
-    def policy_variables(self):
-        """Return the list of variables for the policy net."""
-
-        return list(self.action_net.variables)
-
-    def q_variables(self):
-        """Return the list of variables for Q / twin Q nets."""
-
-        return self.q_net.variables + (self.twin_q_net.variables
-                                       if self.twin_q_net else [])
-
-    def update_action_noise(self, session, distance_in_action_space,
-                            exploration_ou_sigma, cur_noise_scale):
-        """Update the model action noise settings.
-
-        This is called internally by the DDPG policy."""
-
-        self.pi_distance = distance_in_action_space
-        if (distance_in_action_space < exploration_ou_sigma * cur_noise_scale):
-            # multiplying the sampled OU noise by noise scale is
-            # equivalent to multiplying the sigma of OU by noise scale
-            self.parameter_noise_sigma_val *= 1.01
-        else:
-            self.parameter_noise_sigma_val /= 1.01
-        self.parameter_noise_sigma.load(
-            self.parameter_noise_sigma_val, session=session)
-
-    def _build_parameter_noise(self, pnet_params):
-        assert pnet_params
-        self.parameter_noise_sigma_val = self.exploration_ou_sigma
-        self.parameter_noise_sigma = tf.get_variable(
-            initializer=tf.constant_initializer(
-                self.parameter_noise_sigma_val),
-            name="parameter_noise_sigma",
-            shape=(),
-            trainable=False,
-            dtype=tf.float32)
-        self.parameter_noise = []
-        # No need to add any noise on LayerNorm parameters
-        for var in pnet_params:
-            noise_var = tf.get_variable(
-                name=var.name.split(":")[0] + "_noise",
-                shape=var.shape,
-                initializer=tf.constant_initializer(.0),
-                trainable=False)
-            self.parameter_noise.append(noise_var)
-        remove_noise_ops = list()
-        for var, var_noise in zip(pnet_params, self.parameter_noise):
-            remove_noise_ops.append(tf.assign_add(var, -var_noise))
-        self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
-        generate_noise_ops = list()
-        for var_noise in self.parameter_noise:
-            generate_noise_ops.append(
-                tf.assign(
-                    var_noise,
-                    tf.random_normal(
-                        shape=var_noise.shape,
-                        stddev=self.parameter_noise_sigma)))
-        with tf.control_dependencies(generate_noise_ops):
-            add_noise_ops = list()
-            for var, var_noise in zip(pnet_params, self.parameter_noise):
-                add_noise_ops.append(tf.assign_add(var, var_noise))
-            self.add_noise_op = tf.group(*tuple(add_noise_ops))
-        self.pi_distance = None
@@ -1,507 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Box
-import numpy as np
-import logging
-
-import ray
-import ray.experimental.tf_utils
-from ray.rllib.agents.ddpg.ddpg_model import DDPGModel
-from ray.rllib.agents.ddpg.noop_model import NoopModel
-from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip
-
-tf = try_import_tf()
-logger = logging.getLogger(__name__)
-
-
-def build_ddpg_model(policy, obs_space, action_space, config):
-    if config["model"]["custom_model"]:
-        logger.warning(
-            "Setting use_state_preprocessor=True since a custom model "
-            "was specified.")
-        config["use_state_preprocessor"] = True
-    if not isinstance(action_space, Box):
-        raise UnsupportedSpaceException(
-            "Action space {} is not supported for DDPG.".format(action_space))
-    if len(action_space.shape) > 1:
-        raise UnsupportedSpaceException(
-            "Action space has multiple dimensions "
-            "{}. ".format(action_space.shape) +
-            "Consider reshaping this into a single dimension, "
-            "using a Tuple action space, or the multi-agent API.")
-
-    if config["use_state_preprocessor"]:
-        default_model = None  # catalog decides
-        num_outputs = 256  # arbitrary
-        config["model"]["no_final_linear"] = True
-    else:
-        default_model = NoopModel
-        num_outputs = int(np.product(obs_space.shape))
-
-    policy.model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=DDPGModel,
-        default_model=default_model,
-        name="ddpg_model",
-        actor_hidden_activation=config["actor_hidden_activation"],
-        actor_hiddens=config["actor_hiddens"],
-        critic_hidden_activation=config["critic_hidden_activation"],
-        critic_hiddens=config["critic_hiddens"],
-        parameter_noise=config["parameter_noise"],
-        twin_q=config["twin_q"])
-
-    policy.target_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=DDPGModel,
-        default_model=default_model,
-        name="target_ddpg_model",
-        actor_hidden_activation=config["actor_hidden_activation"],
-        actor_hiddens=config["actor_hiddens"],
-        critic_hidden_activation=config["critic_hidden_activation"],
-        critic_hiddens=config["critic_hiddens"],
-        parameter_noise=config["parameter_noise"],
-        twin_q=config["twin_q"])
-
-    return policy.model
-
-
-def postprocess_trajectory(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    if policy.config["parameter_noise"]:
-        policy.adjust_param_noise_sigma(sample_batch)
-    return _postprocess_dqn(policy, sample_batch)
-
-
-def exploration_setting_inputs(policy):
-    return {
-        policy.stochastic: True,
-        policy.noise_scale: policy.cur_noise_scale,
-        policy.pure_exploration_phase: policy.cur_pure_exploration_phase,
-    }
-
-
-def build_action_output(policy, model, input_dict, obs_space, action_space,
-                        config):
-    model_out, _ = model({
-        "obs": input_dict[SampleBatch.CUR_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-    action_out = model.get_policy_output(model_out)
-
-    # Use sigmoid to scale to [0,1], but also double magnitude of input to
-    # emulate behaviour of tanh activation used in DDPG and TD3 papers.
-    sigmoid_out = tf.nn.sigmoid(2 * action_out)
-    # Rescale to actual env policy scale
-    # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
-    # get same dims)
-    action_range = (action_space.high - action_space.low)[None]
-    low_action = action_space.low[None]
-    deterministic_actions = action_range * sigmoid_out + low_action
-
-    noise_type = config["exploration_noise_type"]
-    action_low = action_space.low
-    action_high = action_space.high
-    action_range = action_space.high - action_low
-
-    def compute_stochastic_actions():
-        def make_noisy_actions():
-            # shape of deterministic_actions is [None, dim_action]
-            if noise_type == "gaussian":
-                # add IID Gaussian noise for exploration, TD3-style
-                normal_sample = policy.noise_scale * tf.random_normal(
-                    tf.shape(deterministic_actions),
-                    stddev=config["exploration_gaussian_sigma"])
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + normal_sample,
-                    action_low * tf.ones_like(deterministic_actions),
-                    action_high * tf.ones_like(deterministic_actions))
-            elif noise_type == "ou":
-                # add OU noise for exploration, DDPG-style
-                zero_acts = action_low.size * [.0]
-                exploration_sample = tf.get_variable(
-                    name="ornstein_uhlenbeck",
-                    dtype=tf.float32,
-                    initializer=zero_acts,
-                    trainable=False)
-                normal_sample = tf.random_normal(
-                    shape=[action_low.size], mean=0.0, stddev=1.0)
-                ou_new = config["exploration_ou_theta"] \
-                    * -exploration_sample \
-                    + config["exploration_ou_sigma"] * normal_sample
-                exploration_value = tf.assign_add(exploration_sample, ou_new)
-                base_scale = config["exploration_ou_noise_scale"]
-                noise = policy.noise_scale * base_scale \
-                    * exploration_value * action_range
-                stochastic_actions = tf.clip_by_value(
-                    deterministic_actions + noise,
-                    action_low * tf.ones_like(deterministic_actions),
-                    action_high * tf.ones_like(deterministic_actions))
-            else:
-                raise ValueError(
-                    "Unknown noise type '%s' (try 'ou' or 'gaussian')" %
-                    noise_type)
-            return stochastic_actions
-
-        def make_uniform_random_actions():
-            # pure random exploration option
-            uniform_random_actions = tf.random_uniform(
-                tf.shape(deterministic_actions))
-            # rescale uniform random actions according to action range
-            tf_range = tf.constant(action_range[None], dtype="float32")
-            tf_low = tf.constant(action_low[None], dtype="float32")
-            uniform_random_actions = uniform_random_actions * tf_range \
-                + tf_low
-            return uniform_random_actions
-
-        stochastic_actions = tf.cond(
-            # need to condition on noise_scale > 0 because zeroing
-            # noise_scale is how a worker signals no noise should be used
-            # (this is ugly and should be fixed by adding an "eval_mode"
-            # config flag or something)
-            tf.logical_and(policy.pure_exploration_phase,
-                           policy.noise_scale > 0),
-            true_fn=make_uniform_random_actions,
-            false_fn=make_noisy_actions)
-        return stochastic_actions
-
-    enable_stochastic = tf.logical_and(policy.stochastic,
-                                       not config["parameter_noise"])
-    actions = tf.cond(enable_stochastic, compute_stochastic_actions,
-                      lambda: deterministic_actions)
-    policy.output_actions = actions
-    return actions, None
-
-
-def actor_critic_loss(policy, batch_tensors):
-    model_out_t, _ = policy.model({
-        "obs": batch_tensors[SampleBatch.CUR_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    model_out_tp1, _ = policy.model({
-        "obs": batch_tensors[SampleBatch.NEXT_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    target_model_out_tp1, _ = policy.target_model({
-        "obs": batch_tensors[SampleBatch.NEXT_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    policy_t = policy.model.get_policy_output(model_out_t)
-    policy_tp1 = policy.model.get_policy_output(model_out_tp1)
-
-    if policy.config["smooth_target_policy"]:
-        target_noise_clip = policy.config["target_noise_clip"]
-        clipped_normal_sample = tf.clip_by_value(
-            tf.random_normal(
-                tf.shape(policy_tp1), stddev=policy.config["target_noise"]),
-            -target_noise_clip, target_noise_clip)
-        policy_tp1_smoothed = tf.clip_by_value(
-            policy_tp1 + clipped_normal_sample,
-            policy.action_space.low * tf.ones_like(policy_tp1),
-            policy.action_space.high * tf.ones_like(policy_tp1))
-    else:
-        policy_tp1_smoothed = policy_tp1
-
-    # q network evaluation
-    q_t = policy.model.get_q_values(model_out_t,
-                                    batch_tensors[SampleBatch.ACTIONS])
-    if policy.config["twin_q"]:
-        twin_q_t = policy.model.get_twin_q_values(
-            model_out_t, batch_tensors[SampleBatch.ACTIONS])
-
-    # Q-values for current policy (no noise) in given current state
-    q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
-
-    # target q network evaluation
-    q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
-                                             policy_tp1_smoothed)
-    if policy.config["twin_q"]:
-        twin_q_tp1 = policy.target_model.get_twin_q_values(
-            target_model_out_tp1, policy_tp1_smoothed)
-
-    q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
-    if policy.config["twin_q"]:
-        twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
-        q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
-
-    q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
-    q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
-                                       tf.float32)) * q_tp1_best
-
-    # compute RHS of bellman equation
-    q_t_selected_target = tf.stop_gradient(
-        batch_tensors[SampleBatch.REWARDS] +
-        policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
-
-    # compute the error (potentially clipped)
-    if policy.config["twin_q"]:
-        td_error = q_t_selected - q_t_selected_target
-        twin_td_error = twin_q_t_selected - q_t_selected_target
-        td_error = td_error + twin_td_error
-        if policy.config["use_huber"]:
-            errors = huber_loss(td_error, policy.config["huber_threshold"]) \
-                + huber_loss(twin_td_error, policy.config["huber_threshold"])
-        else:
-            errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(twin_td_error)
-    else:
-        td_error = q_t_selected - q_t_selected_target
-        if policy.config["use_huber"]:
-            errors = huber_loss(td_error, policy.config["huber_threshold"])
-        else:
-            errors = 0.5 * tf.square(td_error)
-
-    critic_loss = policy.model.custom_loss(
-        tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
-    actor_loss = -tf.reduce_mean(q_t_det_policy)
-
-    if policy.config["l2_reg"] is not None:
-        for var in policy.model.policy_variables():
-            if "bias" not in var.name:
-                actor_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
-        for var in policy.model.q_variables():
-            if "bias" not in var.name:
-                critic_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
-
-    # save for stats function
-    policy.q_t = q_t
-    policy.td_error = td_error
-    policy.actor_loss = actor_loss
-    policy.critic_loss = critic_loss
-
-    # in a custom apply op we handle the losses separately, but return them
-    # combined in one loss for now
-    return actor_loss + critic_loss
-
-
-def gradients(policy, optimizer, loss):
-    if policy.config["grad_norm_clipping"] is not None:
-        actor_grads_and_vars = minimize_and_clip(
-            policy._actor_optimizer,
-            policy.actor_loss,
-            var_list=policy.model.policy_variables(),
-            clip_val=policy.config["grad_norm_clipping"])
-        critic_grads_and_vars = minimize_and_clip(
-            policy._critic_optimizer,
-            policy.critic_loss,
-            var_list=policy.model.q_variables(),
-            clip_val=policy.config["grad_norm_clipping"])
-    else:
-        actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
-            policy.actor_loss, var_list=policy.model.policy_variables())
-        critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
-            policy.critic_loss, var_list=policy.model.q_variables())
-    # save these for later use in build_apply_op
-    policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
-                                    if g is not None]
-    policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
-                                     if g is not None]
-    grads_and_vars = (
-        policy._actor_grads_and_vars + policy._critic_grads_and_vars)
-    return grads_and_vars
-
-
-def apply_gradients(policy, optimizer, grads_and_vars):
-    # for policy gradient, update policy net one time v.s.
-    # update critic net `policy_delay` time(s)
-    should_apply_actor_opt = tf.equal(
-        tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
-
-    def make_apply_op():
-        return policy._actor_optimizer.apply_gradients(
-            policy._actor_grads_and_vars)
-
-    actor_op = tf.cond(
-        should_apply_actor_opt,
-        true_fn=make_apply_op,
-        false_fn=lambda: tf.no_op())
-    critic_op = policy._critic_optimizer.apply_gradients(
-        policy._critic_grads_and_vars)
-
-    # increment global step & apply ops
-    with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
-        return tf.group(actor_op, critic_op)
-
-
-def stats(policy, batch_tensors):
-    return {
-        "td_error": tf.reduce_mean(policy.td_error),
-        "actor_loss": tf.reduce_mean(policy.actor_loss),
-        "critic_loss": tf.reduce_mean(policy.critic_loss),
-        "mean_q": tf.reduce_mean(policy.q_t),
-        "max_q": tf.reduce_max(policy.q_t),
-        "min_q": tf.reduce_min(policy.q_t),
-    }
-
-
-class ExplorationStateMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        self.cur_noise_scale = 1.0
-        self.cur_pure_exploration_phase = False
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
-        self.pure_exploration_phase = tf.placeholder(
-            tf.bool, (), name="pure_exploration_phase")
-
-    def add_parameter_noise(self):
-        if self.config["parameter_noise"]:
-            self.get_session().run(self.model.add_noise_op)
-
-    def adjust_param_noise_sigma(self, sample_batch):
-        # adjust the sigma of parameter space noise
-        states, noisy_actions = [
-            list(x) for x in sample_batch.columns(
-                [SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
-        ]
-        self.get_session().run(self.model.remove_noise_op)
-        clean_actions = self.get_session().run(
-            self.output_actions,
-            feed_dict={
-                self.get_placeholder(SampleBatch.CUR_OBS): states,
-                self.stochastic: False,
-                self.noise_scale: .0,
-                self.pure_exploration_phase: False,
-            })
-        distance_in_action_space = np.sqrt(
-            np.mean(np.square(clean_actions - noisy_actions)))
-        self.model.update_action_noise(
-            self.get_session(), distance_in_action_space,
-            self.config["exploration_ou_sigma"], self.cur_noise_scale)
-
-    def set_epsilon(self, epsilon):
-        # set_epsilon is called by optimizer to anneal exploration as
-        # necessary, and to turn it off during evaluation. The "epsilon" part
-        # is a carry-over from DQN, which uses epsilon-greedy exploration
-        # rather than adding action noise to the output of a policy network.
-        self.cur_noise_scale = epsilon
-
-    def set_pure_exploration_phase(self, pure_exploration_phase):
-        self.cur_pure_exploration_phase = pure_exploration_phase
-
-    @override(Policy)
-    def get_state(self):
-        return [
-            TFPolicy.get_state(self), self.cur_noise_scale,
-            self.cur_pure_exploration_phase
-        ]
-
-    @override(Policy)
-    def set_state(self, state):
-        TFPolicy.set_state(self, state[0])
-        self.set_epsilon(state[1])
-        self.set_pure_exploration_phase(state[2])
-
-
-class TargetNetworkMixin(object):
-    def __init__(self, config):
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        self.tau_value = config.get("tau")
-        self.tau = tf.placeholder(tf.float32, (), name="tau")
-        update_target_expr = []
-        model_vars = self.model.trainable_variables()
-        target_model_vars = self.target_model.trainable_variables()
-        assert len(model_vars) == len(target_model_vars), \
-            (model_vars, target_model_vars)
-        for var, var_target in zip(model_vars, target_model_vars):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-            logger.debug("Update target op {}".format(var_target))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-        # Hard initial update
-        self.update_target(tau=1.0)
-
-    # support both hard and soft sync
-    def update_target(self, tau=None):
-        tau = tau or self.tau_value
-        return self.get_session().run(
-            self.update_target_expr, feed_dict={self.tau: tau})
-
-
-class ActorCriticOptimizerMixin(object):
-    def __init__(self, config):
-        # create global step for counting the number of update operations
-        self.global_step = tf.train.get_or_create_global_step()
-
-        # use separate optimizers for actor & critic
-        self._actor_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["actor_lr"])
-        self._critic_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["critic_lr"])
-
-
-class ComputeTDErrorMixin(object):
-    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                         importance_weights):
-        if not self.loss_initialized():
-            return np.zeros_like(rew_t)
-
-        td_err = self.get_session().run(
-            self.td_error,
-            feed_dict={
-                self.get_placeholder(SampleBatch.CUR_OBS): [
-                    np.array(ob) for ob in obs_t
-                ],
-                self.get_placeholder(SampleBatch.ACTIONS): act_t,
-                self.get_placeholder(SampleBatch.REWARDS): rew_t,
-                self.get_placeholder(SampleBatch.NEXT_OBS): [
-                    np.array(ob) for ob in obs_tp1
-                ],
-                self.get_placeholder(SampleBatch.DONES): done_mask,
-                self.get_placeholder(PRIO_WEIGHTS): importance_weights
-            })
-        return td_err
-
-
-def setup_early_mixins(policy, obs_space, action_space, config):
-    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
-    ActorCriticOptimizerMixin.__init__(policy, config)
-
-
-def setup_late_mixins(policy, obs_space, action_space, config):
-    TargetNetworkMixin.__init__(policy, config)
-
-
-DDPGTFPolicy = build_tf_policy(
-    name="DDPGTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
-    make_model=build_ddpg_model,
-    postprocess_fn=postprocess_trajectory,
-    extra_action_feed_fn=exploration_setting_inputs,
-    action_sampler_fn=build_action_output,
-    loss_fn=actor_critic_loss,
-    stats_fn=stats,
-    gradients_fn=gradients,
-    apply_gradients_fn=apply_gradients,
-    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
-    mixins=[
-        TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
-        ComputeTDErrorMixin
-    ],
-    before_init=setup_early_mixins,
-    after_init=setup_late_mixins,
-    obs_include_prev_action_reward=False)
@@ -1,20 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.models import Model
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class NoopModel(Model):
-    """Trivial model that just returns the obs flattened.
-
-    This is the model used if use_state_preprocessor=False."""
-
-    @override(Model)
-    def _build_layers_v2(self, input_dict, num_outputs, options):
-        out = tf.reshape(input_dict["obs"], [-1, num_outputs])
-        return out, out
@@ -1,57 +0,0 @@
-"""A more stable successor to TD3.
-
-By default, this uses a near-identical configuration to that reported in the
-TD3 paper.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
-    DEFAULT_CONFIG as DDPG_CONFIG
-from ray.rllib.utils import merge_dicts
-
-TD3_DEFAULT_CONFIG = merge_dicts(
-    DDPG_CONFIG,
-    {
-        # largest changes: twin Q functions, delayed policy updates, and target
-        # smoothing
-        "twin_q": True,
-        "policy_delay": 2,
-        "smooth_target_policy": True,
-        "target_noise": 0.2,
-        "target_noise_clip": 0.5,
-
-        # other changes & things we want to keep fixed: IID Gaussian
-        # exploration noise, larger actor learning rate, no l2 regularisation,
-        # no Huber loss, etc.
-        "exploration_should_anneal": False,
-        "exploration_noise_type": "gaussian",
-        "exploration_gaussian_sigma": 0.1,
-        "learning_starts": 10000,
-        "pure_exploration_steps": 10000,
-        "actor_hiddens": [400, 300],
-        "critic_hiddens": [400, 300],
-        "n_step": 1,
-        "gamma": 0.99,
-        "actor_lr": 1e-3,
-        "critic_lr": 1e-3,
-        "l2_reg": 0.0,
-        "tau": 5e-3,
-        "train_batch_size": 100,
-        "use_huber": False,
-        "target_network_update_freq": 0,
-        "num_workers": 0,
-        "num_gpus_per_worker": 0,
-        "per_worker_exploration": False,
-        "worker_side_prioritization": False,
-        "buffer_size": 1000000,
-        "prioritized_replay": False,
-        "clip_rewards": False,
-        "use_state_preprocessor": False,
-    },
-)
-
-TD3Trainer = DDPGTrainer.with_updates(
-    name="TD3", default_config=TD3_DEFAULT_CONFIG)
@@ -1 +0,0 @@
-Code in this package is adapted from https://github.com/openai/baselines/tree/master/baselines/deepq.
@@ -1,15 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.dqn.apex import ApexTrainer
-from ray.rllib.agents.dqn.dqn import DQNTrainer, SimpleQTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_agent
-
-DQNAgent = renamed_agent(DQNTrainer)
-ApexAgent = renamed_agent(ApexTrainer)
-
-__all__ = [
-    "DQNAgent", "ApexAgent", "ApexTrainer", "DQNTrainer", "DEFAULT_CONFIG",
-    "SimpleQTrainer"
-]
@@ -1,84 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.dqn.dqn import DQNTrainer, DEFAULT_CONFIG as DQN_CONFIG
-from ray.rllib.optimizers import AsyncReplayOptimizer
-from ray.rllib.utils import merge_dicts
-
-# yapf: disable
-# __sphinx_doc_begin__
-APEX_DEFAULT_CONFIG = merge_dicts(
-    DQN_CONFIG,  # see also the options in dqn.py, which are also supported
-    {
-        "optimizer": merge_dicts(
-            DQN_CONFIG["optimizer"], {
-                "max_weight_sync_delay": 400,
-                "num_replay_buffer_shards": 4,
-                "debug": False
-            }),
-        "n_step": 3,
-        "num_gpus": 1,
-        "num_workers": 32,
-        "buffer_size": 2000000,
-        "learning_starts": 50000,
-        "train_batch_size": 512,
-        "sample_batch_size": 50,
-        "target_network_update_freq": 500000,
-        "timesteps_per_iteration": 25000,
-        "per_worker_exploration": True,
-        "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
-    },
-)
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def defer_make_workers(trainer, env_creator, policy, config):
-    # Hack to workaround https://github.com/ray-project/ray/issues/2541
-    # The workers will be creatd later, after the optimizer is created
-    return trainer._make_workers(env_creator, policy, config, 0)
-
-
-def make_async_optimizer(workers, config):
-    assert len(workers.remote_workers()) == 0
-    extra_config = config["optimizer"].copy()
-    for key in [
-            "prioritized_replay", "prioritized_replay_alpha",
-            "prioritized_replay_beta", "prioritized_replay_eps"
-    ]:
-        if key in config:
-            extra_config[key] = config[key]
-    opt = AsyncReplayOptimizer(
-        workers,
-        learning_starts=config["learning_starts"],
-        buffer_size=config["buffer_size"],
-        train_batch_size=config["train_batch_size"],
-        sample_batch_size=config["sample_batch_size"],
-        **extra_config)
-    workers.add_workers(config["num_workers"])
-    opt._set_workers(workers.remote_workers())
-    return opt
-
-
-def update_target_based_on_num_steps_trained(trainer, fetches):
-    # Ape-X updates based on num steps trained, not sampled
-    if (trainer.optimizer.num_steps_trained -
-            trainer.state["last_target_update_ts"] >
-            trainer.config["target_network_update_freq"]):
-        trainer.workers.local_worker().foreach_trainable_policy(
-            lambda p, _: p.update_target())
-        trainer.state["last_target_update_ts"] = (
-            trainer.optimizer.num_steps_trained)
-        trainer.state["num_target_updates"] += 1
-
-
-APEX_TRAINER_PROPERTIES = {
-    "make_workers": defer_make_workers,
-    "make_policy_optimizer": make_async_optimizer,
-    "after_optimizer_step": update_target_based_on_num_steps_trained,
-}
-
-ApexTrainer = DQNTrainer.with_updates(
-    name="APEX", default_config=APEX_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
@@ -1,261 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class DistributionalQModel(TFModelV2):
-    """Extension of standard TFModel to provide distributional Q values.
-
-    It also supports options for noisy nets and parameter space noise.
-
-    Data flow:
-        obs -> forward() -> model_out
-        model_out -> get_q_value_distributions() -> Q(s, a) atoms
-        model_out -> get_state_value() -> V(s)
-
-    Note that this class by itself is not a valid model unless you
-    implement forward() in a subclass."""
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 q_hiddens=(256, ),
-                 dueling=False,
-                 num_atoms=1,
-                 use_noisy=False,
-                 v_min=-10.0,
-                 v_max=10.0,
-                 sigma0=0.5,
-                 parameter_noise=False):
-        """Initialize variables of this model.
-
-        Extra model kwargs:
-            q_hiddens (list): defines size of hidden layers for the q head.
-                These will be used to postprocess the model output for the
-                purposes of computing Q values.
-            dueling (bool): whether to build the state value head for DDQN
-            num_atoms (int): if >1, enables distributional DQN
-            use_noisy (bool): use noisy nets
-            v_min (float): min value support for distributional DQN
-            v_max (float): max value support for distributional DQN
-            sigma0 (float): initial value of noisy nets
-            parameter_noise (bool): enable layer norm for param noise
-
-        Note that the core layers for forward() are not defined here, this
-        only defines the layers for the Q head. Those layers for forward()
-        should be defined in subclasses of DistributionalQModel.
-        """
-
-        super(DistributionalQModel, self).__init__(
-            obs_space, action_space, num_outputs, model_config, name)
-
-        # setup the Q head output (i.e., model for get_q_values)
-        self.model_out = tf.keras.layers.Input(
-            shape=(num_outputs, ), name="model_out")
-
-        def build_action_value(model_out):
-            if q_hiddens:
-                action_out = model_out
-                for i in range(len(q_hiddens)):
-                    if use_noisy:
-                        action_out = self._noisy_layer(
-                            "hidden_%d" % i, action_out, q_hiddens[i], sigma0)
-                    elif parameter_noise:
-                        import tensorflow.contrib.layers as layers
-                        action_out = layers.fully_connected(
-                            action_out,
-                            num_outputs=q_hiddens[i],
-                            activation_fn=tf.nn.relu,
-                            normalizer_fn=layers.layer_norm)
-                    else:
-                        action_out = tf.layers.dense(
-                            action_out,
-                            units=q_hiddens[i],
-                            activation=tf.nn.relu,
-                            name="hidden_%d" % i)
-            else:
-                # Avoid postprocessing the outputs. This enables custom models
-                # to be used for parametric action DQN.
-                action_out = model_out
-            if use_noisy:
-                action_scores = self._noisy_layer(
-                    "output",
-                    action_out,
-                    self.action_space.n * num_atoms,
-                    sigma0,
-                    non_linear=False)
-            elif q_hiddens:
-                action_scores = tf.layers.dense(
-                    action_out,
-                    units=self.action_space.n * num_atoms,
-                    activation=None)
-            else:
-                action_scores = model_out
-            if num_atoms > 1:
-                # Distributional Q-learning uses a discrete support z
-                # to represent the action value distribution
-                z = tf.range(num_atoms, dtype=tf.float32)
-                z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
-                support_logits_per_action = tf.reshape(
-                    tensor=action_scores,
-                    shape=(-1, self.action_space.n, num_atoms))
-                support_prob_per_action = tf.nn.softmax(
-                    logits=support_logits_per_action)
-                action_scores = tf.reduce_sum(
-                    input_tensor=z * support_prob_per_action, axis=-1)
-                logits = support_logits_per_action
-                dist = support_prob_per_action
-                return [
-                    action_scores, z, support_logits_per_action, logits, dist
-                ]
-            else:
-                logits = tf.expand_dims(tf.ones_like(action_scores), -1)
-                dist = tf.expand_dims(tf.ones_like(action_scores), -1)
-                return [action_scores, logits, dist]
-
-        def build_state_score(model_out):
-            state_out = model_out
-            for i in range(len(q_hiddens)):
-                if use_noisy:
-                    state_out = self._noisy_layer("dueling_hidden_%d" % i,
-                                                  state_out, q_hiddens[i],
-                                                  sigma0)
-                elif parameter_noise:
-                    state_out = tf.contrib.layers.fully_connected(
-                        state_out,
-                        num_outputs=q_hiddens[i],
-                        activation_fn=tf.nn.relu,
-                        normalizer_fn=tf.contrib.layers.layer_norm)
-                else:
-                    state_out = tf.layers.dense(
-                        state_out, units=q_hiddens[i], activation=tf.nn.relu)
-            if use_noisy:
-                state_score = self._noisy_layer(
-                    "dueling_output",
-                    state_out,
-                    num_atoms,
-                    sigma0,
-                    non_linear=False)
-            else:
-                state_score = tf.layers.dense(
-                    state_out, units=num_atoms, activation=None)
-            return state_score
-
-        def build_action_value_in_scope(model_out):
-            with tf.variable_scope(
-                    name + "/action_value", reuse=tf.AUTO_REUSE):
-                return build_action_value(model_out)
-
-        def build_state_score_in_scope(model_out):
-            with tf.variable_scope(name + "/state_value", reuse=tf.AUTO_REUSE):
-                return build_state_score(model_out)
-
-        q_out = tf.keras.layers.Lambda(build_action_value_in_scope)(
-            self.model_out)
-        self.q_value_head = tf.keras.Model(self.model_out, q_out)
-        self.register_variables(self.q_value_head.variables)
-
-        if dueling:
-            state_out = tf.keras.layers.Lambda(build_state_score_in_scope)(
-                self.model_out)
-            self.state_value_head = tf.keras.Model(self.model_out, state_out)
-            self.register_variables(self.state_value_head.variables)
-
-    def forward(self, input_dict, state, seq_lens):
-        """This generates the model_out tensor input.
-
-        You must implement this as documented in modelv2.py."""
-        raise NotImplementedError
-
-    def get_q_value_distributions(self, model_out):
-        """Returns distributional values for Q(s, a) given a state embedding.
-
-        Override this in your custom model to customize the Q output head.
-
-        Arguments:
-            model_out (Tensor): embedding from the model layers
-
-        Returns:
-            (action_scores, logits, dist) if num_atoms == 1, otherwise
-            (action_scores, z, support_logits_per_action, logits, dist)
-        """
-
-        return self.q_value_head(model_out)
-
-    def get_state_value(self, model_out):
-        """Returns the state value prediction for the given state embedding."""
-
-        return self.state_value_head(model_out)
-
-    def _noisy_layer(self,
-                     prefix,
-                     action_in,
-                     out_size,
-                     sigma0,
-                     non_linear=True):
-        """
-        a common dense layer: y = w^{T}x + b
-        a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
-            (b+\epsilon_b*\sigma_b)
-        where \epsilon are random variables sampled from factorized normal
-        distributions and \sigma are trainable variables which are expected to
-        vanish along the training procedure
-        """
-        import tensorflow.contrib.layers as layers
-
-        in_size = int(action_in.shape[1])
-
-        epsilon_in = tf.random_normal(shape=[in_size])
-        epsilon_out = tf.random_normal(shape=[out_size])
-        epsilon_in = self._f_epsilon(epsilon_in)
-        epsilon_out = self._f_epsilon(epsilon_out)
-        epsilon_w = tf.matmul(
-            a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
-        epsilon_b = epsilon_out
-        sigma_w = tf.get_variable(
-            name=prefix + "_sigma_w",
-            shape=[in_size, out_size],
-            dtype=tf.float32,
-            initializer=tf.random_uniform_initializer(
-                minval=-1.0 / np.sqrt(float(in_size)),
-                maxval=1.0 / np.sqrt(float(in_size))))
-        # TF noise generation can be unreliable on GPU
-        # If generating the noise on the CPU,
-        # lowering sigma0 to 0.1 may be helpful
-        sigma_b = tf.get_variable(
-            name=prefix + "_sigma_b",
-            shape=[out_size],
-            dtype=tf.float32,  # 0.5~GPU, 0.1~CPU
-            initializer=tf.constant_initializer(
-                sigma0 / np.sqrt(float(in_size))))
-
-        w = tf.get_variable(
-            name=prefix + "_fc_w",
-            shape=[in_size, out_size],
-            dtype=tf.float32,
-            initializer=layers.xavier_initializer())
-        b = tf.get_variable(
-            name=prefix + "_fc_b",
-            shape=[out_size],
-            dtype=tf.float32,
-            initializer=tf.zeros_initializer())
-
-        action_activation = tf.nn.xw_plus_b(action_in, w + sigma_w * epsilon_w,
-                                            b + sigma_b * epsilon_b)
-
-        if not non_linear:
-            return action_activation
-        return tf.nn.relu(action_activation)
-
-    def _f_epsilon(self, x):
-        return tf.sign(x) * tf.sqrt(tf.abs(x))
@@ -1,300 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-
-from ray import tune
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
-from ray.rllib.agents.dqn.simple_q_policy import SimpleQPolicy
-from ray.rllib.optimizers import SyncReplayOptimizer
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
-
-logger = logging.getLogger(__name__)
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # === Model ===
-    # Number of atoms for representing the distribution of return. When
-    # this is greater than 1, distributional Q-learning is used.
-    # the discrete supports are bounded by v_min and v_max
-    "num_atoms": 1,
-    "v_min": -10.0,
-    "v_max": 10.0,
-    # Whether to use noisy network
-    "noisy": False,
-    # control the initial value of noisy nets
-    "sigma0": 0.5,
-    # Whether to use dueling dqn
-    "dueling": True,
-    # Whether to use double dqn
-    "double_q": True,
-    # Postprocess model outputs with these hidden layers to compute the
-    # state and action values. See also the model config in catalog.py.
-    "hiddens": [256],
-    # N-step Q learning
-    "n_step": 1,
-
-    # === Exploration ===
-    # Max num timesteps for annealing schedules. Exploration is annealed from
-    # 1.0 to exploration_fraction over this number of timesteps scaled by
-    # exploration_fraction
-    "schedule_max_timesteps": 100000,
-    # Minimum env steps to optimize for per train call. This value does
-    # not affect learning, only the length of iterations.
-    "timesteps_per_iteration": 1000,
-    # Fraction of entire training period over which the exploration rate is
-    # annealed
-    "exploration_fraction": 0.1,
-    # Final value of random action probability
-    "exploration_final_eps": 0.02,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 500,
-    # Use softmax for sampling actions. Required for off policy estimation.
-    "soft_q": False,
-    # Softmax temperature. Q values are divided by this value prior to softmax.
-    # Softmax approaches argmax as the temperature drops to zero.
-    "softmax_temp": 1.0,
-    # If True parameter space noise will be used for exploration
-    # See https://blog.openai.com/better-exploration-with-parameter-noise/
-    "parameter_noise": False,
-    # Extra configuration that disables exploration.
-    "evaluation_config": {
-        "exploration_fraction": 0,
-        "exploration_final_eps": 0,
-    },
-
-    # === Replay buffer ===
-    # Size of the replay buffer. Note that if async_updates is set, then
-    # each worker will have a replay buffer of this size.
-    "buffer_size": 50000,
-    # If True prioritized replay buffer will be used.
-    "prioritized_replay": True,
-    # Alpha parameter for prioritized replay buffer.
-    "prioritized_replay_alpha": 0.6,
-    # Beta parameter for sampling from prioritized replay buffer.
-    "prioritized_replay_beta": 0.4,
-    # Fraction of entire training period over which the beta parameter is
-    # annealed
-    "beta_annealing_fraction": 0.2,
-    # Final value of beta
-    "final_prioritized_replay_beta": 0.4,
-    # Epsilon to add to the TD errors when updating priorities.
-    "prioritized_replay_eps": 1e-6,
-    # Whether to LZ4 compress observations
-    "compress_observations": True,
-
-    # === Optimization ===
-    # Learning rate for adam optimizer
-    "lr": 5e-4,
-    # Learning rate schedule
-    "lr_schedule": None,
-    # Adam epsilon hyper parameter
-    "adam_epsilon": 1e-8,
-    # If not None, clip gradients during optimization at this value
-    "grad_norm_clipping": 40,
-    # How many steps of the model to sample before learning starts.
-    "learning_starts": 1000,
-    # Update the replay buffer with this many samples at once. Note that
-    # this setting applies per-worker if num_workers > 1.
-    "sample_batch_size": 4,
-    # Size of a batched sampled from replay buffer for training. Note that
-    # if async_updates is set, then each worker returns gradients for a
-    # batch of this size.
-    "train_batch_size": 32,
-
-    # === Parallelism ===
-    # Number of workers for collecting samples with. This only makes sense
-    # to increase if your environment is particularly slow to sample, or if
-    # you"re using the Async or Ape-X optimizers.
-    "num_workers": 0,
-    # Whether to use a distribution of epsilons across workers for exploration.
-    "per_worker_exploration": False,
-    # Whether to compute priorities on workers.
-    "worker_side_prioritization": False,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 1,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def make_optimizer(workers, config):
-    return SyncReplayOptimizer(
-        workers,
-        learning_starts=config["learning_starts"],
-        buffer_size=config["buffer_size"],
-        prioritized_replay=config["prioritized_replay"],
-        prioritized_replay_alpha=config["prioritized_replay_alpha"],
-        prioritized_replay_beta=config["prioritized_replay_beta"],
-        schedule_max_timesteps=config["schedule_max_timesteps"],
-        beta_annealing_fraction=config["beta_annealing_fraction"],
-        final_prioritized_replay_beta=config["final_prioritized_replay_beta"],
-        prioritized_replay_eps=config["prioritized_replay_eps"],
-        train_batch_size=config["train_batch_size"],
-        sample_batch_size=config["sample_batch_size"],
-        **config["optimizer"])
-
-
-def check_config_and_setup_param_noise(config):
-    """Update the config based on settings.
-
-    Rewrites sample_batch_size to take into account n_step truncation, and also
-    adds the necessary callbacks to support parameter space noise exploration.
-    """
-
-    # Update effective batch size to include n-step
-    adjusted_batch_size = max(config["sample_batch_size"],
-                              config.get("n_step", 1))
-    config["sample_batch_size"] = adjusted_batch_size
-
-    if config.get("parameter_noise", False):
-        if config["batch_mode"] != "complete_episodes":
-            raise ValueError("Exploration with parameter space noise requires "
-                             "batch_mode to be complete_episodes.")
-        if config.get("noisy", False):
-            raise ValueError(
-                "Exploration with parameter space noise and noisy network "
-                "cannot be used at the same time.")
-        if config["callbacks"]["on_episode_start"]:
-            start_callback = config["callbacks"]["on_episode_start"]
-        else:
-            start_callback = None
-
-        def on_episode_start(info):
-            # as a callback function to sample and pose parameter space
-            # noise on the parameters of network
-            policies = info["policy"]
-            for pi in policies.values():
-                pi.add_parameter_noise()
-            if start_callback:
-                start_callback(info)
-
-        config["callbacks"]["on_episode_start"] = tune.function(
-            on_episode_start)
-        if config["callbacks"]["on_episode_end"]:
-            end_callback = config["callbacks"]["on_episode_end"]
-        else:
-            end_callback = None
-
-        def on_episode_end(info):
-            # as a callback function to monitor the distance
-            # between noisy policy and original policy
-            policies = info["policy"]
-            episode = info["episode"]
-            episode.custom_metrics["policy_distance"] = policies[
-                DEFAULT_POLICY_ID].model.pi_distance
-            if end_callback:
-                end_callback(info)
-
-        config["callbacks"]["on_episode_end"] = tune.function(on_episode_end)
-
-
-def get_initial_state(config):
-    return {
-        "last_target_update_ts": 0,
-        "num_target_updates": 0,
-    }
-
-
-def make_exploration_schedule(config, worker_index):
-    # Use either a different `eps` per worker, or a linear schedule.
-    if config["per_worker_exploration"]:
-        assert config["num_workers"] > 1, \
-            "This requires multiple workers"
-        if worker_index >= 0:
-            # Exploration constants from the Ape-X paper
-            exponent = (
-                1 + worker_index / float(config["num_workers"] - 1) * 7)
-            return ConstantSchedule(0.4**exponent)
-        else:
-            # local ev should have zero exploration so that eval rollouts
-            # run properly
-            return ConstantSchedule(0.0)
-    return LinearSchedule(
-        schedule_timesteps=int(
-            config["exploration_fraction"] * config["schedule_max_timesteps"]),
-        initial_p=1.0,
-        final_p=config["exploration_final_eps"])
-
-
-def setup_exploration(trainer):
-    trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
-    trainer.explorations = [
-        make_exploration_schedule(trainer.config, i)
-        for i in range(trainer.config["num_workers"])
-    ]
-
-
-def update_worker_explorations(trainer):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    exp_vals = [trainer.exploration0.value(global_timestep)]
-    trainer.workers.local_worker().foreach_trainable_policy(
-        lambda p, _: p.set_epsilon(exp_vals[0]))
-    for i, e in enumerate(trainer.workers.remote_workers()):
-        exp_val = trainer.explorations[i].value(global_timestep)
-        e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
-        exp_vals.append(exp_val)
-    trainer.train_start_timestep = global_timestep
-    trainer.cur_exp_vals = exp_vals
-
-
-def add_trainer_metrics(trainer, result):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    result.update(
-        timesteps_this_iter=global_timestep - trainer.train_start_timestep,
-        info=dict({
-            "min_exploration": min(trainer.cur_exp_vals),
-            "max_exploration": max(trainer.cur_exp_vals),
-            "num_target_updates": trainer.state["num_target_updates"],
-        }, **trainer.optimizer.stats()))
-
-
-def update_target_if_needed(trainer, fetches):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    if global_timestep - trainer.state["last_target_update_ts"] > \
-            trainer.config["target_network_update_freq"]:
-        trainer.workers.local_worker().foreach_trainable_policy(
-            lambda p, _: p.update_target())
-        trainer.state["last_target_update_ts"] = global_timestep
-        trainer.state["num_target_updates"] += 1
-
-
-def collect_metrics(trainer):
-    if trainer.config["per_worker_exploration"]:
-        # Only collect metrics from the third of workers with lowest eps
-        result = trainer.collect_metrics(
-            selected_workers=trainer.workers.remote_workers()[
-                -len(trainer.workers.remote_workers()) // 3:])
-    else:
-        result = trainer.collect_metrics()
-    return result
-
-
-def disable_exploration(trainer):
-    trainer.evaluation_workers.local_worker().foreach_policy(
-        lambda p, _: p.set_epsilon(0))
-
-
-GenericOffPolicyTrainer = build_trainer(
-    name="GenericOffPolicyAlgorithm",
-    default_policy=None,
-    default_config=DEFAULT_CONFIG,
-    validate_config=check_config_and_setup_param_noise,
-    get_initial_state=get_initial_state,
-    make_policy_optimizer=make_optimizer,
-    before_init=setup_exploration,
-    before_train_step=update_worker_explorations,
-    after_optimizer_step=update_target_if_needed,
-    after_train_result=add_trainer_metrics,
-    collect_metrics_fn=collect_metrics,
-    before_evaluate_fn=disable_exploration)
-
-DQNTrainer = GenericOffPolicyTrainer.with_updates(
-    name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)
-
-SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQPolicy)
@@ -1,504 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Discrete
-import numpy as np
-from scipy.stats import entropy
-
-import ray
-from ray.rllib.agents.dqn.distributional_q_model import DistributionalQModel
-from ray.rllib.agents.dqn.simple_q_policy import ExplorationStateMixin, \
-    TargetNetworkMixin
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.policy.tf_policy import LearningRateSchedule
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \
-    minimize_and_clip
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-Q_SCOPE = "q_func"
-Q_TARGET_SCOPE = "target_q_func"
-
-# Importance sampling weights for prioritized replay
-PRIO_WEIGHTS = "weights"
-
-
-class QLoss(object):
-    def __init__(self,
-                 q_t_selected,
-                 q_logits_t_selected,
-                 q_tp1_best,
-                 q_dist_tp1_best,
-                 importance_weights,
-                 rewards,
-                 done_mask,
-                 gamma=0.99,
-                 n_step=1,
-                 num_atoms=1,
-                 v_min=-10.0,
-                 v_max=10.0):
-
-        if num_atoms > 1:
-            # Distributional Q-learning which corresponds to an entropy loss
-
-            z = tf.range(num_atoms, dtype=tf.float32)
-            z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
-
-            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
-            r_tau = tf.expand_dims(
-                rewards, -1) + gamma**n_step * tf.expand_dims(
-                    1.0 - done_mask, -1) * tf.expand_dims(z, 0)
-            r_tau = tf.clip_by_value(r_tau, v_min, v_max)
-            b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
-            lb = tf.floor(b)
-            ub = tf.ceil(b)
-            # indispensable judgement which is missed in most implementations
-            # when b happens to be an integer, lb == ub, so pr_j(s', a*) will
-            # be discarded because (ub-b) == (b-lb) == 0
-            floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
-
-            l_project = tf.one_hot(
-                tf.cast(lb, dtype=tf.int32),
-                num_atoms)  # (batch_size, num_atoms, num_atoms)
-            u_project = tf.one_hot(
-                tf.cast(ub, dtype=tf.int32),
-                num_atoms)  # (batch_size, num_atoms, num_atoms)
-            ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
-            mu_delta = q_dist_tp1_best * (b - lb)
-            ml_delta = tf.reduce_sum(
-                l_project * tf.expand_dims(ml_delta, -1), axis=1)
-            mu_delta = tf.reduce_sum(
-                u_project * tf.expand_dims(mu_delta, -1), axis=1)
-            m = ml_delta + mu_delta
-
-            # Rainbow paper claims that using this cross entropy loss for
-            # priority is robust and insensitive to `prioritized_replay_alpha`
-            self.td_error = tf.nn.softmax_cross_entropy_with_logits(
-                labels=m, logits=q_logits_t_selected)
-            self.loss = tf.reduce_mean(self.td_error * importance_weights)
-            self.stats = {
-                # TODO: better Q stats for dist dqn
-                "mean_td_error": tf.reduce_mean(self.td_error),
-            }
-        else:
-            q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-            # compute RHS of bellman equation
-            q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
-
-            # compute the error (potentially clipped)
-            self.td_error = (
-                q_t_selected - tf.stop_gradient(q_t_selected_target))
-            self.loss = tf.reduce_mean(
-                importance_weights * huber_loss(self.td_error))
-            self.stats = {
-                "mean_q": tf.reduce_mean(q_t_selected),
-                "min_q": tf.reduce_min(q_t_selected),
-                "max_q": tf.reduce_max(q_t_selected),
-                "mean_td_error": tf.reduce_mean(self.td_error),
-            }
-
-
-class QValuePolicy(object):
-    def __init__(self, q_values, observations, num_actions, stochastic, eps,
-                 softmax, softmax_temp):
-        if softmax:
-            action_dist = Categorical(q_values / softmax_temp)
-            self.action = action_dist.sample()
-            self.action_prob = action_dist.sampled_action_prob()
-            return
-
-        deterministic_actions = tf.argmax(q_values, axis=1)
-        batch_size = tf.shape(observations)[0]
-
-        # Special case masked out actions (q_value ~= -inf) so that we don't
-        # even consider them for exploration.
-        random_valid_action_logits = tf.where(
-            tf.equal(q_values, tf.float32.min),
-            tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
-        random_actions = tf.squeeze(
-            tf.multinomial(random_valid_action_logits, 1), axis=1)
-
-        chose_random = tf.random_uniform(
-            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
-        stochastic_actions = tf.where(chose_random, random_actions,
-                                      deterministic_actions)
-        self.action = tf.cond(stochastic, lambda: stochastic_actions,
-                              lambda: deterministic_actions)
-        self.action_prob = None
-
-
-class ComputeTDErrorMixin(object):
-    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                         importance_weights):
-        if not self.loss_initialized():
-            return np.zeros_like(rew_t)
-
-        td_err = self.get_session().run(
-            self.q_loss.td_error,
-            feed_dict={
-                self.get_placeholder(SampleBatch.CUR_OBS): [
-                    np.array(ob) for ob in obs_t
-                ],
-                self.get_placeholder(SampleBatch.ACTIONS): act_t,
-                self.get_placeholder(SampleBatch.REWARDS): rew_t,
-                self.get_placeholder(SampleBatch.NEXT_OBS): [
-                    np.array(ob) for ob in obs_tp1
-                ],
-                self.get_placeholder(SampleBatch.DONES): done_mask,
-                self.get_placeholder(PRIO_WEIGHTS): importance_weights,
-            })
-        return td_err
-
-
-def postprocess_trajectory(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    if policy.config["parameter_noise"]:
-        # adjust the sigma of parameter space noise
-        states = [list(x) for x in sample_batch.columns(["obs"])][0]
-
-        noisy_action_distribution = policy.get_session().run(
-            policy.action_probs, feed_dict={policy.cur_observations: states})
-        policy.get_session().run(policy.remove_noise_op)
-        clean_action_distribution = policy.get_session().run(
-            policy.action_probs, feed_dict={policy.cur_observations: states})
-        distance_in_action_space = np.mean(
-            entropy(clean_action_distribution.T, noisy_action_distribution.T))
-        policy.pi_distance = distance_in_action_space
-        if (distance_in_action_space <
-                -np.log(1 - policy.cur_epsilon +
-                        policy.cur_epsilon / policy.num_actions)):
-            policy.parameter_noise_sigma_val *= 1.01
-        else:
-            policy.parameter_noise_sigma_val /= 1.01
-        policy.parameter_noise_sigma.load(
-            policy.parameter_noise_sigma_val, session=policy.get_session())
-
-    return _postprocess_dqn(policy, sample_batch)
-
-
-def build_q_model(policy, obs_space, action_space, config):
-
-    if not isinstance(action_space, Discrete):
-        raise UnsupportedSpaceException(
-            "Action space {} is not supported for DQN.".format(action_space))
-
-    if config["hiddens"]:
-        # try to infer the last layer size, otherwise fall back to 256
-        num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
-        config["model"]["no_final_linear"] = True
-    else:
-        num_outputs = action_space.n
-
-    policy.q_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=DistributionalQModel,
-        name=Q_SCOPE,
-        num_atoms=config["num_atoms"],
-        q_hiddens=config["hiddens"],
-        dueling=config["dueling"],
-        use_noisy=config["noisy"],
-        v_min=config["v_min"],
-        v_max=config["v_max"],
-        sigma0=config["sigma0"],
-        parameter_noise=config["parameter_noise"])
-
-    policy.target_q_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=DistributionalQModel,
-        name=Q_TARGET_SCOPE,
-        num_atoms=config["num_atoms"],
-        q_hiddens=config["hiddens"],
-        dueling=config["dueling"],
-        use_noisy=config["noisy"],
-        v_min=config["v_min"],
-        v_max=config["v_max"],
-        sigma0=config["sigma0"],
-        parameter_noise=config["parameter_noise"])
-
-    return policy.q_model
-
-
-def build_q_networks(policy, q_model, input_dict, obs_space, action_space,
-                     config):
-
-    # Action Q network
-    q_values, q_logits, q_dist = _compute_q_values(
-        policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space,
-        action_space)
-    policy.q_values = q_values
-    policy.q_func_vars = q_model.variables()
-
-    # Noise vars for Q network except for layer normalization vars
-    if config["parameter_noise"]:
-        _build_parameter_noise(
-            policy,
-            [var for var in policy.q_func_vars if "LayerNorm" not in var.name])
-        policy.action_probs = tf.nn.softmax(policy.q_values)
-
-    # Action outputs
-    qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS],
-                       action_space.n, policy.stochastic, policy.eps,
-                       config["soft_q"], config["softmax_temp"])
-    policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob
-
-    return policy.output_actions, policy.action_prob
-
-
-def _build_parameter_noise(policy, pnet_params):
-    policy.parameter_noise_sigma_val = 1.0
-    policy.parameter_noise_sigma = tf.get_variable(
-        initializer=tf.constant_initializer(policy.parameter_noise_sigma_val),
-        name="parameter_noise_sigma",
-        shape=(),
-        trainable=False,
-        dtype=tf.float32)
-    policy.parameter_noise = list()
-    # No need to add any noise on LayerNorm parameters
-    for var in pnet_params:
-        noise_var = tf.get_variable(
-            name=var.name.split(":")[0] + "_noise",
-            shape=var.shape,
-            initializer=tf.constant_initializer(.0),
-            trainable=False)
-        policy.parameter_noise.append(noise_var)
-    remove_noise_ops = list()
-    for var, var_noise in zip(pnet_params, policy.parameter_noise):
-        remove_noise_ops.append(tf.assign_add(var, -var_noise))
-    policy.remove_noise_op = tf.group(*tuple(remove_noise_ops))
-    generate_noise_ops = list()
-    for var_noise in policy.parameter_noise:
-        generate_noise_ops.append(
-            tf.assign(
-                var_noise,
-                tf.random_normal(
-                    shape=var_noise.shape,
-                    stddev=policy.parameter_noise_sigma)))
-    with tf.control_dependencies(generate_noise_ops):
-        add_noise_ops = list()
-        for var, var_noise in zip(pnet_params, policy.parameter_noise):
-            add_noise_ops.append(tf.assign_add(var, var_noise))
-        policy.add_noise_op = tf.group(*tuple(add_noise_ops))
-    policy.pi_distance = None
-
-
-def build_q_losses(policy, batch_tensors):
-    config = policy.config
-    # q network evaluation
-    q_t, q_logits_t, q_dist_t = _compute_q_values(
-        policy, policy.q_model, batch_tensors[SampleBatch.CUR_OBS],
-        policy.observation_space, policy.action_space)
-
-    # target q network evalution
-    q_tp1, q_logits_tp1, q_dist_tp1 = _compute_q_values(
-        policy, policy.target_q_model, batch_tensors[SampleBatch.NEXT_OBS],
-        policy.observation_space, policy.action_space)
-    policy.target_q_func_vars = policy.target_q_model.variables()
-
-    # q scores for actions which we know were selected in the given state.
-    one_hot_selection = tf.one_hot(
-        tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
-        policy.action_space.n)
-    q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
-    q_logits_t_selected = tf.reduce_sum(
-        q_logits_t * tf.expand_dims(one_hot_selection, -1), 1)
-
-    # compute estimate of best possible value starting from state at t + 1
-    if config["double_q"]:
-        q_tp1_using_online_net, q_logits_tp1_using_online_net, \
-            q_dist_tp1_using_online_net = _compute_q_values(
-                policy, policy.q_model,
-                batch_tensors[SampleBatch.NEXT_OBS],
-                policy.observation_space, policy.action_space)
-        q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
-        q_tp1_best_one_hot_selection = tf.one_hot(q_tp1_best_using_online_net,
-                                                  policy.action_space.n)
-        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
-        q_dist_tp1_best = tf.reduce_sum(
-            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
-    else:
-        q_tp1_best_one_hot_selection = tf.one_hot(
-            tf.argmax(q_tp1, 1), policy.action_space.n)
-        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
-        q_dist_tp1_best = tf.reduce_sum(
-            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
-
-    policy.q_loss = QLoss(
-        q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best,
-        batch_tensors[PRIO_WEIGHTS], batch_tensors[SampleBatch.REWARDS],
-        tf.cast(batch_tensors[SampleBatch.DONES],
-                tf.float32), config["gamma"], config["n_step"],
-        config["num_atoms"], config["v_min"], config["v_max"])
-
-    return policy.q_loss.loss
-
-
-def adam_optimizer(policy, config):
-    return tf.train.AdamOptimizer(
-        learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"])
-
-
-def clip_gradients(policy, optimizer, loss):
-    if policy.config["grad_norm_clipping"] is not None:
-        grads_and_vars = minimize_and_clip(
-            optimizer,
-            loss,
-            var_list=policy.q_func_vars,
-            clip_val=policy.config["grad_norm_clipping"])
-    else:
-        grads_and_vars = optimizer.compute_gradients(
-            loss, var_list=policy.q_func_vars)
-    grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
-    return grads_and_vars
-
-
-def exploration_setting_inputs(policy):
-    return {
-        policy.stochastic: True,
-        policy.eps: policy.cur_epsilon,
-    }
-
-
-def build_q_stats(policy, batch_tensors):
-    return dict({
-        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
-    }, **policy.q_loss.stats)
-
-
-def setup_early_mixins(policy, obs_space, action_space, config):
-    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
-    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
-
-
-def setup_late_mixins(policy, obs_space, action_space, config):
-    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
-
-
-def _compute_q_values(policy, model, obs, obs_space, action_space):
-    config = policy.config
-    model_out, state = model({
-        "obs": obs,
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    if config["num_atoms"] > 1:
-        (action_scores, z, support_logits_per_action, logits,
-         dist) = model.get_q_value_distributions(model_out)
-    else:
-        (action_scores, logits,
-         dist) = model.get_q_value_distributions(model_out)
-
-    if config["dueling"]:
-        state_score = model.get_state_value(model_out)
-        if config["num_atoms"] > 1:
-            support_logits_per_action_mean = tf.reduce_mean(
-                support_logits_per_action, 1)
-            support_logits_per_action_centered = (
-                support_logits_per_action - tf.expand_dims(
-                    support_logits_per_action_mean, 1))
-            support_logits_per_action = tf.expand_dims(
-                state_score, 1) + support_logits_per_action_centered
-            support_prob_per_action = tf.nn.softmax(
-                logits=support_logits_per_action)
-            value = tf.reduce_sum(
-                input_tensor=z * support_prob_per_action, axis=-1)
-            logits = support_logits_per_action
-            dist = support_prob_per_action
-        else:
-            action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
-            action_scores_centered = action_scores - tf.expand_dims(
-                action_scores_mean, 1)
-            value = state_score + action_scores_centered
-    else:
-        value = action_scores
-
-    return value, logits, dist
-
-
-def _adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
-    """Rewrites the given trajectory fragments to encode n-step rewards.
-
-    reward[i] = (
-        reward[i] * gamma**0 +
-        reward[i+1] * gamma**1 +
-        ... +
-        reward[i+n_step-1] * gamma**(n_step-1))
-
-    The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
-
-    At the end of the trajectory, n is truncated to fit in the traj length.
-    """
-
-    assert not any(dones[:-1]), "Unexpected done in middle of trajectory"
-
-    traj_length = len(rewards)
-    for i in range(traj_length):
-        for j in range(1, n_step):
-            if i + j < traj_length:
-                new_obs[i] = new_obs[i + j]
-                dones[i] = dones[i + j]
-                rewards[i] += gamma**j * rewards[i + j]
-
-
-def _postprocess_dqn(policy, batch):
-    # N-step Q adjustments
-    if policy.config["n_step"] > 1:
-        _adjust_nstep(policy.config["n_step"], policy.config["gamma"],
-                      batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
-                      batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
-                      batch[SampleBatch.DONES])
-
-    if PRIO_WEIGHTS not in batch:
-        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
-
-    # Prioritize on the worker side
-    if batch.count > 0 and policy.config["worker_side_prioritization"]:
-        td_errors = policy.compute_td_error(
-            batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
-            batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
-            batch[SampleBatch.DONES], batch[PRIO_WEIGHTS])
-        new_priorities = (
-            np.abs(td_errors) + policy.config["prioritized_replay_eps"])
-        batch.data[PRIO_WEIGHTS] = new_priorities
-
-    return batch
-
-
-DQNTFPolicy = build_tf_policy(
-    name="DQNTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
-    make_model=build_q_model,
-    action_sampler_fn=build_q_networks,
-    loss_fn=build_q_losses,
-    stats_fn=build_q_stats,
-    postprocess_fn=postprocess_trajectory,
-    optimizer_fn=adam_optimizer,
-    gradients_fn=clip_gradients,
-    extra_action_feed_fn=exploration_setting_inputs,
-    extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
-    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
-    before_init=setup_early_mixins,
-    after_init=setup_late_mixins,
-    obs_include_prev_action_reward=False,
-    mixins=[
-        ExplorationStateMixin,
-        TargetNetworkMixin,
-        ComputeTDErrorMixin,
-        LearningRateSchedule,
-    ])
@@ -1,82 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class SimpleQModel(TFModelV2):
-    """Extension of standard TFModel to provide Q values.
-
-    Data flow:
-        obs -> forward() -> model_out
-        model_out -> get_q_values() -> Q(s, a)
-
-    Note that this class by itself is not a valid model unless you
-    implement forward() in a subclass."""
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 q_hiddens=(256, )):
-        """Initialize variables of this model.
-
-        Extra model kwargs:
-            q_hiddens (list): defines size of hidden layers for the q head.
-                These will be used to postprocess the model output for the
-                purposes of computing Q values.
-
-        Note that the core layers for forward() are not defined here, this
-        only defines the layers for the Q head. Those layers for forward()
-        should be defined in subclasses of SimpleQModel.
-        """
-
-        super(SimpleQModel, self).__init__(obs_space, action_space,
-                                           num_outputs, model_config, name)
-
-        # setup the Q head output (i.e., model for get_q_values)
-        self.model_out = tf.keras.layers.Input(
-            shape=(num_outputs, ), name="model_out")
-
-        if q_hiddens:
-            last_layer = self.model_out
-            for i, n in enumerate(q_hiddens):
-                last_layer = tf.keras.layers.Dense(
-                    n, name="q_hidden_{}".format(i),
-                    activation=tf.nn.relu)(last_layer)
-            q_out = tf.keras.layers.Dense(
-                action_space.n, activation=None, name="q_out")(last_layer)
-        else:
-            q_out = self.model_out
-
-        self.q_value_head = tf.keras.Model(self.model_out, q_out)
-        self.register_variables(self.q_value_head.variables)
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        """This generates the model_out tensor input.
-
-        You must implement this as documented in modelv2.py."""
-        raise NotImplementedError
-
-    def get_q_values(self, model_out):
-        """Returns Q(s, a) given a feature tensor for the state.
-
-        Override this in your custom model to customize the Q output head.
-
-        Arguments:
-            model_out (Tensor): embedding from the model layers
-
-        Returns:
-            action scores Q(s, a) for each action, shape [None, action_space.n]
-        """
-
-        return self.q_value_head(model_out)
@@ -1,214 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-"""Basic example of a DQN policy without any optimizations."""
-
-from gym.spaces import Discrete
-import logging
-
-import ray
-from ray.rllib.agents.dqn.simple_q_model import SimpleQModel
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.tf_ops import huber_loss
-
-tf = try_import_tf()
-logger = logging.getLogger(__name__)
-
-Q_SCOPE = "q_func"
-Q_TARGET_SCOPE = "target_q_func"
-
-
-class ExplorationStateMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        self.cur_epsilon = 1.0
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
-
-    def add_parameter_noise(self):
-        if self.config["parameter_noise"]:
-            self.sess.run(self.add_noise_op)
-
-    def set_epsilon(self, epsilon):
-        self.cur_epsilon = epsilon
-
-    @override(Policy)
-    def get_state(self):
-        return [TFPolicy.get_state(self), self.cur_epsilon]
-
-    @override(Policy)
-    def set_state(self, state):
-        TFPolicy.set_state(self, state[0])
-        self.set_epsilon(state[1])
-
-
-class TargetNetworkMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        update_target_expr = []
-        assert len(self.q_func_vars) == len(self.target_q_func_vars), \
-            (self.q_func_vars, self.target_q_func_vars)
-        for var, var_target in zip(self.q_func_vars, self.target_q_func_vars):
-            update_target_expr.append(var_target.assign(var))
-            logger.debug("Update target op {}".format(var_target))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-    def update_target(self):
-        return self.get_session().run(self.update_target_expr)
-
-
-def build_q_models(policy, obs_space, action_space, config):
-
-    if not isinstance(action_space, Discrete):
-        raise UnsupportedSpaceException(
-            "Action space {} is not supported for DQN.".format(action_space))
-
-    if config["hiddens"]:
-        num_outputs = 256
-        config["model"]["no_final_linear"] = True
-    else:
-        num_outputs = action_space.n
-
-    policy.q_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        name=Q_SCOPE,
-        model_interface=SimpleQModel,
-        q_hiddens=config["hiddens"])
-
-    policy.target_q_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        name=Q_TARGET_SCOPE,
-        model_interface=SimpleQModel,
-        q_hiddens=config["hiddens"])
-
-    return policy.q_model
-
-
-def build_action_sampler(policy, q_model, input_dict, obs_space, action_space,
-                         config):
-
-    # Action Q network
-    q_values = _compute_q_values(policy, q_model,
-                                 input_dict[SampleBatch.CUR_OBS], obs_space,
-                                 action_space)
-    policy.q_values = q_values
-    policy.q_func_vars = q_model.variables()
-
-    # Action outputs
-    deterministic_actions = tf.argmax(q_values, axis=1)
-    batch_size = tf.shape(input_dict[SampleBatch.CUR_OBS])[0]
-
-    # Special case masked out actions (q_value ~= -inf) so that we don't
-    # even consider them for exploration.
-    random_valid_action_logits = tf.where(
-        tf.equal(q_values, tf.float32.min),
-        tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
-    random_actions = tf.squeeze(
-        tf.multinomial(random_valid_action_logits, 1), axis=1)
-
-    chose_random = tf.random_uniform(
-        tf.stack([batch_size]), minval=0, maxval=1,
-        dtype=tf.float32) < policy.eps
-    stochastic_actions = tf.where(chose_random, random_actions,
-                                  deterministic_actions)
-    action = tf.cond(policy.stochastic, lambda: stochastic_actions,
-                     lambda: deterministic_actions)
-    action_prob = None
-
-    return action, action_prob
-
-
-def build_q_losses(policy, batch_tensors):
-    # q network evaluation
-    q_t = _compute_q_values(policy, policy.q_model,
-                            batch_tensors[SampleBatch.CUR_OBS],
-                            policy.observation_space, policy.action_space)
-
-    # target q network evalution
-    q_tp1 = _compute_q_values(policy, policy.target_q_model,
-                              batch_tensors[SampleBatch.NEXT_OBS],
-                              policy.observation_space, policy.action_space)
-    policy.target_q_func_vars = policy.target_q_model.variables()
-
-    # q scores for actions which we know were selected in the given state.
-    one_hot_selection = tf.one_hot(
-        tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
-        policy.action_space.n)
-    q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
-
-    # compute estimate of best possible value starting from state at t + 1
-    dones = tf.cast(batch_tensors[SampleBatch.DONES], tf.float32)
-    q_tp1_best_one_hot_selection = tf.one_hot(
-        tf.argmax(q_tp1, 1), policy.action_space.n)
-    q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
-    q_tp1_best_masked = (1.0 - dones) * q_tp1_best
-
-    # compute RHS of bellman equation
-    q_t_selected_target = (batch_tensors[SampleBatch.REWARDS] +
-                           policy.config["gamma"] * q_tp1_best_masked)
-
-    # compute the error (potentially clipped)
-    td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-    loss = tf.reduce_mean(huber_loss(td_error))
-
-    # save TD error as an attribute for outside access
-    policy.td_error = td_error
-
-    return loss
-
-
-def _compute_q_values(policy, model, obs, obs_space, action_space):
-    input_dict = {
-        "obs": obs,
-        "is_training": policy._get_is_training_placeholder(),
-    }
-    model_out, _ = model(input_dict, [], None)
-    return model.get_q_values(model_out)
-
-
-def exploration_setting_inputs(policy):
-    return {
-        policy.stochastic: True,
-        policy.eps: policy.cur_epsilon,
-    }
-
-
-def setup_early_mixins(policy, obs_space, action_space, config):
-    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
-
-
-def setup_late_mixins(policy, obs_space, action_space, config):
-    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
-
-
-SimpleQPolicy = build_tf_policy(
-    name="SimpleQPolicy",
-    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
-    make_model=build_q_models,
-    action_sampler_fn=build_action_sampler,
-    loss_fn=build_q_losses,
-    extra_action_feed_fn=exploration_setting_inputs,
-    extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
-    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
-    before_init=setup_early_mixins,
-    after_init=setup_late_mixins,
-    obs_include_prev_action_reward=False,
-    mixins=[
-        ExplorationStateMixin,
-        TargetNetworkMixin,
-    ])
@@ -1,6 +0,0 @@
-from ray.rllib.agents.es.es import (ESTrainer, DEFAULT_CONFIG)
-from ray.rllib.utils import renamed_agent
-
-ESAgent = renamed_agent(ESTrainer)
-
-__all__ = ["ESAgent", "ESTrainer", "DEFAULT_CONFIG"]
@@ -1,337 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import logging
-import numpy as np
-import time
-
-import ray
-from ray.rllib.agents import Trainer, with_common_config
-
-from ray.rllib.agents.es import optimizers
-from ray.rllib.agents.es import policies
-from ray.rllib.agents.es import utils
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.memory import ray_get_and_free
-from ray.rllib.utils import FilterManager
-
-logger = logging.getLogger(__name__)
-
-Result = namedtuple("Result", [
-    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
-    "eval_returns", "eval_lengths"
-])
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    "l2_coeff": 0.005,
-    "noise_stdev": 0.02,
-    "episodes_per_batch": 1000,
-    "train_batch_size": 10000,
-    "eval_prob": 0.003,
-    "return_proc_mode": "centered_rank",
-    "num_workers": 10,
-    "stepsize": 0.01,
-    "observation_filter": "MeanStdFilter",
-    "noise_size": 250000000,
-    "report_length": 10,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-@ray.remote
-def create_shared_noise(count):
-    """Create a large array of noise to be shared by all workers."""
-    seed = 123
-    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
-    return noise
-
-
-class SharedNoiseTable(object):
-    def __init__(self, noise):
-        self.noise = noise
-        assert self.noise.dtype == np.float32
-
-    def get(self, i, dim):
-        return self.noise[i:i + dim]
-
-    def sample_index(self, dim):
-        return np.random.randint(0, len(self.noise) - dim + 1)
-
-
-@ray.remote
-class Worker(object):
-    def __init__(self,
-                 config,
-                 policy_params,
-                 env_creator,
-                 noise,
-                 min_task_runtime=0.2):
-        self.min_task_runtime = min_task_runtime
-        self.config = config
-        self.policy_params = policy_params
-        self.noise = SharedNoiseTable(noise)
-
-        self.env = env_creator(config["env_config"])
-        from ray.rllib import models
-        self.preprocessor = models.ModelCatalog.get_preprocessor(
-            self.env, config["model"])
-
-        self.sess = utils.make_session(single_threaded=True)
-        self.policy = policies.GenericPolicy(
-            self.sess, self.env.action_space, self.env.observation_space,
-            self.preprocessor, config["observation_filter"], config["model"],
-            **policy_params)
-
-    @property
-    def filters(self):
-        return {DEFAULT_POLICY_ID: self.policy.get_filter()}
-
-    def sync_filters(self, new_filters):
-        for k in self.filters:
-            self.filters[k].sync(new_filters[k])
-
-    def get_filters(self, flush_after=False):
-        return_filters = {}
-        for k, f in self.filters.items():
-            return_filters[k] = f.as_serializable()
-            if flush_after:
-                f.clear_buffer()
-        return return_filters
-
-    def rollout(self, timestep_limit, add_noise=True):
-        rollout_rewards, rollout_length = policies.rollout(
-            self.policy,
-            self.env,
-            timestep_limit=timestep_limit,
-            add_noise=add_noise)
-        return rollout_rewards, rollout_length
-
-    def do_rollouts(self, params, timestep_limit=None):
-        # Set the network weights.
-        self.policy.set_weights(params)
-
-        noise_indices, returns, sign_returns, lengths = [], [], [], []
-        eval_returns, eval_lengths = [], []
-
-        # Perform some rollouts with noise.
-        task_tstart = time.time()
-        while (len(noise_indices) == 0
-               or time.time() - task_tstart < self.min_task_runtime):
-
-            if np.random.uniform() < self.config["eval_prob"]:
-                # Do an evaluation run with no perturbation.
-                self.policy.set_weights(params)
-                rewards, length = self.rollout(timestep_limit, add_noise=False)
-                eval_returns.append(rewards.sum())
-                eval_lengths.append(length)
-            else:
-                # Do a regular run with parameter perturbations.
-                noise_index = self.noise.sample_index(self.policy.num_params)
-
-                perturbation = self.config["noise_stdev"] * self.noise.get(
-                    noise_index, self.policy.num_params)
-
-                # These two sampling steps could be done in parallel on
-                # different actors letting us update twice as frequently.
-                self.policy.set_weights(params + perturbation)
-                rewards_pos, lengths_pos = self.rollout(timestep_limit)
-
-                self.policy.set_weights(params - perturbation)
-                rewards_neg, lengths_neg = self.rollout(timestep_limit)
-
-                noise_indices.append(noise_index)
-                returns.append([rewards_pos.sum(), rewards_neg.sum()])
-                sign_returns.append(
-                    [np.sign(rewards_pos).sum(),
-                     np.sign(rewards_neg).sum()])
-                lengths.append([lengths_pos, lengths_neg])
-
-        return Result(
-            noise_indices=noise_indices,
-            noisy_returns=returns,
-            sign_noisy_returns=sign_returns,
-            noisy_lengths=lengths,
-            eval_returns=eval_returns,
-            eval_lengths=eval_lengths)
-
-
-class ESTrainer(Trainer):
-    """Large-scale implementation of Evolution Strategies in Ray."""
-
-    _name = "ES"
-    _default_config = DEFAULT_CONFIG
-
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        policy_params = {"action_noise_std": 0.01}
-
-        env = env_creator(config["env_config"])
-        from ray.rllib import models
-        preprocessor = models.ModelCatalog.get_preprocessor(env)
-
-        self.sess = utils.make_session(single_threaded=False)
-        self.policy = policies.GenericPolicy(
-            self.sess, env.action_space, env.observation_space, preprocessor,
-            config["observation_filter"], config["model"], **policy_params)
-        self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
-        self.report_length = config["report_length"]
-
-        # Create the shared noise table.
-        logger.info("Creating shared noise table.")
-        noise_id = create_shared_noise.remote(config["noise_size"])
-        self.noise = SharedNoiseTable(ray.get(noise_id))
-
-        # Create the actors.
-        logger.info("Creating actors.")
-        self._workers = [
-            Worker.remote(config, policy_params, env_creator, noise_id)
-            for _ in range(config["num_workers"])
-        ]
-
-        self.episodes_so_far = 0
-        self.reward_list = []
-        self.tstart = time.time()
-
-    @override(Trainer)
-    def _train(self):
-        config = self.config
-
-        theta = self.policy.get_weights()
-        assert theta.dtype == np.float32
-
-        # Put the current policy weights in the object store.
-        theta_id = ray.put(theta)
-        # Use the actors to do rollouts, note that we pass in the ID of the
-        # policy weights.
-        results, num_episodes, num_timesteps = self._collect_results(
-            theta_id, config["episodes_per_batch"], config["train_batch_size"])
-
-        all_noise_indices = []
-        all_training_returns = []
-        all_training_lengths = []
-        all_eval_returns = []
-        all_eval_lengths = []
-
-        # Loop over the results.
-        for result in results:
-            all_eval_returns += result.eval_returns
-            all_eval_lengths += result.eval_lengths
-
-            all_noise_indices += result.noise_indices
-            all_training_returns += result.noisy_returns
-            all_training_lengths += result.noisy_lengths
-
-        assert len(all_eval_returns) == len(all_eval_lengths)
-        assert (len(all_noise_indices) == len(all_training_returns) ==
-                len(all_training_lengths))
-
-        self.episodes_so_far += num_episodes
-
-        # Assemble the results.
-        eval_returns = np.array(all_eval_returns)
-        eval_lengths = np.array(all_eval_lengths)
-        noise_indices = np.array(all_noise_indices)
-        noisy_returns = np.array(all_training_returns)
-        noisy_lengths = np.array(all_training_lengths)
-
-        # Process the returns.
-        if config["return_proc_mode"] == "centered_rank":
-            proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
-        else:
-            raise NotImplementedError(config["return_proc_mode"])
-
-        # Compute and take a step.
-        g, count = utils.batched_weighted_sum(
-            proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
-            (self.noise.get(index, self.policy.num_params)
-             for index in noise_indices),
-            batch_size=500)
-        g /= noisy_returns.size
-        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
-                and count == len(noise_indices))
-        # Compute the new weights theta.
-        theta, update_ratio = self.optimizer.update(-g +
-                                                    config["l2_coeff"] * theta)
-        # Set the new weights in the local copy of the policy.
-        self.policy.set_weights(theta)
-        # Store the rewards
-        if len(all_eval_returns) > 0:
-            self.reward_list.append(np.mean(eval_returns))
-
-        # Now sync the filters
-        FilterManager.synchronize({
-            DEFAULT_POLICY_ID: self.policy.get_filter()
-        }, self._workers)
-
-        info = {
-            "weights_norm": np.square(theta).sum(),
-            "grad_norm": np.square(g).sum(),
-            "update_ratio": update_ratio,
-            "episodes_this_iter": noisy_lengths.size,
-            "episodes_so_far": self.episodes_so_far,
-        }
-
-        reward_mean = np.mean(self.reward_list[-self.report_length:])
-        result = dict(
-            episode_reward_mean=reward_mean,
-            episode_len_mean=eval_lengths.mean(),
-            timesteps_this_iter=noisy_lengths.sum(),
-            info=info)
-
-        return result
-
-    @override(Trainer)
-    def compute_action(self, observation):
-        return self.policy.compute(observation, update=False)[0]
-
-    @override(Trainer)
-    def _stop(self):
-        # workaround for https://github.com/ray-project/ray/issues/1516
-        for w in self._workers:
-            w.__ray_terminate__.remote()
-
-    def _collect_results(self, theta_id, min_episodes, min_timesteps):
-        num_episodes, num_timesteps = 0, 0
-        results = []
-        while num_episodes < min_episodes or num_timesteps < min_timesteps:
-            logger.info(
-                "Collected {} episodes {} timesteps so far this iter".format(
-                    num_episodes, num_timesteps))
-            rollout_ids = [
-                worker.do_rollouts.remote(theta_id) for worker in self._workers
-            ]
-            # Get the results of the rollouts.
-            for result in ray_get_and_free(rollout_ids):
-                results.append(result)
-                # Update the number of episodes and the number of timesteps
-                # keeping in mind that result.noisy_lengths is a list of lists,
-                # where the inner lists have length 2.
-                num_episodes += sum(len(pair) for pair in result.noisy_lengths)
-                num_timesteps += sum(
-                    sum(pair) for pair in result.noisy_lengths)
-
-        return results, num_episodes, num_timesteps
-
-    def __getstate__(self):
-        return {
-            "weights": self.policy.get_weights(),
-            "filter": self.policy.get_filter(),
-            "episodes_so_far": self.episodes_so_far,
-        }
-
-    def __setstate__(self, state):
-        self.episodes_so_far = state["episodes_so_far"]
-        self.policy.set_weights(state["weights"])
-        self.policy.set_filter(state["filter"])
-        FilterManager.synchronize({
-            DEFAULT_POLICY_ID: self.policy.get_filter()
-        }, self._workers)
@@ -1,56 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-class Optimizer(object):
-    def __init__(self, pi):
-        self.pi = pi
-        self.dim = pi.num_params
-        self.t = 0
-
-    def update(self, globalg):
-        self.t += 1
-        step = self._compute_step(globalg)
-        theta = self.pi.get_weights()
-        ratio = np.linalg.norm(step) / np.linalg.norm(theta)
-        return theta + step, ratio
-
-    def _compute_step(self, globalg):
-        raise NotImplementedError
-
-
-class SGD(Optimizer):
-    def __init__(self, pi, stepsize, momentum=0.9):
-        Optimizer.__init__(self, pi)
-        self.v = np.zeros(self.dim, dtype=np.float32)
-        self.stepsize, self.momentum = stepsize, momentum
-
-    def _compute_step(self, globalg):
-        self.v = self.momentum * self.v + (1. - self.momentum) * globalg
-        step = -self.stepsize * self.v
-        return step
-
-
-class Adam(Optimizer):
-    def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
-        Optimizer.__init__(self, pi)
-        self.stepsize = stepsize
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.m = np.zeros(self.dim, dtype=np.float32)
-        self.v = np.zeros(self.dim, dtype=np.float32)
-
-    def _compute_step(self, globalg):
-        a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
-                             (1 - self.beta1**self.t))
-        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
-        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
-        step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
-        return step
@@ -1,93 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gym
-import numpy as np
-
-import ray
-import ray.experimental.tf_utils
-from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils.filter import get_filter
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-def rollout(policy, env, timestep_limit=None, add_noise=False):
-    """Do a rollout.
-
-    If add_noise is True, the rollout will take noisy actions with
-    noise drawn from that stream. Otherwise, no action noise will be added.
-    """
-    env_timestep_limit = env.spec.max_episode_steps
-    timestep_limit = (env_timestep_limit if timestep_limit is None else min(
-        timestep_limit, env_timestep_limit))
-    rews = []
-    t = 0
-    observation = env.reset()
-    for _ in range(timestep_limit or 999999):
-        ac = policy.compute(observation, add_noise=add_noise)[0]
-        observation, rew, done, _ = env.step(ac)
-        rews.append(rew)
-        t += 1
-        if done:
-            break
-    rews = np.array(rews, dtype=np.float32)
-    return rews, t
-
-
-class GenericPolicy(object):
-    def __init__(self, sess, action_space, obs_space, preprocessor,
-                 observation_filter, model_options, action_noise_std):
-        self.sess = sess
-        self.action_space = action_space
-        self.action_noise_std = action_noise_std
-        self.preprocessor = preprocessor
-        self.observation_filter = get_filter(observation_filter,
-                                             self.preprocessor.shape)
-        self.inputs = tf.placeholder(tf.float32,
-                                     [None] + list(self.preprocessor.shape))
-
-        # Policy network.
-        dist_class, dist_dim = ModelCatalog.get_action_dist(
-            self.action_space, model_options, dist_type="deterministic")
-        model = ModelCatalog.get_model({
-            "obs": self.inputs
-        }, obs_space, action_space, dist_dim, model_options)
-        dist = dist_class(model.outputs)
-        self.sampler = dist.sample()
-
-        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
-            model.outputs, self.sess)
-
-        self.num_params = sum(
-            np.prod(variable.shape.as_list())
-            for _, variable in self.variables.variables.items())
-        self.sess.run(tf.global_variables_initializer())
-
-    def compute(self, observation, add_noise=False, update=True):
-        observation = self.preprocessor.transform(observation)
-        observation = self.observation_filter(observation[None], update=update)
-        action = self.sess.run(
-            self.sampler, feed_dict={self.inputs: observation})
-        action = _unbatch_tuple_actions(action)
-        if add_noise and isinstance(self.action_space, gym.spaces.Box):
-            action += np.random.randn(*action.shape) * self.action_noise_std
-        return action
-
-    def set_weights(self, x):
-        self.variables.set_flat(x)
-
-    def get_weights(self):
-        return self.variables.get_flat()
-
-    def get_filter(self):
-        return self.observation_filter
-
-    def set_filter(self, observation_filter):
-        self.observation_filter = observation_filter
@@ -1,63 +0,0 @@
-# Code in this file is copied and adapted from
-# https://github.com/openai/evolution-strategies-starter.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-def compute_ranks(x):
-    """Returns ranks in [0, len(x))
-
-    Note: This is different from scipy.stats.rankdata, which returns ranks in
-    [1, len(x)].
-    """
-    assert x.ndim == 1
-    ranks = np.empty(len(x), dtype=int)
-    ranks[x.argsort()] = np.arange(len(x))
-    return ranks
-
-
-def compute_centered_ranks(x):
-    y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
-    y /= (x.size - 1)
-    y -= 0.5
-    return y
-
-
-def make_session(single_threaded):
-    if not single_threaded:
-        return tf.Session()
-    return tf.Session(
-        config=tf.ConfigProto(
-            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
-
-
-def itergroups(items, group_size):
-    assert group_size >= 1
-    group = []
-    for x in items:
-        group.append(x)
-        if len(group) == group_size:
-            yield tuple(group)
-            del group[:]
-    if group:
-        yield tuple(group)
-
-
-def batched_weighted_sum(weights, vecs, batch_size):
-    total = 0
-    num_items_summed = 0
-    for batch_weights, batch_vecs in zip(
-            itergroups(weights, batch_size), itergroups(vecs, batch_size)):
-        assert len(batch_weights) == len(batch_vecs) <= batch_size
-        total += np.dot(
-            np.asarray(batch_weights, dtype=np.float32),
-            np.asarray(batch_vecs, dtype=np.float32))
-        num_items_summed += len(batch_weights)
-    return total, num_items_summed
@@ -1,6 +0,0 @@
-from ray.rllib.agents.impala.impala import ImpalaTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_agent
-
-ImpalaAgent = renamed_agent(ImpalaTrainer)
-
-__all__ = ["ImpalaAgent", "ImpalaTrainer", "DEFAULT_CONFIG"]
@@ -1,164 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
-from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy
-from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.optimizers import AsyncSamplesOptimizer
-from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
-from ray.rllib.utils.annotations import override
-from ray.tune.trainable import Trainable
-from ray.tune.resources import Resources
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # V-trace params (see vtrace.py).
-    "vtrace": True,
-    "vtrace_clip_rho_threshold": 1.0,
-    "vtrace_clip_pg_rho_threshold": 1.0,
-
-    # System params.
-    #
-    # == Overview of data flow in IMPALA ==
-    # 1. Policy evaluation in parallel across `num_workers` actors produces
-    #    batches of size `sample_batch_size * num_envs_per_worker`.
-    # 2. If enabled, the replay buffer stores and produces batches of size
-    #    `sample_batch_size * num_envs_per_worker`.
-    # 3. If enabled, the minibatch ring buffer stores and replays batches of
-    #    size `train_batch_size` up to `num_sgd_iter` times per batch.
-    # 4. The learner thread executes data parallel SGD across `num_gpus` GPUs
-    #    on batches of size `train_batch_size`.
-    #
-    "sample_batch_size": 50,
-    "train_batch_size": 500,
-    "min_iter_time_s": 10,
-    "num_workers": 2,
-    # number of GPUs the learner should use.
-    "num_gpus": 1,
-    # set >1 to load data into GPUs in parallel. Increases GPU memory usage
-    # proportionally with the number of buffers.
-    "num_data_loader_buffers": 1,
-    # how many train batches should be retained for minibatching. This conf
-    # only has an effect if `num_sgd_iter > 1`.
-    "minibatch_buffer_size": 1,
-    # number of passes to make over each train batch
-    "num_sgd_iter": 1,
-    # set >0 to enable experience replay. Saved samples will be replayed with
-    # a p:1 proportion to new data samples.
-    "replay_proportion": 0.0,
-    # number of sample batches to store for replay. The number of transitions
-    # saved total will be (replay_buffer_num_slots * sample_batch_size).
-    "replay_buffer_num_slots": 0,
-    # max queue size for train batches feeding into the learner
-    "learner_queue_size": 16,
-    # wait for train batches to be available in minibatch buffer queue
-    # this many seconds. This may need to be increased e.g. when training
-    # with a slow environment
-    "learner_queue_timeout": 300,
-    # level of queuing for sampling.
-    "max_sample_requests_in_flight_per_worker": 2,
-    # max number of workers to broadcast one set of weights to
-    "broadcast_interval": 1,
-    # use intermediate actors for multi-level aggregation. This can make sense
-    # if ingesting >2GB/s of samples, or if the data requires decompression.
-    "num_aggregation_workers": 0,
-
-    # Learning params.
-    "grad_clip": 40.0,
-    # either "adam" or "rmsprop"
-    "opt_type": "adam",
-    "lr": 0.0005,
-    "lr_schedule": None,
-    # rmsprop considered
-    "decay": 0.99,
-    "momentum": 0.0,
-    "epsilon": 0.1,
-    # balancing the three losses
-    "vf_loss_coeff": 0.5,
-    "entropy_coeff": 0.01,
-    "entropy_coeff_schedule": None,
-
-    # use fake (infinite speed) sampler for testing
-    "_fake_sampler": False,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def choose_policy(config):
-    if config["vtrace"]:
-        return VTraceTFPolicy
-    else:
-        return A3CTFPolicy
-
-
-def validate_config(config):
-    if config["entropy_coeff"] < 0:
-        raise DeprecationWarning("entropy_coeff must be >= 0")
-
-
-def defer_make_workers(trainer, env_creator, policy, config):
-    # Defer worker creation to after the optimizer has been created.
-    return trainer._make_workers(env_creator, policy, config, 0)
-
-
-def make_aggregators_and_optimizer(workers, config):
-    if config["num_aggregation_workers"] > 0:
-        # Create co-located aggregator actors first for placement pref
-        aggregators = TreeAggregator.precreate_aggregators(
-            config["num_aggregation_workers"])
-    else:
-        aggregators = None
-    workers.add_workers(config["num_workers"])
-
-    optimizer = AsyncSamplesOptimizer(
-        workers,
-        lr=config["lr"],
-        num_gpus=config["num_gpus"],
-        sample_batch_size=config["sample_batch_size"],
-        train_batch_size=config["train_batch_size"],
-        replay_buffer_num_slots=config["replay_buffer_num_slots"],
-        replay_proportion=config["replay_proportion"],
-        num_data_loader_buffers=config["num_data_loader_buffers"],
-        max_sample_requests_in_flight_per_worker=config[
-            "max_sample_requests_in_flight_per_worker"],
-        broadcast_interval=config["broadcast_interval"],
-        num_sgd_iter=config["num_sgd_iter"],
-        minibatch_buffer_size=config["minibatch_buffer_size"],
-        num_aggregation_workers=config["num_aggregation_workers"],
-        learner_queue_size=config["learner_queue_size"],
-        learner_queue_timeout=config["learner_queue_timeout"],
-        **config["optimizer"])
-
-    if aggregators:
-        # Assign the pre-created aggregators to the optimizer
-        optimizer.aggregator.init(aggregators)
-    return optimizer
-
-
-class OverrideDefaultResourceRequest(object):
-    @classmethod
-    @override(Trainable)
-    def default_resource_request(cls, config):
-        cf = dict(cls._default_config, **config)
-        Trainer._validate_config(cf)
-        return Resources(
-            cpu=cf["num_cpus_for_driver"],
-            gpu=cf["num_gpus"],
-            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
-            cf["num_aggregation_workers"],
-            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
-
-
-ImpalaTrainer = build_trainer(
-    name="IMPALA",
-    default_config=DEFAULT_CONFIG,
-    default_policy=VTraceTFPolicy,
-    validate_config=validate_config,
-    get_policy_class=choose_policy,
-    make_workers=defer_make_workers,
-    make_policy_optimizer=make_aggregators_and_optimizer,
-    mixins=[OverrideDefaultResourceRequest])
@@ -1,409 +0,0 @@
-# Copyright 2018 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Functions to compute V-trace off-policy actor critic targets.
-
-For details and theory see:
-
-"IMPALA: Scalable Distributed Deep-RL with
-Importance Weighted Actor-Learner Architectures"
-by Espeholt, Soyer, Munos et al.
-
-See https://arxiv.org/abs/1802.01561 for the full paper.
-
-In addition to the original paper's code, changes have been made
-to support MultiDiscrete action spaces. behaviour_policy_logits,
-target_policy_logits and actions parameters in the entry point
-multi_from_logits method accepts lists of tensors instead of just
-tensors.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-VTraceFromLogitsReturns = collections.namedtuple("VTraceFromLogitsReturns", [
-    "vs", "pg_advantages", "log_rhos", "behaviour_action_log_probs",
-    "target_action_log_probs"
-])
-
-VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")
-
-
-def log_probs_from_logits_and_actions(policy_logits,
-                                      actions,
-                                      dist_class=Categorical):
-    return multi_log_probs_from_logits_and_actions([policy_logits], [actions],
-                                                   dist_class)[0]
-
-
-def multi_log_probs_from_logits_and_actions(policy_logits, actions,
-                                            dist_class):
-    """Computes action log-probs from policy logits and actions.
-
-  In the notation used throughout documentation and comments, T refers to the
-  time dimension ranging from 0 to T-1. B refers to the batch size and
-  ACTION_SPACE refers to the list of numbers each representing a number of
-  actions.
-
-  Args:
-    policy_logits: A list with length of ACTION_SPACE of float32
-      tensors of shapes
-      [T, B, ACTION_SPACE[0]],
-      ...,
-      [T, B, ACTION_SPACE[-1]]
-      with un-normalized log-probabilities parameterizing a softmax policy.
-    actions: A list with length of ACTION_SPACE of
-      tensors of shapes
-      [T, B, ...],
-      ...,
-      [T, B, ...]
-      with actions.
-
-  Returns:
-    A list with length of ACTION_SPACE of float32
-      tensors of shapes
-      [T, B],
-      ...,
-      [T, B]
-      corresponding to the sampling log probability
-      of the chosen action w.r.t. the policy.
-  """
-
-    log_probs = []
-    for i in range(len(policy_logits)):
-        p_shape = tf.shape(policy_logits[i])
-        a_shape = tf.shape(actions[i])
-        policy_logits_flat = tf.reshape(policy_logits[i],
-                                        tf.concat([[-1], p_shape[2:]], axis=0))
-        actions_flat = tf.reshape(actions[i],
-                                  tf.concat([[-1], a_shape[2:]], axis=0))
-        log_probs.append(
-            tf.reshape(
-                dist_class(policy_logits_flat).logp(actions_flat),
-                a_shape[:2]))
-
-    return log_probs
-
-
-def from_logits(behaviour_policy_logits,
-                target_policy_logits,
-                actions,
-                discounts,
-                rewards,
-                values,
-                bootstrap_value,
-                dist_class=Categorical,
-                clip_rho_threshold=1.0,
-                clip_pg_rho_threshold=1.0,
-                name="vtrace_from_logits"):
-    """multi_from_logits wrapper used only for tests"""
-
-    res = multi_from_logits(
-        [behaviour_policy_logits], [target_policy_logits], [actions],
-        discounts,
-        rewards,
-        values,
-        bootstrap_value,
-        dist_class,
-        clip_rho_threshold=clip_rho_threshold,
-        clip_pg_rho_threshold=clip_pg_rho_threshold,
-        name=name)
-
-    return VTraceFromLogitsReturns(
-        vs=res.vs,
-        pg_advantages=res.pg_advantages,
-        log_rhos=res.log_rhos,
-        behaviour_action_log_probs=tf.squeeze(
-            res.behaviour_action_log_probs, axis=0),
-        target_action_log_probs=tf.squeeze(
-            res.target_action_log_probs, axis=0),
-    )
-
-
-def multi_from_logits(behaviour_policy_logits,
-                      target_policy_logits,
-                      actions,
-                      discounts,
-                      rewards,
-                      values,
-                      bootstrap_value,
-                      dist_class,
-                      clip_rho_threshold=1.0,
-                      clip_pg_rho_threshold=1.0,
-                      name="vtrace_from_logits"):
-    r"""V-trace for softmax policies.
-
-  Calculates V-trace actor critic targets for softmax polices as described in
-
-  "IMPALA: Scalable Distributed Deep-RL with
-  Importance Weighted Actor-Learner Architectures"
-  by Espeholt, Soyer, Munos et al.
-
-  Target policy refers to the policy we are interested in improving and
-  behaviour policy refers to the policy that generated the given
-  rewards and actions.
-
-  In the notation used throughout documentation and comments, T refers to the
-  time dimension ranging from 0 to T-1. B refers to the batch size and
-  ACTION_SPACE refers to the list of numbers each representing a number of
-  actions.
-
-  Args:
-    behaviour_policy_logits: A list with length of ACTION_SPACE of float32
-      tensors of shapes
-      [T, B, ACTION_SPACE[0]],
-      ...,
-      [T, B, ACTION_SPACE[-1]]
-      with un-normalized log-probabilities parameterizing the softmax behaviour
-      policy.
-    target_policy_logits: A list with length of ACTION_SPACE of float32
-      tensors of shapes
-      [T, B, ACTION_SPACE[0]],
-      ...,
-      [T, B, ACTION_SPACE[-1]]
-      with un-normalized log-probabilities parameterizing the softmax target
-      policy.
-    actions: A list with length of ACTION_SPACE of
-      tensors of shapes
-      [T, B, ...],
-      ...,
-      [T, B, ...]
-      with actions sampled from the behaviour policy.
-    discounts: A float32 tensor of shape [T, B] with the discount encountered
-      when following the behaviour policy.
-    rewards: A float32 tensor of shape [T, B] with the rewards generated by
-      following the behaviour policy.
-    values: A float32 tensor of shape [T, B] with the value function estimates
-      wrt. the target policy.
-    bootstrap_value: A float32 of shape [B] with the value function estimate at
-      time T.
-    dist_class: action distribution class for the logits.
-    clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
-      importance weights (rho) when calculating the baseline targets (vs).
-      rho^bar in the paper.
-    clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
-      on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
-    name: The name scope that all V-trace operations will be created in.
-
-  Returns:
-    A `VTraceFromLogitsReturns` namedtuple with the following fields:
-      vs: A float32 tensor of shape [T, B]. Can be used as target to train a
-          baseline (V(x_t) - vs_t)^2.
-      pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
-        estimate of the advantage in the calculation of policy gradients.
-      log_rhos: A float32 tensor of shape [T, B] containing the log importance
-        sampling weights (log rhos).
-      behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
-        behaviour policy action log probabilities (log \mu(a_t)).
-      target_action_log_probs: A float32 tensor of shape [T, B] containing
-        target policy action probabilities (log \pi(a_t)).
-  """
-
-    for i in range(len(behaviour_policy_logits)):
-        behaviour_policy_logits[i] = tf.convert_to_tensor(
-            behaviour_policy_logits[i], dtype=tf.float32)
-        target_policy_logits[i] = tf.convert_to_tensor(
-            target_policy_logits[i], dtype=tf.float32)
-
-        # Make sure tensor ranks are as expected.
-        # The rest will be checked by from_action_log_probs.
-        behaviour_policy_logits[i].shape.assert_has_rank(3)
-        target_policy_logits[i].shape.assert_has_rank(3)
-
-    with tf.name_scope(
-            name,
-            values=[
-                behaviour_policy_logits, target_policy_logits, actions,
-                discounts, rewards, values, bootstrap_value
-            ]):
-        target_action_log_probs = multi_log_probs_from_logits_and_actions(
-            target_policy_logits, actions, dist_class)
-        behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
-            behaviour_policy_logits, actions, dist_class)
-
-        log_rhos = get_log_rhos(target_action_log_probs,
-                                behaviour_action_log_probs)
-
-        vtrace_returns = from_importance_weights(
-            log_rhos=log_rhos,
-            discounts=discounts,
-            rewards=rewards,
-            values=values,
-            bootstrap_value=bootstrap_value,
-            clip_rho_threshold=clip_rho_threshold,
-            clip_pg_rho_threshold=clip_pg_rho_threshold)
-
-        return VTraceFromLogitsReturns(
-            log_rhos=log_rhos,
-            behaviour_action_log_probs=behaviour_action_log_probs,
-            target_action_log_probs=target_action_log_probs,
-            **vtrace_returns._asdict())
-
-
-def from_importance_weights(log_rhos,
-                            discounts,
-                            rewards,
-                            values,
-                            bootstrap_value,
-                            clip_rho_threshold=1.0,
-                            clip_pg_rho_threshold=1.0,
-                            name="vtrace_from_importance_weights"):
-    r"""V-trace from log importance weights.
-
-  Calculates V-trace actor critic targets as described in
-
-  "IMPALA: Scalable Distributed Deep-RL with
-  Importance Weighted Actor-Learner Architectures"
-  by Espeholt, Soyer, Munos et al.
-
-  In the notation used throughout documentation and comments, T refers to the
-  time dimension ranging from 0 to T-1. B refers to the batch size. This code
-  also supports the case where all tensors have the same number of additional
-  dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
-  `bootstrap_value` is [B, C].
-
-  Args:
-    log_rhos: A float32 tensor of shape [T, B] representing the
-      log importance sampling weights, i.e.
-      log(target_policy(a) / behaviour_policy(a)). V-trace performs operations
-      on rhos in log-space for numerical stability.
-    discounts: A float32 tensor of shape [T, B] with discounts encountered when
-      following the behaviour policy.
-    rewards: A float32 tensor of shape [T, B] containing rewards generated by
-      following the behaviour policy.
-    values: A float32 tensor of shape [T, B] with the value function estimates
-      wrt. the target policy.
-    bootstrap_value: A float32 of shape [B] with the value function estimate at
-      time T.
-    clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
-      importance weights (rho) when calculating the baseline targets (vs).
-      rho^bar in the paper. If None, no clipping is applied.
-    clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
-      on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If
-      None, no clipping is applied.
-    name: The name scope that all V-trace operations will be created in.
-
-  Returns:
-    A VTraceReturns namedtuple (vs, pg_advantages) where:
-      vs: A float32 tensor of shape [T, B]. Can be used as target to
-        train a baseline (V(x_t) - vs_t)^2.
-      pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
-        advantage in the calculation of policy gradients.
-  """
-    log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
-    discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
-    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
-    values = tf.convert_to_tensor(values, dtype=tf.float32)
-    bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
-    if clip_rho_threshold is not None:
-        clip_rho_threshold = tf.convert_to_tensor(
-            clip_rho_threshold, dtype=tf.float32)
-    if clip_pg_rho_threshold is not None:
-        clip_pg_rho_threshold = tf.convert_to_tensor(
-            clip_pg_rho_threshold, dtype=tf.float32)
-
-    # Make sure tensor ranks are consistent.
-    rho_rank = log_rhos.shape.ndims  # Usually 2.
-    values.shape.assert_has_rank(rho_rank)
-    bootstrap_value.shape.assert_has_rank(rho_rank - 1)
-    discounts.shape.assert_has_rank(rho_rank)
-    rewards.shape.assert_has_rank(rho_rank)
-    if clip_rho_threshold is not None:
-        clip_rho_threshold.shape.assert_has_rank(0)
-    if clip_pg_rho_threshold is not None:
-        clip_pg_rho_threshold.shape.assert_has_rank(0)
-
-    with tf.name_scope(
-            name,
-            values=[log_rhos, discounts, rewards, values, bootstrap_value]):
-        rhos = tf.exp(log_rhos)
-        if clip_rho_threshold is not None:
-            clipped_rhos = tf.minimum(
-                clip_rho_threshold, rhos, name="clipped_rhos")
-
-            tf.summary.histogram("clipped_rhos_1000", tf.minimum(1000.0, rhos))
-            tf.summary.scalar(
-                "num_of_clipped_rhos",
-                tf.reduce_sum(
-                    tf.cast(
-                        tf.equal(clipped_rhos, clip_rho_threshold), tf.int32)))
-            tf.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
-        else:
-            clipped_rhos = rhos
-
-        cs = tf.minimum(1.0, rhos, name="cs")
-        # Append bootstrapped value to get [v1, ..., v_t+1]
-        values_t_plus_1 = tf.concat(
-            [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
-        deltas = clipped_rhos * (
-            rewards + discounts * values_t_plus_1 - values)
-
-        # All sequences are reversed, computation starts from the back.
-        sequences = (
-            tf.reverse(discounts, axis=[0]),
-            tf.reverse(cs, axis=[0]),
-            tf.reverse(deltas, axis=[0]),
-        )
-
-        # V-trace vs are calculated through a scan from the back to the
-        # beginning of the given trajectory.
-        def scanfunc(acc, sequence_item):
-            discount_t, c_t, delta_t = sequence_item
-            return delta_t + discount_t * c_t * acc
-
-        initial_values = tf.zeros_like(bootstrap_value)
-        vs_minus_v_xs = tf.scan(
-            fn=scanfunc,
-            elems=sequences,
-            initializer=initial_values,
-            parallel_iterations=1,
-            back_prop=False,
-            name="scan")
-        # Reverse the results back to original order.
-        vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name="vs_minus_v_xs")
-
-        # Add V(x_s) to get v_s.
-        vs = tf.add(vs_minus_v_xs, values, name="vs")
-
-        # Advantage for policy gradient.
-        vs_t_plus_1 = tf.concat(
-            [vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
-        if clip_pg_rho_threshold is not None:
-            clipped_pg_rhos = tf.minimum(
-                clip_pg_rho_threshold, rhos, name="clipped_pg_rhos")
-        else:
-            clipped_pg_rhos = rhos
-        pg_advantages = (
-            clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
-
-        # Make sure no gradients backpropagated through the returned values.
-        return VTraceReturns(
-            vs=tf.stop_gradient(vs),
-            pg_advantages=tf.stop_gradient(pg_advantages))
-
-
-def get_log_rhos(target_action_log_probs, behaviour_action_log_probs):
-    """With the selected log_probs for multi-discrete actions of behaviour
-    and target policies we compute the log_rhos for calculating the vtrace."""
-    t = tf.stack(target_action_log_probs)
-    b = tf.stack(behaviour_action_log_probs)
-    log_rhos = tf.reduce_sum(t - b, axis=0)
-    return log_rhos
@@ -1,303 +0,0 @@
-"""Adapted from A3CTFPolicy to add V-trace.
-
-Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import logging
-import gym
-
-import ray
-from ray.rllib.agents.impala import vtrace
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.tf_policy import LearningRateSchedule, \
-    EntropyCoeffSchedule
-from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-logger = logging.getLogger(__name__)
-
-BEHAVIOUR_LOGITS = "behaviour_logits"
-
-
-class VTraceLoss(object):
-    def __init__(self,
-                 actions,
-                 actions_logp,
-                 actions_entropy,
-                 dones,
-                 behaviour_logits,
-                 target_logits,
-                 discount,
-                 rewards,
-                 values,
-                 bootstrap_value,
-                 dist_class,
-                 valid_mask,
-                 vf_loss_coeff=0.5,
-                 entropy_coeff=0.01,
-                 clip_rho_threshold=1.0,
-                 clip_pg_rho_threshold=1.0):
-        """Policy gradient loss with vtrace importance weighting.
-
-        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
-        batch_size. The reason we need to know `B` is for V-trace to properly
-        handle episode cut boundaries.
-
-        Args:
-            actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
-            actions_logp: A float32 tensor of shape [T, B].
-            actions_entropy: A float32 tensor of shape [T, B].
-            dones: A bool tensor of shape [T, B].
-            behaviour_logits: A list with length of ACTION_SPACE of float32
-                tensors of shapes
-                [T, B, ACTION_SPACE[0]],
-                ...,
-                [T, B, ACTION_SPACE[-1]]
-            target_logits: A list with length of ACTION_SPACE of float32
-                tensors of shapes
-                [T, B, ACTION_SPACE[0]],
-                ...,
-                [T, B, ACTION_SPACE[-1]]
-            discount: A float32 scalar.
-            rewards: A float32 tensor of shape [T, B].
-            values: A float32 tensor of shape [T, B].
-            bootstrap_value: A float32 tensor of shape [B].
-            dist_class: action distribution class for logits.
-            valid_mask: A bool tensor of valid RNN input elements (#2992).
-        """
-
-        # Compute vtrace on the CPU for better perf.
-        with tf.device("/cpu:0"):
-            self.vtrace_returns = vtrace.multi_from_logits(
-                behaviour_policy_logits=behaviour_logits,
-                target_policy_logits=target_logits,
-                actions=tf.unstack(actions, axis=2),
-                discounts=tf.to_float(~dones) * discount,
-                rewards=rewards,
-                values=values,
-                bootstrap_value=bootstrap_value,
-                dist_class=dist_class,
-                clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
-                clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
-                                              tf.float32))
-            self.value_targets = self.vtrace_returns.vs
-
-        # The policy gradients loss
-        self.pi_loss = -tf.reduce_sum(
-            tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages,
-                            valid_mask))
-
-        # The baseline loss
-        delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
-        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
-
-        # The entropy loss
-        self.entropy = tf.reduce_sum(
-            tf.boolean_mask(actions_entropy, valid_mask))
-
-        # The summed weighted loss
-        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
-                           self.entropy * entropy_coeff)
-
-
-def _make_time_major(policy, tensor, drop_last=False):
-    """Swaps batch and trajectory axis.
-
-    Arguments:
-        policy: Policy reference
-        tensor: A tensor or list of tensors to reshape.
-        drop_last: A bool indicating whether to drop the last
-        trajectory item.
-
-    Returns:
-        res: A tensor with swapped axes or a list of tensors with
-        swapped axes.
-    """
-    if isinstance(tensor, list):
-        return [_make_time_major(policy, t, drop_last) for t in tensor]
-
-    if policy.state_in:
-        B = tf.shape(policy.seq_lens)[0]
-        T = tf.shape(tensor)[0] // B
-    else:
-        # Important: chop the tensor into batches at known episode cut
-        # boundaries. TODO(ekl) this is kind of a hack
-        T = policy.config["sample_batch_size"]
-        B = tf.shape(tensor)[0] // T
-    rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
-
-    # swap B and T axes
-    res = tf.transpose(
-        rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
-
-    if drop_last:
-        return res[:-1]
-    return res
-
-
-def build_vtrace_loss(policy, batch_tensors):
-    if isinstance(policy.action_space, gym.spaces.Discrete):
-        is_multidiscrete = False
-        output_hidden_shape = [policy.action_space.n]
-    elif isinstance(policy.action_space,
-                    gym.spaces.multi_discrete.MultiDiscrete):
-        is_multidiscrete = True
-        output_hidden_shape = policy.action_space.nvec.astype(np.int32)
-    else:
-        is_multidiscrete = False
-        output_hidden_shape = 1
-
-    def make_time_major(*args, **kw):
-        return _make_time_major(policy, *args, **kw)
-
-    actions = batch_tensors[SampleBatch.ACTIONS]
-    dones = batch_tensors[SampleBatch.DONES]
-    rewards = batch_tensors[SampleBatch.REWARDS]
-    behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
-    unpacked_behaviour_logits = tf.split(
-        behaviour_logits, output_hidden_shape, axis=1)
-    unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
-    action_dist = policy.action_dist
-    values = policy.value_function
-
-    if policy.state_in:
-        max_seq_len = tf.reduce_max(policy.seq_lens) - 1
-        mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
-        mask = tf.reshape(mask, [-1])
-    else:
-        mask = tf.ones_like(rewards)
-
-    # Prepare actions for loss
-    loss_actions = actions if is_multidiscrete else tf.expand_dims(
-        actions, axis=1)
-
-    # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
-    policy.loss = VTraceLoss(
-        actions=make_time_major(loss_actions, drop_last=True),
-        actions_logp=make_time_major(
-            action_dist.logp(actions), drop_last=True),
-        actions_entropy=make_time_major(
-            action_dist.multi_entropy(), drop_last=True),
-        dones=make_time_major(dones, drop_last=True),
-        behaviour_logits=make_time_major(
-            unpacked_behaviour_logits, drop_last=True),
-        target_logits=make_time_major(unpacked_outputs, drop_last=True),
-        discount=policy.config["gamma"],
-        rewards=make_time_major(rewards, drop_last=True),
-        values=make_time_major(values, drop_last=True),
-        bootstrap_value=make_time_major(values)[-1],
-        dist_class=Categorical if is_multidiscrete else policy.dist_class,
-        valid_mask=make_time_major(mask, drop_last=True),
-        vf_loss_coeff=policy.config["vf_loss_coeff"],
-        entropy_coeff=policy.entropy_coeff,
-        clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
-        clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
-
-    return policy.loss.total_loss
-
-
-def stats(policy, batch_tensors):
-    values_batched = _make_time_major(
-        policy, policy.value_function, drop_last=policy.config["vtrace"])
-
-    return {
-        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
-        "policy_loss": policy.loss.pi_loss,
-        "entropy": policy.loss.entropy,
-        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
-        "var_gnorm": tf.global_norm(policy.var_list),
-        "vf_loss": policy.loss.vf_loss,
-        "vf_explained_var": explained_variance(
-            tf.reshape(policy.loss.value_targets, [-1]),
-            tf.reshape(values_batched, [-1])),
-    }
-
-
-def grad_stats(policy, grads):
-    return {
-        "grad_gnorm": tf.global_norm(grads),
-    }
-
-
-def postprocess_trajectory(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    # not used, so save some bandwidth
-    del sample_batch.data[SampleBatch.NEXT_OBS]
-    return sample_batch
-
-
-def add_behaviour_logits(policy):
-    return {BEHAVIOUR_LOGITS: policy.model_out}
-
-
-def validate_config(policy, obs_space, action_space, config):
-    if config["vtrace"]:
-        assert config["batch_mode"] == "truncate_episodes", \
-            "Must use `truncate_episodes` batch mode with V-trace."
-
-
-def choose_optimizer(policy, config):
-    if policy.config["opt_type"] == "adam":
-        return tf.train.AdamOptimizer(policy.cur_lr)
-    else:
-        return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
-                                         config["momentum"], config["epsilon"])
-
-
-def clip_gradients(policy, optimizer, loss):
-    grads = tf.gradients(loss, policy.var_list)
-    policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
-    clipped_grads = list(zip(policy.grads, policy.var_list))
-    return clipped_grads
-
-
-class ValueNetworkMixin(object):
-    def __init__(self):
-        self.value_function = self.model.value_function()
-        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                          tf.get_variable_scope().name)
-
-    def value(self, ob, *args):
-        feed_dict = {
-            self.get_placeholder(SampleBatch.CUR_OBS): [ob],
-            self.seq_lens: [1]
-        }
-        assert len(args) == len(self.state_in), \
-            (args, self.state_in)
-        for k, v in zip(self.state_in, args):
-            feed_dict[k] = v
-        vf = self.get_session().run(self.value_function, feed_dict)
-        return vf[0]
-
-
-def setup_mixins(policy, obs_space, action_space, config):
-    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
-    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
-                                  config["entropy_coeff_schedule"])
-    ValueNetworkMixin.__init__(policy)
-
-
-VTraceTFPolicy = build_tf_policy(
-    name="VTraceTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
-    loss_fn=build_vtrace_loss,
-    stats_fn=stats,
-    grad_stats_fn=grad_stats,
-    postprocess_fn=postprocess_trajectory,
-    optimizer_fn=choose_optimizer,
-    gradients_fn=clip_gradients,
-    extra_action_fetches_fn=add_behaviour_logits,
-    before_init=validate_config,
-    before_loss_init=setup_mixins,
-    mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
-    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
@@ -1,270 +0,0 @@
-# Copyright 2018 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for V-trace.
-
-For details and theory see:
-
-"IMPALA: Scalable Distributed Deep-RL with
-Importance Weighted Actor-Learner Architectures"
-by Espeholt, Soyer, Munos et al.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-import vtrace
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-def _shaped_arange(*shape):
-    """Runs np.arange, converts to float and reshapes."""
-    return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
-
-
-def _softmax(logits):
-    """Applies softmax non-linearity on inputs."""
-    return np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
-
-
-def _ground_truth_calculation(discounts, log_rhos, rewards, values,
-                              bootstrap_value, clip_rho_threshold,
-                              clip_pg_rho_threshold):
-    """Calculates the ground truth for V-trace in Python/Numpy."""
-    vs = []
-    seq_len = len(discounts)
-    rhos = np.exp(log_rhos)
-    cs = np.minimum(rhos, 1.0)
-    clipped_rhos = rhos
-    if clip_rho_threshold:
-        clipped_rhos = np.minimum(rhos, clip_rho_threshold)
-    clipped_pg_rhos = rhos
-    if clip_pg_rho_threshold:
-        clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)
-
-    # This is a very inefficient way to calculate the V-trace ground truth.
-    # We calculate it this way because it is close to the mathematical notation
-    # of
-    # V-trace.
-    # v_s = V(x_s)
-    #       + \sum^{T-1}_{t=s} \gamma^{t-s}
-    #         * \prod_{i=s}^{t-1} c_i
-    #         * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
-    # Note that when we take the product over c_i, we write `s:t` as the
-    # notation
-    # of the paper is inclusive of the `t-1`, but Python is exclusive.
-    # Also note that np.prod([]) == 1.
-    values_t_plus_1 = np.concatenate(
-        [values, bootstrap_value[None, :]], axis=0)
-    for s in range(seq_len):
-        v_s = np.copy(values[s])  # Very important copy.
-        for t in range(s, seq_len):
-            v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0)
-                    * clipped_rhos[t] * (rewards[t] + discounts[t] *
-                                         values_t_plus_1[t + 1] - values[t]))
-        vs.append(v_s)
-    vs = np.stack(vs, axis=0)
-    pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate(
-        [vs[1:], bootstrap_value[None, :]], axis=0) - values))
-
-    return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)
-
-
-class LogProbsFromLogitsAndActionsTest(tf.test.TestCase,
-                                       parameterized.TestCase):
-    @parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
-    def test_log_probs_from_logits_and_actions(self, batch_size):
-        """Tests log_probs_from_logits_and_actions."""
-        seq_len = 7
-        num_actions = 3
-
-        policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
-        actions = np.random.randint(
-            0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32)
-
-        action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
-            policy_logits, actions)
-
-        # Ground Truth
-        # Using broadcasting to create a mask that indexes action logits
-        action_index_mask = actions[..., None] == np.arange(num_actions)
-
-        def index_with_mask(array, mask):
-            return array[mask].reshape(*array.shape[:-1])
-
-        # Note: Normally log(softmax) is not a good idea because it's not
-        # numerically stable. However, in this test we have well-behaved
-        # values.
-        ground_truth_v = index_with_mask(
-            np.log(_softmax(policy_logits)), action_index_mask)
-
-        with self.test_session() as session:
-            self.assertAllClose(ground_truth_v,
-                                session.run(action_log_probs_tensor))
-
-
-class VtraceTest(tf.test.TestCase, parameterized.TestCase):
-    @parameterized.named_parameters(("Batch1", 1), ("Batch5", 5))
-    def test_vtrace(self, batch_size):
-        """Tests V-trace against ground truth data calculated in python."""
-        seq_len = 5
-
-        # Create log_rhos such that rho will span from near-zero to above the
-        # clipping thresholds. In particular, calculate log_rhos in
-        # [-2.5, 2.5),
-        # so that rho is in approx [0.08, 12.2).
-        log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len)
-        log_rhos = 5 * (log_rhos - 0.5)  # [0.0, 1.0) -> [-2.5, 2.5).
-        values = {
-            "log_rhos": log_rhos,
-            # T, B where B_i: [0.9 / (i+1)] * T
-            "discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)]
-                                   for _ in range(seq_len)]),
-            "rewards": _shaped_arange(seq_len, batch_size),
-            "values": _shaped_arange(seq_len, batch_size) / batch_size,
-            "bootstrap_value": _shaped_arange(batch_size) + 1.0,
-            "clip_rho_threshold": 3.7,
-            "clip_pg_rho_threshold": 2.2,
-        }
-
-        output = vtrace.from_importance_weights(**values)
-
-        with self.test_session() as session:
-            output_v = session.run(output)
-
-        ground_truth_v = _ground_truth_calculation(**values)
-        for a, b in zip(ground_truth_v, output_v):
-            self.assertAllClose(a, b)
-
-    @parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
-    def test_vtrace_from_logits(self, batch_size):
-        """Tests V-trace calculated from logits."""
-        seq_len = 5
-        num_actions = 3
-        clip_rho_threshold = None  # No clipping.
-        clip_pg_rho_threshold = None  # No clipping.
-
-        # Intentionally leaving shapes unspecified to test if V-trace can
-        # deal with that.
-        placeholders = {
-            # T, B, NUM_ACTIONS
-            "behaviour_policy_logits": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, None]),
-            # T, B, NUM_ACTIONS
-            "target_policy_logits": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, None]),
-            "actions": tf.placeholder(dtype=tf.int32, shape=[None, None]),
-            "discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]),
-            "rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]),
-            "values": tf.placeholder(dtype=tf.float32, shape=[None, None]),
-            "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]),
-        }
-
-        from_logits_output = vtrace.from_logits(
-            clip_rho_threshold=clip_rho_threshold,
-            clip_pg_rho_threshold=clip_pg_rho_threshold,
-            **placeholders)
-
-        target_log_probs = vtrace.log_probs_from_logits_and_actions(
-            placeholders["target_policy_logits"], placeholders["actions"])
-        behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
-            placeholders["behaviour_policy_logits"], placeholders["actions"])
-        log_rhos = target_log_probs - behaviour_log_probs
-        ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)
-
-        values = {
-            "behaviour_policy_logits": _shaped_arange(seq_len, batch_size,
-                                                      num_actions),
-            "target_policy_logits": _shaped_arange(seq_len, batch_size,
-                                                   num_actions),
-            "actions": np.random.randint(
-                0, num_actions - 1, size=(seq_len, batch_size)),
-            "discounts": np.array(  # T, B where B_i: [0.9 / (i+1)] * T
-                [[0.9 / (b + 1) for b in range(batch_size)]
-                 for _ in range(seq_len)]),
-            "rewards": _shaped_arange(seq_len, batch_size),
-            "values": _shaped_arange(seq_len, batch_size) / batch_size,
-            "bootstrap_value": _shaped_arange(batch_size) + 1.0,  # B
-        }
-
-        feed_dict = {placeholders[k]: v for k, v in values.items()}
-        with self.test_session() as session:
-            from_logits_output_v = session.run(
-                from_logits_output, feed_dict=feed_dict)
-            (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
-             ground_truth_target_action_log_probs) = session.run(
-                 ground_truth, feed_dict=feed_dict)
-
-        # Calculate V-trace using the ground truth logits.
-        from_iw = vtrace.from_importance_weights(
-            log_rhos=ground_truth_log_rhos,
-            discounts=values["discounts"],
-            rewards=values["rewards"],
-            values=values["values"],
-            bootstrap_value=values["bootstrap_value"],
-            clip_rho_threshold=clip_rho_threshold,
-            clip_pg_rho_threshold=clip_pg_rho_threshold)
-
-        with self.test_session() as session:
-            from_iw_v = session.run(from_iw)
-
-        self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
-        self.assertAllClose(from_iw_v.pg_advantages,
-                            from_logits_output_v.pg_advantages)
-        self.assertAllClose(ground_truth_behaviour_action_log_probs,
-                            from_logits_output_v.behaviour_action_log_probs)
-        self.assertAllClose(ground_truth_target_action_log_probs,
-                            from_logits_output_v.target_action_log_probs)
-        self.assertAllClose(ground_truth_log_rhos,
-                            from_logits_output_v.log_rhos)
-
-    def test_higher_rank_inputs_for_importance_weights(self):
-        """Checks support for additional dimensions in inputs."""
-        placeholders = {
-            "log_rhos": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 1]),
-            "discounts": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 1]),
-            "rewards": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 42]),
-            "values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
-            "bootstrap_value": tf.placeholder(
-                dtype=tf.float32, shape=[None, 42])
-        }
-        output = vtrace.from_importance_weights(**placeholders)
-        self.assertEqual(output.vs.shape.as_list()[-1], 42)
-
-    def test_inconsistent_rank_inputs_for_importance_weights(self):
-        """Test one of many possible errors in shape of inputs."""
-        placeholders = {
-            "log_rhos": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 1]),
-            "discounts": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 1]),
-            "rewards": tf.placeholder(
-                dtype=tf.float32, shape=[None, None, 42]),
-            "values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
-            # Should be [None, 42].
-            "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None])
-        }
-        with self.assertRaisesRegexp(ValueError, "must have rank 2"):
-            vtrace.from_importance_weights(**placeholders)
-
-
-if __name__ == "__main__":
-    tf.test.main()
@@ -1,7 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.marwil.marwil import MARWILTrainer, DEFAULT_CONFIG
-
-__all__ = ["MARWILTrainer", "DEFAULT_CONFIG"]
@@ -1,55 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.marwil.marwil_policy import MARWILPolicy
-from ray.rllib.optimizers import SyncBatchReplayOptimizer
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # You should override this to point to an offline dataset (see agent.py).
-    "input": "sampler",
-    # Use importance sampling estimators for reward
-    "input_evaluation": ["is", "wis"],
-
-    # Scaling of advantages in exponential terms
-    # When beta is 0, MARWIL is reduced to imitation learning
-    "beta": 1.0,
-    # Balancing value estimation loss and policy optimization loss
-    "vf_coeff": 1.0,
-    # Whether to calculate cumulative rewards
-    "postprocess_inputs": True,
-    # Whether to rollout "complete_episodes" or "truncate_episodes"
-    "batch_mode": "complete_episodes",
-    # Learning rate for adam optimizer
-    "lr": 1e-4,
-    # Number of timesteps collected for each SGD round
-    "train_batch_size": 2000,
-    # Number of steps max to keep in the batch replay buffer
-    "replay_buffer_size": 100000,
-    # Number of steps to read before learning starts
-    "learning_starts": 0,
-    # === Parallelism ===
-    "num_workers": 0,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def make_optimizer(workers, config):
-    return SyncBatchReplayOptimizer(
-        workers,
-        learning_starts=config["learning_starts"],
-        buffer_size=config["replay_buffer_size"],
-        train_batch_size=config["train_batch_size"],
-    )
-
-
-MARWILTrainer = build_trainer(
-    name="MARWIL",
-    default_config=DEFAULT_CONFIG,
-    default_policy=MARWILPolicy,
-    make_policy_optimizer=make_optimizer)
@@ -1,175 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.models import ModelCatalog
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.utils.annotations import override
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.tf_ops import scope_vars
-
-tf = try_import_tf()
-
-POLICY_SCOPE = "p_func"
-VALUE_SCOPE = "v_func"
-
-
-class ValueLoss(object):
-    def __init__(self, state_values, cumulative_rewards):
-        self.loss = 0.5 * tf.reduce_mean(
-            tf.square(state_values - cumulative_rewards))
-
-
-class ReweightedImitationLoss(object):
-    def __init__(self, state_values, cumulative_rewards, logits, actions,
-                 action_space, beta):
-        ma_adv_norm = tf.get_variable(
-            name="moving_average_of_advantage_norm",
-            dtype=tf.float32,
-            initializer=100.0,
-            trainable=False)
-        # advantage estimation
-        adv = cumulative_rewards - state_values
-        # update averaged advantage norm
-        update_adv_norm = tf.assign_add(
-            ref=ma_adv_norm,
-            value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))
-
-        # exponentially weighted advantages
-        with tf.control_dependencies([update_adv_norm]):
-            exp_advs = tf.exp(
-                beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))
-
-        # log\pi_\theta(a|s)
-        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
-        action_dist = dist_cls(logits)
-        logprobs = action_dist.logp(actions)
-
-        self.loss = -1.0 * tf.reduce_mean(
-            tf.stop_gradient(exp_advs) * logprobs)
-
-
-class MARWILPostprocessing(object):
-    """Adds the advantages field to the trajectory."""
-
-    @override(Policy)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        completed = sample_batch["dones"][-1]
-        if completed:
-            last_r = 0.0
-        else:
-            raise NotImplementedError(
-                "last done mask in a batch should be True. "
-                "For now, we only support reading experience batches produced "
-                "with batch_mode='complete_episodes'.",
-                len(sample_batch[SampleBatch.DONES]),
-                sample_batch[SampleBatch.DONES][-1])
-        batch = compute_advantages(
-            sample_batch, last_r, gamma=self.config["gamma"], use_gae=False)
-        return batch
-
-
-class MARWILPolicy(MARWILPostprocessing, TFPolicy):
-    def __init__(self, observation_space, action_space, config):
-        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
-        self.config = config
-
-        dist_cls, logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-
-        # Action inputs
-        self.obs_t = tf.placeholder(
-            tf.float32, shape=(None, ) + observation_space.shape)
-        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
-        prev_rewards_ph = tf.placeholder(
-            tf.float32, [None], name="prev_reward")
-
-        with tf.variable_scope(POLICY_SCOPE) as scope:
-            self.model = ModelCatalog.get_model({
-                "obs": self.obs_t,
-                "prev_actions": prev_actions_ph,
-                "prev_rewards": prev_rewards_ph,
-                "is_training": self._get_is_training_placeholder(),
-            }, observation_space, action_space, logit_dim,
-                                                self.config["model"])
-            logits = self.model.outputs
-            self.p_func_vars = scope_vars(scope.name)
-
-        # Action outputs
-        action_dist = dist_cls(logits)
-        self.output_actions = action_dist.sample()
-
-        # Training inputs
-        self.act_t = ModelCatalog.get_action_placeholder(action_space)
-        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")
-
-        # v network evaluation
-        with tf.variable_scope(VALUE_SCOPE) as scope:
-            state_values = self.model.value_function()
-            self.v_func_vars = scope_vars(scope.name)
-        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
-        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
-                                              logits, self.act_t, action_space)
-
-        # which kind of objective to optimize
-        objective = (
-            self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
-        self.explained_variance = tf.reduce_mean(
-            explained_variance(self.cum_rew_t, state_values))
-
-        # initialize TFPolicy
-        self.sess = tf.get_default_session()
-        self.loss_inputs = [
-            (SampleBatch.CUR_OBS, self.obs_t),
-            (SampleBatch.ACTIONS, self.act_t),
-            (Postprocessing.ADVANTAGES, self.cum_rew_t),
-        ]
-        TFPolicy.__init__(
-            self,
-            observation_space,
-            action_space,
-            self.sess,
-            obs_input=self.obs_t,
-            action_sampler=self.output_actions,
-            action_prob=action_dist.sampled_action_prob(),
-            loss=objective,
-            model=self.model,
-            loss_inputs=self.loss_inputs,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
-            prev_action_input=prev_actions_ph,
-            prev_reward_input=prev_rewards_ph)
-        self.sess.run(tf.global_variables_initializer())
-
-        self.stats_fetches = {
-            "total_loss": objective,
-            "vf_explained_var": self.explained_variance,
-            "policy_loss": self.p_loss.loss,
-            "vf_loss": self.v_loss.loss
-        }
-
-    def _build_value_loss(self, state_values, cum_rwds):
-        return ValueLoss(state_values, cum_rwds)
-
-    def _build_policy_loss(self, state_values, cum_rwds, logits, actions,
-                           action_space):
-        return ReweightedImitationLoss(state_values, cum_rwds, logits, actions,
-                                       action_space, self.config["beta"])
-
-    @override(TFPolicy)
-    def extra_compute_grad_fetches(self):
-        return {LEARNER_STATS_KEY: self.stats_fetches}
-
-    @override(Policy)
-    def get_initial_state(self):
-        return self.model.state_init
@@ -1,128 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import pickle
-import numpy as np
-
-from ray.tune import result as tune_result
-from ray.rllib.agents.trainer import Trainer, with_common_config
-
-
-class _MockTrainer(Trainer):
-    """Mock trainer for use in tests"""
-
-    _name = "MockTrainer"
-    _default_config = with_common_config({
-        "mock_error": False,
-        "persistent_error": False,
-        "test_variable": 1,
-        "num_workers": 0,
-        "user_checkpoint_freq": 0,
-    })
-
-    @classmethod
-    def default_resource_request(cls, config):
-        return None
-
-    def _init(self, config, env_creator):
-        self.info = None
-        self.restored = False
-
-    def _train(self):
-        if self.config["mock_error"] and self.iteration == 1 \
-                and (self.config["persistent_error"] or not self.restored):
-            raise Exception("mock error")
-        result = dict(
-            episode_reward_mean=10,
-            episode_len_mean=10,
-            timesteps_this_iter=10,
-            info={})
-        if self.config["user_checkpoint_freq"] > 0 and self.iteration > 0:
-            if self.iteration % self.config["user_checkpoint_freq"] == 0:
-                result.update({tune_result.SHOULD_CHECKPOINT: True})
-        return result
-
-    def _save(self, checkpoint_dir):
-        path = os.path.join(checkpoint_dir, "mock_agent.pkl")
-        with open(path, "wb") as f:
-            pickle.dump(self.info, f)
-        return path
-
-    def _restore(self, checkpoint_path):
-        with open(checkpoint_path, "rb") as f:
-            info = pickle.load(f)
-        self.info = info
-        self.restored = True
-
-    def _register_if_needed(self, env_object):
-        pass
-
-    def set_info(self, info):
-        self.info = info
-        return info
-
-    def get_info(self):
-        return self.info
-
-
-class _SigmoidFakeData(_MockTrainer):
-    """Trainer that returns sigmoid learning curves.
-
-    This can be helpful for evaluating early stopping algorithms."""
-
-    _name = "SigmoidFakeData"
-    _default_config = with_common_config({
-        "width": 100,
-        "height": 100,
-        "offset": 0,
-        "iter_time": 10,
-        "iter_timesteps": 1,
-        "num_workers": 0,
-    })
-
-    def _train(self):
-        i = max(0, self.iteration - self.config["offset"])
-        v = np.tanh(float(i) / self.config["width"])
-        v *= self.config["height"]
-        return dict(
-            episode_reward_mean=v,
-            episode_len_mean=v,
-            timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"],
-            info={})
-
-
-class _ParameterTuningTrainer(_MockTrainer):
-
-    _name = "ParameterTuningTrainer"
-    _default_config = with_common_config({
-        "reward_amt": 10,
-        "dummy_param": 10,
-        "dummy_param2": 15,
-        "iter_time": 10,
-        "iter_timesteps": 1,
-        "num_workers": 0,
-    })
-
-    def _train(self):
-        return dict(
-            episode_reward_mean=self.config["reward_amt"] * self.iteration,
-            episode_len_mean=self.config["reward_amt"],
-            timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"],
-            info={})
-
-
-def _agent_import_failed(trace):
-    """Returns dummy agent class for if PyTorch etc. is not installed."""
-
-    class _AgentImportFailed(Trainer):
-        _name = "AgentImportFailed"
-        _default_config = with_common_config({})
-
-        def _setup(self, config):
-            raise ImportError(trace)
-
-    return _AgentImportFailed
@@ -1,6 +0,0 @@
-from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_agent
-
-PGAgent = renamed_agent(PGTrainer)
-
-__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
@@ -1,35 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # No remote workers by default
-    "num_workers": 0,
-    # Learning rate
-    "lr": 0.0004,
-    # Use PyTorch as backend
-    "use_pytorch": False,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def get_policy_class(config):
-    if config["use_pytorch"]:
-        from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
-        return PGTorchPolicy
-    else:
-        return PGTFPolicy
-
-
-PGTrainer = build_trainer(
-    name="PG",
-    default_config=DEFAULT_CONFIG,
-    default_policy=PGTFPolicy,
-    get_policy_class=get_policy_class)
@@ -1,35 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-# The basic policy gradients loss
-def policy_gradient_loss(policy, batch_tensors):
-    actions = batch_tensors[SampleBatch.ACTIONS]
-    advantages = batch_tensors[Postprocessing.ADVANTAGES]
-    return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
-
-
-# This adds the "advantages" column to the sample batch.
-def postprocess_advantages(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    return compute_advantages(
-        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
-
-
-PGTFPolicy = build_tf_policy(
-    name="PGTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
-    postprocess_fn=postprocess_advantages,
-    loss_fn=policy_gradient_loss)
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
-
-
-def pg_torch_loss(policy, batch_tensors):
-    logits, _ = policy.model({
-        SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
-    })
-    action_dist = policy.dist_class(logits)
-    log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS])
-    # save the error in the policy object
-    policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
-        log_probs.reshape(-1))
-    return policy.pi_err
-
-
-def postprocess_advantages(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    return compute_advantages(
-        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
-
-
-def pg_loss_stats(policy, batch_tensors):
-    # the error is recorded when computing the loss
-    return {"policy_loss": policy.pi_err.item()}
-
-
-PGTorchPolicy = build_torch_policy(
-    name="PGTorchPolicy",
-    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=pg_torch_loss,
-    stats_fn=pg_loss_stats,
-    postprocess_fn=postprocess_advantages)
@@ -1,7 +0,0 @@
-from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
-from ray.rllib.agents.ppo.appo import APPOTrainer
-from ray.rllib.utils import renamed_agent
-
-PPOAgent = renamed_agent(PPOTrainer)
-
-__all__ = ["PPOAgent", "APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
@@ -1,91 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy
-from ray.rllib.agents.trainer import with_base_config
-from ray.rllib.agents.ppo.ppo import update_kl
-from ray.rllib.agents import impala
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
-    # Whether to use V-trace weighted advantages. If false, PPO GAE advantages
-    # will be used instead.
-    "vtrace": False,
-
-    # == These two options only apply if vtrace: False ==
-    # If true, use the Generalized Advantage Estimator (GAE)
-    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
-    "use_gae": True,
-    # GAE(lambda) parameter
-    "lambda": 1.0,
-
-    # == PPO surrogate loss options ==
-    "clip_param": 0.4,
-
-    # == PPO KL Loss options ==
-    "use_kl_loss": False,
-    "kl_coeff": 1.0,
-    "kl_target": 0.01,
-
-    # == IMPALA optimizer params (see documentation in impala.py) ==
-    "sample_batch_size": 50,
-    "train_batch_size": 500,
-    "min_iter_time_s": 10,
-    "num_workers": 2,
-    "num_gpus": 0,
-    "num_data_loader_buffers": 1,
-    "minibatch_buffer_size": 1,
-    "num_sgd_iter": 1,
-    "replay_proportion": 0.0,
-    "replay_buffer_num_slots": 100,
-    "learner_queue_size": 16,
-    "learner_queue_timeout": 300,
-    "max_sample_requests_in_flight_per_worker": 2,
-    "broadcast_interval": 1,
-    "grad_clip": 40.0,
-    "opt_type": "adam",
-    "lr": 0.0005,
-    "lr_schedule": None,
-    "decay": 0.99,
-    "momentum": 0.0,
-    "epsilon": 0.1,
-    "vf_loss_coeff": 0.5,
-    "entropy_coeff": 0.01,
-    "entropy_coeff_schedule": None,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def update_target_and_kl(trainer, fetches):
-    # Update the KL coeff depending on how many steps LearnerThread has stepped
-    # through
-    learner_steps = trainer.optimizer.learner.num_steps
-    if learner_steps >= trainer.target_update_frequency:
-
-        # Update Target Network
-        trainer.optimizer.learner.num_steps = 0
-        trainer.workers.local_worker().foreach_trainable_policy(
-            lambda p, _: p.update_target())
-
-        # Also update KL Coeff
-        if trainer.config["use_kl_loss"]:
-            update_kl(trainer, trainer.optimizer.learner.stats)
-
-
-def initialize_target(trainer):
-    trainer.workers.local_worker().foreach_trainable_policy(
-        lambda p, _: p.update_target())
-    trainer.target_update_frequency = trainer.config["num_sgd_iter"] \
-        * trainer.config["minibatch_buffer_size"]
-
-
-APPOTrainer = impala.ImpalaTrainer.with_updates(
-    name="APPO",
-    default_config=DEFAULT_CONFIG,
-    default_policy=AsyncPPOTFPolicy,
-    get_policy_class=lambda _: AsyncPPOTFPolicy,
-    after_init=initialize_target,
-    after_optimizer_step=update_target_and_kl)
@@ -1,440 +0,0 @@
-"""Adapted from VTraceTFPolicy to use the PPO surrogate loss.
-
-Keep in sync with changes to VTraceTFPolicy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import logging
-import gym
-
-from ray.rllib.agents.impala import vtrace
-from ray.rllib.agents.impala.vtrace_policy import _make_time_major, \
-        BEHAVIOUR_LOGITS, clip_gradients, \
-        validate_config, choose_optimizer, ValueNetworkMixin
-from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.evaluation.postprocessing import compute_advantages
-from ray.rllib.utils import try_import_tf
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.tf_policy import LearningRateSchedule
-from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils.explained_variance import explained_variance
-
-tf = try_import_tf()
-
-POLICY_SCOPE = "func"
-TARGET_POLICY_SCOPE = "target_func"
-
-logger = logging.getLogger(__name__)
-
-
-class PPOSurrogateLoss(object):
-    """Loss used when V-trace is disabled.
-
-    Arguments:
-        prev_actions_logp: A float32 tensor of shape [T, B].
-        actions_logp: A float32 tensor of shape [T, B].
-        action_kl: A float32 tensor of shape [T, B].
-        actions_entropy: A float32 tensor of shape [T, B].
-        values: A float32 tensor of shape [T, B].
-        valid_mask: A bool tensor of valid RNN input elements (#2992).
-        advantages: A float32 tensor of shape [T, B].
-        value_targets: A float32 tensor of shape [T, B].
-        vf_loss_coeff (float): Coefficient of the value function loss.
-        entropy_coeff (float): Coefficient of the entropy regularizer.
-        clip_param (float): Clip parameter.
-        cur_kl_coeff (float): Coefficient for KL loss.
-        use_kl_loss (bool): If true, use KL loss.
-    """
-
-    def __init__(self,
-                 prev_actions_logp,
-                 actions_logp,
-                 action_kl,
-                 actions_entropy,
-                 values,
-                 valid_mask,
-                 advantages,
-                 value_targets,
-                 vf_loss_coeff=0.5,
-                 entropy_coeff=0.01,
-                 clip_param=0.3,
-                 cur_kl_coeff=None,
-                 use_kl_loss=False):
-        def reduce_mean_valid(t):
-            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
-
-        logp_ratio = tf.exp(actions_logp - prev_actions_logp)
-
-        surrogate_loss = tf.minimum(
-            advantages * logp_ratio,
-            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
-                                          1 + clip_param))
-
-        self.mean_kl = reduce_mean_valid(action_kl)
-        self.pi_loss = -reduce_mean_valid(surrogate_loss)
-
-        # The baseline loss
-        delta = values - value_targets
-        self.value_targets = value_targets
-        self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
-
-        # The entropy loss
-        self.entropy = reduce_mean_valid(actions_entropy)
-
-        # The summed weighted loss
-        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
-                           self.entropy * entropy_coeff)
-
-        # Optional additional KL Loss
-        if use_kl_loss:
-            self.total_loss += cur_kl_coeff * self.mean_kl
-
-
-class VTraceSurrogateLoss(object):
-    def __init__(self,
-                 actions,
-                 prev_actions_logp,
-                 actions_logp,
-                 old_policy_actions_logp,
-                 action_kl,
-                 actions_entropy,
-                 dones,
-                 behaviour_logits,
-                 old_policy_behaviour_logits,
-                 target_logits,
-                 discount,
-                 rewards,
-                 values,
-                 bootstrap_value,
-                 dist_class,
-                 valid_mask,
-                 vf_loss_coeff=0.5,
-                 entropy_coeff=0.01,
-                 clip_rho_threshold=1.0,
-                 clip_pg_rho_threshold=1.0,
-                 clip_param=0.3,
-                 cur_kl_coeff=None,
-                 use_kl_loss=False):
-        """APPO Loss, with IS modifications and V-trace for Advantage Estimation
-
-        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
-        batch_size. The reason we need to know `B` is for V-trace to properly
-        handle episode cut boundaries.
-
-        Arguments:
-            actions: An int|float32 tensor of shape [T, B, logit_dim].
-            prev_actions_logp: A float32 tensor of shape [T, B].
-            actions_logp: A float32 tensor of shape [T, B].
-            old_policy_actions_logp: A float32 tensor of shape [T, B].
-            action_kl: A float32 tensor of shape [T, B].
-            actions_entropy: A float32 tensor of shape [T, B].
-            dones: A bool tensor of shape [T, B].
-            behaviour_logits: A float32 tensor of shape [T, B, logit_dim].
-            old_policy_behaviour_logits: A float32 tensor of shape
-            [T, B, logit_dim].
-            target_logits: A float32 tensor of shape [T, B, logit_dim].
-            discount: A float32 scalar.
-            rewards: A float32 tensor of shape [T, B].
-            values: A float32 tensor of shape [T, B].
-            bootstrap_value: A float32 tensor of shape [B].
-            dist_class: action distribution class for logits.
-            valid_mask: A bool tensor of valid RNN input elements (#2992).
-            vf_loss_coeff (float): Coefficient of the value function loss.
-            entropy_coeff (float): Coefficient of the entropy regularizer.
-            clip_param (float): Clip parameter.
-            cur_kl_coeff (float): Coefficient for KL loss.
-            use_kl_loss (bool): If true, use KL loss.
-        """
-
-        def reduce_mean_valid(t):
-            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
-
-        # Compute vtrace on the CPU for better perf.
-        with tf.device("/cpu:0"):
-            self.vtrace_returns = vtrace.multi_from_logits(
-                behaviour_policy_logits=behaviour_logits,
-                target_policy_logits=old_policy_behaviour_logits,
-                actions=tf.unstack(actions, axis=2),
-                discounts=tf.to_float(~dones) * discount,
-                rewards=rewards,
-                values=values,
-                bootstrap_value=bootstrap_value,
-                dist_class=dist_class,
-                clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
-                clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
-                                              tf.float32))
-
-        self.is_ratio = tf.clip_by_value(
-            tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
-        logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp)
-
-        advantages = self.vtrace_returns.pg_advantages
-        surrogate_loss = tf.minimum(
-            advantages * logp_ratio,
-            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
-                                          1 + clip_param))
-
-        self.mean_kl = reduce_mean_valid(action_kl)
-        self.pi_loss = -reduce_mean_valid(surrogate_loss)
-
-        # The baseline loss
-        delta = values - self.vtrace_returns.vs
-        self.value_targets = self.vtrace_returns.vs
-        self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
-
-        # The entropy loss
-        self.entropy = reduce_mean_valid(actions_entropy)
-
-        # The summed weighted loss
-        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
-                           self.entropy * entropy_coeff)
-
-        # Optional additional KL Loss
-        if use_kl_loss:
-            self.total_loss += cur_kl_coeff * self.mean_kl
-
-
-def build_appo_model(policy, obs_space, action_space, config):
-    policy.model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        policy.logit_dim,
-        config["model"],
-        name=POLICY_SCOPE,
-        framework="tf")
-
-    policy.target_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        policy.logit_dim,
-        config["model"],
-        name=TARGET_POLICY_SCOPE,
-        framework="tf")
-
-    return policy.model
-
-
-def build_appo_surrogate_loss(policy, batch_tensors):
-    if isinstance(policy.action_space, gym.spaces.Discrete):
-        is_multidiscrete = False
-        output_hidden_shape = [policy.action_space.n]
-    elif isinstance(policy.action_space,
-                    gym.spaces.multi_discrete.MultiDiscrete):
-        is_multidiscrete = True
-        output_hidden_shape = policy.action_space.nvec.astype(np.int32)
-    else:
-        is_multidiscrete = False
-        output_hidden_shape = 1
-
-    def make_time_major(*args, **kw):
-        return _make_time_major(policy, *args, **kw)
-
-    actions = batch_tensors[SampleBatch.ACTIONS]
-    dones = batch_tensors[SampleBatch.DONES]
-    rewards = batch_tensors[SampleBatch.REWARDS]
-
-    behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
-
-    policy.target_model_out, _ = policy.target_model(
-        policy.input_dict, policy.state_in, policy.seq_lens)
-    old_policy_behaviour_logits = tf.stop_gradient(policy.target_model_out)
-
-    unpacked_behaviour_logits = tf.split(
-        behaviour_logits, output_hidden_shape, axis=1)
-    unpacked_old_policy_behaviour_logits = tf.split(
-        old_policy_behaviour_logits, output_hidden_shape, axis=1)
-    unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
-    action_dist = policy.action_dist
-    old_policy_action_dist = policy.dist_class(old_policy_behaviour_logits)
-    prev_action_dist = policy.dist_class(behaviour_logits)
-    values = policy.value_function
-
-    policy.model_vars = policy.model.variables()
-    policy.target_model_vars = policy.target_model.variables()
-
-    if policy.state_in:
-        max_seq_len = tf.reduce_max(policy.seq_lens) - 1
-        mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
-        mask = tf.reshape(mask, [-1])
-    else:
-        mask = tf.ones_like(rewards)
-
-    if policy.config["vtrace"]:
-        logger.info("Using V-Trace surrogate loss (vtrace=True)")
-
-        # Prepare actions for loss
-        loss_actions = actions if is_multidiscrete else tf.expand_dims(
-            actions, axis=1)
-
-        # Prepare KL for Loss
-        mean_kl = make_time_major(
-            old_policy_action_dist.multi_kl(action_dist), drop_last=True)
-
-        policy.loss = VTraceSurrogateLoss(
-            actions=make_time_major(loss_actions, drop_last=True),
-            prev_actions_logp=make_time_major(
-                prev_action_dist.logp(actions), drop_last=True),
-            actions_logp=make_time_major(
-                action_dist.logp(actions), drop_last=True),
-            old_policy_actions_logp=make_time_major(
-                old_policy_action_dist.logp(actions), drop_last=True),
-            action_kl=tf.reduce_mean(mean_kl, axis=0)
-            if is_multidiscrete else mean_kl,
-            actions_entropy=make_time_major(
-                action_dist.multi_entropy(), drop_last=True),
-            dones=make_time_major(dones, drop_last=True),
-            behaviour_logits=make_time_major(
-                unpacked_behaviour_logits, drop_last=True),
-            old_policy_behaviour_logits=make_time_major(
-                unpacked_old_policy_behaviour_logits, drop_last=True),
-            target_logits=make_time_major(unpacked_outputs, drop_last=True),
-            discount=policy.config["gamma"],
-            rewards=make_time_major(rewards, drop_last=True),
-            values=make_time_major(values, drop_last=True),
-            bootstrap_value=make_time_major(values)[-1],
-            dist_class=Categorical if is_multidiscrete else policy.dist_class,
-            valid_mask=make_time_major(mask, drop_last=True),
-            vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
-            clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
-            clip_pg_rho_threshold=policy.config[
-                "vtrace_clip_pg_rho_threshold"],
-            clip_param=policy.config["clip_param"],
-            cur_kl_coeff=policy.kl_coeff,
-            use_kl_loss=policy.config["use_kl_loss"])
-    else:
-        logger.info("Using PPO surrogate loss (vtrace=False)")
-
-        # Prepare KL for Loss
-        mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist))
-
-        policy.loss = PPOSurrogateLoss(
-            prev_actions_logp=make_time_major(prev_action_dist.logp(actions)),
-            actions_logp=make_time_major(action_dist.logp(actions)),
-            action_kl=tf.reduce_mean(mean_kl, axis=0)
-            if is_multidiscrete else mean_kl,
-            actions_entropy=make_time_major(action_dist.multi_entropy()),
-            values=make_time_major(values),
-            valid_mask=make_time_major(mask),
-            advantages=make_time_major(
-                batch_tensors[Postprocessing.ADVANTAGES]),
-            value_targets=make_time_major(
-                batch_tensors[Postprocessing.VALUE_TARGETS]),
-            vf_loss_coeff=policy.config["vf_loss_coeff"],
-            entropy_coeff=policy.config["entropy_coeff"],
-            clip_param=policy.config["clip_param"],
-            cur_kl_coeff=policy.kl_coeff,
-            use_kl_loss=policy.config["use_kl_loss"])
-
-    return policy.loss.total_loss
-
-
-def stats(policy, batch_tensors):
-    values_batched = _make_time_major(
-        policy, policy.value_function, drop_last=policy.config["vtrace"])
-
-    stats_dict = {
-        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
-        "policy_loss": policy.loss.pi_loss,
-        "entropy": policy.loss.entropy,
-        "var_gnorm": tf.global_norm(policy.var_list),
-        "vf_loss": policy.loss.vf_loss,
-        "vf_explained_var": explained_variance(
-            tf.reshape(policy.loss.value_targets, [-1]),
-            tf.reshape(values_batched, [-1])),
-    }
-
-    if policy.config["vtrace"]:
-        is_stat_mean, is_stat_var = tf.nn.moments(policy.loss.is_ratio, [0, 1])
-        stats_dict.update({"mean_IS": is_stat_mean})
-        stats_dict.update({"var_IS": is_stat_var})
-
-    if policy.config["use_kl_loss"]:
-        stats_dict.update({"kl": policy.loss.mean_kl})
-        stats_dict.update({"KL_Coeff": policy.kl_coeff})
-
-    return stats_dict
-
-
-def postprocess_trajectory(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    if not policy.config["vtrace"]:
-        completed = sample_batch["dones"][-1]
-        if completed:
-            last_r = 0.0
-        else:
-            next_state = []
-            for i in range(len(policy.state_in)):
-                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
-            last_r = policy.value(sample_batch["new_obs"][-1], *next_state)
-        batch = compute_advantages(
-            sample_batch,
-            last_r,
-            policy.config["gamma"],
-            policy.config["lambda"],
-            use_gae=policy.config["use_gae"])
-    else:
-        batch = sample_batch
-    del batch.data["new_obs"]  # not used, so save some bandwidth
-    return batch
-
-
-def add_values_and_logits(policy):
-    out = {BEHAVIOUR_LOGITS: policy.model_out}
-    if not policy.config["vtrace"]:
-        out[SampleBatch.VF_PREDS] = policy.value_function
-    return out
-
-
-class TargetNetworkMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        """Target Network is updated by the master learner every
-        trainer.update_target_frequency steps. All worker batches
-        are importance sampled w.r. to the target network to ensure
-        a more stable pi_old in PPO.
-        """
-        assign_ops = []
-        assert len(self.model_vars) == len(self.target_model_vars)
-        for var, var_target in zip(self.model_vars, self.target_model_vars):
-            assign_ops.append(var_target.assign(var))
-        self.update_target_network = tf.group(*assign_ops)
-
-    def update_target(self):
-        return self.get_session().run(self.update_target_network)
-
-
-def setup_mixins(policy, obs_space, action_space, config):
-    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
-    KLCoeffMixin.__init__(policy, config)
-    ValueNetworkMixin.__init__(policy)
-
-
-def setup_late_mixins(policy, obs_space, action_space, config):
-    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
-
-
-AsyncPPOTFPolicy = build_tf_policy(
-    name="AsyncPPOTFPolicy",
-    make_model=build_appo_model,
-    loss_fn=build_appo_surrogate_loss,
-    stats_fn=stats,
-    postprocess_fn=postprocess_trajectory,
-    optimizer_fn=choose_optimizer,
-    gradients_fn=clip_gradients,
-    extra_action_fetches_fn=add_values_and_logits,
-    before_init=validate_config,
-    before_loss_init=setup_mixins,
-    after_init=setup_late_mixins,
-    mixins=[
-        LearningRateSchedule, KLCoeffMixin, TargetNetworkMixin,
-        ValueNetworkMixin
-    ],
-    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
@@ -1,154 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-
-from ray.rllib.agents import with_common_config
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
-from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
-
-logger = logging.getLogger(__name__)
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # If true, use the Generalized Advantage Estimator (GAE)
-    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
-    "use_gae": True,
-    # GAE(lambda) parameter
-    "lambda": 1.0,
-    # Initial coefficient for KL divergence
-    "kl_coeff": 0.2,
-    # Size of batches collected from each worker
-    "sample_batch_size": 200,
-    # Number of timesteps collected for each SGD round
-    "train_batch_size": 4000,
-    # Total SGD batch size across all devices for SGD
-    "sgd_minibatch_size": 128,
-    # Whether to shuffle sequences in the batch when training (recommended)
-    "shuffle_sequences": True,
-    # Number of SGD iterations in each outer loop
-    "num_sgd_iter": 30,
-    # Stepsize of SGD
-    "lr": 5e-5,
-    # Learning rate schedule
-    "lr_schedule": None,
-    # Share layers for value function. If you set this to True, it's important
-    # to tune vf_loss_coeff.
-    "vf_share_layers": False,
-    # Coefficient of the value function loss. It's important to tune this if
-    # you set vf_share_layers: True
-    "vf_loss_coeff": 1.0,
-    # Coefficient of the entropy regularizer
-    "entropy_coeff": 0.0,
-    # Decay schedule for the entropy regularizer
-    "entropy_coeff_schedule": None,
-    # PPO clip parameter
-    "clip_param": 0.3,
-    # Clip param for the value function. Note that this is sensitive to the
-    # scale of the rewards. If your expected V is large, increase this.
-    "vf_clip_param": 10.0,
-    # If specified, clip the global norm of gradients by this amount
-    "grad_clip": None,
-    # Target value for KL divergence
-    "kl_target": 0.01,
-    # Whether to rollout "complete_episodes" or "truncate_episodes"
-    "batch_mode": "truncate_episodes",
-    # Which observation filter to apply to the observation
-    "observation_filter": "NoFilter",
-    # Uses the sync samples optimizer instead of the multi-gpu one. This does
-    # not support minibatches.
-    "simple_optimizer": False,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def choose_policy_optimizer(workers, config):
-    if config["simple_optimizer"]:
-        return SyncSamplesOptimizer(
-            workers,
-            num_sgd_iter=config["num_sgd_iter"],
-            train_batch_size=config["train_batch_size"])
-
-    return LocalMultiGPUOptimizer(
-        workers,
-        sgd_batch_size=config["sgd_minibatch_size"],
-        num_sgd_iter=config["num_sgd_iter"],
-        num_gpus=config["num_gpus"],
-        sample_batch_size=config["sample_batch_size"],
-        num_envs_per_worker=config["num_envs_per_worker"],
-        train_batch_size=config["train_batch_size"],
-        standardize_fields=["advantages"],
-        shuffle_sequences=config["shuffle_sequences"])
-
-
-def update_kl(trainer, fetches):
-    if "kl" in fetches:
-        # single-agent
-        trainer.workers.local_worker().for_policy(
-            lambda pi: pi.update_kl(fetches["kl"]))
-    else:
-
-        def update(pi, pi_id):
-            if pi_id in fetches:
-                pi.update_kl(fetches[pi_id]["kl"])
-            else:
-                logger.debug("No data for {}, not updating kl".format(pi_id))
-
-        # multi-agent
-        trainer.workers.local_worker().foreach_trainable_policy(update)
-
-
-def warn_about_bad_reward_scales(trainer, result):
-    # Warn about bad clipping configs
-    if trainer.config["vf_clip_param"] <= 0:
-        rew_scale = float("inf")
-    elif result["policy_reward_mean"]:
-        rew_scale = 0  # punt on handling multiagent case
-    else:
-        rew_scale = round(
-            abs(result["episode_reward_mean"]) /
-            trainer.config["vf_clip_param"], 0)
-    if rew_scale > 200:
-        logger.warning(
-            "The magnitude of your environment rewards are more than "
-            "{}x the scale of `vf_clip_param`. ".format(rew_scale) +
-            "This means that it will take more than "
-            "{} iterations for your value ".format(rew_scale) +
-            "function to converge. If this is not intended, consider "
-            "increasing `vf_clip_param`.")
-
-
-def validate_config(config):
-    if config["entropy_coeff"] < 0:
-        raise DeprecationWarning("entropy_coeff must be >= 0")
-    if config["sgd_minibatch_size"] > config["train_batch_size"]:
-        raise ValueError(
-            "Minibatch size {} must be <= train batch size {}.".format(
-                config["sgd_minibatch_size"], config["train_batch_size"]))
-    if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
-        raise ValueError(
-            "Episode truncation is not supported without a value "
-            "function. Consider setting batch_mode=complete_episodes.")
-    if config["multiagent"]["policies"] and not config["simple_optimizer"]:
-        logger.info(
-            "In multi-agent mode, policies will be optimized sequentially "
-            "by the multi-GPU optimizer. Consider setting "
-            "simple_optimizer=True if this doesn't work for you.")
-    if config["simple_optimizer"]:
-        logger.warning(
-            "Using the simple non-minibatch optimizer. This will greatly "
-            "reduce performance, consider simple_optimizer=False.")
-
-
-PPOTrainer = build_trainer(
-    name="PPO",
-    default_config=DEFAULT_CONFIG,
-    default_policy=PPOTFPolicy,
-    make_policy_optimizer=choose_policy_optimizer,
-    validate_config=validate_config,
-    after_optimizer_step=update_kl,
-    after_train_result=warn_about_bad_reward_scales)
@@ -1,270 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy import LearningRateSchedule, \
-    EntropyCoeffSchedule
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-logger = logging.getLogger(__name__)
-
-# Frozen logits of the policy that computed the action
-BEHAVIOUR_LOGITS = "behaviour_logits"
-
-
-class PPOLoss(object):
-    def __init__(self,
-                 action_space,
-                 value_targets,
-                 advantages,
-                 actions,
-                 logits,
-                 vf_preds,
-                 curr_action_dist,
-                 value_fn,
-                 cur_kl_coeff,
-                 valid_mask,
-                 entropy_coeff=0,
-                 clip_param=0.1,
-                 vf_clip_param=0.1,
-                 vf_loss_coeff=1.0,
-                 use_gae=True):
-        """Constructs the loss for Proximal Policy Objective.
-
-        Arguments:
-            action_space: Environment observation space specification.
-            value_targets (Placeholder): Placeholder for target values; used
-                for GAE.
-            actions (Placeholder): Placeholder for actions taken
-                from previous model evaluation.
-            advantages (Placeholder): Placeholder for calculated advantages
-                from previous model evaluation.
-            logits (Placeholder): Placeholder for logits output from
-                previous model evaluation.
-            vf_preds (Placeholder): Placeholder for value function output
-                from previous model evaluation.
-            curr_action_dist (ActionDistribution): ActionDistribution
-                of the current model.
-            value_fn (Tensor): Current value function output Tensor.
-            cur_kl_coeff (Variable): Variable holding the current PPO KL
-                coefficient.
-            valid_mask (Tensor): A bool mask of valid input elements (#2992).
-            entropy_coeff (float): Coefficient of the entropy regularizer.
-            clip_param (float): Clip parameter
-            vf_clip_param (float): Clip parameter for the value function
-            vf_loss_coeff (float): Coefficient of the value function loss
-            use_gae (bool): If true, use the Generalized Advantage Estimator.
-        """
-
-        def reduce_mean_valid(t):
-            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
-
-        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
-        prev_dist = dist_cls(logits)
-        # Make loss functions.
-        logp_ratio = tf.exp(
-            curr_action_dist.logp(actions) - prev_dist.logp(actions))
-        action_kl = prev_dist.kl(curr_action_dist)
-        self.mean_kl = reduce_mean_valid(action_kl)
-
-        curr_entropy = curr_action_dist.entropy()
-        self.mean_entropy = reduce_mean_valid(curr_entropy)
-
-        surrogate_loss = tf.minimum(
-            advantages * logp_ratio,
-            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
-                                          1 + clip_param))
-        self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
-
-        if use_gae:
-            vf_loss1 = tf.square(value_fn - value_targets)
-            vf_clipped = vf_preds + tf.clip_by_value(
-                value_fn - vf_preds, -vf_clip_param, vf_clip_param)
-            vf_loss2 = tf.square(vf_clipped - value_targets)
-            vf_loss = tf.maximum(vf_loss1, vf_loss2)
-            self.mean_vf_loss = reduce_mean_valid(vf_loss)
-            loss = reduce_mean_valid(
-                -surrogate_loss + cur_kl_coeff * action_kl +
-                vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
-        else:
-            self.mean_vf_loss = tf.constant(0.0)
-            loss = reduce_mean_valid(-surrogate_loss +
-                                     cur_kl_coeff * action_kl -
-                                     entropy_coeff * curr_entropy)
-        self.loss = loss
-
-
-def ppo_surrogate_loss(policy, batch_tensors):
-    if policy.state_in:
-        max_seq_len = tf.reduce_max(policy.seq_lens)
-        mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
-        mask = tf.reshape(mask, [-1])
-    else:
-        mask = tf.ones_like(
-            batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool)
-
-    policy.loss_obj = PPOLoss(
-        policy.action_space,
-        batch_tensors[Postprocessing.VALUE_TARGETS],
-        batch_tensors[Postprocessing.ADVANTAGES],
-        batch_tensors[SampleBatch.ACTIONS],
-        batch_tensors[BEHAVIOUR_LOGITS],
-        batch_tensors[SampleBatch.VF_PREDS],
-        policy.action_dist,
-        policy.value_function,
-        policy.kl_coeff,
-        mask,
-        entropy_coeff=policy.entropy_coeff,
-        clip_param=policy.config["clip_param"],
-        vf_clip_param=policy.config["vf_clip_param"],
-        vf_loss_coeff=policy.config["vf_loss_coeff"],
-        use_gae=policy.config["use_gae"])
-
-    return policy.loss_obj.loss
-
-
-def kl_and_loss_stats(policy, batch_tensors):
-    return {
-        "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
-        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
-        "total_loss": policy.loss_obj.loss,
-        "policy_loss": policy.loss_obj.mean_policy_loss,
-        "vf_loss": policy.loss_obj.mean_vf_loss,
-        "vf_explained_var": explained_variance(
-            batch_tensors[Postprocessing.VALUE_TARGETS],
-            policy.value_function),
-        "kl": policy.loss_obj.mean_kl,
-        "entropy": policy.loss_obj.mean_entropy,
-        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
-    }
-
-
-def vf_preds_and_logits_fetches(policy):
-    """Adds value function and logits outputs to experience batches."""
-    return {
-        SampleBatch.VF_PREDS: policy.value_function,
-        BEHAVIOUR_LOGITS: policy.model_out,
-    }
-
-
-def postprocess_ppo_gae(policy,
-                        sample_batch,
-                        other_agent_batches=None,
-                        episode=None):
-    """Adds the policy logits, VF preds, and advantages to the trajectory."""
-
-    completed = sample_batch["dones"][-1]
-    if completed:
-        last_r = 0.0
-    else:
-        next_state = []
-        for i in range(len(policy.state_in)):
-            next_state.append([sample_batch["state_out_{}".format(i)][-1]])
-        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
-                               sample_batch[SampleBatch.ACTIONS][-1],
-                               sample_batch[SampleBatch.REWARDS][-1],
-                               *next_state)
-    batch = compute_advantages(
-        sample_batch,
-        last_r,
-        policy.config["gamma"],
-        policy.config["lambda"],
-        use_gae=policy.config["use_gae"])
-    return batch
-
-
-def clip_gradients(policy, optimizer, loss):
-    if policy.config["grad_clip"] is not None:
-        policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                            tf.get_variable_scope().name)
-        grads = tf.gradients(loss, policy.var_list)
-        policy.grads, _ = tf.clip_by_global_norm(grads,
-                                                 policy.config["grad_clip"])
-        clipped_grads = list(zip(policy.grads, policy.var_list))
-        return clipped_grads
-    else:
-        return optimizer.compute_gradients(
-            loss, colocate_gradients_with_ops=True)
-
-
-class KLCoeffMixin(object):
-    def __init__(self, config):
-        # KL Coefficient
-        self.kl_coeff_val = config["kl_coeff"]
-        self.kl_target = config["kl_target"]
-        self.kl_coeff = tf.get_variable(
-            initializer=tf.constant_initializer(self.kl_coeff_val),
-            name="kl_coeff",
-            shape=(),
-            trainable=False,
-            dtype=tf.float32)
-
-    def update_kl(self, sampled_kl):
-        if sampled_kl > 2.0 * self.kl_target:
-            self.kl_coeff_val *= 1.5
-        elif sampled_kl < 0.5 * self.kl_target:
-            self.kl_coeff_val *= 0.5
-        self.kl_coeff.load(self.kl_coeff_val, session=self.get_session())
-        return self.kl_coeff_val
-
-
-class ValueNetworkMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        if config["use_gae"]:
-            self.value_function = self.model.value_function()
-        else:
-            self.value_function = tf.zeros(
-                shape=tf.shape(self.get_placeholder(SampleBatch.CUR_OBS))[:1])
-
-    def _value(self, ob, prev_action, prev_reward, *args):
-        feed_dict = {
-            self.get_placeholder(SampleBatch.CUR_OBS): [ob],
-            self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
-            self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
-            self.seq_lens: [1]
-        }
-        assert len(args) == len(self.state_in), (args, self.state_in)
-        for k, v in zip(self.state_in, args):
-            feed_dict[k] = v
-        vf = self.get_session().run(self.value_function, feed_dict)
-        return vf[0]
-
-
-def setup_config(policy, obs_space, action_space, config):
-    # auto set the model option for layer sharing
-    config["model"]["vf_share_layers"] = config["vf_share_layers"]
-
-
-def setup_mixins(policy, obs_space, action_space, config):
-    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
-    KLCoeffMixin.__init__(policy, config)
-    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
-                                  config["entropy_coeff_schedule"])
-    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
-
-
-PPOTFPolicy = build_tf_policy(
-    name="PPOTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
-    loss_fn=ppo_surrogate_loss,
-    stats_fn=kl_and_loss_stats,
-    extra_action_fetches_fn=vf_preds_and_logits_fetches,
-    postprocess_fn=postprocess_ppo_gae,
-    gradients_fn=clip_gradients,
-    before_init=setup_config,
-    before_loss_init=setup_mixins,
-    mixins=[
-        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
-        ValueNetworkMixin
-    ])
@@ -1,64 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from numpy.testing import assert_allclose
-
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.agents.ppo.utils import flatten, concatenate
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-# TODO(ekl): move to rllib/models dir
-class DistributionsTest(unittest.TestCase):
-    def testCategorical(self):
-        num_samples = 100000
-        logits = tf.placeholder(tf.float32, shape=(None, 10))
-        z = 8 * (np.random.rand(10) - 0.5)
-        data = np.tile(z, (num_samples, 1))
-        c = Categorical(logits)
-        sample_op = c.sample()
-        sess = tf.Session()
-        sess.run(tf.global_variables_initializer())
-        samples = sess.run(sample_op, feed_dict={logits: data})
-        counts = np.zeros(10)
-        for sample in samples:
-            counts[sample] += 1.0
-        probs = np.exp(z) / np.sum(np.exp(z))
-        self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
-
-
-class UtilsTest(unittest.TestCase):
-    def testFlatten(self):
-        d = {
-            "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
-            "a": np.array([[[5], [-5]], [[6], [-6]]])
-        }
-        flat = flatten(d.copy(), start=0, stop=2)
-        assert_allclose(d["s"][0][0][:], flat["s"][0][:])
-        assert_allclose(d["s"][0][1][:], flat["s"][1][:])
-        assert_allclose(d["s"][1][0][:], flat["s"][2][:])
-        assert_allclose(d["s"][1][1][:], flat["s"][3][:])
-        assert_allclose(d["a"][0][0], flat["a"][0])
-        assert_allclose(d["a"][0][1], flat["a"][1])
-        assert_allclose(d["a"][1][0], flat["a"][2])
-        assert_allclose(d["a"][1][1], flat["a"][3])
-
-    def testConcatenate(self):
-        d1 = {"s": np.array([0, 1]), "a": np.array([2, 3])}
-        d2 = {"s": np.array([4, 5]), "a": np.array([6, 7])}
-        d = concatenate([d1, d2])
-        assert_allclose(d["s"], np.array([0, 1, 4, 5]))
-        assert_allclose(d["a"], np.array([2, 3, 6, 7]))
-
-        D = concatenate([d])
-        assert_allclose(D["s"], np.array([0, 1, 4, 5]))
-        assert_allclose(D["a"], np.array([2, 3, 6, 7]))
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
@@ -1,36 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def flatten(weights, start=0, stop=2):
-    """This methods reshapes all values in a dictionary.
-
-    The indices from start to stop will be flattened into a single index.
-
-    Args:
-        weights: A dictionary mapping keys to numpy arrays.
-        start: The starting index.
-        stop: The ending index.
-    """
-    for key, val in weights.items():
-        new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
-        weights[key] = val.reshape(new_shape)
-    return weights
-
-
-def concatenate(weights_list):
-    keys = weights_list[0].keys()
-    result = {}
-    for key in keys:
-        result[key] = np.concatenate([l[key] for l in weights_list])
-    return result
-
-
-def shuffle(trajectory):
-    permutation = np.random.permutation(trajectory["actions"].shape[0])
-    for key, val in trajectory.items():
-        trajectory[key] = val[permutation]
-    return trajectory
@@ -1 +0,0 @@
-Code in this package is adapted from https://github.com/oxwhirl/pymarl.
@@ -1,8 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.qmix.qmix import QMixTrainer, DEFAULT_CONFIG
-from ray.rllib.agents.qmix.apex import ApexQMixTrainer
-
-__all__ = ["QMixTrainer", "ApexQMixTrainer", "DEFAULT_CONFIG"]
@@ -1,39 +0,0 @@
-"""Experimental: scalable Ape-X variant of QMIX"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
-from ray.rllib.agents.qmix.qmix import QMixTrainer, \
-    DEFAULT_CONFIG as QMIX_CONFIG
-from ray.rllib.utils import merge_dicts
-
-APEX_QMIX_DEFAULT_CONFIG = merge_dicts(
-    QMIX_CONFIG,  # see also the options in qmix.py, which are also supported
-    {
-        "optimizer": merge_dicts(
-            QMIX_CONFIG["optimizer"],
-            {
-                "max_weight_sync_delay": 400,
-                "num_replay_buffer_shards": 4,
-                "batch_replay": True,  # required for RNN. Disables prio.
-                "debug": False
-            }),
-        "num_gpus": 0,
-        "num_workers": 32,
-        "buffer_size": 2000000,
-        "learning_starts": 50000,
-        "train_batch_size": 512,
-        "sample_batch_size": 50,
-        "target_network_update_freq": 500000,
-        "timesteps_per_iteration": 25000,
-        "per_worker_exploration": True,
-        "min_iter_time_s": 30,
-    },
-)
-
-ApexQMixTrainer = QMixTrainer.with_updates(
-    name="APEX_QMIX",
-    default_config=APEX_QMIX_DEFAULT_CONFIG,
-    **APEX_TRAINER_PROPERTIES)
@@ -1,64 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-
-
-class VDNMixer(nn.Module):
-    def __init__(self):
-        super(VDNMixer, self).__init__()
-
-    def forward(self, agent_qs, batch):
-        return th.sum(agent_qs, dim=2, keepdim=True)
-
-
-class QMixer(nn.Module):
-    def __init__(self, n_agents, state_shape, mixing_embed_dim):
-        super(QMixer, self).__init__()
-
-        self.n_agents = n_agents
-        self.embed_dim = mixing_embed_dim
-        self.state_dim = int(np.prod(state_shape))
-
-        self.hyper_w_1 = nn.Linear(self.state_dim,
-                                   self.embed_dim * self.n_agents)
-        self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim)
-
-        # State dependent bias for hidden layer
-        self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim)
-
-        # V(s) instead of a bias for the last layers
-        self.V = nn.Sequential(
-            nn.Linear(self.state_dim, self.embed_dim), nn.ReLU(),
-            nn.Linear(self.embed_dim, 1))
-
-    def forward(self, agent_qs, states):
-        """Forward pass for the mixer.
-
-        Arguments:
-            agent_qs: Tensor of shape [B, T, n_agents, n_actions]
-            states: Tensor of shape [B, T, state_dim]
-        """
-        bs = agent_qs.size(0)
-        states = states.reshape(-1, self.state_dim)
-        agent_qs = agent_qs.view(-1, 1, self.n_agents)
-        # First layer
-        w1 = th.abs(self.hyper_w_1(states))
-        b1 = self.hyper_b_1(states)
-        w1 = w1.view(-1, self.n_agents, self.embed_dim)
-        b1 = b1.view(-1, 1, self.embed_dim)
-        hidden = F.elu(th.bmm(agent_qs, w1) + b1)
-        # Second layer
-        w_final = th.abs(self.hyper_w_final(states))
-        w_final = w_final.view(-1, self.embed_dim, 1)
-        # State-dependent bias
-        v = self.V(states).view(-1, 1, 1)
-        # Compute final output
-        y = th.bmm(hidden, w_final) + v
-        # Reshape and return
-        q_tot = y.view(bs, -1, 1)
-        return q_tot
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from torch import nn
-import torch.nn.functional as F
-
-from ray.rllib.models.preprocessors import get_preprocessor
-from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from ray.rllib.utils.annotations import override
-
-
-class RNNModel(TorchModelV2, nn.Module):
-    """The default RNN model for QMIX."""
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
-                              model_config, name)
-        nn.Module.__init__(self)
-        self.obs_size = _get_size(obs_space)
-        self.rnn_hidden_dim = model_config["lstm_cell_size"]
-        self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim)
-        self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim)
-        self.fc2 = nn.Linear(self.rnn_hidden_dim, num_outputs)
-
-    @override(TorchModelV2)
-    def get_initial_state(self):
-        # make hidden states on same device as model
-        return [self.fc1.weight.new(1, self.rnn_hidden_dim).zero_().squeeze(0)]
-
-    @override(TorchModelV2)
-    def forward(self, input_dict, hidden_state, seq_lens):
-        x = F.relu(self.fc1(input_dict["obs_flat"].float()))
-        h_in = hidden_state[0].reshape(-1, self.rnn_hidden_dim)
-        h = self.rnn(x, h_in)
-        q = self.fc2(h)
-        return q, [h]
-
-
-def _get_size(obs_space):
-    return get_preprocessor(obs_space)(obs_space).size
@@ -1,104 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
-from ray.rllib.agents.qmix.qmix_policy import QMixTorchPolicy
-from ray.rllib.optimizers import SyncBatchReplayOptimizer
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # === QMix ===
-    # Mixing network. Either "qmix", "vdn", or None
-    "mixer": "qmix",
-    # Size of the mixing network embedding
-    "mixing_embed_dim": 32,
-    # Whether to use Double_Q learning
-    "double_q": True,
-    # Optimize over complete episodes by default.
-    "batch_mode": "complete_episodes",
-
-    # === Evaluation ===
-    # Evaluate with epsilon=0 every `evaluation_interval` training iterations.
-    # The evaluation stats will be reported under the "evaluation" metric key.
-    # Note that evaluation is currently not parallelized, and that for Ape-X
-    # metrics are already only reported for the lowest epsilon workers.
-    "evaluation_interval": None,
-    # Number of episodes to run per evaluation period.
-    "evaluation_num_episodes": 10,
-
-    # === Exploration ===
-    # Max num timesteps for annealing schedules. Exploration is annealed from
-    # 1.0 to exploration_fraction over this number of timesteps scaled by
-    # exploration_fraction
-    "schedule_max_timesteps": 100000,
-    # Number of env steps to optimize for before returning
-    "timesteps_per_iteration": 1000,
-    # Fraction of entire training period over which the exploration rate is
-    # annealed
-    "exploration_fraction": 0.1,
-    # Final value of random action probability
-    "exploration_final_eps": 0.02,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 500,
-
-    # === Replay buffer ===
-    # Size of the replay buffer in steps.
-    "buffer_size": 10000,
-
-    # === Optimization ===
-    # Learning rate for adam optimizer
-    "lr": 0.0005,
-    # RMSProp alpha
-    "optim_alpha": 0.99,
-    # RMSProp epsilon
-    "optim_eps": 0.00001,
-    # If not None, clip gradients during optimization at this value
-    "grad_norm_clipping": 10,
-    # How many steps of the model to sample before learning starts.
-    "learning_starts": 1000,
-    # Update the replay buffer with this many samples at once. Note that
-    # this setting applies per-worker if num_workers > 1.
-    "sample_batch_size": 4,
-    # Size of a batched sampled from replay buffer for training. Note that
-    # if async_updates is set, then each worker returns gradients for a
-    # batch of this size.
-    "train_batch_size": 32,
-
-    # === Parallelism ===
-    # Number of workers for collecting samples with. This only makes sense
-    # to increase if your environment is particularly slow to sample, or if
-    # you"re using the Async or Ape-X optimizers.
-    "num_workers": 0,
-    # Whether to use a distribution of epsilons across workers for exploration.
-    "per_worker_exploration": False,
-    # Whether to compute priorities on workers.
-    "worker_side_prioritization": False,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 1,
-
-    # === Model ===
-    "model": {
-        "lstm_cell_size": 64,
-        "max_seq_len": 999999,
-    },
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def make_sync_batch_optimizer(workers, config):
-    return SyncBatchReplayOptimizer(
-        workers,
-        learning_starts=config["learning_starts"],
-        buffer_size=config["buffer_size"],
-        train_batch_size=config["train_batch_size"])
-
-
-QMixTrainer = GenericOffPolicyTrainer.with_updates(
-    name="QMIX",
-    default_config=DEFAULT_CONFIG,
-    default_policy=QMixTorchPolicy,
-    make_policy_optimizer=make_sync_batch_optimizer)
@@ -1,450 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Tuple, Discrete, Dict
-import logging
-import numpy as np
-import torch as th
-import torch.nn as nn
-from torch.optim import RMSprop
-from torch.distributions import Categorical
-
-import ray
-from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer
-from ray.rllib.agents.qmix.model import RNNModel, _get_size
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.policy.policy import Policy, TupleActions
-from ray.rllib.policy.rnn_sequencing import chop_into_sequences
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.models.model import _unpack_obs
-from ray.rllib.env.constants import GROUP_REWARDS
-from ray.rllib.utils.annotations import override
-
-logger = logging.getLogger(__name__)
-
-
-class QMixLoss(nn.Module):
-    def __init__(self,
-                 model,
-                 target_model,
-                 mixer,
-                 target_mixer,
-                 n_agents,
-                 n_actions,
-                 double_q=True,
-                 gamma=0.99):
-        nn.Module.__init__(self)
-        self.model = model
-        self.target_model = target_model
-        self.mixer = mixer
-        self.target_mixer = target_mixer
-        self.n_agents = n_agents
-        self.n_actions = n_actions
-        self.double_q = double_q
-        self.gamma = gamma
-
-    def forward(self, rewards, actions, terminated, mask, obs, next_obs,
-                action_mask, next_action_mask):
-        """Forward pass of the loss.
-
-        Arguments:
-            rewards: Tensor of shape [B, T, n_agents]
-            actions: Tensor of shape [B, T, n_agents]
-            terminated: Tensor of shape [B, T, n_agents]
-            mask: Tensor of shape [B, T, n_agents]
-            obs: Tensor of shape [B, T, n_agents, obs_size]
-            next_obs: Tensor of shape [B, T, n_agents, obs_size]
-            action_mask: Tensor of shape [B, T, n_agents, n_actions]
-            next_action_mask: Tensor of shape [B, T, n_agents, n_actions]
-        """
-
-        B, T = obs.size(0), obs.size(1)
-
-        # Calculate estimated Q-Values
-        mac_out = []
-        h = [
-            s.expand([B, self.n_agents, -1])
-            for s in self.model.get_initial_state()
-        ]
-        for t in range(T):
-            q, h = _mac(self.model, obs[:, t], h)
-            mac_out.append(q)
-        mac_out = th.stack(mac_out, dim=1)  # Concat over time
-
-        # Pick the Q-Values for the actions taken -> [B * n_agents, T]
-        chosen_action_qvals = th.gather(
-            mac_out, dim=3, index=actions.unsqueeze(3)).squeeze(3)
-
-        # Calculate the Q-Values necessary for the target
-        target_mac_out = []
-        target_h = [
-            s.expand([B, self.n_agents, -1])
-            for s in self.target_model.get_initial_state()
-        ]
-        for t in range(T):
-            target_q, target_h = _mac(self.target_model, next_obs[:, t],
-                                      target_h)
-            target_mac_out.append(target_q)
-        target_mac_out = th.stack(target_mac_out, dim=1)  # Concat across time
-
-        # Mask out unavailable actions
-        ignore_action = (next_action_mask == 0) & (mask == 1).unsqueeze(-1)
-        target_mac_out[ignore_action] = -np.inf
-
-        # Max over target Q-Values
-        if self.double_q:
-            # Get actions that maximise live Q (for double q-learning)
-            ignore_action = (action_mask == 0) & (mask == 1).unsqueeze(-1)
-            mac_out = mac_out.clone()  # issue 4742
-            mac_out[ignore_action] = -np.inf
-            cur_max_actions = mac_out.max(dim=3, keepdim=True)[1]
-            target_max_qvals = th.gather(target_mac_out, 3,
-                                         cur_max_actions).squeeze(3)
-        else:
-            target_max_qvals = target_mac_out.max(dim=3)[0]
-
-        assert target_max_qvals.min().item() != -np.inf, \
-            "target_max_qvals contains a masked action; \
-            there may be a state with no valid actions."
-
-        # Mix
-        if self.mixer is not None:
-            # TODO(ekl) add support for handling global state? This is just
-            # treating the stacked agent obs as the state.
-            chosen_action_qvals = self.mixer(chosen_action_qvals, obs)
-            target_max_qvals = self.target_mixer(target_max_qvals, next_obs)
-
-        # Calculate 1-step Q-Learning targets
-        targets = rewards + self.gamma * (1 - terminated) * target_max_qvals
-
-        # Td-error
-        td_error = (chosen_action_qvals - targets.detach())
-
-        mask = mask.expand_as(td_error)
-
-        # 0-out the targets that came from padded data
-        masked_td_error = td_error * mask
-
-        # Normal L2 loss, take mean over actual data
-        loss = (masked_td_error**2).sum() / mask.sum()
-        return loss, mask, masked_td_error, chosen_action_qvals, targets
-
-
-class QMixTorchPolicy(Policy):
-    """QMix impl. Assumes homogeneous agents for now.
-
-    You must use MultiAgentEnv.with_agent_groups() to group agents
-    together for QMix. This creates the proper Tuple obs/action spaces and
-    populates the '_group_rewards' info field.
-
-    Action masking: to specify an action mask for individual agents, use a
-    dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}.
-    The mask space must be `Box(0, 1, (n_actions,))`.
-    """
-
-    def __init__(self, obs_space, action_space, config):
-        _validate(obs_space, action_space)
-        config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config)
-        self.config = config
-        self.observation_space = obs_space
-        self.action_space = action_space
-        self.n_agents = len(obs_space.original_space.spaces)
-        self.n_actions = action_space.spaces[0].n
-        self.h_size = config["model"]["lstm_cell_size"]
-
-        agent_obs_space = obs_space.original_space.spaces[0]
-        if isinstance(agent_obs_space, Dict):
-            space_keys = set(agent_obs_space.spaces.keys())
-            if space_keys != {"obs", "action_mask"}:
-                raise ValueError(
-                    "Dict obs space for agent must have keyset "
-                    "['obs', 'action_mask'], got {}".format(space_keys))
-            mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape)
-            if mask_shape != (self.n_actions, ):
-                raise ValueError("Action mask shape must be {}, got {}".format(
-                    (self.n_actions, ), mask_shape))
-            self.has_action_mask = True
-            self.obs_size = _get_size(agent_obs_space.spaces["obs"])
-            # The real agent obs space is nested inside the dict
-            agent_obs_space = agent_obs_space.spaces["obs"]
-        else:
-            self.has_action_mask = False
-            self.obs_size = _get_size(agent_obs_space)
-
-        self.model = ModelCatalog.get_model_v2(
-            agent_obs_space,
-            action_space.spaces[0],
-            self.n_actions,
-            config["model"],
-            framework="torch",
-            name="model",
-            default_model=RNNModel)
-
-        self.target_model = ModelCatalog.get_model_v2(
-            agent_obs_space,
-            action_space.spaces[0],
-            self.n_actions,
-            config["model"],
-            framework="torch",
-            name="target_model",
-            default_model=RNNModel)
-
-        # Setup the mixer network.
-        # The global state is just the stacked agent observations for now.
-        self.state_shape = [self.obs_size, self.n_agents]
-        if config["mixer"] is None:
-            self.mixer = None
-            self.target_mixer = None
-        elif config["mixer"] == "qmix":
-            self.mixer = QMixer(self.n_agents, self.state_shape,
-                                config["mixing_embed_dim"])
-            self.target_mixer = QMixer(self.n_agents, self.state_shape,
-                                       config["mixing_embed_dim"])
-        elif config["mixer"] == "vdn":
-            self.mixer = VDNMixer()
-            self.target_mixer = VDNMixer()
-        else:
-            raise ValueError("Unknown mixer type {}".format(config["mixer"]))
-
-        self.cur_epsilon = 1.0
-        self.update_target()  # initial sync
-
-        # Setup optimizer
-        self.params = list(self.model.parameters())
-        if self.mixer:
-            self.params += list(self.mixer.parameters())
-        self.loss = QMixLoss(self.model, self.target_model, self.mixer,
-                             self.target_mixer, self.n_agents, self.n_actions,
-                             self.config["double_q"], self.config["gamma"])
-        self.optimiser = RMSprop(
-            params=self.params,
-            lr=config["lr"],
-            alpha=config["optim_alpha"],
-            eps=config["optim_eps"])
-
-    @override(Policy)
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        obs_batch, action_mask = self._unpack_observation(obs_batch)
-
-        # Compute actions
-        with th.no_grad():
-            q_values, hiddens = _mac(
-                self.model, th.from_numpy(obs_batch),
-                [th.from_numpy(np.array(s)) for s in state_batches])
-            avail = th.from_numpy(action_mask).float()
-            masked_q_values = q_values.clone()
-            masked_q_values[avail == 0.0] = -float("inf")
-            # epsilon-greedy action selector
-            random_numbers = th.rand_like(q_values[:, :, 0])
-            pick_random = (random_numbers < self.cur_epsilon).long()
-            random_actions = Categorical(avail).sample().long()
-            actions = (pick_random * random_actions +
-                       (1 - pick_random) * masked_q_values.max(dim=2)[1])
-            actions = actions.numpy()
-            hiddens = [s.numpy() for s in hiddens]
-
-        return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
-
-    @override(Policy)
-    def learn_on_batch(self, samples):
-        obs_batch, action_mask = self._unpack_observation(
-            samples[SampleBatch.CUR_OBS])
-        next_obs_batch, next_action_mask = self._unpack_observation(
-            samples[SampleBatch.NEXT_OBS])
-        group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS])
-
-        # These will be padded to shape [B * T, ...]
-        [rew, action_mask, next_action_mask, act, dones, obs, next_obs], \
-            initial_states, seq_lens = \
-            chop_into_sequences(
-                samples[SampleBatch.EPS_ID],
-                samples[SampleBatch.UNROLL_ID],
-                samples[SampleBatch.AGENT_INDEX], [
-                    group_rewards, action_mask, next_action_mask,
-                    samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES],
-                    obs_batch, next_obs_batch
-                ],
-                [samples["state_in_{}".format(k)]
-                 for k in range(len(self.get_initial_state()))],
-                max_seq_len=self.config["model"]["max_seq_len"],
-                dynamic_max=True)
-        B, T = len(seq_lens), max(seq_lens)
-
-        def to_batches(arr):
-            new_shape = [B, T] + list(arr.shape[1:])
-            return th.from_numpy(np.reshape(arr, new_shape))
-
-        rewards = to_batches(rew).float()
-        actions = to_batches(act).long()
-        obs = to_batches(obs).reshape([B, T, self.n_agents,
-                                       self.obs_size]).float()
-        action_mask = to_batches(action_mask)
-        next_obs = to_batches(next_obs).reshape(
-            [B, T, self.n_agents, self.obs_size]).float()
-        next_action_mask = to_batches(next_action_mask)
-
-        # TODO(ekl) this treats group termination as individual termination
-        terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand(
-            B, T, self.n_agents)
-
-        # Create mask for where index is < unpadded sequence length
-        filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) <
-                  np.expand_dims(seq_lens, 1)).astype(np.float32)
-        mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)
-
-        # Compute loss
-        loss_out, mask, masked_td_error, chosen_action_qvals, targets = \
-            self.loss(rewards, actions, terminated, mask, obs,
-                      next_obs, action_mask, next_action_mask)
-
-        # Optimise
-        self.optimiser.zero_grad()
-        loss_out.backward()
-        grad_norm = th.nn.utils.clip_grad_norm_(
-            self.params, self.config["grad_norm_clipping"])
-        self.optimiser.step()
-
-        mask_elems = mask.sum().item()
-        stats = {
-            "loss": loss_out.item(),
-            "grad_norm": grad_norm
-            if isinstance(grad_norm, float) else grad_norm.item(),
-            "td_error_abs": masked_td_error.abs().sum().item() / mask_elems,
-            "q_taken_mean": (chosen_action_qvals * mask).sum().item() /
-            mask_elems,
-            "target_mean": (targets * mask).sum().item() / mask_elems,
-        }
-        return {LEARNER_STATS_KEY: stats}
-
-    @override(Policy)
-    def get_initial_state(self):
-        return [
-            s.expand([self.n_agents, -1]).numpy()
-            for s in self.model.get_initial_state()
-        ]
-
-    @override(Policy)
-    def get_weights(self):
-        return {"model": self.model.state_dict()}
-
-    @override(Policy)
-    def set_weights(self, weights):
-        self.model.load_state_dict(weights["model"])
-
-    @override(Policy)
-    def get_state(self):
-        return {
-            "model": self.model.state_dict(),
-            "target_model": self.target_model.state_dict(),
-            "mixer": self.mixer.state_dict() if self.mixer else None,
-            "target_mixer": self.target_mixer.state_dict()
-            if self.mixer else None,
-            "cur_epsilon": self.cur_epsilon,
-        }
-
-    @override(Policy)
-    def set_state(self, state):
-        self.model.load_state_dict(state["model"])
-        self.target_model.load_state_dict(state["target_model"])
-        if state["mixer"] is not None:
-            self.mixer.load_state_dict(state["mixer"])
-            self.target_mixer.load_state_dict(state["target_mixer"])
-        self.set_epsilon(state["cur_epsilon"])
-        self.update_target()
-
-    def update_target(self):
-        self.target_model.load_state_dict(self.model.state_dict())
-        if self.mixer is not None:
-            self.target_mixer.load_state_dict(self.mixer.state_dict())
-        logger.debug("Updated target networks")
-
-    def set_epsilon(self, epsilon):
-        self.cur_epsilon = epsilon
-
-    def _get_group_rewards(self, info_batch):
-        group_rewards = np.array([
-            info.get(GROUP_REWARDS, [0.0] * self.n_agents)
-            for info in info_batch
-        ])
-        return group_rewards
-
-    def _unpack_observation(self, obs_batch):
-        """Unpacks the action mask / tuple obs from agent grouping.
-
-        Returns:
-            obs (Tensor): flattened obs tensor of shape [B, n_agents, obs_size]
-            mask (Tensor): action mask, if any
-        """
-        unpacked = _unpack_obs(
-            np.array(obs_batch),
-            self.observation_space.original_space,
-            tensorlib=np)
-        if self.has_action_mask:
-            obs = np.concatenate(
-                [o["obs"] for o in unpacked],
-                axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
-            action_mask = np.concatenate(
-                [o["action_mask"] for o in unpacked], axis=1).reshape(
-                    [len(obs_batch), self.n_agents, self.n_actions])
-        else:
-            obs = np.concatenate(
-                unpacked,
-                axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
-            action_mask = np.ones(
-                [len(obs_batch), self.n_agents, self.n_actions])
-        return obs, action_mask
-
-
-def _validate(obs_space, action_space):
-    if not hasattr(obs_space, "original_space") or \
-            not isinstance(obs_space.original_space, Tuple):
-        raise ValueError("Obs space must be a Tuple, got {}. Use ".format(
-            obs_space) + "MultiAgentEnv.with_agent_groups() to group related "
-                         "agents for QMix.")
-    if not isinstance(action_space, Tuple):
-        raise ValueError(
-            "Action space must be a Tuple, got {}. ".format(action_space) +
-            "Use MultiAgentEnv.with_agent_groups() to group related "
-            "agents for QMix.")
-    if not isinstance(action_space.spaces[0], Discrete):
-        raise ValueError(
-            "QMix requires a discrete action space, got {}".format(
-                action_space.spaces[0]))
-    if len({str(x) for x in obs_space.original_space.spaces}) > 1:
-        raise ValueError(
-            "Implementation limitation: observations of grouped agents "
-            "must be homogeneous, got {}".format(
-                obs_space.original_space.spaces))
-    if len({str(x) for x in action_space.spaces}) > 1:
-        raise ValueError(
-            "Implementation limitation: action space of grouped agents "
-            "must be homogeneous, got {}".format(action_space.spaces))
-
-
-def _mac(model, obs, h):
-    """Forward pass of the multi-agent controller.
-
-    Arguments:
-        model: TorchModelV2 class
-        obs: Tensor of shape [B, n_agents, obs_size]
-        h: List of tensors of shape [B, n_agents, h_size]
-
-    Returns:
-        q_vals: Tensor of shape [B, n_agents, n_actions]
-        h: Tensor of shape [B, n_agents, h_size]
-    """
-    B, n_agents = obs.size(0), obs.size(1)
-    obs_flat = obs.reshape([B * n_agents, -1])
-    h_flat = [s.reshape([B * n_agents, -1]) for s in h]
-    q_flat, h_flat = model({"obs": obs_flat}, h_flat, None)
-    return q_flat.reshape(
-        [B, n_agents, -1]), [s.reshape([B, n_agents, -1]) for s in h_flat]
@@ -1,152 +0,0 @@
-"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import traceback
-
-from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
-
-
-def _import_sac():
-    from ray.rllib.agents import sac
-    return sac.SACTrainer
-
-
-def _import_appo():
-    from ray.rllib.agents import ppo
-    return ppo.APPOTrainer
-
-
-def _import_qmix():
-    from ray.rllib.agents import qmix
-    return qmix.QMixTrainer
-
-
-def _import_apex_qmix():
-    from ray.rllib.agents import qmix
-    return qmix.ApexQMixTrainer
-
-
-def _import_ddpg():
-    from ray.rllib.agents import ddpg
-    return ddpg.DDPGTrainer
-
-
-def _import_apex_ddpg():
-    from ray.rllib.agents import ddpg
-    return ddpg.ApexDDPGTrainer
-
-
-def _import_td3():
-    from ray.rllib.agents import ddpg
-    return ddpg.TD3Trainer
-
-
-def _import_ppo():
-    from ray.rllib.agents import ppo
-    return ppo.PPOTrainer
-
-
-def _import_es():
-    from ray.rllib.agents import es
-    return es.ESTrainer
-
-
-def _import_ars():
-    from ray.rllib.agents import ars
-    return ars.ARSTrainer
-
-
-def _import_dqn():
-    from ray.rllib.agents import dqn
-    return dqn.DQNTrainer
-
-
-def _import_simple_q():
-    from ray.rllib.agents import dqn
-    return dqn.SimpleQTrainer
-
-
-def _import_apex():
-    from ray.rllib.agents import dqn
-    return dqn.ApexTrainer
-
-
-def _import_a3c():
-    from ray.rllib.agents import a3c
-    return a3c.A3CTrainer
-
-
-def _import_a2c():
-    from ray.rllib.agents import a3c
-    return a3c.A2CTrainer
-
-
-def _import_pg():
-    from ray.rllib.agents import pg
-    return pg.PGTrainer
-
-
-def _import_impala():
-    from ray.rllib.agents import impala
-    return impala.ImpalaTrainer
-
-
-def _import_marwil():
-    from ray.rllib.agents import marwil
-    return marwil.MARWILTrainer
-
-
-ALGORITHMS = {
-    "SAC": _import_sac,
-    "DDPG": _import_ddpg,
-    "APEX_DDPG": _import_apex_ddpg,
-    "TD3": _import_td3,
-    "PPO": _import_ppo,
-    "ES": _import_es,
-    "ARS": _import_ars,
-    "DQN": _import_dqn,
-    "SimpleQ": _import_simple_q,
-    "APEX": _import_apex,
-    "A3C": _import_a3c,
-    "A2C": _import_a2c,
-    "PG": _import_pg,
-    "IMPALA": _import_impala,
-    "QMIX": _import_qmix,
-    "APEX_QMIX": _import_apex_qmix,
-    "APPO": _import_appo,
-    "MARWIL": _import_marwil,
-}
-
-
-def get_agent_class(alg):
-    """Returns the class of a known agent given its name."""
-
-    try:
-        return _get_agent_class(alg)
-    except ImportError:
-        from ray.rllib.agents.mock import _agent_import_failed
-        return _agent_import_failed(traceback.format_exc())
-
-
-def _get_agent_class(alg):
-    if alg in ALGORITHMS:
-        return ALGORITHMS[alg]()
-    elif alg in CONTRIBUTED_ALGORITHMS:
-        return CONTRIBUTED_ALGORITHMS[alg]()
-    elif alg == "script":
-        from ray.tune import script_runner
-        return script_runner.ScriptRunner
-    elif alg == "__fake":
-        from ray.rllib.agents.mock import _MockTrainer
-        return _MockTrainer
-    elif alg == "__sigmoid_fake_data":
-        from ray.rllib.agents.mock import _SigmoidFakeData
-        return _SigmoidFakeData
-    elif alg == "__parameter_tuning":
-        from ray.rllib.agents.mock import _ParameterTuningTrainer
-        return _ParameterTuningTrainer
-    else:
-        raise Exception(("Unknown algorithm {}.").format(alg))
@@ -1 +0,0 @@
-Implementation of Soft Actor-Critic (https://arxiv.org/abs/1812.05905.pdf).
@@ -1,13 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.sac.sac import SACTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_agent
-
-SACAgent = renamed_agent(SACTrainer)
-
-__all__ = [
-    "SACTrainer",
-    "DEFAULT_CONFIG",
-]
@@ -1,119 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.agents.trainer import with_common_config
-from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
-from ray.rllib.agents.sac.sac_policy import SACTFPolicy
-
-OPTIMIZER_SHARED_CONFIGS = [
-    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
-    "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
-    "train_batch_size", "learning_starts"
-]
-
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # === Model ===
-    "twin_q": True,
-    "use_state_preprocessor": False,
-    "policy": "GaussianLatentSpacePolicy",
-    # RLlib model options for the Q function
-    "Q_model": {
-        "hidden_activation": "relu",
-        "hidden_layer_sizes": (256, 256),
-    },
-    # RLlib model options for the policy function
-    "policy_model": {
-        "hidden_activation": "relu",
-        "hidden_layer_sizes": (256, 256),
-    },
-
-    # === Learning ===
-    # Update the target by \tau * policy + (1-\tau) * target_policy
-    "tau": 5e-3,
-    # Target entropy lower bound. This is the inverse of reward scale,
-    # and will be optimized automatically.
-    "target_entropy": "auto",
-    # Disable setting done=True at end of episode.
-    "no_done_at_end": True,
-    # N-step target updates
-    "n_step": 1,
-
-    # === Evaluation ===
-    # The evaluation stats will be reported under the "evaluation" metric key.
-    "evaluation_interval": 1,
-    # Number of episodes to run per evaluation period.
-    "evaluation_num_episodes": 1,
-    # Extra configuration that disables exploration.
-    "evaluation_config": {
-        "exploration_enabled": False,
-    },
-
-    # === Exploration ===
-    # Number of env steps to optimize for before returning
-    "timesteps_per_iteration": 1000,
-    "exploration_enabled": True,
-
-    # === Replay buffer ===
-    # Size of the replay buffer. Note that if async_updates is set, then
-    # each worker will have a replay buffer of this size.
-    "buffer_size": int(1e6),
-    # If True prioritized replay buffer will be used.
-    # TODO(hartikainen): Make sure this works or remove the option.
-    "prioritized_replay": False,
-    "prioritized_replay_alpha": 0.6,
-    "prioritized_replay_beta": 0.4,
-    "prioritized_replay_eps": 1e-6,
-    "beta_annealing_fraction": 0.2,
-    "final_prioritized_replay_beta": 0.4,
-    "compress_observations": False,
-
-    # === Optimization ===
-    "optimization": {
-        "actor_learning_rate": 3e-4,
-        "critic_learning_rate": 3e-4,
-        "entropy_learning_rate": 3e-4,
-    },
-    # If not None, clip gradients during optimization at this value
-    "grad_norm_clipping": None,
-    # How many steps of the model to sample before learning starts.
-    "learning_starts": 1500,
-    # Update the replay buffer with this many samples at once. Note that this
-    # setting applies per-worker if num_workers > 1.
-    "sample_batch_size": 1,
-    # Size of a batched sampled from replay buffer for training. Note that
-    # if async_updates is set, then each worker returns gradients for a
-    # batch of this size.
-    "train_batch_size": 256,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 0,
-
-    # === Parallelism ===
-    # Whether to use a GPU for local optimization.
-    "num_gpus": 0,
-    # Number of workers for collecting samples with. This only makes sense
-    # to increase if your environment is particularly slow to sample, or if
-    # you"re using the Async or Ape-X optimizers.
-    "num_workers": 0,
-    # Whether to allocate GPUs for workers (if > 0).
-    "num_gpus_per_worker": 0,
-    # Whether to allocate CPUs for workers (if > 0).
-    "num_cpus_per_worker": 1,
-    # Whether to compute priorities on workers.
-    "worker_side_prioritization": False,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 1,
-
-    # TODO(ekl) these are unused; remove them from sac config
-    "per_worker_exploration": False,
-    "exploration_fraction": 0.1,
-    "schedule_max_timesteps": 100000,
-    "exploration_final_eps": 0.02,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-SACTrainer = GenericOffPolicyTrainer.with_updates(
-    name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
@@ -1,232 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils import try_import_tf, try_import_tfp
-
-tf = try_import_tf()
-tfp = try_import_tfp()
-
-SCALE_DIAG_MIN_MAX = (-20, 2)
-
-
-def SquashBijector():
-    # lazy def since it depends on tfp
-    class SquashBijector(tfp.bijectors.Bijector):
-        def __init__(self, validate_args=False, name="tanh"):
-            super(SquashBijector, self).__init__(
-                forward_min_event_ndims=0,
-                validate_args=validate_args,
-                name=name)
-
-        def _forward(self, x):
-            return tf.nn.tanh(x)
-
-        def _inverse(self, y):
-            return tf.atanh(y)
-
-        def _forward_log_det_jacobian(self, x):
-            return 2. * (np.log(2.) - x - tf.nn.softplus(-2. * x))
-
-    return SquashBijector()
-
-
-class SACModel(TFModelV2):
-    """Extension of standard TFModel for SAC.
-
-    Data flow:
-        obs -> forward() -> model_out
-        model_out -> get_policy_output() -> pi(s)
-        model_out, actions -> get_q_values() -> Q(s, a)
-        model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
-
-    Note that this class by itself is not a valid model unless you
-    implement forward() in a subclass."""
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 actor_hidden_activation="relu",
-                 actor_hiddens=(256, 256),
-                 critic_hidden_activation="relu",
-                 critic_hiddens=(256, 256),
-                 twin_q=False):
-        """Initialize variables of this model.
-
-        Extra model kwargs:
-            actor_hidden_activation (str): activation for actor network
-            actor_hiddens (list): hidden layers sizes for actor network
-            critic_hidden_activation (str): activation for critic network
-            critic_hiddens (list): hidden layers sizes for critic network
-            twin_q (bool): build twin Q networks
-
-        Note that the core layers for forward() are not defined here, this
-        only defines the layers for the output heads. Those layers for
-        forward() should be defined in subclasses of SACModel.
-        """
-
-        if tfp is None:
-            raise ImportError("tensorflow-probability package not found")
-
-        super(SACModel, self).__init__(obs_space, action_space, num_outputs,
-                                       model_config, name)
-
-        self.action_dim = np.product(action_space.shape)
-        self.model_out = tf.keras.layers.Input(
-            shape=(num_outputs, ), name="model_out")
-        self.actions = tf.keras.layers.Input(
-            shape=(self.action_dim, ), name="actions")
-
-        shift_and_log_scale_diag = tf.keras.Sequential([
-            tf.keras.layers.Dense(
-                units=hidden,
-                activation=getattr(tf.nn, actor_hidden_activation),
-                name="action_hidden_{}".format(i))
-            for i, hidden in enumerate(actor_hiddens)
-        ] + [
-            tf.keras.layers.Dense(
-                units=tfp.layers.MultivariateNormalTriL.params_size(
-                    self.action_dim),
-                activation=None,
-                name="action_out")
-        ])(self.model_out)
-
-        shift, log_scale_diag = tf.keras.layers.Lambda(
-            lambda shift_and_log_scale_diag: tf.split(
-                shift_and_log_scale_diag,
-                num_or_size_splits=2,
-                axis=-1)
-        )(shift_and_log_scale_diag)
-
-        log_scale_diag = tf.keras.layers.Lambda(
-            lambda log_sd: tf.clip_by_value(log_sd, *SCALE_DIAG_MIN_MAX))(
-                log_scale_diag)
-
-        shift_and_log_scale_diag = tf.keras.layers.Concatenate(axis=-1)(
-            [shift, log_scale_diag])
-
-        raw_action_distribution = tfp.layers.MultivariateNormalTriL(
-            self.action_dim)(shift_and_log_scale_diag)
-
-        action_distribution = tfp.layers.DistributionLambda(
-            make_distribution_fn=SquashBijector())(raw_action_distribution)
-
-        # TODO(hartikainen): Remove the unnecessary Model call here
-        self.action_distribution_model = tf.keras.Model(
-            self.model_out, action_distribution)
-
-        self.register_variables(self.action_distribution_model.variables)
-
-        def build_q_net(name, observations, actions):
-            q_net = tf.keras.Sequential([
-                tf.keras.layers.Concatenate(axis=1),
-            ] + [
-                tf.keras.layers.Dense(
-                    units=units,
-                    activation=getattr(tf.nn, critic_hidden_activation),
-                    name="{}_hidden_{}".format(name, i))
-                for i, units in enumerate(critic_hiddens)
-            ] + [
-                tf.keras.layers.Dense(
-                    units=1, activation=None, name="{}_out".format(name))
-            ])
-
-            # TODO(hartikainen): Remove the unnecessary Model call here
-            q_net = tf.keras.Model([observations, actions],
-                                   q_net([observations, actions]))
-            return q_net
-
-        self.q_net = build_q_net("q", self.model_out, self.actions)
-        self.register_variables(self.q_net.variables)
-
-        if twin_q:
-            self.twin_q_net = build_q_net("twin_q", self.model_out,
-                                          self.actions)
-            self.register_variables(self.twin_q_net.variables)
-        else:
-            self.twin_q_net = None
-
-        self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha")
-        self.alpha = tf.exp(self.log_alpha)
-
-        self.register_variables([self.log_alpha])
-
-    def forward(self, input_dict, state, seq_lens):
-        """This generates the model_out tensor input.
-
-        You must implement this as documented in modelv2.py."""
-        raise NotImplementedError
-
-    def get_policy_output(self, model_out, deterministic=False):
-        """Return the (unscaled) output of the policy network.
-
-        This returns the unscaled outputs of pi(s).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-
-        Returns:
-            tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
-        """
-        action_distribution = self.action_distribution_model(model_out)
-        if deterministic:
-            actions = action_distribution.bijector(
-                action_distribution.distribution.mean())
-            log_pis = None
-        else:
-            actions = action_distribution.sample()
-            log_pis = action_distribution.log_prob(actions)
-
-        return actions, log_pis
-
-    def get_q_values(self, model_out, actions):
-        """Return the Q estimates for the most recent forward pass.
-
-        This implements Q(s, a).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-            actions (Tensor): action values that correspond with the most
-                recent batch of observations passed through forward(), of shape
-                [BATCH_SIZE, action_dim].
-
-        Returns:
-            tensor of shape [BATCH_SIZE].
-        """
-        return self.q_net([model_out, actions])
-
-    def get_twin_q_values(self, model_out, actions):
-        """Same as get_q_values but using the twin Q net.
-
-        This implements the twin Q(s, a).
-
-        Arguments:
-            model_out (Tensor): obs embeddings from the model layers, of shape
-                [BATCH_SIZE, num_outputs].
-            actions (Tensor): action values that correspond with the most
-                recent batch of observations passed through forward(), of shape
-                [BATCH_SIZE, action_dim].
-
-        Returns:
-            tensor of shape [BATCH_SIZE].
-        """
-        return self.twin_q_net([model_out, actions])
-
-    def policy_variables(self):
-        """Return the list of variables for the policy net."""
-
-        return list(self.action_distribution_model.variables)
-
-    def q_variables(self):
-        """Return the list of variables for Q / twin Q nets."""
-
-        return self.q_net.variables + (self.twin_q_net.variables
-                                       if self.twin_q_net else [])
@@ -1,367 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Box
-import numpy as np
-import logging
-
-import ray
-import ray.experimental.tf_utils
-from ray.rllib.agents.sac.sac_model import SACModel
-from ray.rllib.agents.ddpg.noop_model import NoopModel
-from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.models import ModelCatalog
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.utils import try_import_tf, try_import_tfp
-from ray.rllib.utils.tf_ops import minimize_and_clip
-
-tf = try_import_tf()
-tfp = try_import_tfp()
-logger = logging.getLogger(__name__)
-
-
-def build_sac_model(policy, obs_space, action_space, config):
-    if config["model"]["custom_model"]:
-        logger.warning(
-            "Setting use_state_preprocessor=True since a custom model "
-            "was specified.")
-        config["use_state_preprocessor"] = True
-    if not isinstance(action_space, Box):
-        raise UnsupportedSpaceException(
-            "Action space {} is not supported for SAC.".format(action_space))
-    if len(action_space.shape) > 1:
-        raise UnsupportedSpaceException(
-            "Action space has multiple dimensions "
-            "{}. ".format(action_space.shape) +
-            "Consider reshaping this into a single dimension, "
-            "using a Tuple action space, or the multi-agent API.")
-
-    if config["use_state_preprocessor"]:
-        default_model = None  # catalog decides
-        num_outputs = 256  # arbitrary
-        config["model"]["no_final_linear"] = True
-    else:
-        default_model = NoopModel
-        num_outputs = int(np.product(obs_space.shape))
-
-    policy.model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=SACModel,
-        default_model=default_model,
-        name="sac_model",
-        actor_hidden_activation=config["policy_model"]["hidden_activation"],
-        actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
-        critic_hidden_activation=config["Q_model"]["hidden_activation"],
-        critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
-        twin_q=config["twin_q"])
-
-    policy.target_model = ModelCatalog.get_model_v2(
-        obs_space,
-        action_space,
-        num_outputs,
-        config["model"],
-        framework="tf",
-        model_interface=SACModel,
-        default_model=default_model,
-        name="target_sac_model",
-        actor_hidden_activation=config["policy_model"]["hidden_activation"],
-        actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
-        critic_hidden_activation=config["Q_model"]["hidden_activation"],
-        critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
-        twin_q=config["twin_q"])
-
-    return policy.model
-
-
-def postprocess_trajectory(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    return _postprocess_dqn(policy, sample_batch)
-
-
-def exploration_setting_inputs(policy):
-    return {
-        policy.stochastic: policy.config["exploration_enabled"],
-    }
-
-
-def build_action_output(policy, model, input_dict, obs_space, action_space,
-                        config):
-    model_out, _ = model({
-        "obs": input_dict[SampleBatch.CUR_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    def unsquash_actions(actions):
-        # Use sigmoid to scale to [0,1], but also double magnitude of input to
-        # emulate behaviour of tanh activation used in SAC and TD3 papers.
-        sigmoid_out = tf.nn.sigmoid(2 * actions)
-        # Rescale to actual env policy scale
-        # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
-        # get same dims)
-        action_range = (action_space.high - action_space.low)[None]
-        low_action = action_space.low[None]
-        unsquashed_actions = action_range * sigmoid_out + low_action
-
-        return unsquashed_actions
-
-    squashed_stochastic_actions, log_pis = policy.model.get_policy_output(
-        model_out, deterministic=False)
-    stochastic_actions = unsquash_actions(squashed_stochastic_actions)
-    squashed_deterministic_actions, _ = policy.model.get_policy_output(
-        model_out, deterministic=True)
-    deterministic_actions = unsquash_actions(squashed_deterministic_actions)
-
-    actions = tf.cond(policy.stochastic, lambda: stochastic_actions,
-                      lambda: deterministic_actions)
-
-    action_probabilities = tf.cond(policy.stochastic, lambda: log_pis,
-                                   lambda: tf.zeros_like(log_pis))
-    policy.output_actions = actions
-    return actions, action_probabilities
-
-
-def actor_critic_loss(policy, batch_tensors):
-    model_out_t, _ = policy.model({
-        "obs": batch_tensors[SampleBatch.CUR_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    model_out_tp1, _ = policy.model({
-        "obs": batch_tensors[SampleBatch.NEXT_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-
-    target_model_out_tp1, _ = policy.target_model({
-        "obs": batch_tensors[SampleBatch.NEXT_OBS],
-        "is_training": policy._get_is_training_placeholder(),
-    }, [], None)
-    # TODO(hartikainen): figure actions and log pis
-    policy_t, log_pis_t = policy.model.get_policy_output(model_out_t)
-    policy_tp1, log_pis_tp1 = policy.model.get_policy_output(model_out_tp1)
-
-    log_alpha = policy.model.log_alpha
-    alpha = policy.model.alpha
-
-    # q network evaluation
-    q_t = policy.model.get_q_values(model_out_t,
-                                    batch_tensors[SampleBatch.ACTIONS])
-    if policy.config["twin_q"]:
-        twin_q_t = policy.model.get_twin_q_values(
-            model_out_t, batch_tensors[SampleBatch.ACTIONS])
-
-    # Q-values for current policy (no noise) in given current state
-    q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
-
-    # target q network evaluation
-    q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1)
-    if policy.config["twin_q"]:
-        twin_q_tp1 = policy.target_model.get_twin_q_values(
-            target_model_out_tp1, policy_tp1)
-
-    q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
-    if policy.config["twin_q"]:
-        twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
-        q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
-
-    q_tp1 -= tf.expand_dims(alpha * log_pis_t, 1)
-
-    q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
-    q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
-                                       tf.float32)) * q_tp1_best
-
-    assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1"
-
-    # compute RHS of bellman equation
-    q_t_selected_target = tf.stop_gradient(
-        batch_tensors[SampleBatch.REWARDS] +
-        policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
-
-    # compute the error (potentially clipped)
-    if policy.config["twin_q"]:
-        td_error = q_t_selected - q_t_selected_target
-        twin_td_error = twin_q_t_selected - q_t_selected_target
-        td_error = td_error + twin_td_error
-        errors = 0.5 * (tf.square(td_error) + tf.square(twin_td_error))
-    else:
-        td_error = q_t_selected - q_t_selected_target
-        errors = 0.5 * tf.square(td_error)
-
-    critic_loss = policy.model.custom_loss(
-        tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
-    actor_loss = tf.reduce_mean(alpha * log_pis_t - q_t_det_policy)
-
-    target_entropy = (-np.prod(policy.action_space.shape)
-                      if policy.config["target_entropy"] == "auto" else
-                      policy.config["target_entropy"])
-    alpha_loss = -tf.reduce_mean(
-        log_alpha * tf.stop_gradient(log_pis_t + target_entropy))
-
-    # save for stats function
-    policy.q_t = q_t
-    policy.td_error = td_error
-    policy.actor_loss = actor_loss
-    policy.critic_loss = critic_loss
-    policy.alpha_loss = alpha_loss
-
-    # in a custom apply op we handle the losses separately, but return them
-    # combined in one loss for now
-    return actor_loss + critic_loss + alpha_loss
-
-
-def gradients(policy, optimizer, loss):
-    if policy.config["grad_norm_clipping"] is not None:
-        actor_grads_and_vars = minimize_and_clip(
-            policy._actor_optimizer,
-            policy.actor_loss,
-            var_list=policy.model.policy_variables(),
-            clip_val=policy.config["grad_norm_clipping"])
-        critic_grads_and_vars = minimize_and_clip(
-            policy._critic_optimizer,
-            policy.critic_loss,
-            var_list=policy.model.q_variables(),
-            clip_val=policy.config["grad_norm_clipping"])
-        alpha_grads_and_vars = minimize_and_clip(
-            policy._alpha_optimizer,
-            policy.alpha_loss,
-            var_list=policy.model.alpha,
-            clip_val=policy.config["grad_norm_clipping"])
-    else:
-        actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
-            policy.actor_loss, var_list=policy.model.policy_variables())
-        critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
-            policy.critic_loss, var_list=policy.model.q_variables())
-        alpha_grads_and_vars = policy._critic_optimizer.compute_gradients(
-            policy.alpha_loss, var_list=policy.model.alpha)
-    # save these for later use in build_apply_op
-    policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
-                                    if g is not None]
-    policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
-                                     if g is not None]
-    policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars
-                                    if g is not None]
-    grads_and_vars = (
-        policy._actor_grads_and_vars + policy._critic_grads_and_vars +
-        policy._alpha_grads_and_vars)
-    return grads_and_vars
-
-
-def stats(policy, batch_tensors):
-    return {
-        "td_error": tf.reduce_mean(policy.td_error),
-        "actor_loss": tf.reduce_mean(policy.actor_loss),
-        "critic_loss": tf.reduce_mean(policy.critic_loss),
-        "mean_q": tf.reduce_mean(policy.q_t),
-        "max_q": tf.reduce_max(policy.q_t),
-        "min_q": tf.reduce_min(policy.q_t),
-    }
-
-
-class ExplorationStateMixin(object):
-    def __init__(self, obs_space, action_space, config):
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-
-    def set_epsilon(self, epsilon):
-        pass
-
-
-class TargetNetworkMixin(object):
-    def __init__(self, config):
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        self.tau_value = config.get("tau")
-        self.tau = tf.placeholder(tf.float32, (), name="tau")
-        update_target_expr = []
-        model_vars = self.model.trainable_variables()
-        target_model_vars = self.target_model.trainable_variables()
-        assert len(model_vars) == len(target_model_vars), \
-            (model_vars, target_model_vars)
-        for var, var_target in zip(model_vars, target_model_vars):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-            logger.debug("Update target op {}".format(var_target))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-        # Hard initial update
-        self.update_target(tau=1.0)
-
-    # support both hard and soft sync
-    def update_target(self, tau=None):
-        tau = tau or self.tau_value
-        return self.get_session().run(
-            self.update_target_expr, feed_dict={self.tau: tau})
-
-
-class ActorCriticOptimizerMixin(object):
-    def __init__(self, config):
-        # create global step for counting the number of update operations
-        self.global_step = tf.train.get_or_create_global_step()
-
-        # use separate optimizers for actor & critic
-        self._actor_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["optimization"]["actor_learning_rate"])
-        self._critic_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["optimization"]["critic_learning_rate"])
-        self._alpha_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["optimization"]["entropy_learning_rate"])
-
-
-class ComputeTDErrorMixin(object):
-    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                         importance_weights):
-        if not self.loss_initialized():
-            return np.zeros_like(rew_t)
-
-        td_err = self.get_session().run(
-            self.td_error,
-            feed_dict={
-                self.get_placeholder(SampleBatch.CUR_OBS): [
-                    np.array(ob) for ob in obs_t
-                ],
-                self.get_placeholder(SampleBatch.ACTIONS): act_t,
-                self.get_placeholder(SampleBatch.REWARDS): rew_t,
-                self.get_placeholder(SampleBatch.NEXT_OBS): [
-                    np.array(ob) for ob in obs_tp1
-                ],
-                self.get_placeholder(SampleBatch.DONES): done_mask,
-                self.get_placeholder(PRIO_WEIGHTS): importance_weights
-            })
-        return td_err
-
-
-def setup_early_mixins(policy, obs_space, action_space, config):
-    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
-    ActorCriticOptimizerMixin.__init__(policy, config)
-
-
-def setup_late_mixins(policy, obs_space, action_space, config):
-    TargetNetworkMixin.__init__(policy, config)
-
-
-SACTFPolicy = build_tf_policy(
-    name="SACTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
-    make_model=build_sac_model,
-    postprocess_fn=postprocess_trajectory,
-    extra_action_feed_fn=exploration_setting_inputs,
-    action_sampler_fn=build_action_output,
-    loss_fn=actor_critic_loss,
-    stats_fn=stats,
-    gradients_fn=gradients,
-    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
-    mixins=[
-        TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
-        ComputeTDErrorMixin
-    ],
-    before_init=setup_early_mixins,
-    after_init=setup_late_mixins,
-    obs_include_prev_action_reward=False)
@@ -1,797 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from datetime import datetime
-import copy
-import logging
-import os
-import pickle
-import six
-import time
-import tempfile
-
-import ray
-from ray.exceptions import RayError
-from ray.rllib.models import MODEL_DEFAULTS
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
-from ray.rllib.evaluation.worker_set import WorkerSet
-from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
-from ray.rllib.utils import FilterManager, deep_update, merge_dicts
-from ray.rllib.utils.memory import ray_get_and_free
-from ray.rllib.utils import try_import_tf
-from ray.tune.registry import ENV_CREATOR, register_env, _global_registry
-from ray.tune.trainable import Trainable
-from ray.tune.trial import ExportFormat
-from ray.tune.resources import Resources
-from ray.tune.logger import UnifiedLogger
-from ray.tune.result import DEFAULT_RESULTS_DIR
-
-tf = try_import_tf()
-
-logger = logging.getLogger(__name__)
-
-# Max number of times to retry a worker failure. We shouldn't try too many
-# times in a row since that would indicate a persistent cluster issue.
-MAX_WORKER_FAILURE_RETRIES = 3
-
-# yapf: disable
-# __sphinx_doc_begin__
-COMMON_CONFIG = {
-    # === Debugging ===
-    # Whether to write episode stats and videos to the agent log dir
-    "monitor": False,
-    # Set the ray.rllib.* log level for the agent process and its workers.
-    # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
-    # periodically print out summaries of relevant internal dataflow (this is
-    # also printed out once at startup at the INFO level).
-    "log_level": "INFO",
-    # Callbacks that will be run during various phases of training. These all
-    # take a single "info" dict as an argument. For episode callbacks, custom
-    # metrics can be attached to the episode by updating the episode object's
-    # custom metrics dict (see examples/custom_metrics_and_callbacks.py). You
-    # may also mutate the passed in batch data in your callback.
-    "callbacks": {
-        "on_episode_start": None,     # arg: {"env": .., "episode": ...}
-        "on_episode_step": None,      # arg: {"env": .., "episode": ...}
-        "on_episode_end": None,       # arg: {"env": .., "episode": ...}
-        "on_sample_end": None,        # arg: {"samples": .., "worker": ...}
-        "on_train_result": None,      # arg: {"trainer": ..., "result": ...}
-        "on_postprocess_traj": None,  # arg: {
-                                      #   "agent_id": ..., "episode": ...,
-                                      #   "pre_batch": (before processing),
-                                      #   "post_batch": (after processing),
-                                      #   "all_pre_batches": (other agent ids),
-                                      # }
-    },
-    # Whether to attempt to continue training if a worker crashes.
-    "ignore_worker_failures": False,
-    # Log system resource metrics to results.
-    "log_sys_usage": True,
-
-    # === Policy ===
-    # Arguments to pass to model. See models/catalog.py for a full list of the
-    # available model options.
-    "model": MODEL_DEFAULTS,
-    # Arguments to pass to the policy optimizer. These vary by optimizer.
-    "optimizer": {},
-
-    # === Environment ===
-    # Discount factor of the MDP
-    "gamma": 0.99,
-    # Number of steps after which the episode is forced to terminate. Defaults
-    # to `env.spec.max_episode_steps` (if present) for Gym envs.
-    "horizon": None,
-    # Calculate rewards but don't reset the environment when the horizon is
-    # hit. This allows value estimation and RNN state to span across logical
-    # episodes denoted by horizon. This only has an effect if horizon != inf.
-    "soft_horizon": False,
-    # Don't set 'done' at the end of the episode. Note that you still need to
-    # set this if soft_horizon=True, unless your env is actually running
-    # forever without returning done=True.
-    "no_done_at_end": False,
-    # Arguments to pass to the env creator
-    "env_config": {},
-    # Environment name can also be passed via config
-    "env": None,
-    # Whether to clip rewards prior to experience postprocessing. Setting to
-    # None means clip for Atari only.
-    "clip_rewards": None,
-    # Whether to np.clip() actions to the action space low/high range spec.
-    "clip_actions": True,
-    # Whether to use rllib or deepmind preprocessors by default
-    "preprocessor_pref": "deepmind",
-    # The default learning rate
-    "lr": 0.0001,
-
-    # === Evaluation ===
-    # Evaluate with every `evaluation_interval` training iterations.
-    # The evaluation stats will be reported under the "evaluation" metric key.
-    # Note that evaluation is currently not parallelized, and that for Ape-X
-    # metrics are already only reported for the lowest epsilon workers.
-    "evaluation_interval": None,
-    # Number of episodes to run per evaluation period.
-    "evaluation_num_episodes": 10,
-    # Extra arguments to pass to evaluation workers.
-    # Typical usage is to pass extra args to evaluation env creator
-    # and to disable exploration by computing deterministic actions
-    # TODO(kismuz): implement determ. actions and include relevant keys hints
-    "evaluation_config": {},
-
-    # === Resources ===
-    # Number of actors used for parallelism
-    "num_workers": 2,
-    # Number of GPUs to allocate to the driver. Note that not all algorithms
-    # can take advantage of driver GPUs. This can be fraction (e.g., 0.3 GPUs).
-    "num_gpus": 0,
-    # Number of CPUs to allocate per worker.
-    "num_cpus_per_worker": 1,
-    # Number of GPUs to allocate per worker. This can be fractional.
-    "num_gpus_per_worker": 0,
-    # Any custom resources to allocate per worker.
-    "custom_resources_per_worker": {},
-    # Number of CPUs to allocate for the driver. Note: this only takes effect
-    # when running in Tune.
-    "num_cpus_for_driver": 1,
-
-    # === Execution ===
-    # Number of environments to evaluate vectorwise per worker.
-    "num_envs_per_worker": 1,
-    # Default sample batch size (unroll length). Batches of this size are
-    # collected from workers until train_batch_size is met. When using
-    # multiple envs per worker, this is multiplied by num_envs_per_worker.
-    "sample_batch_size": 200,
-    # Training batch size, if applicable. Should be >= sample_batch_size.
-    # Samples batches will be concatenated together to this size for training.
-    "train_batch_size": 200,
-    # Whether to rollout "complete_episodes" or "truncate_episodes"
-    "batch_mode": "truncate_episodes",
-    # Use a background thread for sampling (slightly off-policy, usually not
-    # advisable to turn on unless your env specifically requires it)
-    "sample_async": False,
-    # Element-wise observation filter, either "NoFilter" or "MeanStdFilter"
-    "observation_filter": "NoFilter",
-    # Whether to synchronize the statistics of remote filters.
-    "synchronize_filters": True,
-    # Configure TF for single-process operation by default
-    "tf_session_args": {
-        # note: overriden by `local_tf_session_args`
-        "intra_op_parallelism_threads": 2,
-        "inter_op_parallelism_threads": 2,
-        "gpu_options": {
-            "allow_growth": True,
-        },
-        "log_device_placement": False,
-        "device_count": {
-            "CPU": 1
-        },
-        "allow_soft_placement": True,  # required by PPO multi-gpu
-    },
-    # Override the following tf session args on the local worker
-    "local_tf_session_args": {
-        # Allow a higher level of parallelism by default, but not unlimited
-        # since that can cause crashes with many concurrent drivers.
-        "intra_op_parallelism_threads": 8,
-        "inter_op_parallelism_threads": 8,
-    },
-    # Whether to LZ4 compress individual observations
-    "compress_observations": False,
-    # Wait for metric batches for at most this many seconds. Those that
-    # have not returned in time will be collected in the next iteration.
-    "collect_metrics_timeout": 180,
-    # Smooth metrics over this many episodes.
-    "metrics_smoothing_episodes": 100,
-    # If using num_envs_per_worker > 1, whether to create those new envs in
-    # remote processes instead of in the same worker. This adds overheads, but
-    # can make sense if your envs can take much time to step / reset
-    # (e.g., for StarCraft). Use this cautiously; overheads are significant.
-    "remote_worker_envs": False,
-    # Timeout that remote workers are waiting when polling environments.
-    # 0 (continue when at least one env is ready) is a reasonable default,
-    # but optimal value could be obtained by measuring your environment
-    # step / reset and model inference perf.
-    "remote_env_batch_wait_ms": 0,
-    # Minimum time per iteration
-    "min_iter_time_s": 0,
-    # Minimum env steps to optimize for per train call. This value does
-    # not affect learning, only the length of iterations.
-    "timesteps_per_iteration": 0,
-    # This argument, in conjunction with worker_index, sets the random seed of
-    # each worker, so that identically configured trials will have identical
-    # results. This makes experiments reproducible.
-    "seed": None,
-
-    # === Offline Datasets ===
-    # Specify how to generate experiences:
-    #  - "sampler": generate experiences via online simulation (default)
-    #  - a local directory or file glob expression (e.g., "/tmp/*.json")
-    #  - a list of individual file paths/URIs (e.g., ["/tmp/1.json",
-    #    "s3://bucket/2.json"])
-    #  - a dict with string keys and sampling probabilities as values (e.g.,
-    #    {"sampler": 0.4, "/tmp/*.json": 0.4, "s3://bucket/expert.json": 0.2}).
-    #  - a function that returns a rllib.offline.InputReader
-    "input": "sampler",
-    # Specify how to evaluate the current policy. This only has an effect when
-    # reading offline experiences. Available options:
-    #  - "wis": the weighted step-wise importance sampling estimator.
-    #  - "is": the step-wise importance sampling estimator.
-    #  - "simulation": run the environment in the background, but use
-    #    this data for evaluation only and not for learning.
-    "input_evaluation": ["is", "wis"],
-    # Whether to run postprocess_trajectory() on the trajectory fragments from
-    # offline inputs. Note that postprocessing will be done using the *current*
-    # policy, not the *behaviour* policy, which is typically undesirable for
-    # on-policy algorithms.
-    "postprocess_inputs": False,
-    # If positive, input batches will be shuffled via a sliding window buffer
-    # of this number of batches. Use this if the input data is not in random
-    # enough order. Input is delayed until the shuffle buffer is filled.
-    "shuffle_buffer_size": 0,
-    # Specify where experiences should be saved:
-    #  - None: don't save any experiences
-    #  - "logdir" to save to the agent log dir
-    #  - a path/URI to save to a custom output directory (e.g., "s3://bucket/")
-    #  - a function that returns a rllib.offline.OutputWriter
-    "output": None,
-    # What sample batch columns to LZ4 compress in the output data.
-    "output_compress_columns": ["obs", "new_obs"],
-    # Max output file size before rolling over to a new file.
-    "output_max_file_size": 64 * 1024 * 1024,
-
-    # === Multiagent ===
-    "multiagent": {
-        # Map from policy ids to tuples of (policy_cls, obs_space,
-        # act_space, config). See rollout_worker.py for more info.
-        "policies": {},
-        # Function mapping agent ids to policy ids.
-        "policy_mapping_fn": None,
-        # Optional whitelist of policies to train, or None for all policies.
-        "policies_to_train": None,
-    },
-}
-# __sphinx_doc_end__
-# yapf: enable
-
-
-@DeveloperAPI
-def with_common_config(extra_config):
-    """Returns the given config dict merged with common agent confs."""
-
-    return with_base_config(COMMON_CONFIG, extra_config)
-
-
-def with_base_config(base_config, extra_config):
-    """Returns the given config dict merged with a base agent conf."""
-
-    config = copy.deepcopy(base_config)
-    config.update(extra_config)
-    return config
-
-
-@PublicAPI
-class Trainer(Trainable):
-    """A trainer coordinates the optimization of one or more RL policies.
-
-    All RLlib trainers extend this base class, e.g., the A3CTrainer implements
-    the A3C algorithm for single and multi-agent training.
-
-    Trainer objects retain internal model state between calls to train(), so
-    you should create a new trainer instance for each training session.
-
-    Attributes:
-        env_creator (func): Function that creates a new training env.
-        config (obj): Algorithm-specific configuration data.
-        logdir (str): Directory in which training outputs should be placed.
-    """
-
-    _allow_unknown_configs = False
-    _allow_unknown_subkeys = [
-        "tf_session_args", "env_config", "model", "optimizer", "multiagent",
-        "custom_resources_per_worker", "evaluation_config"
-    ]
-
-    @PublicAPI
-    def __init__(self, config=None, env=None, logger_creator=None):
-        """Initialize an RLLib trainer.
-
-        Args:
-            config (dict): Algorithm-specific configuration data.
-            env (str): Name of the environment to use. Note that this can also
-                be specified as the `env` key in config.
-            logger_creator (func): Function that creates a ray.tune.Logger
-                object. If unspecified, a default logger is created.
-        """
-
-        config = config or {}
-
-        # Vars to synchronize to workers on each train call
-        self.global_vars = {"timestep": 0}
-
-        # Trainers allow env ids to be passed directly to the constructor.
-        self._env_id = self._register_if_needed(env or config.get("env"))
-
-        # Create a default logger creator if no logger_creator is specified
-        if logger_creator is None:
-            timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
-            logdir_prefix = "{}_{}_{}".format(self._name, self._env_id,
-                                              timestr)
-
-            def default_logger_creator(config):
-                """Creates a Unified logger with a default logdir prefix
-                containing the agent name and the env id
-                """
-                if not os.path.exists(DEFAULT_RESULTS_DIR):
-                    os.makedirs(DEFAULT_RESULTS_DIR)
-                logdir = tempfile.mkdtemp(
-                    prefix=logdir_prefix, dir=DEFAULT_RESULTS_DIR)
-                return UnifiedLogger(config, logdir, None)
-
-            logger_creator = default_logger_creator
-
-        Trainable.__init__(self, config, logger_creator)
-
-    @classmethod
-    @override(Trainable)
-    def default_resource_request(cls, config):
-        cf = dict(cls._default_config, **config)
-        Trainer._validate_config(cf)
-        # TODO(ekl): add custom resources here once tune supports them
-        return Resources(
-            cpu=cf["num_cpus_for_driver"],
-            gpu=cf["num_gpus"],
-            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
-            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
-
-    @override(Trainable)
-    @PublicAPI
-    def train(self):
-        """Overrides super.train to synchronize global vars."""
-
-        if self._has_policy_optimizer():
-            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
-            self.optimizer.workers.local_worker().set_global_vars(
-                self.global_vars)
-            for w in self.optimizer.workers.remote_workers():
-                w.set_global_vars.remote(self.global_vars)
-            logger.debug("updated global vars: {}".format(self.global_vars))
-
-        result = None
-        for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
-            try:
-                result = Trainable.train(self)
-            except RayError as e:
-                if self.config["ignore_worker_failures"]:
-                    logger.exception(
-                        "Error in train call, attempting to recover")
-                    self._try_recover()
-                else:
-                    logger.info(
-                        "Worker crashed during call to train(). To attempt to "
-                        "continue training without the failed worker, set "
-                        "`'ignore_worker_failures': True`.")
-                    raise e
-            except Exception as e:
-                time.sleep(0.5)  # allow logs messages to propagate
-                raise e
-            else:
-                break
-        if result is None:
-            raise RuntimeError("Failed to recover from worker crash")
-
-        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
-                and hasattr(self, "workers")
-                and isinstance(self.workers, WorkerSet)):
-            FilterManager.synchronize(
-                self.workers.local_worker().filters,
-                self.workers.remote_workers(),
-                update_remote=self.config["synchronize_filters"])
-            logger.debug("synchronized filters: {}".format(
-                self.workers.local_worker().filters))
-
-        if self._has_policy_optimizer():
-            result["num_healthy_workers"] = len(
-                self.optimizer.workers.remote_workers())
-
-        if self.config["evaluation_interval"]:
-            if self._iteration % self.config["evaluation_interval"] == 0:
-                evaluation_metrics = self._evaluate()
-                assert isinstance(evaluation_metrics, dict), \
-                    "_evaluate() needs to return a dict."
-                result.update(evaluation_metrics)
-
-        return result
-
-    @override(Trainable)
-    def _log_result(self, result):
-        if self.config["callbacks"].get("on_train_result"):
-            self.config["callbacks"]["on_train_result"]({
-                "trainer": self,
-                "result": result,
-            })
-        # log after the callback is invoked, so that the user has a chance
-        # to mutate the result
-        Trainable._log_result(self, result)
-
-    @override(Trainable)
-    def _setup(self, config):
-        env = self._env_id
-        if env:
-            config["env"] = env
-            if _global_registry.contains(ENV_CREATOR, env):
-                self.env_creator = _global_registry.get(ENV_CREATOR, env)
-            else:
-                import gym  # soft dependency
-                self.env_creator = lambda env_config: gym.make(env)
-        else:
-            self.env_creator = lambda env_config: None
-
-        # Merge the supplied config with the class default
-        merged_config = copy.deepcopy(self._default_config)
-        merged_config = deep_update(merged_config, config,
-                                    self._allow_unknown_configs,
-                                    self._allow_unknown_subkeys)
-        self.raw_user_config = config
-        self.config = merged_config
-        Trainer._validate_config(self.config)
-        if self.config.get("log_level"):
-            logging.getLogger("ray.rllib").setLevel(self.config["log_level"])
-
-        def get_scope():
-            if tf:
-                return tf.Graph().as_default()
-            else:
-                return open("/dev/null")  # fake a no-op scope
-
-        with get_scope():
-            self._init(self.config, self.env_creator)
-
-            # Evaluation related
-            if self.config.get("evaluation_interval"):
-                # Update env_config with evaluation settings:
-                extra_config = copy.deepcopy(self.config["evaluation_config"])
-                extra_config.update({
-                    "batch_mode": "complete_episodes",
-                    "batch_steps": 1,
-                })
-                logger.debug(
-                    "using evaluation_config: {}".format(extra_config))
-                self.evaluation_workers = self._make_workers(
-                    self.env_creator,
-                    self._policy,
-                    merge_dicts(self.config, extra_config),
-                    num_workers=0)
-                self.evaluation_metrics = self._evaluate()
-
-    @override(Trainable)
-    def _stop(self):
-        if hasattr(self, "workers"):
-            self.workers.stop()
-        if hasattr(self, "optimizer"):
-            self.optimizer.stop()
-
-    @override(Trainable)
-    def _save(self, checkpoint_dir):
-        checkpoint_path = os.path.join(checkpoint_dir,
-                                       "checkpoint-{}".format(self.iteration))
-        pickle.dump(self.__getstate__(), open(checkpoint_path, "wb"))
-        return checkpoint_path
-
-    @override(Trainable)
-    def _restore(self, checkpoint_path):
-        extra_data = pickle.load(open(checkpoint_path, "rb"))
-        self.__setstate__(extra_data)
-
-    @DeveloperAPI
-    def _make_workers(self, env_creator, policy, config, num_workers):
-        return WorkerSet(
-            env_creator,
-            policy,
-            config,
-            num_workers=num_workers,
-            logdir=self.logdir)
-
-    @DeveloperAPI
-    def _init(self, config, env_creator):
-        """Subclasses should override this for custom initialization."""
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def _evaluate(self):
-        """Evaluates current policy under `evaluation_config` settings.
-
-        Note that this default implementation does not do anything beyond
-        merging evaluation_config with the normal trainer config.
-        """
-
-        if not self.config["evaluation_config"]:
-            raise ValueError(
-                "No evaluation_config specified. It doesn't make sense "
-                "to enable evaluation without specifying any config "
-                "overrides, since the results will be the "
-                "same as reported during normal policy evaluation.")
-
-        logger.info("Evaluating current policy for {} episodes".format(
-            self.config["evaluation_num_episodes"]))
-        self._before_evaluate()
-        self.evaluation_workers.local_worker().restore(
-            self.workers.local_worker().save())
-        for _ in range(self.config["evaluation_num_episodes"]):
-            self.evaluation_workers.local_worker().sample()
-
-        metrics = collect_metrics(self.evaluation_workers.local_worker())
-        return {"evaluation": metrics}
-
-    @DeveloperAPI
-    def _before_evaluate(self):
-        """Pre-evaluation callback."""
-        pass
-
-    @PublicAPI
-    def compute_action(self,
-                       observation,
-                       state=None,
-                       prev_action=None,
-                       prev_reward=None,
-                       info=None,
-                       policy_id=DEFAULT_POLICY_ID,
-                       full_fetch=False):
-        """Computes an action for the specified policy.
-
-        Note that you can also access the policy object through
-        self.get_policy(policy_id) and call compute_actions() on it directly.
-
-        Arguments:
-            observation (obj): observation from the environment.
-            state (list): RNN hidden state, if any. If state is not None,
-                          then all of compute_single_action(...) is returned
-                          (computed action, rnn state, logits dictionary).
-                          Otherwise compute_single_action(...)[0] is
-                          returned (computed action).
-            prev_action (obj): previous action value, if any
-            prev_reward (int): previous reward, if any
-            info (dict): info object, if any
-            policy_id (str): policy to query (only applies to multi-agent).
-            full_fetch (bool): whether to return extra action fetch results.
-                This is always set to true if RNN state is specified.
-
-        Returns:
-            Just the computed action if full_fetch=False, or the full output
-            of policy.compute_actions() otherwise.
-        """
-
-        if state is None:
-            state = []
-        preprocessed = self.workers.local_worker().preprocessors[
-            policy_id].transform(observation)
-        filtered_obs = self.workers.local_worker().filters[policy_id](
-            preprocessed, update=False)
-        if state:
-            return self.get_policy(policy_id).compute_single_action(
-                filtered_obs,
-                state,
-                prev_action,
-                prev_reward,
-                info,
-                clip_actions=self.config["clip_actions"])
-        res = self.get_policy(policy_id).compute_single_action(
-            filtered_obs,
-            state,
-            prev_action,
-            prev_reward,
-            info,
-            clip_actions=self.config["clip_actions"])
-        if full_fetch:
-            return res
-        else:
-            return res[0]  # backwards compatibility
-
-    @property
-    def _name(self):
-        """Subclasses should override this to declare their name."""
-
-        raise NotImplementedError
-
-    @property
-    def _default_config(self):
-        """Subclasses should override this to declare their default config."""
-
-        raise NotImplementedError
-
-    @PublicAPI
-    def get_policy(self, policy_id=DEFAULT_POLICY_ID):
-        """Return policy for the specified id, or None.
-
-        Arguments:
-            policy_id (str): id of policy to return.
-        """
-
-        return self.workers.local_worker().get_policy(policy_id)
-
-    @PublicAPI
-    def get_weights(self, policies=None):
-        """Return a dictionary of policy ids to weights.
-
-        Arguments:
-            policies (list): Optional list of policies to return weights for,
-                or None for all policies.
-        """
-        return self.workers.local_worker().get_weights(policies)
-
-    @PublicAPI
-    def set_weights(self, weights):
-        """Set policy weights by policy id.
-
-        Arguments:
-            weights (dict): Map of policy ids to weights to set.
-        """
-        self.workers.local_worker().set_weights(weights)
-
-    @DeveloperAPI
-    def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
-        """Export policy model with given policy_id to local directory.
-
-        Arguments:
-            export_dir (string): Writable local directory.
-            policy_id (string): Optional policy id to export.
-
-        Example:
-            >>> trainer = MyTrainer()
-            >>> for _ in range(10):
-            >>>     trainer.train()
-            >>> trainer.export_policy_model("/tmp/export_dir")
-        """
-        self.workers.local_worker().export_policy_model(export_dir, policy_id)
-
-    @DeveloperAPI
-    def export_policy_checkpoint(self,
-                                 export_dir,
-                                 filename_prefix="model",
-                                 policy_id=DEFAULT_POLICY_ID):
-        """Export tensorflow policy model checkpoint to local directory.
-
-        Arguments:
-            export_dir (string): Writable local directory.
-            filename_prefix (string): file name prefix of checkpoint files.
-            policy_id (string): Optional policy id to export.
-
-        Example:
-            >>> trainer = MyTrainer()
-            >>> for _ in range(10):
-            >>>     trainer.train()
-            >>> trainer.export_policy_checkpoint("/tmp/export_dir")
-        """
-        self.workers.local_worker().export_policy_checkpoint(
-            export_dir, filename_prefix, policy_id)
-
-    @DeveloperAPI
-    def collect_metrics(self, selected_workers=None):
-        """Collects metrics from the remote workers of this agent.
-
-        This is the same data as returned by a call to train().
-        """
-        return self.optimizer.collect_metrics(
-            self.config["collect_metrics_timeout"],
-            min_history=self.config["metrics_smoothing_episodes"],
-            selected_workers=selected_workers)
-
-    @classmethod
-    def resource_help(cls, config):
-        return ("\n\nYou can adjust the resource requests of RLlib agents by "
-                "setting `num_workers`, `num_gpus`, and other configs. See "
-                "the DEFAULT_CONFIG defined by each agent for more info.\n\n"
-                "The config of this agent is: {}".format(config))
-
-    @staticmethod
-    def _validate_config(config):
-        if "policy_graphs" in config["multiagent"]:
-            logger.warning(
-                "The `policy_graphs` config has been renamed to `policies`.")
-            # Backwards compatibility
-            config["multiagent"]["policies"] = config["multiagent"][
-                "policy_graphs"]
-            del config["multiagent"]["policy_graphs"]
-        if "gpu" in config:
-            raise ValueError(
-                "The `gpu` config is deprecated, please use `num_gpus=0|1` "
-                "instead.")
-        if "gpu_fraction" in config:
-            raise ValueError(
-                "The `gpu_fraction` config is deprecated, please use "
-                "`num_gpus=<fraction>` instead.")
-        if "use_gpu_for_workers" in config:
-            raise ValueError(
-                "The `use_gpu_for_workers` config is deprecated, please use "
-                "`num_gpus_per_worker=1` instead.")
-        if type(config["input_evaluation"]) != list:
-            raise ValueError(
-                "`input_evaluation` must be a list of strings, got {}".format(
-                    config["input_evaluation"]))
-
-    def _try_recover(self):
-        """Try to identify and blacklist any unhealthy workers.
-
-        This method is called after an unexpected remote error is encountered
-        from a worker. It issues check requests to all current workers and
-        blacklists any that respond with error. If no healthy workers remain,
-        an error is raised.
-        """
-
-        if not self._has_policy_optimizer():
-            raise NotImplementedError(
-                "Recovery is not supported for this algorithm")
-
-        logger.info("Health checking all workers...")
-        checks = []
-        for ev in self.optimizer.workers.remote_workers():
-            _, obj_id = ev.sample_with_count.remote()
-            checks.append(obj_id)
-
-        healthy_workers = []
-        for i, obj_id in enumerate(checks):
-            w = self.optimizer.workers.remote_workers()[i]
-            try:
-                ray_get_and_free(obj_id)
-                healthy_workers.append(w)
-                logger.info("Worker {} looks healthy".format(i + 1))
-            except RayError:
-                logger.exception("Blacklisting worker {}".format(i + 1))
-                try:
-                    w.__ray_terminate__.remote()
-                except Exception:
-                    logger.exception("Error terminating unhealthy worker")
-
-        if len(healthy_workers) < 1:
-            raise RuntimeError(
-                "Not enough healthy workers remain to continue.")
-
-        self.optimizer.reset(healthy_workers)
-
-    def _has_policy_optimizer(self):
-        return hasattr(self, "optimizer") and isinstance(
-            self.optimizer, PolicyOptimizer)
-
-    @override(Trainable)
-    def _export_model(self, export_formats, export_dir):
-        ExportFormat.validate(export_formats)
-        exported = {}
-        if ExportFormat.CHECKPOINT in export_formats:
-            path = os.path.join(export_dir, ExportFormat.CHECKPOINT)
-            self.export_policy_checkpoint(path)
-            exported[ExportFormat.CHECKPOINT] = path
-        if ExportFormat.MODEL in export_formats:
-            path = os.path.join(export_dir, ExportFormat.MODEL)
-            self.export_policy_model(path)
-            exported[ExportFormat.MODEL] = path
-        return exported
-
-    def __getstate__(self):
-        state = {}
-        if hasattr(self, "workers"):
-            state["worker"] = self.workers.local_worker().save()
-        if hasattr(self, "optimizer") and hasattr(self.optimizer, "save"):
-            state["optimizer"] = self.optimizer.save()
-        return state
-
-    def __setstate__(self, state):
-        if "worker" in state:
-            self.workers.local_worker().restore(state["worker"])
-            remote_state = ray.put(state["worker"])
-            for r in self.workers.remote_workers():
-                r.restore.remote(remote_state)
-        if "optimizer" in state:
-            self.optimizer.restore(state["optimizer"])
-
-    def _register_if_needed(self, env_object):
-        if isinstance(env_object, six.string_types):
-            return env_object
-        elif isinstance(env_object, type):
-            name = env_object.__name__
-            register_env(name, lambda config: env_object(config))
-            return name
-        raise ValueError(
-            "{} is an invalid env specification. ".format(env_object) +
-            "You can specify a custom env as either a class "
-            "(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").")
@@ -1,174 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
-from ray.rllib.optimizers import SyncSamplesOptimizer
-from ray.rllib.utils import add_mixins
-from ray.rllib.utils.annotations import override, DeveloperAPI
-
-
-@DeveloperAPI
-def build_trainer(name,
-                  default_policy,
-                  default_config=None,
-                  validate_config=None,
-                  get_initial_state=None,
-                  get_policy_class=None,
-                  before_init=None,
-                  make_workers=None,
-                  make_policy_optimizer=None,
-                  after_init=None,
-                  before_train_step=None,
-                  after_optimizer_step=None,
-                  after_train_result=None,
-                  collect_metrics_fn=None,
-                  before_evaluate_fn=None,
-                  mixins=None):
-    """Helper function for defining a custom trainer.
-
-    Functions will be run in this order to initialize the trainer:
-        1. Config setup: validate_config, get_initial_state, get_policy
-        2. Worker setup: before_init, make_workers, make_policy_optimizer
-        3. Post setup: after_init
-
-    Arguments:
-        name (str): name of the trainer (e.g., "PPO")
-        default_policy (cls): the default Policy class to use
-        default_config (dict): the default config dict of the algorithm,
-            otherwises uses the Trainer default config
-        validate_config (func): optional callback that checks a given config
-            for correctness. It may mutate the config as needed.
-        get_initial_state (func): optional function that returns the initial
-            state dict given the trainer instance as an argument. The state
-            dict must be serializable so that it can be checkpointed, and will
-            be available as the `trainer.state` variable.
-        get_policy_class (func): optional callback that takes a config and
-            returns the policy class to override the default with
-        before_init (func): optional function to run at the start of trainer
-            init that takes the trainer instance as argument
-        make_workers (func): override the method that creates rollout workers.
-            This takes in (trainer, env_creator, policy, config) as args.
-        make_policy_optimizer (func): optional function that returns a
-            PolicyOptimizer instance given (WorkerSet, config)
-        after_init (func): optional function to run at the end of trainer init
-            that takes the trainer instance as argument
-        before_train_step (func): optional callback to run before each train()
-            call. It takes the trainer instance as an argument.
-        after_optimizer_step (func): optional callback to run after each
-            step() call to the policy optimizer. It takes the trainer instance
-            and the policy gradient fetches as arguments.
-        after_train_result (func): optional callback to run at the end of each
-            train() call. It takes the trainer instance and result dict as
-            arguments, and may mutate the result dict as needed.
-        collect_metrics_fn (func): override the method used to collect metrics.
-            It takes the trainer instance as argumnt.
-        before_evaluate_fn (func): callback to run before evaluation. This
-            takes the trainer instance as argument.
-        mixins (list): list of any class mixins for the returned trainer class.
-            These mixins will be applied in order and will have higher
-            precedence than the Trainer class
-
-    Returns:
-        a Trainer instance that uses the specified args.
-    """
-
-    original_kwargs = locals().copy()
-    base = add_mixins(Trainer, mixins)
-
-    class trainer_cls(base):
-        _name = name
-        _default_config = default_config or COMMON_CONFIG
-        _policy = default_policy
-
-        def __init__(self, config=None, env=None, logger_creator=None):
-            Trainer.__init__(self, config, env, logger_creator)
-
-        def _init(self, config, env_creator):
-            if validate_config:
-                validate_config(config)
-            if get_initial_state:
-                self.state = get_initial_state(self)
-            else:
-                self.state = {}
-            if get_policy_class is None:
-                policy = default_policy
-            else:
-                policy = get_policy_class(config)
-            if before_init:
-                before_init(self)
-            if make_workers:
-                self.workers = make_workers(self, env_creator, policy, config)
-            else:
-                self.workers = self._make_workers(env_creator, policy, config,
-                                                  self.config["num_workers"])
-            if make_policy_optimizer:
-                self.optimizer = make_policy_optimizer(self.workers, config)
-            else:
-                optimizer_config = dict(
-                    config["optimizer"],
-                    **{"train_batch_size": config["train_batch_size"]})
-                self.optimizer = SyncSamplesOptimizer(self.workers,
-                                                      **optimizer_config)
-            if after_init:
-                after_init(self)
-
-        @override(Trainer)
-        def _train(self):
-            if before_train_step:
-                before_train_step(self)
-            prev_steps = self.optimizer.num_steps_sampled
-
-            start = time.time()
-            while True:
-                fetches = self.optimizer.step()
-                if after_optimizer_step:
-                    after_optimizer_step(self, fetches)
-                if (time.time() - start >= self.config["min_iter_time_s"]
-                        and self.optimizer.num_steps_sampled - prev_steps >=
-                        self.config["timesteps_per_iteration"]):
-                    break
-
-            if collect_metrics_fn:
-                res = collect_metrics_fn(self)
-            else:
-                res = self.collect_metrics()
-            res.update(
-                timesteps_this_iter=self.optimizer.num_steps_sampled -
-                prev_steps,
-                info=res.get("info", {}))
-
-            if after_train_result:
-                after_train_result(self, res)
-            return res
-
-        @override(Trainer)
-        def _before_evaluate(self):
-            if before_evaluate_fn:
-                before_evaluate_fn(self)
-
-        def __getstate__(self):
-            state = Trainer.__getstate__(self)
-            state["trainer_state"] = self.state.copy()
-            return state
-
-        def __setstate__(self, state):
-            Trainer.__setstate__(self, state)
-            self.state = state["trainer_state"].copy()
-
-    @staticmethod
-    def with_updates(**overrides):
-        """Build a copy of this trainer with the specified overrides.
-
-        Arguments:
-            overrides (dict): use this to override any of the arguments
-                originally passed to build_trainer() for this policy.
-        """
-        return build_trainer(**dict(original_kwargs, **overrides))
-
-    trainer_cls.with_updates = with_updates
-    trainer_cls.__name__ = name
-    trainer_cls.__qualname__ = name
-    return trainer_cls
@@ -1,141 +0,0 @@
-{
-    // The version of the config file format.  Do not change, unless
-    // you know what you are doing.
-    "version": 1,
-
-    // The name of the project being benchmarked
-    "project": "rllib",
-
-    // The project's homepage
-    "project_url": "http://rllib.io",
-
-    // The URL or local path of the source code repository for the
-    // project being benchmarked
-    "repo": "../../../",
-
-    // List of branches to benchmark. If not provided, defaults to "master"
-    // (for git) or "default" (for mercurial).
-    "branches": ["master"], // for git
-    // "branches": ["default"],    // for mercurial
-
-    // The DVCS being used.  If not set, it will be automatically
-    // determined from "repo" by looking at the protocol in the URL
-    // (if remote), or by looking for special directories, such as
-    // ".git" (if local).
-    "dvcs": "git",
-
-    // The tool to use to create environments.  May be "conda",
-    // "virtualenv" or other value depending on the plugins in use.
-    // If missing or the empty string, the tool will be automatically
-    // determined by looking for tools on the PATH environment
-    // variable.
-    "environment_type": "conda",
-
-    // timeout in seconds for installing any dependencies in environment
-    // defaults to 10 min
-    //"install_timeout": 600,
-
-    // the base URL to show a commit for the project.
-    "show_commit_url": "http://github.com/ray-project/ray/commit/",
-
-    // The Pythons you'd like to test against.  If not provided, defaults
-    // to the current version of Python used to run `asv`.
-    "pythons": ["3.6"],
-
-    // The matrix of dependencies to test.  Each key is the name of a
-    // package (in PyPI) and the values are version numbers.  An empty
-    // list or empty string indicates to just test against the default
-    // (latest) version. null indicates that the package is to not be
-    // installed. If the package to be tested is only available from
-    // PyPi, and the 'environment_type' is conda, then you can preface
-    // the package name by 'pip+', and the package will be installed via
-    // pip (with all the conda available packages installed first,
-    // followed by the pip installed packages).
-    //
-    // "matrix": {
-    //     "numpy": ["1.6", "1.7"],
-    //     "six": ["", null],        // test with and without six installed
-    //     "pip+emcee": [""],   // emcee is only available for install with pip.
-    // },
-
-    // Combinations of libraries/python versions can be excluded/included
-    // from the set to test. Each entry is a dictionary containing additional
-    // key-value pairs to include/exclude.
-    //
-    // An exclude entry excludes entries where all values match. The
-    // values are regexps that should match the whole string.
-    //
-    // An include entry adds an environment. Only the packages listed
-    // are installed. The 'python' key is required. The exclude rules
-    // do not apply to includes.
-    //
-    // In addition to package names, the following keys are available:
-    //
-    // - python
-    //     Python version, as in the *pythons* variable above.
-    // - environment_type
-    //     Environment type, as above.
-    // - sys_platform
-    //     Platform, as in sys.platform. Possible values for the common
-    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
-    //
-    // "exclude": [
-    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
-    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
-    // ],
-    //
-    // "include": [
-    //     // additional env for python2.7
-    //     {"python": "2.7", "numpy": "1.8"},
-    //     // additional env if run on windows+conda
-    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
-    // ],
-
-    // The directory (relative to the current directory) that benchmarks are
-    // stored in.  If not provided, defaults to "benchmarks"
-    "benchmark_dir": "tuned_examples/regression_tests",
-
-    // The directory (relative to the current directory) to cache the Python
-    // environments in.  If not provided, defaults to "env"
-    // "env_dir": "env",
-
-    // The directory (relative to the current directory) that raw benchmark
-    // results are stored in.  If not provided, defaults to "results".
-    "results_dir": "RLLIB_RESULTS",
-
-    // The directory (relative to the current directory) that the html tree
-    // should be written to.  If not provided, defaults to "html".
-    // "html_dir": "html",
-
-    // The number of characters to retain in the commit hashes.
-    // "hash_length": 8,
-
-    // `asv` will cache wheels of the recent builds in each
-    // environment, making them faster to install next time.  This is
-    // number of builds to keep, per environment.
-    // "wheel_cache_size": 0
-
-    // The commits after which the regression search in `asv publish`
-    // should start looking for regressions. Dictionary whose keys are
-    // regexps matching to benchmark names, and values corresponding to
-    // the commit (exclusive) after which to start looking for
-    // regressions.  The default is to start from the first commit
-    // with results. If the commit is `null`, regression detection is
-    // skipped for the matching benchmark.
-    //
-    // "regressions_first_commits": {
-    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
-    //    "another_benchmark": null,   // Skip regression detection altogether
-    // }
-
-    // The thresholds for relative change in results, after which `asv
-    // publish` starts reporting regressions. Dictionary of the same
-    // form as in ``regressions_first_commits``, with values
-    // indicating the thresholds.  If multiple entries match, the
-    // maximum is taken. If no entry matches, the default is 5%.
-    //
-    // "regressions_thresholds": {
-    //    "some_benchmark": 0.01,     // Threshold of 1%
-    //    "another_benchmark": 0.5,   // Threshold of 50%
-    // }
-}
@@ -1,3 +0,0 @@
-Contributed algorithms, which can be run via ``rllib train --run=contrib/<alg_name>``
-
-See https://ray.readthedocs.io/en/latest/rllib-dev.html for guidelines.
@@ -1,52 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.utils.annotations import override
-
-
-# yapf: disable
-# __sphinx_doc_begin__
-class RandomAgent(Trainer):
-    """Policy that takes random actions and never learns."""
-
-    _name = "RandomAgent"
-    _default_config = with_common_config({
-        "rollouts_per_iteration": 10,
-    })
-
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        self.env = env_creator(config["env_config"])
-
-    @override(Trainer)
-    def _train(self):
-        rewards = []
-        steps = 0
-        for _ in range(self.config["rollouts_per_iteration"]):
-            obs = self.env.reset()
-            done = False
-            reward = 0.0
-            while not done:
-                action = self.env.action_space.sample()
-                obs, r, done, info = self.env.step(action)
-                reward += r
-                steps += 1
-            rewards.append(reward)
-        return {
-            "episode_reward_mean": np.mean(rewards),
-            "timesteps_this_iter": steps,
-        }
-# __sphinx_doc_end__
-# don't enable yapf after, it's buggy here
-
-
-if __name__ == "__main__":
-    trainer = RandomAgent(
-        env="CartPole-v0", config={"rollouts_per_iteration": 10})
-    result = trainer.train()
-    assert result["episode_reward_mean"] > 10, result
-    print("Test: OK")
@@ -1,15 +0,0 @@
-"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-def _import_random_agent():
-    from ray.rllib.contrib.random_agent.random_agent import RandomAgent
-    return RandomAgent
-
-
-CONTRIBUTED_ALGORITHMS = {
-    "contrib/RandomAgent": _import_random_agent,
-}
@@ -1,11 +0,0 @@
-from ray.rllib.env.base_env import BaseEnv
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.env.serving_env import ServingEnv
-from ray.rllib.env.vector_env import VectorEnv
-from ray.rllib.env.env_context import EnvContext
-
-__all__ = [
-    "BaseEnv", "MultiAgentEnv", "ExternalEnv", "VectorEnv", "ServingEnv",
-    "EnvContext"
-]
@@ -1,291 +0,0 @@
-import numpy as np
-from collections import deque
-import gym
-from gym import spaces
-import cv2
-cv2.ocl.setUseOpenCL(False)
-
-
-def is_atari(env):
-    if (hasattr(env.observation_space, "shape")
-            and env.observation_space.shape is not None
-            and len(env.observation_space.shape) <= 2):
-        return False
-    return hasattr(env, "unwrapped") and hasattr(env.unwrapped, "ale")
-
-
-def get_wrapper_by_cls(env, cls):
-    """Returns the gym env wrapper of the given class, or None."""
-    currentenv = env
-    while True:
-        if isinstance(currentenv, cls):
-            return currentenv
-        elif isinstance(currentenv, gym.Wrapper):
-            currentenv = currentenv.env
-        else:
-            return None
-
-
-class MonitorEnv(gym.Wrapper):
-    def __init__(self, env=None):
-        """Record episodes stats prior to EpisodicLifeEnv, etc."""
-        gym.Wrapper.__init__(self, env)
-        self._current_reward = None
-        self._num_steps = None
-        self._total_steps = None
-        self._episode_rewards = []
-        self._episode_lengths = []
-        self._num_episodes = 0
-        self._num_returned = 0
-
-    def reset(self, **kwargs):
-        obs = self.env.reset(**kwargs)
-
-        if self._total_steps is None:
-            self._total_steps = sum(self._episode_lengths)
-
-        if self._current_reward is not None:
-            self._episode_rewards.append(self._current_reward)
-            self._episode_lengths.append(self._num_steps)
-            self._num_episodes += 1
-
-        self._current_reward = 0
-        self._num_steps = 0
-
-        return obs
-
-    def step(self, action):
-        obs, rew, done, info = self.env.step(action)
-        self._current_reward += rew
-        self._num_steps += 1
-        self._total_steps += 1
-        return (obs, rew, done, info)
-
-    def get_episode_rewards(self):
-        return self._episode_rewards
-
-    def get_episode_lengths(self):
-        return self._episode_lengths
-
-    def get_total_steps(self):
-        return self._total_steps
-
-    def next_episode_results(self):
-        for i in range(self._num_returned, len(self._episode_rewards)):
-            yield (self._episode_rewards[i], self._episode_lengths[i])
-        self._num_returned = len(self._episode_rewards)
-
-
-class NoopResetEnv(gym.Wrapper):
-    def __init__(self, env, noop_max=30):
-        """Sample initial states by taking random number of no-ops on reset.
-        No-op is assumed to be action 0.
-        """
-        gym.Wrapper.__init__(self, env)
-        self.noop_max = noop_max
-        self.override_num_noops = None
-        self.noop_action = 0
-        assert env.unwrapped.get_action_meanings()[0] == "NOOP"
-
-    def reset(self, **kwargs):
-        """ Do no-op action for a number of steps in [1, noop_max]."""
-        self.env.reset(**kwargs)
-        if self.override_num_noops is not None:
-            noops = self.override_num_noops
-        else:
-            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
-        assert noops > 0
-        obs = None
-        for _ in range(noops):
-            obs, _, done, _ = self.env.step(self.noop_action)
-            if done:
-                obs = self.env.reset(**kwargs)
-        return obs
-
-    def step(self, ac):
-        return self.env.step(ac)
-
-
-class ClipRewardEnv(gym.RewardWrapper):
-    def __init__(self, env):
-        gym.RewardWrapper.__init__(self, env)
-
-    def reward(self, reward):
-        """Bin reward to {+1, 0, -1} by its sign."""
-        return np.sign(reward)
-
-
-class FireResetEnv(gym.Wrapper):
-    def __init__(self, env):
-        """Take action on reset.
-
-        For environments that are fixed until firing."""
-        gym.Wrapper.__init__(self, env)
-        assert env.unwrapped.get_action_meanings()[1] == "FIRE"
-        assert len(env.unwrapped.get_action_meanings()) >= 3
-
-    def reset(self, **kwargs):
-        self.env.reset(**kwargs)
-        obs, _, done, _ = self.env.step(1)
-        if done:
-            self.env.reset(**kwargs)
-        obs, _, done, _ = self.env.step(2)
-        if done:
-            self.env.reset(**kwargs)
-        return obs
-
-    def step(self, ac):
-        return self.env.step(ac)
-
-
-class EpisodicLifeEnv(gym.Wrapper):
-    def __init__(self, env):
-        """Make end-of-life == end-of-episode, but only reset on true game over.
-        Done by DeepMind for the DQN and co. since it helps value estimation.
-        """
-        gym.Wrapper.__init__(self, env)
-        self.lives = 0
-        self.was_real_done = True
-
-    def step(self, action):
-        obs, reward, done, info = self.env.step(action)
-        self.was_real_done = done
-        # check current lives, make loss of life terminal,
-        # then update lives to handle bonus lives
-        lives = self.env.unwrapped.ale.lives()
-        if lives < self.lives and lives > 0:
-            # for Qbert sometimes we stay in lives == 0 condtion for a few fr
-            # so its important to keep lives > 0, so that we only reset once
-            # the environment advertises done.
-            done = True
-        self.lives = lives
-        return obs, reward, done, info
-
-    def reset(self, **kwargs):
-        """Reset only when lives are exhausted.
-        This way all states are still reachable even though lives are episodic,
-        and the learner need not know about any of this behind-the-scenes.
-        """
-        if self.was_real_done:
-            obs = self.env.reset(**kwargs)
-        else:
-            # no-op step to advance from terminal/lost life state
-            obs, _, _, _ = self.env.step(0)
-        self.lives = self.env.unwrapped.ale.lives()
-        return obs
-
-
-class MaxAndSkipEnv(gym.Wrapper):
-    def __init__(self, env, skip=4):
-        """Return only every `skip`-th frame"""
-        gym.Wrapper.__init__(self, env)
-        # most recent raw observations (for max pooling across time steps)
-        self._obs_buffer = np.zeros(
-            (2, ) + env.observation_space.shape, dtype=np.uint8)
-        self._skip = skip
-
-    def step(self, action):
-        """Repeat action, sum reward, and max over last observations."""
-        total_reward = 0.0
-        done = None
-        for i in range(self._skip):
-            obs, reward, done, info = self.env.step(action)
-            if i == self._skip - 2:
-                self._obs_buffer[0] = obs
-            if i == self._skip - 1:
-                self._obs_buffer[1] = obs
-            total_reward += reward
-            if done:
-                break
-        # Note that the observation on the done=True frame
-        # doesn't matter
-        max_frame = self._obs_buffer.max(axis=0)
-
-        return max_frame, total_reward, done, info
-
-    def reset(self, **kwargs):
-        return self.env.reset(**kwargs)
-
-
-class WarpFrame(gym.ObservationWrapper):
-    def __init__(self, env, dim):
-        """Warp frames to the specified size (dim x dim)."""
-        gym.ObservationWrapper.__init__(self, env)
-        self.width = dim
-        self.height = dim
-        self.observation_space = spaces.Box(
-            low=0,
-            high=255,
-            shape=(self.height, self.width, 1),
-            dtype=np.uint8)
-
-    def observation(self, frame):
-        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
-        frame = cv2.resize(
-            frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
-        return frame[:, :, None]
-
-
-class FrameStack(gym.Wrapper):
-    def __init__(self, env, k):
-        """Stack k last frames."""
-        gym.Wrapper.__init__(self, env)
-        self.k = k
-        self.frames = deque([], maxlen=k)
-        shp = env.observation_space.shape
-        self.observation_space = spaces.Box(
-            low=0,
-            high=255,
-            shape=(shp[0], shp[1], shp[2] * k),
-            dtype=env.observation_space.dtype)
-
-    def reset(self):
-        ob = self.env.reset()
-        for _ in range(self.k):
-            self.frames.append(ob)
-        return self._get_ob()
-
-    def step(self, action):
-        ob, reward, done, info = self.env.step(action)
-        self.frames.append(ob)
-        return self._get_ob(), reward, done, info
-
-    def _get_ob(self):
-        assert len(self.frames) == self.k
-        return np.concatenate(self.frames, axis=2)
-
-
-class ScaledFloatFrame(gym.ObservationWrapper):
-    def __init__(self, env):
-        gym.ObservationWrapper.__init__(self, env)
-        self.observation_space = gym.spaces.Box(
-            low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
-
-    def observation(self, observation):
-        # careful! This undoes the memory optimization, use
-        # with smaller replay buffers only.
-        return np.array(observation).astype(np.float32) / 255.0
-
-
-def wrap_deepmind(env, dim=84, framestack=True):
-    """Configure environment for DeepMind-style Atari.
-
-    Note that we assume reward clipping is done outside the wrapper.
-
-    Args:
-        dim (int): Dimension to resize observations to (dim x dim).
-        framestack (bool): Whether to framestack observations.
-    """
-    env = MonitorEnv(env)
-    env = NoopResetEnv(env, noop_max=30)
-    if "NoFrameskip" in env.spec.id:
-        env = MaxAndSkipEnv(env, skip=4)
-    env = EpisodicLifeEnv(env)
-    if "FIRE" in env.unwrapped.get_action_meanings():
-        env = FireResetEnv(env)
-    env = WarpFrame(env, dim)
-    # env = ScaledFloatFrame(env)  # TODO: use for dqn?
-    # env = ClipRewardEnv(env)  # reward clipping is handled by policy eval
-    if framestack:
-        env = FrameStack(env, 4)
-    return env
@@ -1,451 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
-from ray.rllib.env.vector_env import VectorEnv
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.rllib.utils.annotations import override, PublicAPI
-
-ASYNC_RESET_RETURN = "async_reset_return"
-
-
-@PublicAPI
-class BaseEnv(object):
-    """The lowest-level env interface used by RLlib for sampling.
-
-    BaseEnv models multiple agents executing asynchronously in multiple
-    environments. A call to poll() returns observations from ready agents
-    keyed by their environment and agent ids, and actions for those agents
-    can be sent back via send_actions().
-
-    All other env types can be adapted to BaseEnv. RLlib handles these
-    conversions internally in RolloutWorker, for example:
-
-        gym.Env => rllib.VectorEnv => rllib.BaseEnv
-        rllib.MultiAgentEnv => rllib.BaseEnv
-        rllib.ExternalEnv => rllib.BaseEnv
-
-    Attributes:
-        action_space (gym.Space): Action space. This must be defined for
-            single-agent envs. Multi-agent envs can set this to None.
-        observation_space (gym.Space): Observation space. This must be defined
-            for single-agent envs. Multi-agent envs can set this to None.
-
-    Examples:
-        >>> env = MyBaseEnv()
-        >>> obs, rewards, dones, infos, off_policy_actions = env.poll()
-        >>> print(obs)
-        {
-            "env_0": {
-                "car_0": [2.4, 1.6],
-                "car_1": [3.4, -3.2],
-            },
-            "env_1": {
-                "car_0": [8.0, 4.1],
-            },
-            "env_2": {
-                "car_0": [2.3, 3.3],
-                "car_1": [1.4, -0.2],
-                "car_3": [1.2, 0.1],
-            },
-        }
-        >>> env.send_actions(
-            actions={
-                "env_0": {
-                    "car_0": 0,
-                    "car_1": 1,
-                }, ...
-            })
-        >>> obs, rewards, dones, infos, off_policy_actions = env.poll()
-        >>> print(obs)
-        {
-            "env_0": {
-                "car_0": [4.1, 1.7],
-                "car_1": [3.2, -4.2],
-            }, ...
-        }
-        >>> print(dones)
-        {
-            "env_0": {
-                "__all__": False,
-                "car_0": False,
-                "car_1": True,
-            }, ...
-        }
-    """
-
-    @staticmethod
-    def to_base_env(env,
-                    make_env=None,
-                    num_envs=1,
-                    remote_envs=False,
-                    remote_env_batch_wait_ms=0):
-        """Wraps any env type as needed to expose the async interface."""
-
-        from ray.rllib.env.remote_vector_env import RemoteVectorEnv
-        if remote_envs and num_envs == 1:
-            raise ValueError(
-                "Remote envs only make sense to use if num_envs > 1 "
-                "(i.e. vectorization is enabled).")
-
-        if not isinstance(env, BaseEnv):
-            if isinstance(env, MultiAgentEnv):
-                if remote_envs:
-                    env = RemoteVectorEnv(
-                        make_env,
-                        num_envs,
-                        multiagent=True,
-                        remote_env_batch_wait_ms=remote_env_batch_wait_ms)
-                else:
-                    env = _MultiAgentEnvToBaseEnv(
-                        make_env=make_env,
-                        existing_envs=[env],
-                        num_envs=num_envs)
-            elif isinstance(env, ExternalMultiAgentEnv):
-                if num_envs != 1:
-                    raise ValueError(
-                        "ExternalMultiAgentEnv does not currently support "
-                        "num_envs > 1.")
-                env = _ExternalEnvToBaseEnv(env, multiagent=True)
-            elif isinstance(env, ExternalEnv):
-                if num_envs != 1:
-                    raise ValueError(
-                        "ExternalEnv does not currently support num_envs > 1.")
-                env = _ExternalEnvToBaseEnv(env)
-            elif isinstance(env, VectorEnv):
-                env = _VectorEnvToBaseEnv(env)
-            else:
-                if remote_envs:
-                    env = RemoteVectorEnv(
-                        make_env,
-                        num_envs,
-                        multiagent=False,
-                        remote_env_batch_wait_ms=remote_env_batch_wait_ms)
-                else:
-                    env = VectorEnv.wrap(
-                        make_env=make_env,
-                        existing_envs=[env],
-                        num_envs=num_envs,
-                        action_space=env.action_space,
-                        observation_space=env.observation_space)
-                    env = _VectorEnvToBaseEnv(env)
-        assert isinstance(env, BaseEnv), env
-        return env
-
-    @PublicAPI
-    def poll(self):
-        """Returns observations from ready agents.
-
-        The returns are two-level dicts mapping from env_id to a dict of
-        agent_id to values. The number of agents and envs can vary over time.
-
-        Returns
-        -------
-            obs (dict): New observations for each ready agent.
-            rewards (dict): Reward values for each ready agent. If the
-                episode is just started, the value will be None.
-            dones (dict): Done values for each ready agent. The special key
-                "__all__" is used to indicate env termination.
-            infos (dict): Info values for each ready agent.
-            off_policy_actions (dict): Agents may take off-policy actions. When
-                that happens, there will be an entry in this dict that contains
-                the taken action. There is no need to send_actions() for agents
-                that have already chosen off-policy actions.
-
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def send_actions(self, action_dict):
-        """Called to send actions back to running agents in this env.
-
-        Actions should be sent for each ready agent that returned observations
-        in the previous poll() call.
-
-        Arguments:
-            action_dict (dict): Actions values keyed by env_id and agent_id.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def try_reset(self, env_id):
-        """Attempt to reset the env with the given id.
-
-        If the environment does not support synchronous reset, None can be
-        returned here.
-
-        Returns:
-            obs (dict|None): Resetted observation or None if not supported.
-        """
-        return None
-
-    @PublicAPI
-    def get_unwrapped(self):
-        """Return a reference to the underlying gym envs, if any.
-
-        Returns:
-            envs (list): Underlying gym envs or [].
-        """
-        return []
-
-    @PublicAPI
-    def stop(self):
-        """Releases all resources used."""
-
-        for env in self.get_unwrapped():
-            if hasattr(env, "close"):
-                env.close()
-
-
-# Fixed agent identifier when there is only the single agent in the env
-_DUMMY_AGENT_ID = "agent0"
-
-
-def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
-    return {k: {dummy_id: v} for (k, v) in env_id_to_values.items()}
-
-
-class _ExternalEnvToBaseEnv(BaseEnv):
-    """Internal adapter of ExternalEnv to BaseEnv."""
-
-    def __init__(self, external_env, preprocessor=None, multiagent=False):
-        self.external_env = external_env
-        self.prep = preprocessor
-        self.multiagent = multiagent
-        self.action_space = external_env.action_space
-        if preprocessor:
-            self.observation_space = preprocessor.observation_space
-        else:
-            self.observation_space = external_env.observation_space
-        external_env.start()
-
-    @override(BaseEnv)
-    def poll(self):
-        with self.external_env._results_avail_condition:
-            results = self._poll()
-            while len(results[0]) == 0:
-                self.external_env._results_avail_condition.wait()
-                results = self._poll()
-                if not self.external_env.isAlive():
-                    raise Exception("Serving thread has stopped.")
-        limit = self.external_env._max_concurrent_episodes
-        assert len(results[0]) < limit, \
-            ("Too many concurrent episodes, were some leaked? This "
-             "ExternalEnv was created with max_concurrent={}".format(limit))
-        return results
-
-    @override(BaseEnv)
-    def send_actions(self, action_dict):
-        if self.multiagent:
-            for env_id, actions in action_dict.items():
-                self.external_env._episodes[env_id].action_queue.put(actions)
-        else:
-            for env_id, action in action_dict.items():
-                self.external_env._episodes[env_id].action_queue.put(
-                    action[_DUMMY_AGENT_ID])
-
-    def _poll(self):
-        all_obs, all_rewards, all_dones, all_infos = {}, {}, {}, {}
-        off_policy_actions = {}
-        for eid, episode in self.external_env._episodes.copy().items():
-            data = episode.get_data()
-            cur_done = episode.cur_done_dict[
-                "__all__"] if self.multiagent else episode.cur_done
-            if cur_done:
-                del self.external_env._episodes[eid]
-            if data:
-                if self.prep:
-                    all_obs[eid] = self.prep.transform(data["obs"])
-                else:
-                    all_obs[eid] = data["obs"]
-                all_rewards[eid] = data["reward"]
-                all_dones[eid] = data["done"]
-                all_infos[eid] = data["info"]
-                if "off_policy_action" in data:
-                    off_policy_actions[eid] = data["off_policy_action"]
-        if self.multiagent:
-            # ensure a consistent set of keys
-            # rely on all_obs having all possible keys for now
-            for eid, eid_dict in all_obs.items():
-                for agent_id in eid_dict.keys():
-
-                    def fix(d, zero_val):
-                        if agent_id not in d[eid]:
-                            d[eid][agent_id] = zero_val
-
-                    fix(all_rewards, 0.0)
-                    fix(all_dones, False)
-                    fix(all_infos, {})
-            return (all_obs, all_rewards, all_dones, all_infos,
-                    off_policy_actions)
-        else:
-            return _with_dummy_agent_id(all_obs), \
-                _with_dummy_agent_id(all_rewards), \
-                _with_dummy_agent_id(all_dones, "__all__"), \
-                _with_dummy_agent_id(all_infos), \
-                _with_dummy_agent_id(off_policy_actions)
-
-
-class _VectorEnvToBaseEnv(BaseEnv):
-    """Internal adapter of VectorEnv to BaseEnv.
-
-    We assume the caller will always send the full vector of actions in each
-    call to send_actions(), and that they call reset_at() on all completed
-    environments before calling send_actions().
-    """
-
-    def __init__(self, vector_env):
-        self.vector_env = vector_env
-        self.action_space = vector_env.action_space
-        self.observation_space = vector_env.observation_space
-        self.num_envs = vector_env.num_envs
-        self.new_obs = None  # lazily initialized
-        self.cur_rewards = [None for _ in range(self.num_envs)]
-        self.cur_dones = [False for _ in range(self.num_envs)]
-        self.cur_infos = [None for _ in range(self.num_envs)]
-
-    @override(BaseEnv)
-    def poll(self):
-        if self.new_obs is None:
-            self.new_obs = self.vector_env.vector_reset()
-        new_obs = dict(enumerate(self.new_obs))
-        rewards = dict(enumerate(self.cur_rewards))
-        dones = dict(enumerate(self.cur_dones))
-        infos = dict(enumerate(self.cur_infos))
-        self.new_obs = []
-        self.cur_rewards = []
-        self.cur_dones = []
-        self.cur_infos = []
-        return _with_dummy_agent_id(new_obs), \
-            _with_dummy_agent_id(rewards), \
-            _with_dummy_agent_id(dones, "__all__"), \
-            _with_dummy_agent_id(infos), {}
-
-    @override(BaseEnv)
-    def send_actions(self, action_dict):
-        action_vector = [None] * self.num_envs
-        for i in range(self.num_envs):
-            action_vector[i] = action_dict[i][_DUMMY_AGENT_ID]
-        self.new_obs, self.cur_rewards, self.cur_dones, self.cur_infos = \
-            self.vector_env.vector_step(action_vector)
-
-    @override(BaseEnv)
-    def try_reset(self, env_id):
-        return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)}
-
-    @override(BaseEnv)
-    def get_unwrapped(self):
-        return self.vector_env.get_unwrapped()
-
-
-class _MultiAgentEnvToBaseEnv(BaseEnv):
-    """Internal adapter of MultiAgentEnv to BaseEnv.
-
-    This also supports vectorization if num_envs > 1.
-    """
-
-    def __init__(self, make_env, existing_envs, num_envs):
-        """Wrap existing multi-agent envs.
-
-        Arguments:
-            make_env (func|None): Factory that produces a new multiagent env.
-                Must be defined if the number of existing envs is less than
-                num_envs.
-            existing_envs (list): List of existing multiagent envs.
-            num_envs (int): Desired num multiagent envs to keep total.
-        """
-        self.make_env = make_env
-        self.envs = existing_envs
-        self.num_envs = num_envs
-        self.dones = set()
-        while len(self.envs) < self.num_envs:
-            self.envs.append(self.make_env(len(self.envs)))
-        for env in self.envs:
-            assert isinstance(env, MultiAgentEnv)
-        self.env_states = [_MultiAgentEnvState(env) for env in self.envs]
-
-    @override(BaseEnv)
-    def poll(self):
-        obs, rewards, dones, infos = {}, {}, {}, {}
-        for i, env_state in enumerate(self.env_states):
-            obs[i], rewards[i], dones[i], infos[i] = env_state.poll()
-        return obs, rewards, dones, infos, {}
-
-    @override(BaseEnv)
-    def send_actions(self, action_dict):
-        for env_id, agent_dict in action_dict.items():
-            if env_id in self.dones:
-                raise ValueError("Env {} is already done".format(env_id))
-            env = self.envs[env_id]
-            obs, rewards, dones, infos = env.step(agent_dict)
-            assert isinstance(obs, dict), "Not a multi-agent obs"
-            assert isinstance(rewards, dict), "Not a multi-agent reward"
-            assert isinstance(dones, dict), "Not a multi-agent return"
-            assert isinstance(infos, dict), "Not a multi-agent info"
-            if set(obs.keys()) != set(rewards.keys()):
-                raise ValueError(
-                    "Key set for obs and rewards must be the same: "
-                    "{} vs {}".format(obs.keys(), rewards.keys()))
-            if set(infos).difference(set(obs)):
-                raise ValueError("Key set for infos must be a subset of obs: "
-                                 "{} vs {}".format(infos.keys(), obs.keys()))
-            if "__all__" not in dones:
-                raise ValueError(
-                    "In multi-agent environments, '__all__': True|False must "
-                    "be included in the 'done' dict: got {}.".format(dones))
-            if dones["__all__"]:
-                self.dones.add(env_id)
-            self.env_states[env_id].observe(obs, rewards, dones, infos)
-
-    @override(BaseEnv)
-    def try_reset(self, env_id):
-        obs = self.env_states[env_id].reset()
-        assert isinstance(obs, dict), "Not a multi-agent obs"
-        if obs is not None and env_id in self.dones:
-            self.dones.remove(env_id)
-        return obs
-
-    @override(BaseEnv)
-    def get_unwrapped(self):
-        return [state.env for state in self.env_states]
-
-
-class _MultiAgentEnvState(object):
-    def __init__(self, env):
-        assert isinstance(env, MultiAgentEnv)
-        self.env = env
-        self.initialized = False
-
-    def poll(self):
-        if not self.initialized:
-            self.reset()
-            self.initialized = True
-        obs, rew, dones, info = (self.last_obs, self.last_rewards,
-                                 self.last_dones, self.last_infos)
-        self.last_obs = {}
-        self.last_rewards = {}
-        self.last_dones = {"__all__": False}
-        self.last_infos = {}
-        return obs, rew, dones, info
-
-    def observe(self, obs, rewards, dones, infos):
-        self.last_obs = obs
-        self.last_rewards = rewards
-        self.last_dones = dones
-        self.last_infos = infos
-
-    def reset(self):
-        self.last_obs = self.env.reset()
-        self.last_rewards = {
-            agent_id: None
-            for agent_id in self.last_obs.keys()
-        }
-        self.last_dones = {
-            agent_id: False
-            for agent_id in self.last_obs.keys()
-        }
-        self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
-        self.last_dones["__all__"] = False
-        return self.last_obs
@@ -1,19 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# info key for the individual rewards of an agent, for example:
-# info: {
-#   group_1: {
-#      _group_rewards: [5, -1, 1],  # 3 agents in this group
-#   }
-# }
-GROUP_REWARDS = "_group_rewards"
-
-# info key for the individual infos of an agent, for example:
-# info: {
-#   group_1: {
-#      _group_infos: [{"foo": ...}, {}],  # 2 agents in this group
-#   }
-# }
-GROUP_INFO = "_group_info"
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.utils.annotations import PublicAPI
-
-
-@PublicAPI
-class EnvContext(dict):
-    """Wraps env configurations to include extra rllib metadata.
-
-    These attributes can be used to parameterize environments per process.
-    For example, one might use `worker_index` to control which data file an
-    environment reads in on initialization.
-
-    RLlib auto-sets these attributes when constructing registered envs.
-
-    Attributes:
-        worker_index (int): When there are multiple workers created, this
-            uniquely identifies the worker the env is created in.
-        vector_index (int): When there are multiple envs per worker, this
-            uniquely identifies the env index within the worker.
-        remote (bool): Whether environment should be remote or not.
-    """
-
-    def __init__(self, env_config, worker_index, vector_index=0, remote=False):
-        dict.__init__(self, env_config)
-        self.worker_index = worker_index
-        self.vector_index = vector_index
-        self.remote = remote
-
-    def copy_with_overrides(self,
-                            env_config=None,
-                            worker_index=None,
-                            vector_index=None,
-                            remote=None):
-        return EnvContext(
-            env_config if env_config is not None else self,
-            worker_index if worker_index is not None else self.worker_index,
-            vector_index if vector_index is not None else self.vector_index,
-            remote if remote is not None else self.remote,
-        )
@@ -1,272 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import queue
-import threading
-import uuid
-
-from ray.rllib.utils.annotations import PublicAPI
-
-
-@PublicAPI
-class ExternalEnv(threading.Thread):
-    """An environment that interfaces with external agents.
-
-    Unlike simulator envs, control is inverted. The environment queries the
-    policy to obtain actions and logs observations and rewards for training.
-    This is in contrast to gym.Env, where the algorithm drives the simulation
-    through env.step() calls.
-
-    You can use ExternalEnv as the backend for policy serving (by serving HTTP
-    requests in the run loop), for ingesting offline logs data (by reading
-    offline transitions in the run loop), or other custom use cases not easily
-    expressed through gym.Env.
-
-    ExternalEnv supports both on-policy actions (through self.get_action()),
-    and off-policy actions (through self.log_action()).
-
-    This env is thread-safe, but individual episodes must be executed serially.
-
-    Attributes:
-        action_space (gym.Space): Action space.
-        observation_space (gym.Space): Observation space.
-
-    Examples:
-        >>> register_env("my_env", lambda config: YourExternalEnv(config))
-        >>> trainer = DQNTrainer(env="my_env")
-        >>> while True:
-              print(trainer.train())
-    """
-
-    @PublicAPI
-    def __init__(self, action_space, observation_space, max_concurrent=100):
-        """Initialize an external env.
-
-        ExternalEnv subclasses must call this during their __init__.
-
-        Arguments:
-            action_space (gym.Space): Action space of the env.
-            observation_space (gym.Space): Observation space of the env.
-            max_concurrent (int): Max number of active episodes to allow at
-                once. Exceeding this limit raises an error.
-        """
-
-        threading.Thread.__init__(self)
-        self.daemon = True
-        self.action_space = action_space
-        self.observation_space = observation_space
-        self._episodes = {}
-        self._finished = set()
-        self._results_avail_condition = threading.Condition()
-        self._max_concurrent_episodes = max_concurrent
-
-    @PublicAPI
-    def run(self):
-        """Override this to implement the run loop.
-
-        Your loop should continuously:
-            1. Call self.start_episode(episode_id)
-            2. Call self.get_action(episode_id, obs)
-                    -or-
-                    self.log_action(episode_id, obs, action)
-            3. Call self.log_returns(episode_id, reward)
-            4. Call self.end_episode(episode_id, obs)
-            5. Wait if nothing to do.
-
-        Multiple episodes may be started at the same time.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def start_episode(self, episode_id=None, training_enabled=True):
-        """Record the start of an episode.
-
-        Arguments:
-            episode_id (str): Unique string id for the episode or None for
-                it to be auto-assigned.
-            training_enabled (bool): Whether to use experiences for this
-                episode to improve the policy.
-
-        Returns:
-            episode_id (str): Unique string id for the episode.
-        """
-
-        if episode_id is None:
-            episode_id = uuid.uuid4().hex
-
-        if episode_id in self._finished:
-            raise ValueError(
-                "Episode {} has already completed.".format(episode_id))
-
-        if episode_id in self._episodes:
-            raise ValueError(
-                "Episode {} is already started".format(episode_id))
-
-        self._episodes[episode_id] = _ExternalEnvEpisode(
-            episode_id, self._results_avail_condition, training_enabled)
-
-        return episode_id
-
-    @PublicAPI
-    def get_action(self, episode_id, observation):
-        """Record an observation and get the on-policy action.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation (obj): Current environment observation.
-
-        Returns:
-            action (obj): Action from the env action space.
-        """
-
-        episode = self._get(episode_id)
-        return episode.wait_for_action(observation)
-
-    @PublicAPI
-    def log_action(self, episode_id, observation, action):
-        """Record an observation and (off-policy) action taken.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation (obj): Current environment observation.
-            action (obj): Action for the observation.
-        """
-
-        episode = self._get(episode_id)
-        episode.log_action(observation, action)
-
-    @PublicAPI
-    def log_returns(self, episode_id, reward, info=None):
-        """Record returns from the environment.
-
-        The reward will be attributed to the previous action taken by the
-        episode. Rewards accumulate until the next action. If no reward is
-        logged before the next action, a reward of 0.0 is assumed.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            reward (float): Reward from the environment.
-            info (dict): Optional info dict.
-        """
-
-        episode = self._get(episode_id)
-        episode.cur_reward += reward
-        if info:
-            episode.cur_info = info or {}
-
-    @PublicAPI
-    def end_episode(self, episode_id, observation):
-        """Record the end of an episode.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation (obj): Current environment observation.
-        """
-
-        episode = self._get(episode_id)
-        self._finished.add(episode.episode_id)
-        episode.done(observation)
-
-    def _get(self, episode_id):
-        """Get a started episode or raise an error."""
-
-        if episode_id in self._finished:
-            raise ValueError(
-                "Episode {} has already completed.".format(episode_id))
-
-        if episode_id not in self._episodes:
-            raise ValueError("Episode {} not found.".format(episode_id))
-
-        return self._episodes[episode_id]
-
-
-class _ExternalEnvEpisode(object):
-    """Tracked state for each active episode."""
-
-    def __init__(self,
-                 episode_id,
-                 results_avail_condition,
-                 training_enabled,
-                 multiagent=False):
-        self.episode_id = episode_id
-        self.results_avail_condition = results_avail_condition
-        self.training_enabled = training_enabled
-        self.multiagent = multiagent
-        self.data_queue = queue.Queue()
-        self.action_queue = queue.Queue()
-        if multiagent:
-            self.new_observation_dict = None
-            self.new_action_dict = None
-            self.cur_reward_dict = {}
-            self.cur_done_dict = {"__all__": False}
-            self.cur_info_dict = {}
-        else:
-            self.new_observation = None
-            self.new_action = None
-            self.cur_reward = 0.0
-            self.cur_done = False
-            self.cur_info = {}
-
-    def get_data(self):
-        if self.data_queue.empty():
-            return None
-        return self.data_queue.get_nowait()
-
-    def log_action(self, observation, action):
-        if self.multiagent:
-            self.new_observation_dict = observation
-            self.new_action_dict = action
-        else:
-            self.new_observation = observation
-            self.new_action = action
-        self._send()
-        self.action_queue.get(True, timeout=60.0)
-
-    def wait_for_action(self, observation):
-        if self.multiagent:
-            self.new_observation_dict = observation
-        else:
-            self.new_observation = observation
-        self._send()
-        return self.action_queue.get(True, timeout=60.0)
-
-    def done(self, observation):
-        if self.multiagent:
-            self.new_observation_dict = observation
-            self.cur_done_dict = {"__all__": True}
-        else:
-            self.new_observation = observation
-            self.cur_done = True
-        self._send()
-
-    def _send(self):
-        if self.multiagent:
-            item = {
-                "obs": self.new_observation_dict,
-                "reward": self.cur_reward_dict,
-                "done": self.cur_done_dict,
-                "info": self.cur_info_dict,
-            }
-            if self.new_action_dict is not None:
-                item["off_policy_action"] = self.new_action_dict
-            self.new_observation_dict = None
-            self.new_action_dict = None
-            self.cur_reward_dict = {}
-        else:
-            item = {
-                "obs": self.new_observation,
-                "reward": self.cur_reward,
-                "done": self.cur_done,
-                "info": self.cur_info,
-            }
-            if self.new_action is not None:
-                item["off_policy_action"] = self.new_action
-            self.new_observation = None
-            self.new_action = None
-            self.cur_reward = 0.0
-        if not self.training_enabled:
-            item["info"]["training_enabled"] = False
-        with self.results_avail_condition:
-            self.data_queue.put_nowait(item)
-            self.results_avail_condition.notify()
@@ -1,149 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import uuid
-
-from ray.rllib.utils.annotations import override, PublicAPI
-from ray.rllib.env.external_env import ExternalEnv, _ExternalEnvEpisode
-
-
-@PublicAPI
-class ExternalMultiAgentEnv(ExternalEnv):
-    """This is the multi-agent version of ExternalEnv."""
-
-    @PublicAPI
-    def __init__(self, action_space, observation_space, max_concurrent=100):
-        """Initialize a multi-agent external env.
-
-        ExternalMultiAgentEnv subclasses must call this during their __init__.
-
-        Arguments:
-            action_space (gym.Space): Action space of the env.
-            observation_space (gym.Space): Observation space of the env.
-            max_concurrent (int): Max number of active episodes to allow at
-                once. Exceeding this limit raises an error.
-        """
-        ExternalEnv.__init__(self, action_space, observation_space,
-                             max_concurrent)
-
-        # we require to know all agents' spaces
-        if isinstance(self.action_space, dict) or isinstance(
-                self.observation_space, dict):
-            if not (self.action_space.keys() == self.observation_space.keys()):
-                raise ValueError("Agent ids disagree for action space and obs "
-                                 "space dict: {} {}".format(
-                                     self.action_space.keys(),
-                                     self.observation_space.keys()))
-
-    @PublicAPI
-    def run(self):
-        """Override this to implement the multi-agent run loop.
-
-        Your loop should continuously:
-            1. Call self.start_episode(episode_id)
-            2. Call self.get_action(episode_id, obs_dict)
-                    -or-
-                    self.log_action(episode_id, obs_dict, action_dict)
-            3. Call self.log_returns(episode_id, reward_dict)
-            4. Call self.end_episode(episode_id, obs_dict)
-            5. Wait if nothing to do.
-
-        Multiple episodes may be started at the same time.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    @override(ExternalEnv)
-    def start_episode(self, episode_id=None, training_enabled=True):
-        if episode_id is None:
-            episode_id = uuid.uuid4().hex
-
-        if episode_id in self._finished:
-            raise ValueError(
-                "Episode {} has already completed.".format(episode_id))
-
-        if episode_id in self._episodes:
-            raise ValueError(
-                "Episode {} is already started".format(episode_id))
-
-        self._episodes[episode_id] = _ExternalEnvEpisode(
-            episode_id,
-            self._results_avail_condition,
-            training_enabled,
-            multiagent=True)
-
-        return episode_id
-
-    @PublicAPI
-    @override(ExternalEnv)
-    def get_action(self, episode_id, observation_dict):
-        """Record an observation and get the on-policy action.
-        observation_dict is expected to contain the observation
-        of all agents acting in this episode step.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation_dict (dict): Current environment observation.
-
-        Returns:
-            action (dict): Action from the env action space.
-        """
-
-        episode = self._get(episode_id)
-        return episode.wait_for_action(observation_dict)
-
-    @PublicAPI
-    @override(ExternalEnv)
-    def log_action(self, episode_id, observation_dict, action_dict):
-        """Record an observation and (off-policy) action taken.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation_dict (dict): Current environment observation.
-            action_dict (dict): Action for the observation.
-        """
-
-        episode = self._get(episode_id)
-        episode.log_action(observation_dict, action_dict)
-
-    @PublicAPI
-    @override(ExternalEnv)
-    def log_returns(self, episode_id, reward_dict, info_dict=None):
-        """Record returns from the environment.
-
-        The reward will be attributed to the previous action taken by the
-        episode. Rewards accumulate until the next action. If no reward is
-        logged before the next action, a reward of 0.0 is assumed.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            reward_dict (dict): Reward from the environment agents.
-            info (dict): Optional info dict.
-        """
-
-        episode = self._get(episode_id)
-
-        # accumulate reward by agent
-        # for existing agents, we want to add the reward up
-        for agent, rew in reward_dict.items():
-            if agent in episode.cur_reward_dict:
-                episode.cur_reward_dict[agent] += rew
-            else:
-                episode.cur_reward_dict[agent] = rew
-        if info_dict:
-            episode.cur_info_dict = info_dict or {}
-
-    @PublicAPI
-    @override(ExternalEnv)
-    def end_episode(self, episode_id, observation_dict):
-        """Record the end of an episode.
-
-        Arguments:
-            episode_id (str): Episode id returned from start_episode().
-            observation_dict (dict): Current environment observation.
-        """
-
-        episode = self._get(episode_id)
-        self._finished.add(episode.episode_id)
-        episode.done(observation_dict)
@@ -1,107 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import OrderedDict
-
-from ray.rllib.env.constants import GROUP_REWARDS, GROUP_INFO
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-
-
-# TODO(ekl) we should add some unit tests for this
-class _GroupAgentsWrapper(MultiAgentEnv):
-    """Wraps a MultiAgentEnv environment with agents grouped as specified.
-
-    See multi_agent_env.py for the specification of groups.
-
-    This API is experimental.
-    """
-
-    def __init__(self, env, groups, obs_space=None, act_space=None):
-        """Wrap an existing multi-agent env to group agents together.
-
-        See MultiAgentEnv.with_agent_groups() for usage info.
-
-        Arguments:
-            env (MultiAgentEnv): env to wrap
-            groups (dict): Grouping spec as documented in MultiAgentEnv
-            obs_space (Space): Optional observation space for the grouped
-                env. Must be a tuple space.
-            act_space (Space): Optional action space for the grouped env.
-                Must be a tuple space.
-        """
-
-        self.env = env
-        self.groups = groups
-        self.agent_id_to_group = {}
-        for group_id, agent_ids in groups.items():
-            for agent_id in agent_ids:
-                if agent_id in self.agent_id_to_group:
-                    raise ValueError(
-                        "Agent id {} is in multiple groups".format(
-                            agent_id, groups))
-                self.agent_id_to_group[agent_id] = group_id
-        if obs_space is not None:
-            self.observation_space = obs_space
-        if act_space is not None:
-            self.action_space = act_space
-
-    def reset(self):
-        obs = self.env.reset()
-        return self._group_items(obs)
-
-    def step(self, action_dict):
-        # Ungroup and send actions
-        action_dict = self._ungroup_items(action_dict)
-        obs, rewards, dones, infos = self.env.step(action_dict)
-
-        # Apply grouping transforms to the env outputs
-        obs = self._group_items(obs)
-        rewards = self._group_items(
-            rewards, agg_fn=lambda gvals: list(gvals.values()))
-        dones = self._group_items(
-            dones, agg_fn=lambda gvals: all(gvals.values()))
-        infos = self._group_items(
-            infos, agg_fn=lambda gvals: {GROUP_INFO: list(gvals.values())})
-
-        # Aggregate rewards, but preserve the original values in infos
-        for agent_id, rew in rewards.items():
-            if isinstance(rew, list):
-                rewards[agent_id] = sum(rew)
-                if agent_id not in infos:
-                    infos[agent_id] = {}
-                infos[agent_id][GROUP_REWARDS] = rew
-
-        return obs, rewards, dones, infos
-
-    def _ungroup_items(self, items):
-        out = {}
-        for agent_id, value in items.items():
-            if agent_id in self.groups:
-                assert len(value) == len(self.groups[agent_id]), \
-                    (agent_id, value, self.groups)
-                for a, v in zip(self.groups[agent_id], value):
-                    out[a] = v
-            else:
-                out[agent_id] = value
-        return out
-
-    def _group_items(self, items, agg_fn=lambda gvals: list(gvals.values())):
-        grouped_items = {}
-        for agent_id, item in items.items():
-            if agent_id in self.agent_id_to_group:
-                group_id = self.agent_id_to_group[agent_id]
-                if group_id in grouped_items:
-                    continue  # already added
-                group_out = OrderedDict()
-                for a in self.groups[group_id]:
-                    if a in items:
-                        group_out[a] = items[a]
-                    else:
-                        raise ValueError(
-                            "Missing member of group {}: {}: {}".format(
-                                group_id, a, items))
-                grouped_items[group_id] = agg_fn(group_out)
-            else:
-                grouped_items[agent_id] = item
-        return grouped_items
@@ -1,114 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.utils.annotations import PublicAPI
-
-
-@PublicAPI
-class MultiAgentEnv(object):
-    """An environment that hosts multiple independent agents.
-
-    Agents are identified by (string) agent ids. Note that these "agents" here
-    are not to be confused with RLlib agents.
-
-    Examples:
-        >>> env = MyMultiAgentEnv()
-        >>> obs = env.reset()
-        >>> print(obs)
-        {
-            "car_0": [2.4, 1.6],
-            "car_1": [3.4, -3.2],
-            "traffic_light_1": [0, 3, 5, 1],
-        }
-        >>> obs, rewards, dones, infos = env.step(
-            action_dict={
-                "car_0": 1, "car_1": 0, "traffic_light_1": 2,
-            })
-        >>> print(rewards)
-        {
-            "car_0": 3,
-            "car_1": -1,
-            "traffic_light_1": 0,
-        }
-        >>> print(dones)
-        {
-            "car_0": False,    # car_0 is still running
-            "car_1": True,     # car_1 is done
-            "__all__": False,  # the env is not done
-        }
-        >>> print(infos)
-        {
-            "car_0": {},  # info for car_0
-            "car_1": {},  # info for car_1
-        }
-    """
-
-    @PublicAPI
-    def reset(self):
-        """Resets the env and returns observations from ready agents.
-
-        Returns:
-            obs (dict): New observations for each ready agent.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def step(self, action_dict):
-        """Returns observations from ready agents.
-
-        The returns are dicts mapping from agent_id strings to values. The
-        number of agents in the env can vary over time.
-
-        Returns
-        -------
-            obs (dict): New observations for each ready agent.
-            rewards (dict): Reward values for each ready agent. If the
-                episode is just started, the value will be None.
-            dones (dict): Done values for each ready agent. The special key
-                "__all__" (required) is used to indicate env termination.
-            infos (dict): Optional info values for each agent id.
-        """
-        raise NotImplementedError
-
-# yapf: disable
-# __grouping_doc_begin__
-    @PublicAPI
-    def with_agent_groups(self, groups, obs_space=None, act_space=None):
-        """Convenience method for grouping together agents in this env.
-
-        An agent group is a list of agent ids that are mapped to a single
-        logical agent. All agents of the group must act at the same time in the
-        environment. The grouped agent exposes Tuple action and observation
-        spaces that are the concatenated action and obs spaces of the
-        individual agents.
-
-        The rewards of all the agents in a group are summed. The individual
-        agent rewards are available under the "individual_rewards" key of the
-        group info return.
-
-        Agent grouping is required to leverage algorithms such as Q-Mix.
-
-        This API is experimental.
-
-        Arguments:
-            groups (dict): Mapping from group id to a list of the agent ids
-                of group members. If an agent id is not present in any group
-                value, it will be left ungrouped.
-            obs_space (Space): Optional observation space for the grouped
-                env. Must be a tuple space.
-            act_space (Space): Optional action space for the grouped env.
-                Must be a tuple space.
-
-        Examples:
-            >>> env = YourMultiAgentEnv(...)
-            >>> grouped_env = env.with_agent_groups(env, {
-            ...   "group1": ["agent1", "agent2", "agent3"],
-            ...   "group2": ["agent4", "agent5"],
-            ... })
-        """
-
-        from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper
-        return _GroupAgentsWrapper(self, groups, obs_space, act_space)
-# __grouping_doc_end__
-# yapf: enable
@@ -1,130 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-
-import ray
-from ray.rllib.env.base_env import BaseEnv, _DUMMY_AGENT_ID, ASYNC_RESET_RETURN
-from ray.rllib.utils.memory import ray_get_and_free
-
-logger = logging.getLogger(__name__)
-
-
-class RemoteVectorEnv(BaseEnv):
-    """Vector env that executes envs in remote workers.
-
-    This provides dynamic batching of inference as observations are returned
-    from the remote simulator actors. Both single and multi-agent child envs
-    are supported, and envs can be stepped synchronously or async.
-    """
-
-    def __init__(self, make_env, num_envs, multiagent,
-                 remote_env_batch_wait_ms):
-        self.make_local_env = make_env
-        self.num_envs = num_envs
-        self.multiagent = multiagent
-        self.poll_timeout = remote_env_batch_wait_ms / 1000
-
-        self.actors = None  # lazy init
-        self.pending = None  # lazy init
-
-    def poll(self):
-        if self.actors is None:
-
-            def make_remote_env(i):
-                logger.info("Launching env {} in remote actor".format(i))
-                if self.multiagent:
-                    return _RemoteMultiAgentEnv.remote(self.make_local_env, i)
-                else:
-                    return _RemoteSingleAgentEnv.remote(self.make_local_env, i)
-
-            self.actors = [make_remote_env(i) for i in range(self.num_envs)]
-
-        if self.pending is None:
-            self.pending = {a.reset.remote(): a for a in self.actors}
-
-        # each keyed by env_id in [0, num_remote_envs)
-        obs, rewards, dones, infos = {}, {}, {}, {}
-        ready = []
-
-        # Wait for at least 1 env to be ready here
-        while not ready:
-            ready, _ = ray.wait(
-                list(self.pending),
-                num_returns=len(self.pending),
-                timeout=self.poll_timeout)
-
-        # Get and return observations for each of the ready envs
-        env_ids = set()
-        for obj_id in ready:
-            actor = self.pending.pop(obj_id)
-            env_id = self.actors.index(actor)
-            env_ids.add(env_id)
-            ob, rew, done, info = ray_get_and_free(obj_id)
-            obs[env_id] = ob
-            rewards[env_id] = rew
-            dones[env_id] = done
-            infos[env_id] = info
-
-        logger.debug("Got obs batch for actors {}".format(env_ids))
-        return obs, rewards, dones, infos, {}
-
-    def send_actions(self, action_dict):
-        for env_id, actions in action_dict.items():
-            actor = self.actors[env_id]
-            obj_id = actor.step.remote(actions)
-            self.pending[obj_id] = actor
-
-    def try_reset(self, env_id):
-        actor = self.actors[env_id]
-        obj_id = actor.reset.remote()
-        self.pending[obj_id] = actor
-        return ASYNC_RESET_RETURN
-
-    def stop(self):
-        if self.actors is not None:
-            for actor in self.actors:
-                actor.__ray_terminate__.remote()
-
-
-@ray.remote(num_cpus=0)
-class _RemoteMultiAgentEnv(object):
-    """Wrapper class for making a multi-agent env a remote actor."""
-
-    def __init__(self, make_env, i):
-        self.env = make_env(i)
-
-    def reset(self):
-        obs = self.env.reset()
-        # each keyed by agent_id in the env
-        rew = {agent_id: 0 for agent_id in obs.keys()}
-        info = {agent_id: {} for agent_id in obs.keys()}
-        done = {"__all__": False}
-        return obs, rew, done, info
-
-    def step(self, action_dict):
-        return self.env.step(action_dict)
-
-
-@ray.remote(num_cpus=0)
-class _RemoteSingleAgentEnv(object):
-    """Wrapper class for making a gym env a remote actor."""
-
-    def __init__(self, make_env, i):
-        self.env = make_env(i)
-
-    def reset(self):
-        obs = {_DUMMY_AGENT_ID: self.env.reset()}
-        rew = {agent_id: 0 for agent_id in obs.keys()}
-        info = {agent_id: {} for agent_id in obs.keys()}
-        done = {"__all__": False}
-        return obs, rew, done, info
-
-    def step(self, action):
-        obs, rew, done, info = self.env.step(action[_DUMMY_AGENT_ID])
-        obs, rew, done, info = [{
-            _DUMMY_AGENT_ID: x
-        } for x in [obs, rew, done, info]]
-        done["__all__"] = done[_DUMMY_AGENT_ID]
-        return obs, rew, done, info
@@ -1,8 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.env.external_env import ExternalEnv
-
-# renamed to ExternalEnv in 0.6
-ServingEnv = ExternalEnv
@@ -1,126 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-import numpy as np
-
-from ray.rllib.utils.annotations import override, PublicAPI
-
-logger = logging.getLogger(__name__)
-
-
-@PublicAPI
-class VectorEnv(object):
-    """An environment that supports batch evaluation.
-
-    Subclasses must define the following attributes:
-
-    Attributes:
-        action_space (gym.Space): Action space of individual envs.
-        observation_space (gym.Space): Observation space of individual envs.
-        num_envs (int): Number of envs in this vector env.
-    """
-
-    @staticmethod
-    def wrap(make_env=None,
-             existing_envs=None,
-             num_envs=1,
-             action_space=None,
-             observation_space=None):
-        return _VectorizedGymEnv(make_env, existing_envs or [], num_envs,
-                                 action_space, observation_space)
-
-    @PublicAPI
-    def vector_reset(self):
-        """Resets all environments.
-
-        Returns:
-            obs (list): Vector of observations from each environment.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def reset_at(self, index):
-        """Resets a single environment.
-
-        Returns:
-            obs (obj): Observations from the resetted environment.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def vector_step(self, actions):
-        """Vectorized step.
-
-        Arguments:
-            actions (list): Actions for each env.
-
-        Returns:
-            obs (list): New observations for each env.
-            rewards (list): Reward values for each env.
-            dones (list): Done values for each env.
-            infos (list): Info values for each env.
-        """
-        raise NotImplementedError
-
-    @PublicAPI
-    def get_unwrapped(self):
-        """Returns the underlying env instances."""
-        raise NotImplementedError
-
-
-class _VectorizedGymEnv(VectorEnv):
-    """Internal wrapper for gym envs to implement VectorEnv.
-
-    Arguments:
-        make_env (func|None): Factory that produces a new gym env. Must be
-            defined if the number of existing envs is less than num_envs.
-        existing_envs (list): List of existing gym envs.
-        num_envs (int): Desired num gym envs to keep total.
-    """
-
-    def __init__(self,
-                 make_env,
-                 existing_envs,
-                 num_envs,
-                 action_space=None,
-                 observation_space=None):
-        self.make_env = make_env
-        self.envs = existing_envs
-        self.num_envs = num_envs
-        while len(self.envs) < self.num_envs:
-            self.envs.append(self.make_env(len(self.envs)))
-        self.action_space = action_space or self.envs[0].action_space
-        self.observation_space = observation_space or \
-            self.envs[0].observation_space
-
-    @override(VectorEnv)
-    def vector_reset(self):
-        return [e.reset() for e in self.envs]
-
-    @override(VectorEnv)
-    def reset_at(self, index):
-        return self.envs[index].reset()
-
-    @override(VectorEnv)
-    def vector_step(self, actions):
-        obs_batch, rew_batch, done_batch, info_batch = [], [], [], []
-        for i in range(self.num_envs):
-            obs, r, done, info = self.envs[i].step(actions[i])
-            if not np.isscalar(r) or not np.isreal(r) or not np.isfinite(r):
-                raise ValueError(
-                    "Reward should be finite scalar, got {} ({})".format(
-                        r, type(r)))
-            if type(info) is not dict:
-                raise ValueError("Info should be a dict, got {} ({})".format(
-                    info, type(info)))
-            obs_batch.append(obs)
-            rew_batch.append(r)
-            done_batch.append(done)
-            info_batch.append(info)
-        return obs_batch, rew_batch, done_batch, info_batch
-
-    @override(VectorEnv)
-    def get_unwrapped(self):
-        return self.envs
@@ -1,31 +0,0 @@
-from ray.rllib.evaluation.episode import MultiAgentEpisode
-from ray.rllib.evaluation.rollout_worker import RolloutWorker
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
-from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
-from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
-from ray.rllib.evaluation.sample_batch_builder import (
-    SampleBatchBuilder, MultiAgentSampleBatchBuilder)
-from ray.rllib.evaluation.sampler import SyncSampler, AsyncSampler
-from ray.rllib.evaluation.postprocessing import compute_advantages
-from ray.rllib.evaluation.metrics import collect_metrics
-
-__all__ = [
-    "EvaluatorInterface",
-    "RolloutWorker",
-    "PolicyGraph",
-    "TFPolicyGraph",
-    "TorchPolicyGraph",
-    "SampleBatch",
-    "MultiAgentBatch",
-    "SampleBatchBuilder",
-    "MultiAgentSampleBatchBuilder",
-    "SyncSampler",
-    "AsyncSampler",
-    "compute_advantages",
-    "collect_metrics",
-    "MultiAgentEpisode",
-    "PolicyEvaluator",
-]
@@ -1,201 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import defaultdict
-import random
-
-import numpy as np
-
-from ray.rllib.env.base_env import _DUMMY_AGENT_ID
-from ray.rllib.utils.annotations import DeveloperAPI
-
-
-@DeveloperAPI
-class MultiAgentEpisode(object):
-    """Tracks the current state of a (possibly multi-agent) episode.
-
-    Attributes:
-        new_batch_builder (func): Create a new MultiAgentSampleBatchBuilder.
-        add_extra_batch (func): Return a built MultiAgentBatch to the sampler.
-        batch_builder (obj): Batch builder for the current episode.
-        total_reward (float): Summed reward across all agents in this episode.
-        length (int): Length of this episode.
-        episode_id (int): Unique id identifying this trajectory.
-        agent_rewards (dict): Summed rewards broken down by agent.
-        custom_metrics (dict): Dict where the you can add custom metrics.
-        user_data (dict): Dict that you can use for temporary storage.
-
-    Use case 1: Model-based rollouts in multi-agent:
-        A custom compute_actions() function in a policy can inspect the
-        current episode state and perform a number of rollouts based on the
-        policies and state of other agents in the environment.
-
-    Use case 2: Returning extra rollouts data.
-        The model rollouts can be returned back to the sampler by calling:
-
-        >>> batch = episode.new_batch_builder()
-        >>> for each transition:
-               batch.add_values(...)  # see sampler for usage
-        >>> episode.extra_batches.add(batch.build_and_reset())
-    """
-
-    def __init__(self, policies, policy_mapping_fn, batch_builder_factory,
-                 extra_batch_callback):
-        self.new_batch_builder = batch_builder_factory
-        self.add_extra_batch = extra_batch_callback
-        self.batch_builder = batch_builder_factory()
-        self.total_reward = 0.0
-        self.length = 0
-        self.episode_id = random.randrange(2e9)
-        self.agent_rewards = defaultdict(float)
-        self.custom_metrics = {}
-        self.user_data = {}
-        self._policies = policies
-        self._policy_mapping_fn = policy_mapping_fn
-        self._next_agent_index = 0
-        self._agent_to_index = {}
-        self._agent_to_policy = {}
-        self._agent_to_rnn_state = {}
-        self._agent_to_last_obs = {}
-        self._agent_to_last_raw_obs = {}
-        self._agent_to_last_info = {}
-        self._agent_to_last_action = {}
-        self._agent_to_last_pi_info = {}
-        self._agent_to_prev_action = {}
-        self._agent_reward_history = defaultdict(list)
-
-    @DeveloperAPI
-    def soft_reset(self):
-        """Clears rewards and metrics, but retains RNN and other state.
-
-        This is used to carry state across multiple logical episodes in the
-        same env (i.e., if `soft_horizon` is set).
-        """
-        self.length = 0
-        self.episode_id = random.randrange(2e9)
-        self.total_reward = 0.0
-        self.agent_rewards = defaultdict(float)
-        self._agent_reward_history = defaultdict(list)
-
-    @DeveloperAPI
-    def policy_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the policy for the specified agent.
-
-        If the agent is new, the policy mapping fn will be called to bind the
-        agent to a policy for the duration of the episode.
-        """
-
-        if agent_id not in self._agent_to_policy:
-            self._agent_to_policy[agent_id] = self._policy_mapping_fn(agent_id)
-        return self._agent_to_policy[agent_id]
-
-    @DeveloperAPI
-    def last_observation_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last observation for the specified agent."""
-
-        return self._agent_to_last_obs.get(agent_id)
-
-    @DeveloperAPI
-    def last_raw_obs_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last un-preprocessed obs for the specified agent."""
-
-        return self._agent_to_last_raw_obs.get(agent_id)
-
-    @DeveloperAPI
-    def last_info_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last info for the specified agent."""
-
-        return self._agent_to_last_info.get(agent_id)
-
-    @DeveloperAPI
-    def last_action_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last action for the specified agent, or zeros."""
-
-        if agent_id in self._agent_to_last_action:
-            return _flatten_action(self._agent_to_last_action[agent_id])
-        else:
-            policy = self._policies[self.policy_for(agent_id)]
-            flat = _flatten_action(policy.action_space.sample())
-            return np.zeros_like(flat)
-
-    @DeveloperAPI
-    def prev_action_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the previous action for the specified agent."""
-
-        if agent_id in self._agent_to_prev_action:
-            return _flatten_action(self._agent_to_prev_action[agent_id])
-        else:
-            # We're at t=0, so return all zeros.
-            return np.zeros_like(self.last_action_for(agent_id))
-
-    @DeveloperAPI
-    def prev_reward_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the previous reward for the specified agent."""
-
-        history = self._agent_reward_history[agent_id]
-        if len(history) >= 2:
-            return history[-2]
-        else:
-            # We're at t=0, so there is no previous reward, just return zero.
-            return 0.0
-
-    @DeveloperAPI
-    def rnn_state_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last RNN state for the specified agent."""
-
-        if agent_id not in self._agent_to_rnn_state:
-            policy = self._policies[self.policy_for(agent_id)]
-            self._agent_to_rnn_state[agent_id] = policy.get_initial_state()
-        return self._agent_to_rnn_state[agent_id]
-
-    @DeveloperAPI
-    def last_pi_info_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the last info object for the specified agent."""
-
-        return self._agent_to_last_pi_info[agent_id]
-
-    def _add_agent_rewards(self, reward_dict):
-        for agent_id, reward in reward_dict.items():
-            if reward is not None:
-                self.agent_rewards[agent_id,
-                                   self.policy_for(agent_id)] += reward
-                self.total_reward += reward
-                self._agent_reward_history[agent_id].append(reward)
-
-    def _set_rnn_state(self, agent_id, rnn_state):
-        self._agent_to_rnn_state[agent_id] = rnn_state
-
-    def _set_last_observation(self, agent_id, obs):
-        self._agent_to_last_obs[agent_id] = obs
-
-    def _set_last_raw_obs(self, agent_id, obs):
-        self._agent_to_last_raw_obs[agent_id] = obs
-
-    def _set_last_info(self, agent_id, info):
-        self._agent_to_last_info[agent_id] = info
-
-    def _set_last_action(self, agent_id, action):
-        if agent_id in self._agent_to_last_action:
-            self._agent_to_prev_action[agent_id] = \
-                self._agent_to_last_action[agent_id]
-        self._agent_to_last_action[agent_id] = action
-
-    def _set_last_pi_info(self, agent_id, pi_info):
-        self._agent_to_last_pi_info[agent_id] = pi_info
-
-    def _agent_index(self, agent_id):
-        if agent_id not in self._agent_to_index:
-            self._agent_to_index[agent_id] = self._next_agent_index
-            self._next_agent_index += 1
-        return self._agent_to_index[agent_id]
-
-
-def _flatten_action(action):
-    # Concatenate tuple actions
-    if isinstance(action, list) or isinstance(action, tuple):
-        expanded = []
-        for a in action:
-            expanded.append(np.reshape(a, [-1]))
-        action = np.concatenate(expanded, axis=0).flatten()
-    return action
@@ -1,128 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from ray.rllib.utils.annotations import DeveloperAPI
-
-
-@DeveloperAPI
-class EvaluatorInterface(object):
-    """This is the interface between policy optimizers and policy evaluation.
-
-    See also: RolloutWorker
-    """
-
-    @DeveloperAPI
-    def sample(self):
-        """Returns a batch of experience sampled from this evaluator.
-
-        This method must be implemented by subclasses.
-
-        Returns:
-            SampleBatch|MultiAgentBatch: A columnar batch of experiences
-            (e.g., tensors), or a multi-agent batch.
-
-        Examples:
-            >>> print(ev.sample())
-            SampleBatch({"obs": [1, 2, 3], "action": [0, 1, 0], ...})
-        """
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def learn_on_batch(self, samples):
-        """Update policies based on the given batch.
-
-        This is the equivalent to apply_gradients(compute_gradients(samples)),
-        but can be optimized to avoid pulling gradients into CPU memory.
-
-        Either this or the combination of compute/apply grads must be
-        implemented by subclasses.
-
-        Returns:
-            info: dictionary of extra metadata from compute_gradients().
-
-        Examples:
-            >>> batch = ev.sample()
-            >>> ev.learn_on_batch(samples)
-        """
-
-        grads, info = self.compute_gradients(samples)
-        self.apply_gradients(grads)
-        return info
-
-    @DeveloperAPI
-    def compute_gradients(self, samples):
-        """Returns a gradient computed w.r.t the specified samples.
-
-        Either this or learn_on_batch() must be implemented by subclasses.
-
-        Returns:
-            (grads, info): A list of gradients that can be applied on a
-            compatible evaluator. In the multi-agent case, returns a dict
-            of gradients keyed by policy ids. An info dictionary of
-            extra metadata is also returned.
-
-        Examples:
-            >>> batch = ev.sample()
-            >>> grads, info = ev2.compute_gradients(samples)
-        """
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def apply_gradients(self, grads):
-        """Applies the given gradients to this evaluator's weights.
-
-        Either this or learn_on_batch() must be implemented by subclasses.
-
-        Examples:
-            >>> samples = ev1.sample()
-            >>> grads, info = ev2.compute_gradients(samples)
-            >>> ev1.apply_gradients(grads)
-        """
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def get_weights(self):
-        """Returns the model weights of this Evaluator.
-
-        This method must be implemented by subclasses.
-
-        Returns:
-            object: weights that can be set on a compatible evaluator.
-            info: dictionary of extra metadata.
-
-        Examples:
-            >>> weights = ev1.get_weights()
-        """
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def set_weights(self, weights):
-        """Sets the model weights of this Evaluator.
-
-        This method must be implemented by subclasses.
-
-        Examples:
-            >>> weights = ev1.get_weights()
-            >>> ev2.set_weights(weights)
-        """
-
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def get_host(self):
-        """Returns the hostname of the process running this evaluator."""
-
-        return os.uname()[1]
-
-    @DeveloperAPI
-    def apply(self, func, *args):
-        """Apply the given function to this evaluator instance."""
-
-        return func(self, *args)
@@ -1,173 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging
-import numpy as np
-import collections
-
-import ray
-from ray.rllib.evaluation.rollout_metrics import RolloutMetrics
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.offline.off_policy_estimator import OffPolicyEstimate
-from ray.rllib.policy.policy import LEARNER_STATS_KEY
-from ray.rllib.utils.annotations import DeveloperAPI
-from ray.rllib.utils.memory import ray_get_and_free
-
-logger = logging.getLogger(__name__)
-
-
-@DeveloperAPI
-def get_learner_stats(grad_info):
-    """Return optimization stats reported from the policy.
-
-    Example:
-        >>> grad_info = evaluator.learn_on_batch(samples)
-        >>> print(get_stats(grad_info))
-        {"vf_loss": ..., "policy_loss": ...}
-    """
-
-    if LEARNER_STATS_KEY in grad_info:
-        return grad_info[LEARNER_STATS_KEY]
-
-    multiagent_stats = {}
-    for k, v in grad_info.items():
-        if type(v) is dict:
-            if LEARNER_STATS_KEY in v:
-                multiagent_stats[k] = v[LEARNER_STATS_KEY]
-
-    return multiagent_stats
-
-
-@DeveloperAPI
-def collect_metrics(local_worker=None,
-                    remote_workers=[],
-                    to_be_collected=[],
-                    timeout_seconds=180):
-    """Gathers episode metrics from RolloutWorker instances."""
-
-    episodes, to_be_collected = collect_episodes(
-        local_worker,
-        remote_workers,
-        to_be_collected,
-        timeout_seconds=timeout_seconds)
-    metrics = summarize_episodes(episodes, episodes)
-    return metrics
-
-
-@DeveloperAPI
-def collect_episodes(local_worker=None,
-                     remote_workers=[],
-                     to_be_collected=[],
-                     timeout_seconds=180):
-    """Gathers new episodes metrics tuples from the given evaluators."""
-
-    if remote_workers:
-        pending = [
-            a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_workers
-        ] + to_be_collected
-        collected, to_be_collected = ray.wait(
-            pending, num_returns=len(pending), timeout=timeout_seconds * 1.0)
-        if pending and len(collected) == 0:
-            logger.warning(
-                "WARNING: collected no metrics in {} seconds".format(
-                    timeout_seconds))
-        metric_lists = ray_get_and_free(collected)
-    else:
-        metric_lists = []
-
-    if local_worker:
-        metric_lists.append(local_worker.get_metrics())
-    episodes = []
-    for metrics in metric_lists:
-        episodes.extend(metrics)
-    return episodes, to_be_collected
-
-
-@DeveloperAPI
-def summarize_episodes(episodes, new_episodes):
-    """Summarizes a set of episode metrics tuples.
-
-    Arguments:
-        episodes: smoothed set of episodes including historical ones
-        new_episodes: just the new episodes in this iteration
-    """
-
-    episodes, estimates = _partition(episodes)
-    new_episodes, _ = _partition(new_episodes)
-
-    episode_rewards = []
-    episode_lengths = []
-    policy_rewards = collections.defaultdict(list)
-    custom_metrics = collections.defaultdict(list)
-    perf_stats = collections.defaultdict(list)
-    for episode in episodes:
-        episode_lengths.append(episode.episode_length)
-        episode_rewards.append(episode.episode_reward)
-        for k, v in episode.custom_metrics.items():
-            custom_metrics[k].append(v)
-        for k, v in episode.perf_stats.items():
-            perf_stats[k].append(v)
-        for (_, policy_id), reward in episode.agent_rewards.items():
-            if policy_id != DEFAULT_POLICY_ID:
-                policy_rewards[policy_id].append(reward)
-    if episode_rewards:
-        min_reward = min(episode_rewards)
-        max_reward = max(episode_rewards)
-    else:
-        min_reward = float("nan")
-        max_reward = float("nan")
-    avg_reward = np.mean(episode_rewards)
-    avg_length = np.mean(episode_lengths)
-
-    for policy_id, rewards in policy_rewards.copy().items():
-        policy_rewards[policy_id] = np.mean(rewards)
-
-    for k, v_list in custom_metrics.copy().items():
-        custom_metrics[k + "_mean"] = np.mean(v_list)
-        filt = [v for v in v_list if not np.isnan(v)]
-        if filt:
-            custom_metrics[k + "_min"] = np.min(filt)
-            custom_metrics[k + "_max"] = np.max(filt)
-        else:
-            custom_metrics[k + "_min"] = float("nan")
-            custom_metrics[k + "_max"] = float("nan")
-        del custom_metrics[k]
-
-    for k, v_list in perf_stats.copy().items():
-        perf_stats[k] = np.mean(v_list)
-
-    estimators = collections.defaultdict(lambda: collections.defaultdict(list))
-    for e in estimates:
-        acc = estimators[e.estimator_name]
-        for k, v in e.metrics.items():
-            acc[k].append(v)
-    for name, metrics in estimators.items():
-        for k, v_list in metrics.items():
-            metrics[k] = np.mean(v_list)
-        estimators[name] = dict(metrics)
-
-    return dict(
-        episode_reward_max=max_reward,
-        episode_reward_min=min_reward,
-        episode_reward_mean=avg_reward,
-        episode_len_mean=avg_length,
-        episodes_this_iter=len(new_episodes),
-        policy_reward_mean=dict(policy_rewards),
-        custom_metrics=dict(custom_metrics),
-        sampler_perf=dict(perf_stats),
-        off_policy_estimator=dict(estimators))
-
-
-def _partition(episodes):
-    """Divides metrics data into true rollouts vs off-policy estimates."""
-
-    rollouts, estimates = [], []
-    for e in episodes:
-        if isinstance(e, RolloutMetrics):
-            rollouts.append(e)
-        elif isinstance(e, OffPolicyEstimate):
-            estimates.append(e)
-        else:
-            raise ValueError("Unknown metric type: {}".format(e))
-    return rollouts, estimates
@@ -1,9 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.utils import renamed_class
-from ray.rllib.evaluation import RolloutWorker
-
-PolicyEvaluator = renamed_class(
-    RolloutWorker, old_name="rllib.evaluation.PolicyEvaluator")
@@ -1,8 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.policy.policy import Policy
-from ray.rllib.utils import renamed_class
-
-PolicyGraph = renamed_class(Policy, old_name="PolicyGraph")
@@ -1,70 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import scipy.signal
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.annotations import DeveloperAPI
-
-
-def discount(x, gamma):
-    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
-
-
-class Postprocessing(object):
-    """Constant definitions for postprocessing."""
-
-    ADVANTAGES = "advantages"
-    VALUE_TARGETS = "value_targets"
-
-
-@DeveloperAPI
-def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
-    """Given a rollout, compute its value targets and the advantage.
-
-    Args:
-        rollout (SampleBatch): SampleBatch of a single trajectory
-        last_r (float): Value estimation for last observation
-        gamma (float): Discount factor.
-        lambda_ (float): Parameter for GAE
-        use_gae (bool): Using Generalized Advantage Estamation
-
-    Returns:
-        SampleBatch (SampleBatch): Object with experience from rollout and
-            processed rewards.
-    """
-
-    traj = {}
-    trajsize = len(rollout[SampleBatch.ACTIONS])
-    for key in rollout:
-        traj[key] = np.stack(rollout[key])
-
-    if use_gae:
-        assert SampleBatch.VF_PREDS in rollout, "Values not found!"
-        vpred_t = np.concatenate(
-            [rollout[SampleBatch.VF_PREDS],
-             np.array([last_r])])
-        delta_t = (
-            traj[SampleBatch.REWARDS] + gamma * vpred_t[1:] - vpred_t[:-1])
-        # This formula for the advantage comes
-        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
-        traj[Postprocessing.ADVANTAGES] = discount(delta_t, gamma * lambda_)
-        traj[Postprocessing.VALUE_TARGETS] = (
-            traj[Postprocessing.ADVANTAGES] +
-            traj[SampleBatch.VF_PREDS]).copy().astype(np.float32)
-    else:
-        rewards_plus_v = np.concatenate(
-            [rollout[SampleBatch.REWARDS],
-             np.array([last_r])])
-        traj[Postprocessing.ADVANTAGES] = discount(rewards_plus_v, gamma)[:-1]
-        # TODO(ekl): support using a critic without GAE
-        traj[Postprocessing.VALUE_TARGETS] = np.zeros_like(
-            traj[Postprocessing.ADVANTAGES])
-
-    traj[Postprocessing.ADVANTAGES] = traj[
-        Postprocessing.ADVANTAGES].copy().astype(np.float32)
-
-    assert all(val.shape[0] == trajsize for val in traj.values()), \
-        "Rollout stacked incorrectly!"
-    return SampleBatch(traj)
@@ -1,11 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-# Define this in its own file, see #5125
-RolloutMetrics = collections.namedtuple("RolloutMetrics", [
-    "episode_length", "episode_reward", "agent_rewards", "custom_metrics",
-    "perf_stats"
-])
@@ -1,819 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import numpy as np
-import gym
-import logging
-import pickle
-
-import ray
-from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
-from ray.rllib.env.base_env import BaseEnv
-from ray.rllib.env.env_context import EnvContext
-from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
-from ray.rllib.env.vector_env import VectorEnv
-from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
-from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
-from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator
-from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.preprocessors import NoPreprocessor
-from ray.rllib.utils import merge_dicts
-from ray.rllib.utils.annotations import override, DeveloperAPI
-from ray.rllib.utils.debug import disable_log_once_globally, log_once, \
-    summarize, enable_periodic_logging
-from ray.rllib.utils.filter import get_filter
-from ray.rllib.utils.tf_run_builder import TFRunBuilder
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-logger = logging.getLogger(__name__)
-
-# Handle to the current rollout worker, which will be set to the most recently
-# created RolloutWorker in this process. This can be helpful to access in
-# custom env or policy classes for debugging or advanced use cases.
-_global_worker = None
-
-
-@DeveloperAPI
-def get_global_worker():
-    """Returns a handle to the active rollout worker in this process."""
-
-    global _global_worker
-    return _global_worker
-
-
-@DeveloperAPI
-class RolloutWorker(EvaluatorInterface):
-    """Common experience collection class.
-
-    This class wraps a policy instance and an environment class to
-    collect experiences from the environment. You can create many replicas of
-    this class as Ray actors to scale RL training.
-
-    This class supports vectorized and multi-agent policy evaluation (e.g.,
-    VectorEnv, MultiAgentEnv, etc.)
-
-    Examples:
-        >>> # Create a rollout worker and using it to collect experiences.
-        >>> worker = RolloutWorker(
-        ...   env_creator=lambda _: gym.make("CartPole-v0"),
-        ...   policy=PGTFPolicy)
-        >>> print(worker.sample())
-        SampleBatch({
-            "obs": [[...]], "actions": [[...]], "rewards": [[...]],
-            "dones": [[...]], "new_obs": [[...]]})
-
-        >>> # Creating a multi-agent rollout worker
-        >>> worker = RolloutWorker(
-        ...   env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
-        ...   policies={
-        ...       # Use an ensemble of two policies for car agents
-        ...       "car_policy1":
-        ...         (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}),
-        ...       "car_policy2":
-        ...         (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}),
-        ...       # Use a single shared policy for all traffic lights
-        ...       "traffic_light_policy":
-        ...         (PGTFPolicy, Box(...), Discrete(...), {}),
-        ...   },
-        ...   policy_mapping_fn=lambda agent_id:
-        ...     random.choice(["car_policy1", "car_policy2"])
-        ...     if agent_id.startswith("car_") else "traffic_light_policy")
-        >>> print(worker.sample())
-        MultiAgentBatch({
-            "car_policy1": SampleBatch(...),
-            "car_policy2": SampleBatch(...),
-            "traffic_light_policy": SampleBatch(...)})
-    """
-
-    @DeveloperAPI
-    @classmethod
-    def as_remote(cls, num_cpus=None, num_gpus=None, resources=None):
-        return ray.remote(
-            num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls)
-
-    @DeveloperAPI
-    def __init__(self,
-                 env_creator,
-                 policy,
-                 policy_mapping_fn=None,
-                 policies_to_train=None,
-                 tf_session_creator=None,
-                 batch_steps=100,
-                 batch_mode="truncate_episodes",
-                 episode_horizon=None,
-                 preprocessor_pref="deepmind",
-                 sample_async=False,
-                 compress_observations=False,
-                 num_envs=1,
-                 observation_filter="NoFilter",
-                 clip_rewards=None,
-                 clip_actions=True,
-                 env_config=None,
-                 model_config=None,
-                 policy_config=None,
-                 worker_index=0,
-                 monitor_path=None,
-                 log_dir=None,
-                 log_level=None,
-                 callbacks=None,
-                 input_creator=lambda ioctx: ioctx.default_sampler_input(),
-                 input_evaluation=frozenset([]),
-                 output_creator=lambda ioctx: NoopOutput(),
-                 remote_worker_envs=False,
-                 remote_env_batch_wait_ms=0,
-                 soft_horizon=False,
-                 no_done_at_end=False,
-                 seed=None,
-                 _fake_sampler=False):
-        """Initialize a rollout worker.
-
-        Arguments:
-            env_creator (func): Function that returns a gym.Env given an
-                EnvContext wrapped configuration.
-            policy (class|dict): Either a class implementing
-                Policy, or a dictionary of policy id strings to
-                (Policy, obs_space, action_space, config) tuples. If a
-                dict is specified, then we are in multi-agent mode and a
-                policy_mapping_fn should also be set.
-            policy_mapping_fn (func): A function that maps agent ids to
-                policy ids in multi-agent mode. This function will be called
-                each time a new agent appears in an episode, to bind that agent
-                to a policy for the duration of the episode.
-            policies_to_train (list): Optional whitelist of policies to train,
-                or None for all policies.
-            tf_session_creator (func): A function that returns a TF session.
-                This is optional and only useful with TFPolicy.
-            batch_steps (int): The target number of env transitions to include
-                in each sample batch returned from this worker.
-            batch_mode (str): One of the following batch modes:
-                "truncate_episodes": Each call to sample() will return a batch
-                    of at most `batch_steps * num_envs` in size. The batch will
-                    be exactly `batch_steps * num_envs` in size if
-                    postprocessing does not change batch sizes. Episodes may be
-                    truncated in order to meet this size requirement.
-                "complete_episodes": Each call to sample() will return a batch
-                    of at least `batch_steps * num_envs` in size. Episodes will
-                    not be truncated, but multiple episodes may be packed
-                    within one batch to meet the batch size. Note that when
-                    `num_envs > 1`, episode steps will be buffered until the
-                    episode completes, and hence batches may contain
-                    significant amounts of off-policy data.
-            episode_horizon (int): Whether to stop episodes at this horizon.
-            preprocessor_pref (str): Whether to prefer RLlib preprocessors
-                ("rllib") or deepmind ("deepmind") when applicable.
-            sample_async (bool): Whether to compute samples asynchronously in
-                the background, which improves throughput but can cause samples
-                to be slightly off-policy.
-            compress_observations (bool): If true, compress the observations.
-                They can be decompressed with rllib/utils/compression.
-            num_envs (int): If more than one, will create multiple envs
-                and vectorize the computation of actions. This has no effect if
-                if the env already implements VectorEnv.
-            observation_filter (str): Name of observation filter to use.
-            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
-                experience postprocessing. Setting to None means clip for Atari
-                only.
-            clip_actions (bool): Whether to clip action values to the range
-                specified by the policy action space.
-            env_config (dict): Config to pass to the env creator.
-            model_config (dict): Config to use when creating the policy model.
-            policy_config (dict): Config to pass to the policy. In the
-                multi-agent case, this config will be merged with the
-                per-policy configs specified by `policy`.
-            worker_index (int): For remote workers, this should be set to a
-                non-zero and unique value. This index is passed to created envs
-                through EnvContext so that envs can be configured per worker.
-            monitor_path (str): Write out episode stats and videos to this
-                directory if specified.
-            log_dir (str): Directory where logs can be placed.
-            log_level (str): Set the root log level on creation.
-            callbacks (dict): Dict of custom debug callbacks.
-            input_creator (func): Function that returns an InputReader object
-                for loading previous generated experiences.
-            input_evaluation (list): How to evaluate the policy performance.
-                This only makes sense to set when the input is reading offline
-                data. The possible values include:
-                  - "is": the step-wise importance sampling estimator.
-                  - "wis": the weighted step-wise is estimator.
-                  - "simulation": run the environment in the background, but
-                    use this data for evaluation only and never for learning.
-            output_creator (func): Function that returns an OutputWriter object
-                for saving generated experiences.
-            remote_worker_envs (bool): If using num_envs > 1, whether to create
-                those new envs in remote processes instead of in the current
-                process. This adds overheads, but can make sense if your envs
-            remote_env_batch_wait_ms (float): Timeout that remote workers
-                are waiting when polling environments. 0 (continue when at
-                least one env is ready) is a reasonable default, but optimal
-                value could be obtained by measuring your environment
-                step / reset and model inference perf.
-            soft_horizon (bool): Calculate rewards but don't reset the
-                environment when the horizon is hit.
-            no_done_at_end (bool): Ignore the done=True at the end of the
-                episode and instead record done=False.
-            seed (int): Set the seed of both np and tf to this value to
-                to ensure each remote worker has unique exploration behavior.
-            _fake_sampler (bool): Use a fake (inf speed) sampler for testing.
-        """
-
-        global _global_worker
-        _global_worker = self
-
-        if log_level:
-            logging.getLogger("ray.rllib").setLevel(log_level)
-
-        if worker_index > 1:
-            disable_log_once_globally()  # only need 1 worker to log
-        elif log_level == "DEBUG":
-            enable_periodic_logging()
-
-        env_context = EnvContext(env_config or {}, worker_index)
-        policy_config = policy_config or {}
-        self.policy_config = policy_config
-        self.callbacks = callbacks or {}
-        self.worker_index = worker_index
-        model_config = model_config or {}
-        policy_mapping_fn = (policy_mapping_fn
-                             or (lambda agent_id: DEFAULT_POLICY_ID))
-        if not callable(policy_mapping_fn):
-            raise ValueError(
-                "Policy mapping function not callable. If you're using Tune, "
-                "make sure to escape the function with tune.function() "
-                "to prevent it from being evaluated as an expression.")
-        self.env_creator = env_creator
-        self.sample_batch_size = batch_steps * num_envs
-        self.batch_mode = batch_mode
-        self.compress_observations = compress_observations
-        self.preprocessing_enabled = True
-        self.last_batch = None
-        self._fake_sampler = _fake_sampler
-
-        self.env = _validate_env(env_creator(env_context))
-        if isinstance(self.env, MultiAgentEnv) or \
-                isinstance(self.env, BaseEnv):
-
-            def wrap(env):
-                return env  # we can't auto-wrap these env types
-        elif is_atari(self.env) and \
-                not model_config.get("custom_preprocessor") and \
-                preprocessor_pref == "deepmind":
-
-            # Deepmind wrappers already handle all preprocessing
-            self.preprocessing_enabled = False
-
-            if clip_rewards is None:
-                clip_rewards = True
-
-            def wrap(env):
-                env = wrap_deepmind(
-                    env,
-                    dim=model_config.get("dim"),
-                    framestack=model_config.get("framestack"))
-                if monitor_path:
-                    env = gym.wrappers.Monitor(env, monitor_path, resume=True)
-                return env
-        else:
-
-            def wrap(env):
-                if monitor_path:
-                    env = gym.wrappers.Monitor(env, monitor_path, resume=True)
-                return env
-
-        self.env = wrap(self.env)
-
-        def make_env(vector_index):
-            return wrap(
-                env_creator(
-                    env_context.copy_with_overrides(
-                        vector_index=vector_index, remote=remote_worker_envs)))
-
-        self.tf_sess = None
-        policy_dict = _validate_and_canonicalize(policy, self.env)
-        self.policies_to_train = policies_to_train or list(policy_dict.keys())
-        # set numpy and python seed
-        if seed is not None:
-            np.random.seed(seed)
-            random.seed(seed)
-            if not hasattr(self.env, "seed"):
-                raise ValueError("Env doesn't support env.seed(): {}".format(
-                    self.env))
-            self.env.seed(seed)
-            try:
-                import torch
-                torch.manual_seed(seed)
-            except ImportError:
-                logger.info("Could not seed torch")
-        if _has_tensorflow_graph(policy_dict):
-            if (ray.is_initialized()
-                    and ray.worker._mode() != ray.worker.LOCAL_MODE
-                    and not ray.get_gpu_ids()):
-                logger.debug("Creating policy evaluation worker {}".format(
-                    worker_index) +
-                             " on CPU (please ignore any CUDA init errors)")
-            if not tf:
-                raise ImportError("Could not import tensorflow")
-            with tf.Graph().as_default():
-                if tf_session_creator:
-                    self.tf_sess = tf_session_creator()
-                else:
-                    self.tf_sess = tf.Session(
-                        config=tf.ConfigProto(
-                            gpu_options=tf.GPUOptions(allow_growth=True)))
-                with self.tf_sess.as_default():
-                    # set graph-level seed
-                    if seed is not None:
-                        tf.set_random_seed(seed)
-                    self.policy_map, self.preprocessors = \
-                        self._build_policy_map(policy_dict, policy_config)
-        else:
-            self.policy_map, self.preprocessors = self._build_policy_map(
-                policy_dict, policy_config)
-
-        self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
-        if self.multiagent:
-            if not ((isinstance(self.env, MultiAgentEnv)
-                     or isinstance(self.env, ExternalMultiAgentEnv))
-                    or isinstance(self.env, BaseEnv)):
-                raise ValueError(
-                    "Have multiple policies {}, but the env ".format(
-                        self.policy_map) +
-                    "{} is not a subclass of BaseEnv, MultiAgentEnv or "
-                    "ExternalMultiAgentEnv?".format(self.env))
-
-        self.filters = {
-            policy_id: get_filter(observation_filter,
-                                  policy.observation_space.shape)
-            for (policy_id, policy) in self.policy_map.items()
-        }
-        if self.worker_index == 0:
-            logger.info("Built filter map: {}".format(self.filters))
-
-        # Always use vector env for consistency even if num_envs = 1
-        self.async_env = BaseEnv.to_base_env(
-            self.env,
-            make_env=make_env,
-            num_envs=num_envs,
-            remote_envs=remote_worker_envs,
-            remote_env_batch_wait_ms=remote_env_batch_wait_ms)
-        self.num_envs = num_envs
-
-        if self.batch_mode == "truncate_episodes":
-            unroll_length = batch_steps
-            pack_episodes = True
-        elif self.batch_mode == "complete_episodes":
-            unroll_length = float("inf")  # never cut episodes
-            pack_episodes = False  # sampler will return 1 episode per poll
-        else:
-            raise ValueError("Unsupported batch mode: {}".format(
-                self.batch_mode))
-
-        self.io_context = IOContext(log_dir, policy_config, worker_index, self)
-        self.reward_estimators = []
-        for method in input_evaluation:
-            if method == "simulation":
-                logger.warning(
-                    "Requested 'simulation' input evaluation method: "
-                    "will discard all sampler outputs and keep only metrics.")
-                sample_async = True
-            elif method == "is":
-                ise = ImportanceSamplingEstimator.create(self.io_context)
-                self.reward_estimators.append(ise)
-            elif method == "wis":
-                wise = WeightedImportanceSamplingEstimator.create(
-                    self.io_context)
-                self.reward_estimators.append(wise)
-            else:
-                raise ValueError(
-                    "Unknown evaluation method: {}".format(method))
-
-        if sample_async:
-            self.sampler = AsyncSampler(
-                self.async_env,
-                self.policy_map,
-                policy_mapping_fn,
-                self.preprocessors,
-                self.filters,
-                clip_rewards,
-                unroll_length,
-                self.callbacks,
-                horizon=episode_horizon,
-                pack=pack_episodes,
-                tf_sess=self.tf_sess,
-                clip_actions=clip_actions,
-                blackhole_outputs="simulation" in input_evaluation,
-                soft_horizon=soft_horizon,
-                no_done_at_end=no_done_at_end)
-            self.sampler.start()
-        else:
-            self.sampler = SyncSampler(
-                self.async_env,
-                self.policy_map,
-                policy_mapping_fn,
-                self.preprocessors,
-                self.filters,
-                clip_rewards,
-                unroll_length,
-                self.callbacks,
-                horizon=episode_horizon,
-                pack=pack_episodes,
-                tf_sess=self.tf_sess,
-                clip_actions=clip_actions,
-                soft_horizon=soft_horizon,
-                no_done_at_end=no_done_at_end)
-
-        self.input_reader = input_creator(self.io_context)
-        assert isinstance(self.input_reader, InputReader), self.input_reader
-        self.output_writer = output_creator(self.io_context)
-        assert isinstance(self.output_writer, OutputWriter), self.output_writer
-
-        logger.debug(
-            "Created rollout worker with env {} ({}), policies {}".format(
-                self.async_env, self.env, self.policy_map))
-
-    @override(EvaluatorInterface)
-    def sample(self):
-        """Evaluate the current policies and return a batch of experiences.
-
-        Return:
-            SampleBatch|MultiAgentBatch from evaluating the current policies.
-        """
-
-        if self._fake_sampler and self.last_batch is not None:
-            return self.last_batch
-
-        if log_once("sample_start"):
-            logger.info("Generating sample batch of size {}".format(
-                self.sample_batch_size))
-
-        batches = [self.input_reader.next()]
-        steps_so_far = batches[0].count
-
-        # In truncate_episodes mode, never pull more than 1 batch per env.
-        # This avoids over-running the target batch size.
-        if self.batch_mode == "truncate_episodes":
-            max_batches = self.num_envs
-        else:
-            max_batches = float("inf")
-
-        while steps_so_far < self.sample_batch_size and len(
-                batches) < max_batches:
-            batch = self.input_reader.next()
-            steps_so_far += batch.count
-            batches.append(batch)
-        batch = batches[0].concat_samples(batches)
-
-        if self.callbacks.get("on_sample_end"):
-            self.callbacks["on_sample_end"]({"worker": self, "samples": batch})
-
-        # Always do writes prior to compression for consistency and to allow
-        # for better compression inside the writer.
-        self.output_writer.write(batch)
-
-        # Do off-policy estimation if needed
-        if self.reward_estimators:
-            for sub_batch in batch.split_by_episode():
-                for estimator in self.reward_estimators:
-                    estimator.process(sub_batch)
-
-        if log_once("sample_end"):
-            logger.info("Completed sample batch:\n\n{}\n".format(
-                summarize(batch)))
-
-        if self.compress_observations == "bulk":
-            batch.compress(bulk=True)
-        elif self.compress_observations:
-            batch.compress()
-
-        if self._fake_sampler:
-            self.last_batch = batch
-        return batch
-
-    @DeveloperAPI
-    @ray.method(num_return_vals=2)
-    def sample_with_count(self):
-        """Same as sample() but returns the count as a separate future."""
-        batch = self.sample()
-        return batch, batch.count
-
-    @override(EvaluatorInterface)
-    def get_weights(self, policies=None):
-        if policies is None:
-            policies = self.policy_map.keys()
-        return {
-            pid: policy.get_weights()
-            for pid, policy in self.policy_map.items() if pid in policies
-        }
-
-    @override(EvaluatorInterface)
-    def set_weights(self, weights):
-        for pid, w in weights.items():
-            self.policy_map[pid].set_weights(w)
-
-    @override(EvaluatorInterface)
-    def compute_gradients(self, samples):
-        if log_once("compute_gradients"):
-            logger.info("Compute gradients on:\n\n{}\n".format(
-                summarize(samples)))
-        if isinstance(samples, MultiAgentBatch):
-            grad_out, info_out = {}, {}
-            if self.tf_sess is not None:
-                builder = TFRunBuilder(self.tf_sess, "compute_gradients")
-                for pid, batch in samples.policy_batches.items():
-                    if pid not in self.policies_to_train:
-                        continue
-                    grad_out[pid], info_out[pid] = (
-                        self.policy_map[pid]._build_compute_gradients(
-                            builder, batch))
-                grad_out = {k: builder.get(v) for k, v in grad_out.items()}
-                info_out = {k: builder.get(v) for k, v in info_out.items()}
-            else:
-                for pid, batch in samples.policy_batches.items():
-                    if pid not in self.policies_to_train:
-                        continue
-                    grad_out[pid], info_out[pid] = (
-                        self.policy_map[pid].compute_gradients(batch))
-        else:
-            grad_out, info_out = (
-                self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples))
-        info_out["batch_count"] = samples.count
-        if log_once("grad_out"):
-            logger.info("Compute grad info:\n\n{}\n".format(
-                summarize(info_out)))
-        return grad_out, info_out
-
-    @override(EvaluatorInterface)
-    def apply_gradients(self, grads):
-        if log_once("apply_gradients"):
-            logger.info("Apply gradients:\n\n{}\n".format(summarize(grads)))
-        if isinstance(grads, dict):
-            if self.tf_sess is not None:
-                builder = TFRunBuilder(self.tf_sess, "apply_gradients")
-                outputs = {
-                    pid: self.policy_map[pid]._build_apply_gradients(
-                        builder, grad)
-                    for pid, grad in grads.items()
-                }
-                return {k: builder.get(v) for k, v in outputs.items()}
-            else:
-                return {
-                    pid: self.policy_map[pid].apply_gradients(g)
-                    for pid, g in grads.items()
-                }
-        else:
-            return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
-
-    @override(EvaluatorInterface)
-    def learn_on_batch(self, samples):
-        if log_once("learn_on_batch"):
-            logger.info(
-                "Training on concatenated sample batches:\n\n{}\n".format(
-                    summarize(samples)))
-        if isinstance(samples, MultiAgentBatch):
-            info_out = {}
-            to_fetch = {}
-            if self.tf_sess is not None:
-                builder = TFRunBuilder(self.tf_sess, "learn_on_batch")
-            else:
-                builder = None
-            for pid, batch in samples.policy_batches.items():
-                if pid not in self.policies_to_train:
-                    continue
-                policy = self.policy_map[pid]
-                if builder and hasattr(policy, "_build_learn_on_batch"):
-                    to_fetch[pid] = policy._build_learn_on_batch(
-                        builder, batch)
-                else:
-                    info_out[pid] = policy.learn_on_batch(batch)
-            info_out.update({k: builder.get(v) for k, v in to_fetch.items()})
-        else:
-            info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch(
-                samples)
-        if log_once("learn_out"):
-            logger.info("Training output:\n\n{}\n".format(summarize(info_out)))
-        return info_out
-
-    @DeveloperAPI
-    def get_metrics(self):
-        """Returns a list of new RolloutMetric objects from evaluation."""
-
-        out = self.sampler.get_metrics()
-        for m in self.reward_estimators:
-            out.extend(m.get_metrics())
-        return out
-
-    @DeveloperAPI
-    def foreach_env(self, func):
-        """Apply the given function to each underlying env instance."""
-
-        envs = self.async_env.get_unwrapped()
-        if not envs:
-            return [func(self.async_env)]
-        else:
-            return [func(e) for e in envs]
-
-    @DeveloperAPI
-    def get_policy(self, policy_id=DEFAULT_POLICY_ID):
-        """Return policy for the specified id, or None.
-
-        Arguments:
-            policy_id (str): id of policy to return.
-        """
-
-        return self.policy_map.get(policy_id)
-
-    @DeveloperAPI
-    def for_policy(self, func, policy_id=DEFAULT_POLICY_ID):
-        """Apply the given function to the specified policy."""
-
-        return func(self.policy_map[policy_id])
-
-    @DeveloperAPI
-    def foreach_policy(self, func):
-        """Apply the given function to each (policy, policy_id) tuple."""
-
-        return [func(policy, pid) for pid, policy in self.policy_map.items()]
-
-    @DeveloperAPI
-    def foreach_trainable_policy(self, func):
-        """Apply the given function to each (policy, policy_id) tuple.
-
-        This only applies func to policies in `self.policies_to_train`."""
-
-        return [
-            func(policy, pid) for pid, policy in self.policy_map.items()
-            if pid in self.policies_to_train
-        ]
-
-    @DeveloperAPI
-    def sync_filters(self, new_filters):
-        """Changes self's filter to given and rebases any accumulated delta.
-
-        Args:
-            new_filters (dict): Filters with new state to update local copy.
-        """
-        assert all(k in new_filters for k in self.filters)
-        for k in self.filters:
-            self.filters[k].sync(new_filters[k])
-
-    @DeveloperAPI
-    def get_filters(self, flush_after=False):
-        """Returns a snapshot of filters.
-
-        Args:
-            flush_after (bool): Clears the filter buffer state.
-
-        Returns:
-            return_filters (dict): Dict for serializable filters
-        """
-        return_filters = {}
-        for k, f in self.filters.items():
-            return_filters[k] = f.as_serializable()
-            if flush_after:
-                f.clear_buffer()
-        return return_filters
-
-    @DeveloperAPI
-    def save(self):
-        filters = self.get_filters(flush_after=True)
-        state = {
-            pid: self.policy_map[pid].get_state()
-            for pid in self.policy_map
-        }
-        return pickle.dumps({"filters": filters, "state": state})
-
-    @DeveloperAPI
-    def restore(self, objs):
-        objs = pickle.loads(objs)
-        self.sync_filters(objs["filters"])
-        for pid, state in objs["state"].items():
-            self.policy_map[pid].set_state(state)
-
-    @DeveloperAPI
-    def set_global_vars(self, global_vars):
-        self.foreach_policy(lambda p, _: p.on_global_var_update(global_vars))
-
-    @DeveloperAPI
-    def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
-        self.policy_map[policy_id].export_model(export_dir)
-
-    @DeveloperAPI
-    def export_policy_checkpoint(self,
-                                 export_dir,
-                                 filename_prefix="model",
-                                 policy_id=DEFAULT_POLICY_ID):
-        self.policy_map[policy_id].export_checkpoint(export_dir,
-                                                     filename_prefix)
-
-    @DeveloperAPI
-    def stop(self):
-        self.async_env.stop()
-
-    def _build_policy_map(self, policy_dict, policy_config):
-        policy_map = {}
-        preprocessors = {}
-        for name, (cls, obs_space, act_space,
-                   conf) in sorted(policy_dict.items()):
-            logger.debug("Creating policy for {}".format(name))
-            merged_conf = merge_dicts(policy_config, conf)
-            if self.preprocessing_enabled:
-                preprocessor = ModelCatalog.get_preprocessor_for_space(
-                    obs_space, merged_conf.get("model"))
-                preprocessors[name] = preprocessor
-                obs_space = preprocessor.observation_space
-            else:
-                preprocessors[name] = NoPreprocessor(obs_space)
-            if isinstance(obs_space, gym.spaces.Dict) or \
-                    isinstance(obs_space, gym.spaces.Tuple):
-                raise ValueError(
-                    "Found raw Tuple|Dict space as input to policy. "
-                    "Please preprocess these observations with a "
-                    "Tuple|DictFlatteningPreprocessor.")
-            if tf:
-                with tf.variable_scope(name):
-                    policy_map[name] = cls(obs_space, act_space, merged_conf)
-            else:
-                policy_map[name] = cls(obs_space, act_space, merged_conf)
-        if self.worker_index == 0:
-            logger.info("Built policy map: {}".format(policy_map))
-            logger.info("Built preprocessor map: {}".format(preprocessors))
-        return policy_map, preprocessors
-
-    def __del__(self):
-        if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler):
-            self.sampler.shutdown = True
-
-
-def _validate_and_canonicalize(policy, env):
-    if isinstance(policy, dict):
-        _validate_multiagent_config(policy)
-        return policy
-    elif not issubclass(policy, Policy):
-        raise ValueError("policy must be a rllib.Policy class")
-    else:
-        if (isinstance(env, MultiAgentEnv)
-                and not hasattr(env, "observation_space")):
-            raise ValueError(
-                "MultiAgentEnv must have observation_space defined if run "
-                "in a single-agent configuration.")
-        return {
-            DEFAULT_POLICY_ID: (policy, env.observation_space,
-                                env.action_space, {})
-        }
-
-
-def _validate_multiagent_config(policy, allow_none_graph=False):
-    for k, v in policy.items():
-        if not isinstance(k, str):
-            raise ValueError("policy keys must be strs, got {}".format(
-                type(k)))
-        if not isinstance(v, (tuple, list)) or len(v) != 4:
-            raise ValueError(
-                "policy values must be tuples/lists of "
-                "(cls or None, obs_space, action_space, config), got {}".
-                format(v))
-        if allow_none_graph and v[0] is None:
-            pass
-        elif not issubclass(v[0], Policy):
-            raise ValueError("policy tuple value 0 must be a rllib.Policy "
-                             "class or None, got {}".format(v[0]))
-        if not isinstance(v[1], gym.Space):
-            raise ValueError(
-                "policy tuple value 1 (observation_space) must be a "
-                "gym.Space, got {}".format(type(v[1])))
-        if not isinstance(v[2], gym.Space):
-            raise ValueError("policy tuple value 2 (action_space) must be a "
-                             "gym.Space, got {}".format(type(v[2])))
-        if not isinstance(v[3], dict):
-            raise ValueError("policy tuple value 3 (config) must be a dict, "
-                             "got {}".format(type(v[3])))
-
-
-def _validate_env(env):
-    # allow this as a special case (assumed gym.Env)
-    if hasattr(env, "observation_space") and hasattr(env, "action_space"):
-        return env
-
-    allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv]
-    if not any(isinstance(env, tpe) for tpe in allowed_types):
-        raise ValueError(
-            "Returned env should be an instance of gym.Env, MultiAgentEnv, "
-            "ExternalEnv, VectorEnv, or BaseEnv. The provided env creator "
-            "function returned {} ({}).".format(env, type(env)))
-    return env
-
-
-def _has_tensorflow_graph(policy_dict):
-    for policy, _, _, _ in policy_dict.values():
-        if issubclass(policy, TFPolicy):
-            return True
-    return False
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`Implementation of deep deterministic policy gradients (https://arxiv.org/abs/1509.02971), including an Ape-X variant.`
				`@@ -1 +0,0 @@`
				`Code in this package is adapted from https://github.com/openai/baselines/tree/master/baselines/deepq.`
				`@@ -1 +0,0 @@`
				`Implementation of Soft Actor-Critic (https://arxiv.org/abs/1812.05905.pdf).`