[rllib] format with yapf (#2427)

* initial yapf * manual fix yapf bugs
2026-06-28 03:34:48 +08:00 · 2018-07-19 15:30:36 -07:00
parent 24eb140e07
commit d01dc9e22d
86 changed files with 1276 additions and 978 deletions
@@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch


 def _register_all():
-    for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
-                "APEX_DDPG", "__fake", "__sigmoid_fake_data",
-                "__parameter_tuning"]:
+    for key in [
+            "PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG",
+            "__fake", "__sigmoid_fake_data", "__parameter_tuning"
+    ]:
        from ray.rllib.agents.agent import get_agent_class
        register_trainable(key, get_agent_class(key))

@@ -27,6 +28,12 @@ def _register_all():
 _register_all()

 __all__ = [
-    "PolicyGraph", "TFPolicyGraph", "PolicyEvaluator", "SampleBatch",
-    "AsyncVectorEnv", "MultiAgentEnv", "VectorEnv", "ServingEnv",
+    "PolicyGraph",
+    "TFPolicyGraph",
+    "PolicyEvaluator",
+    "SampleBatch",
+    "AsyncVectorEnv",
+    "MultiAgentEnv",
+    "VectorEnv",
+    "ServingEnv",
 ]
@@ -92,15 +92,15 @@ class A3CAgent(Agent):
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_cls, self.config["num_workers"],
            {"num_gpus": 1 if self.config["use_gpu_for_workers"] else 0})
-        self.optimizer = AsyncGradientsOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+        self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
+                                                 self.remote_evaluators,
+                                                 self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
-        FilterManager.synchronize(
-            self.local_evaluator.filters, self.remote_evaluators)
+        FilterManager.synchronize(self.local_evaluator.filters,
+                                  self.remote_evaluators)
        result = self.optimizer.collect_metrics()
        result = result._replace(
            timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps)
@@ -14,19 +14,23 @@ from ray.rllib.models.catalog import ModelCatalog


 class A3CLoss(object):
-    def __init__(
-            self, action_dist, actions, advantages, v_target, vf,
-            vf_loss_coeff=0.5, entropy_coeff=-0.01):
+    def __init__(self,
+                 action_dist,
+                 actions,
+                 advantages,
+                 v_target,
+                 vf,
+                 vf_loss_coeff=0.5,
+                 entropy_coeff=-0.01):
        log_prob = action_dist.logp(actions)

        # The "policy gradients" loss
-        self.pi_loss = - tf.reduce_sum(log_prob * advantages)
+        self.pi_loss = -tf.reduce_sum(log_prob * advantages)

        delta = vf - v_target
        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
        self.entropy = tf.reduce_sum(action_dist.entropy())
-        self.total_loss = (self.pi_loss +
-                           self.vf_loss * vf_loss_coeff +
+        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
                           self.entropy * entropy_coeff)


@@ -41,8 +45,8 @@ class A3CPolicyGraph(TFPolicyGraph):
            tf.float32, [None] + list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
-        self.model = ModelCatalog.get_model(
-            self.observations, logit_dim, self.config["model"])
+        self.model = ModelCatalog.get_model(self.observations, logit_dim,
+                                            self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = tf.reshape(
            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
@@ -62,9 +66,9 @@ class A3CPolicyGraph(TFPolicyGraph):
                    action_space))
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        v_target = tf.placeholder(tf.float32, [None], name="v_target")
-        self.loss = A3CLoss(
-            action_dist, actions, advantages, v_target, self.vf,
-            self.config["vf_loss_coeff"], self.config["entropy_coeff"])
+        self.loss = A3CLoss(action_dist, actions, advantages, v_target,
+                            self.vf, self.config["vf_loss_coeff"],
+                            self.config["entropy_coeff"])

        # Initialize TFPolicyGraph
        loss_in = [
@@ -76,10 +80,16 @@ class A3CPolicyGraph(TFPolicyGraph):
        self.state_in = self.model.state_in
        self.state_out = self.model.state_out
        TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
-            obs_input=self.observations, action_sampler=action_dist.sample(),
-            loss=self.loss.total_loss, loss_inputs=loss_in,
-            state_inputs=self.state_in, state_outputs=self.state_out,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
+            obs_input=self.observations,
+            action_sampler=action_dist.sample(),
+            loss=self.loss.total_loss,
+            loss_inputs=loss_in,
+            state_inputs=self.state_in,
+            state_outputs=self.state_out,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

@@ -132,5 +142,5 @@ class A3CPolicyGraph(TFPolicyGraph):
            for i in range(len(self.state_in)):
                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
            last_r = self.value(sample_batch["new_obs"][-1], *next_state)
-        return compute_advantages(
-            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+        return compute_advantages(sample_batch, last_r, self.config["gamma"],
+                                  self.config["lambda"])
@@ -46,20 +46,21 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
            action_space, self.config["model"])
        self.model = ModelCatalog.get_torch_model(
            obs_space.shape, self.logit_dim, self.config["model"])
-        loss = A3CLoss(
-            self.model, self.config["vf_loss_coeff"],
-            self.config["entropy_coeff"])
+        loss = A3CLoss(self.model, self.config["vf_loss_coeff"],
+                       self.config["entropy_coeff"])
        TorchPolicyGraph.__init__(
-            self, obs_space, action_space, self.model, loss,
-            loss_inputs=[
-                "obs", "actions", "advantages", "value_targets"])
+            self,
+            obs_space,
+            action_space,
+            self.model,
+            loss,
+            loss_inputs=["obs", "actions", "advantages", "value_targets"])

    def extra_action_out(self, model_out):
        return {"vf_preds": var_to_np(model_out[1])}

    def optimizer(self):
-        return torch.optim.Adam(
-            self.model.parameters(), lr=self.config["lr"])
+        return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])

    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
        completed = sample_batch["dones"][-1]
@@ -67,8 +68,8 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
            last_r = 0.0
        else:
            last_r = self._value(sample_batch["new_obs"][-1])
-        return compute_advantages(
-            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+        return compute_advantages(sample_batch, last_r, self.config["gamma"],
+                                  self.config["lambda"])

    def _value(self, obs):
        with self.lock:
@@ -47,7 +47,9 @@ COMMON_CONFIG = {
            "allow_growth": True,
        },
        "log_device_placement": False,
-        "device_count": {"CPU": 1},
+        "device_count": {
+            "CPU": 1
+        },
        "allow_soft_placement": True,  # required by PPO multi-gpu
    },
    # Whether to LZ4 compress observations
@@ -86,8 +88,7 @@ def _deep_update(original, new_dict, new_keys_allowed, whitelist):
    for k, value in new_dict.items():
        if k not in original and k != "env":
            if not new_keys_allowed:
-                raise Exception(
-                    "Unknown config parameter `{}` ".format(k))
+                raise Exception("Unknown config parameter `{}` ".format(k))
        if type(original.get(k)) is dict:
            if k in whitelist:
                _deep_update(original[k], value, True, [])
@@ -112,22 +113,24 @@ class Agent(Trainable):

    _allow_unknown_configs = False
    _allow_unknown_subkeys = [
-        "tf_session_args", "env_config", "model", "optimizer", "multiagent"]
+        "tf_session_args", "env_config", "model", "optimizer", "multiagent"
+    ]

    def make_local_evaluator(self, env_creator, policy_graph):
        """Convenience method to return configured local evaluator."""

-        return self._make_evaluator(
-            PolicyEvaluator, env_creator, policy_graph, 0)
+        return self._make_evaluator(PolicyEvaluator, env_creator, policy_graph,
+                                    0)

-    def make_remote_evaluators(
-            self, env_creator, policy_graph, count, remote_args):
+    def make_remote_evaluators(self, env_creator, policy_graph, count,
+                               remote_args):
        """Convenience method to return a number of remote evaluators."""

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
-            self._make_evaluator(cls, env_creator, policy_graph, i+1)
-            for i in range(count)]
+            self._make_evaluator(cls, env_creator, policy_graph, i + 1)
+            for i in range(count)
+        ]

    def _make_evaluator(self, cls, env_creator, policy_graph, worker_index):
        config = self.config
@@ -140,8 +143,8 @@ class Agent(Trainable):
            env_creator,
            self.config["multiagent"]["policy_graphs"] or policy_graph,
            policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
-            tf_session_creator=(
-                session_creator if config["tf_session_args"] else None),
+            tf_session_creator=(session_creator
+                                if config["tf_session_args"] else None),
            batch_steps=config["sample_batch_size"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
@@ -157,14 +160,12 @@ class Agent(Trainable):

    @classmethod
    def resource_help(cls, config):
-        return (
-            "\n\nYou can adjust the resource requests of RLlib agents by "
-            "setting `num_workers` and other configs. See the "
-            "DEFAULT_CONFIG defined by each agent for more info.\n\n"
-            "The config of this agent is: " + json.dumps(config))
+        return ("\n\nYou can adjust the resource requests of RLlib agents by "
+                "setting `num_workers` and other configs. See the "
+                "DEFAULT_CONFIG defined by each agent for more info.\n\n"
+                "The config of this agent is: " + json.dumps(config))

-    def __init__(
-            self, config=None, env=None, logger_creator=None):
+    def __init__(self, config=None, env=None, logger_creator=None):
        """Initialize an RLLib agent.

        Args:
@@ -235,8 +236,8 @@ class Agent(Trainable):
        obs = self.local_evaluator.filters["default"](
            observation, update=False)
        return self.local_evaluator.for_policy(
-            lambda p: p.compute_single_action(
-                obs, state, is_training=False)[0])
+            lambda p: p.compute_single_action(obs, state, is_training=False)[0]
+        )


 class _MockAgent(Agent):
@@ -257,8 +258,10 @@ class _MockAgent(Agent):
                and (self.config["persistent_error"] or not self.restored):
            raise Exception("mock error")
        return TrainingResult(
-            episode_reward_mean=10, episode_len_mean=10,
-            timesteps_this_iter=10, info={})
+            episode_reward_mean=10,
+            episode_len_mean=10,
+            timesteps_this_iter=10,
+            info={})

    def _save(self, checkpoint_dir):
        path = os.path.join(checkpoint_dir, "mock_agent.pkl")
@@ -299,9 +302,11 @@ class _SigmoidFakeData(_MockAgent):
        v = np.tanh(float(i) / self.config["width"])
        v *= self.config["height"]
        return TrainingResult(
-            episode_reward_mean=v, episode_len_mean=v,
+            episode_reward_mean=v,
+            episode_len_mean=v,
            timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"], info={})
+            time_this_iter_s=self.config["iter_time"],
+            info={})


 class _ParameterTuningAgent(_MockAgent):
@@ -320,7 +325,8 @@ class _ParameterTuningAgent(_MockAgent):
            episode_reward_mean=self.config["reward_amt"] * self.iteration,
            episode_len_mean=self.config["reward_amt"],
            timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"], info={})
+            time_this_iter_s=self.config["iter_time"],
+            info={})


 def get_agent_class(alg):
@@ -363,5 +369,4 @@ def get_agent_class(alg):
    elif alg == "__parameter_tuning":
        return _ParameterTuningAgent
    else:
-        raise Exception(
-            ("Unknown algorithm {}.").format(alg))
+        raise Exception(("Unknown algorithm {}.").format(alg))
@@ -57,28 +57,31 @@ class BCAgent(Agent):
        else:
            num_gpus_per_worker = 0
        return Resources(
-            cpu=1, gpu=cf["gpu"] and 1 or 0,
+            cpu=1,
+            gpu=cf["gpu"] and 1 or 0,
            extra_cpu=cf["num_workers"],
            extra_gpu=num_gpus_per_worker * cf["num_workers"])

    def _init(self):
-        self.local_evaluator = BCEvaluator(
-            self.env_creator, self.config, self.logdir)
+        self.local_evaluator = BCEvaluator(self.env_creator, self.config,
+                                           self.logdir)
        if self.config["use_gpu_for_workers"]:
            remote_cls = GPURemoteBCEvaluator
        else:
            remote_cls = RemoteBCEvaluator
        self.remote_evaluators = [
            remote_cls.remote(self.env_creator, self.config, self.logdir)
-            for _ in range(self.config["num_workers"])]
-        self.optimizer = AsyncGradientsOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+            for _ in range(self.config["num_workers"])
+        ]
+        self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
+                                                 self.remote_evaluators,
+                                                 self.config["optimizer"])

    def _train(self):
        self.optimizer.step()
-        metric_lists = [re.get_metrics.remote() for re in
-                        self.remote_evaluators]
+        metric_lists = [
+            re.get_metrics.remote() for re in self.remote_evaluators
+        ]
        total_samples = 0
        total_loss = 0
        for metrics in metric_lists:
@@ -14,8 +14,8 @@ from ray.rllib.models import ModelCatalog

 class BCEvaluator(EvaluatorInterface):
    def __init__(self, env_creator, config, logdir):
-        env = ModelCatalog.get_preprocessor_as_wrapper(env_creator(
-            config["env_config"]), config["model"])
+        env = ModelCatalog.get_preprocessor_as_wrapper(
+            env_creator(config["env_config"]), config["model"])
        self.dataset = ExperienceDataset(config["dataset_path"])
        self.policy = BCPolicy(env.observation_space, env.action_space, config)
        self.config = config
@@ -27,8 +27,10 @@ class BCEvaluator(EvaluatorInterface):

    def compute_gradients(self, samples):
        gradient, info = self.policy.compute_gradients(samples)
-        self.metrics_queue.put(
-            {"num_samples": info["num_samples"], "loss": info["loss"]})
+        self.metrics_queue.put({
+            "num_samples": info["num_samples"],
+            "loss": info["loss"]
+        })
        return gradient, {}

    def apply_gradients(self, grads):
@@ -42,8 +44,7 @@ class BCEvaluator(EvaluatorInterface):

    def save(self):
        weights = self.get_weights()
-        return pickle.dumps({
-            "weights": weights})
+        return pickle.dumps({"weights": weights})

    def restore(self, objs):
        objs = pickle.loads(objs)
@@ -21,8 +21,9 @@ class ExperienceDataset(object):
            elements.
          The file must be available on each machine used by a BCEvaluator.
        """
-        self._dataset = list(itertools.chain.from_iterable(
-            pickle.load(open(dataset_path, "rb"))))
+        self._dataset = list(
+            itertools.chain.from_iterable(
+                pickle.load(open(dataset_path, "rb"))))

    def sample(self, batch_size):
        indexes = np.random.choice(len(self._dataset), batch_size)
@@ -23,8 +23,8 @@ class BCPolicy(object):
        self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
            ac_space, self.config["model"])
-        self._model = ModelCatalog.get_model(
-            self.x, self.logit_dim, self.config["model"])
+        self._model = ModelCatalog.get_model(self.x, self.logit_dim,
+                                             self.config["model"])
        self.logits = self._model.outputs
        self.curr_dist = dist_class(self.logits)
        self.sample = self.curr_dist.sample()
@@ -33,17 +33,16 @@ class BCPolicy(object):

    def setup_loss(self, action_space):
        if isinstance(action_space, gym.spaces.Box):
-            self.ac = tf.placeholder(tf.float32,
-                                     [None] + list(action_space.shape),
-                                     name="ac")
+            self.ac = tf.placeholder(
+                tf.float32, [None] + list(action_space.shape), name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            self.ac = tf.placeholder(tf.int64, [None], name="ac")
        else:
-            raise NotImplementedError(
-                "action space" + str(type(action_space)) +
-                "currently not supported")
+            raise NotImplementedError("action space" +
+                                      str(type(action_space)) +
+                                      "currently not supported")
        log_prob = self.curr_dist.logp(self.ac)
-        self.pi_loss = - tf.reduce_sum(log_prob)
+        self.pi_loss = -tf.reduce_sum(log_prob)
        self.loss = self.pi_loss

    def setup_gradients(self):
@@ -62,11 +61,14 @@ class BCPolicy(object):
            self.summary_op = tf.summary.merge_all()

        # TODO(rliaw): Can consider exposing these parameters
-        self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
-            intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
-            gpu_options=tf.GPUOptions(allow_growth=True)))
-        self.variables = ray.experimental.TensorFlowVariables(self.loss,
-                                                              self.sess)
+        self.sess = tf.Session(
+            graph=self.g,
+            config=tf.ConfigProto(
+                intra_op_parallelism_threads=1,
+                inter_op_parallelism_threads=2,
+                gpu_options=tf.GPUOptions(allow_growth=True)))
+        self.variables = ray.experimental.TensorFlowVariables(
+            self.loss, self.sess)
        self.sess.run(tf.global_variables_initializer())

    def compute_gradients(self, samples):
@@ -82,15 +84,14 @@ class BCPolicy(object):
                [self.loss, self.grads, self.summary_op], feed_dict=feed_dict)
            info["summary"] = summ
        else:
-            loss, grad = self.sess.run([self.loss, self.grads],
-                                       feed_dict=feed_dict)
+            loss, grad = self.sess.run(
+                [self.loss, self.grads], feed_dict=feed_dict)
        info["num_samples"] = len(samples)
        info["loss"] = loss
        return grad, info

    def apply_gradients(self, grads):
-        feed_dict = {self.grads[i]: grads[i]
-                     for i in range(len(grads))}
+        feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))}
        self.sess.run(self._apply_gradients, feed_dict=feed_dict)

    def get_weights(self):
@@ -9,13 +9,12 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
    DDPG_CONFIG,
    {
        "optimizer_class": "AsyncSamplesOptimizer",
-        "optimizer":
-            merge_dicts(
-                DDPG_CONFIG["optimizer"], {
-                    "max_weight_sync_delay": 400,
-                    "num_replay_buffer_shards": 4,
-                    "debug": False
-                }),
+        "optimizer": merge_dicts(
+            DDPG_CONFIG["optimizer"], {
+                "max_weight_sync_delay": 400,
+                "num_replay_buffer_shards": 4,
+                "debug": False
+            }),
        "n_step": 3,
        "num_workers": 32,
        "buffer_size": 2000000,
@@ -118,9 +118,9 @@ class DDPGAgent(DQNAgent):
        if self.config["per_worker_exploration"]:
            assert self.config["num_workers"] > 1, \
                "This requires multiple workers"
-            return ConstantSchedule(
-                self.config["noise_scale"] * 0.4 **
-                (1 + worker_index / float(self.config["num_workers"] - 1) * 7))
+            exponent = (
+                1 + worker_index / float(self.config["num_workers"] - 1) * 7)
+            return ConstantSchedule(self.config["noise_scale"] * 0.4**exponent)
        else:
            return LinearSchedule(
                schedule_timesteps=int(self.config["exploration_fraction"] *
@@ -14,7 +14,6 @@ from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph

-
 A_SCOPE = "a_func"
 P_SCOPE = "p_func"
 P_TARGET_SCOPE = "target_p_func"
@@ -26,8 +25,8 @@ class PNetwork(object):
    """Maps an observations (i.e., state) to an action where each entry takes
    value from (0, 1) due to the sigmoid function."""

-    def __init__(
-            self, model, dim_actions, hiddens=[64, 64], activation="relu"):
+    def __init__(self, model, dim_actions, hiddens=[64, 64],
+                 activation="relu"):
        action_out = model.last_layer
        activation = tf.nn.__dict__[activation]
        for hidden in hiddens:
@@ -44,9 +43,14 @@ class ActionNetwork(object):
    for training, thus ignoring the batch_size issue when constructing a
    stochastic action."""

-    def __init__(
-            self, p_values, low_action, high_action, stochastic, eps,
-            theta=0.15, sigma=0.2):
+    def __init__(self,
+                 p_values,
+                 low_action,
+                 high_action,
+                 stochastic,
+                 eps,
+                 theta=0.15,
+                 sigma=0.2):

        # shape is [None, dim_action]
        deterministic_actions = (
@@ -65,15 +69,16 @@ class ActionNetwork(object):
        stochastic_actions = deterministic_actions + eps * (
            high_action - low_action) * exploration_value

-        self.actions = tf.cond(
-            stochastic, lambda: stochastic_actions,
-            lambda: deterministic_actions)
+        self.actions = tf.cond(stochastic, lambda: stochastic_actions,
+                               lambda: deterministic_actions)


 class QNetwork(object):
-    def __init__(
-            self, model, action_inputs,
-            hiddens=[64, 64], activation="relu"):
+    def __init__(self,
+                 model,
+                 action_inputs,
+                 hiddens=[64, 64],
+                 activation="relu"):
        q_out = tf.concat([model.last_layer, action_inputs], axis=1)
        activation = tf.nn.__dict__[activation]
        for hidden in hiddens:
@@ -84,14 +89,21 @@ class QNetwork(object):


 class ActorCriticLoss(object):
-    def __init__(
-            self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask,
-            gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0):
+    def __init__(self,
+                 q_t,
+                 q_tp1,
+                 q_tp0,
+                 importance_weights,
+                 rewards,
+                 done_mask,
+                 gamma=0.99,
+                 n_step=1,
+                 use_huber=False,
+                 huber_threshold=1.0):

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)

-        q_tp1_best = tf.squeeze(
-            input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
@@ -131,27 +143,20 @@ class DDPGPolicyGraph(TFPolicyGraph):

        def _build_q_network(obs, actions):
            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                actions,
+                ModelCatalog.get_model(obs, 1, config["model"]), actions,
                config["critic_hiddens"],
                config["critic_hidden_activation"]).value

        def _build_p_network(obs):
            return PNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                dim_actions,
+                ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
                config["actor_hiddens"],
                config["actor_hidden_activation"]).action_scores

        def _build_action_network(p_values, stochastic, eps):
-            return ActionNetwork(
-                p_values,
-                low_action,
-                high_action,
-                stochastic,
-                eps,
-                config["exploration_theta"],
-                config["exploration_sigma"]).actions
+            return ActionNetwork(p_values, low_action, high_action, stochastic,
+                                 eps, config["exploration_theta"],
+                                 config["exploration_sigma"]).actions

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -263,9 +268,13 @@ class DDPGPolicyGraph(TFPolicyGraph):
            ("weights", self.importance_weights),
        ]
        TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
            obs_input=self.cur_observations,
-            action_sampler=self.output_actions, loss=self.loss.total_loss,
+            action_sampler=self.output_actions,
+            loss=self.loss.total_loss,
            loss_inputs=self.loss_inputs)
        self.sess.run(tf.global_variables_initializer())

@@ -294,10 +303,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
                self.loss.actor_loss, var_list=self.p_func_vars)
            critic_grads_and_vars = self.critic_optimizer.compute_gradients(
                self.loss.critic_loss, var_list=self.q_func_vars)
-        actor_grads_and_vars = [
-            (g, v) for (g, v) in actor_grads_and_vars if g is not None]
-        critic_grads_and_vars = [
-            (g, v) for (g, v) in critic_grads_and_vars if g is not None]
+        actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
+                                if g is not None]
+        critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
+                                 if g is not None]
        grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
        return grads_and_vars

@@ -10,13 +10,12 @@ APEX_DEFAULT_CONFIG = merge_dicts(
    DQN_CONFIG,
    {
        "optimizer_class": "AsyncSamplesOptimizer",
-        "optimizer":
-            merge_dicts(
-                DQN_CONFIG["optimizer"], {
-                    "max_weight_sync_delay": 400,
-                    "num_replay_buffer_shards": 4,
-                    "debug": False
-                }),
+        "optimizer": merge_dicts(
+            DQN_CONFIG["optimizer"], {
+                "max_weight_sync_delay": 400,
+                "num_replay_buffer_shards": 4,
+                "debug": False
+            }),
        "n_step": 3,
        "gpu": True,
        "num_workers": 32,
@@ -13,11 +13,11 @@ from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 from ray.tune.trial import Resources

-
 OPTIMIZER_SHARED_CONFIGS = [
    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
    "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
-    "train_batch_size", "learning_starts", "clip_rewards"]
+    "train_batch_size", "learning_starts", "clip_rewards"
+]

 DEFAULT_CONFIG = with_common_config({
    # === Model ===
@@ -110,7 +110,8 @@ class DQNAgent(Agent):
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        return Resources(
-            cpu=1, gpu=cf["gpu"] and 1 or 0,
+            cpu=1,
+            gpu=cf["gpu"] and 1 or 0,
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

@@ -123,7 +124,8 @@ class DQNAgent(Agent):
        self.exploration0 = self._make_exploration_schedule(0)
        self.explorations = [
            self._make_exploration_schedule(i)
-            for i in range(self.config["num_workers"])]
+            for i in range(self.config["num_workers"])
+        ]

        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer"]:
@@ -132,9 +134,10 @@ class DQNAgent(Agent):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
-            self.env_creator, self._policy_graph, self.config["num_workers"],
-            {"num_cpus": self.config["num_cpus_per_worker"],
-             "num_gpus": self.config["num_gpus_per_worker"]})
+            self.env_creator, self._policy_graph, self.config["num_workers"], {
+                "num_cpus": self.config["num_cpus_per_worker"],
+                "num_gpus": self.config["num_gpus_per_worker"]
+            })
        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
            self.local_evaluator, self.remote_evaluators,
            self.config["optimizer"])
@@ -147,14 +150,12 @@ class DQNAgent(Agent):
        if self.config["per_worker_exploration"]:
            assert self.config["num_workers"] > 1, \
                "This requires multiple workers"
-            return ConstantSchedule(
-                0.4 ** (
-                    1 + worker_index / float(
-                        self.config["num_workers"] - 1) * 7))
+            exponent = (
+                1 + worker_index / float(self.config["num_workers"] - 1) * 7)
+            return ConstantSchedule(0.4**exponent)
        return LinearSchedule(
-            schedule_timesteps=int(
-                self.config["exploration_fraction"] *
-                self.config["schedule_max_timesteps"]),
+            schedule_timesteps=int(self.config["exploration_fraction"] *
+                                   self.config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=self.config["exploration_final_eps"])

@@ -191,8 +192,8 @@ class DQNAgent(Agent):
                self.local_evaluator,
                self.remote_evaluators[-len(self.remote_evaluators) // 3:])
        else:
-            result = collect_metrics(
-                self.local_evaluator, self.remote_evaluators)
+            result = collect_metrics(self.local_evaluator,
+                                     self.remote_evaluators)

        return result._replace(
            timesteps_this_iter=self.global_timestep - start_timestep,
@@ -208,14 +209,14 @@ class DQNAgent(Agent):
            ev.__ray_terminate__.remote()

    def _save(self, checkpoint_dir):
-        checkpoint_path = os.path.join(
-            checkpoint_dir, "checkpoint-{}".format(self.iteration))
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "checkpoint-{}".format(self.iteration))
        extra_data = [
            self.local_evaluator.save(),
            ray.get([e.save.remote() for e in self.remote_evaluators]),
-            self.optimizer.save(),
-            self.num_target_updates,
-            self.last_target_update_ts]
+            self.optimizer.save(), self.num_target_updates,
+            self.last_target_update_ts
+        ]
        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
        return checkpoint_path

@@ -223,8 +224,9 @@ class DQNAgent(Agent):
        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
        self.local_evaluator.restore(extra_data[0])
        ray.get([
-            e.restore.remote(d) for (d, e)
-            in zip(extra_data[1], self.remote_evaluators)])
+            e.restore.remote(d)
+            for (d, e) in zip(extra_data[1], self.remote_evaluators)
+        ])
        self.optimizer.restore(extra_data[2])
        self.num_target_updates = extra_data[3]
        self.last_target_update_ts = extra_data[4]
@@ -13,7 +13,6 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph

-
 Q_SCOPE = "q_func"
 Q_TARGET_SCOPE = "target_q_func"

@@ -33,7 +32,8 @@ class QNetwork(object):
                state_out = model.last_layer
                for hidden in hiddens:
                    state_out = layers.fully_connected(
-                        state_out, num_outputs=hidden,
+                        state_out,
+                        num_outputs=hidden,
                        activation_fn=tf.nn.relu)
                state_score = layers.fully_connected(
                    state_out, num_outputs=1, activation_fn=None)
@@ -50,26 +50,32 @@ class QValuePolicy(object):
        deterministic_actions = tf.argmax(q_values, axis=1)
        batch_size = tf.shape(observations)[0]
        random_actions = tf.random_uniform(
-            tf.stack([batch_size]), minval=0, maxval=num_actions,
+            tf.stack([batch_size]),
+            minval=0,
+            maxval=num_actions,
            dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
-        stochastic_actions = tf.where(
-            chose_random, random_actions, deterministic_actions)
-        self.action = tf.cond(
-            stochastic, lambda: stochastic_actions,
-            lambda: deterministic_actions)
+        stochastic_actions = tf.where(chose_random, random_actions,
+                                      deterministic_actions)
+        self.action = tf.cond(stochastic, lambda: stochastic_actions,
+                              lambda: deterministic_actions)


 class QLoss(object):
-    def __init__(
-            self, q_t_selected, q_tp1_best, importance_weights, rewards,
-            done_mask, gamma=0.99, n_step=1):
+    def __init__(self,
+                 q_t_selected,
+                 q_tp1_best,
+                 importance_weights,
+                 rewards,
+                 done_mask,
+                 gamma=0.99,
+                 n_step=1):

        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
-        q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked
+        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked

        # compute the error (potentially clipped)
        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
@@ -91,14 +97,14 @@ class DQNPolicyGraph(TFPolicyGraph):

        def _build_q_network(obs):
            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                num_actions, config["dueling"], config["hiddens"]).value
+                ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
+                config["dueling"], config["hiddens"]).value

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
        self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)

        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
@@ -106,20 +112,17 @@ class DQNPolicyGraph(TFPolicyGraph):
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
-        self.output_actions = QValuePolicy(
-            q_values,
-            self.cur_observations,
-            num_actions,
-            self.stochastic,
-            self.eps).action
+        self.output_actions = QValuePolicy(q_values, self.cur_observations,
+                                           num_actions, self.stochastic,
+                                           self.eps).action

        # Replay inputs
        self.obs_t = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(
            tf.float32, [None], name="weight")
@@ -134,8 +137,8 @@ class DQNPolicyGraph(TFPolicyGraph):
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(
-            q_t * tf.one_hot(self.act_t, num_actions), 1)
+        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
+                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if config["double_q"]:
@@ -143,20 +146,20 @@ class DQNPolicyGraph(TFPolicyGraph):
                q_tp1_using_online_net = _build_q_network(self.obs_tp1)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
-                q_tp1 * tf.one_hot(
-                    q_tp1_best_using_online_net, num_actions), 1)
+                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
+                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)

-        self.loss = QLoss(
-            q_t_selected, q_tp1_best, self.importance_weights,
-            self.rew_t, self.done_mask, config["gamma"], config["n_step"])
+        self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
+                          self.rew_t, self.done_mask, config["gamma"],
+                          config["n_step"])

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        update_target_expr = []
        for var, var_target in zip(
-            sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(self.target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        self.update_target_expr = tf.group(*update_target_expr)
@@ -172,9 +175,13 @@ class DQNPolicyGraph(TFPolicyGraph):
            ("weights", self.importance_weights),
        ]
        TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
            obs_input=self.cur_observations,
-            action_sampler=self.output_actions, loss=self.loss.loss,
+            action_sampler=self.output_actions,
+            loss=self.loss.loss,
            loss_inputs=self.loss_inputs)
        self.sess.run(tf.global_variables_initializer())

@@ -184,13 +191,14 @@ class DQNPolicyGraph(TFPolicyGraph):
    def gradients(self, optimizer):
        if self.config["grad_norm_clipping"] is not None:
            grads_and_vars = _minimize_and_clip(
-                optimizer, self.loss.loss, var_list=self.q_func_vars,
+                optimizer,
+                self.loss.loss,
+                var_list=self.q_func_vars,
                clip_val=self.config["grad_norm_clipping"])
        else:
            grads_and_vars = optimizer.compute_gradients(
                self.loss.loss, var_list=self.q_func_vars)
-        grads_and_vars = [
-            (g, v) for (g, v) in grads_and_vars if g is not None]
+        grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
        return grads_and_vars

    def extra_compute_action_feed_dict(self):
@@ -207,8 +215,8 @@ class DQNPolicyGraph(TFPolicyGraph):
    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
        return _postprocess_dqn(self, sample_batch)

-    def compute_td_error(
-            self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
+    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
+                         importance_weights):
        td_err = self.sess.run(
            self.loss.td_error,
            feed_dict={
@@ -254,7 +262,7 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
            continue  # episode end
        for j in range(1, n_step):
            new_obs[i] = new_obs[i + j]
-            rewards[i] += gamma ** j * rewards[i + j]
+            rewards[i] += gamma**j * rewards[i + j]
            if dones[i + j]:
                break  # episode end
    # truncate ends of the trajectory
@@ -266,24 +274,29 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
 def _postprocess_dqn(policy_graph, sample_batch):
    obs, actions, rewards, new_obs, dones = [
        list(x) for x in sample_batch.columns(
-            ["obs", "actions", "rewards", "new_obs", "dones"])]
+            ["obs", "actions", "rewards", "new_obs", "dones"])
+    ]

    # N-step Q adjustments
    if policy_graph.config["n_step"] > 1:
-        adjust_nstep(
-            policy_graph.config["n_step"], policy_graph.config["gamma"],
-            obs, actions, rewards, new_obs, dones)
+        adjust_nstep(policy_graph.config["n_step"],
+                     policy_graph.config["gamma"], obs, actions, rewards,
+                     new_obs, dones)

    batch = SampleBatch({
-        "obs": obs, "actions": actions, "rewards": rewards,
-        "new_obs": new_obs, "dones": dones,
-        "weights": np.ones_like(rewards)})
+        "obs": obs,
+        "actions": actions,
+        "rewards": rewards,
+        "new_obs": new_obs,
+        "dones": dones,
+        "weights": np.ones_like(rewards)
+    })

    # Prioritize on the worker side
    if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
        td_errors = policy_graph.compute_td_error(
-            batch["obs"], batch["actions"], batch["rewards"],
-            batch["new_obs"], batch["dones"], batch["weights"])
+            batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"],
+            batch["dones"], batch["weights"])
        new_priorities = (
            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
        batch.data["weights"] = new_priorities
@@ -295,8 +308,7 @@ def _huber_loss(x, delta=1.0):
    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
    return tf.where(
        tf.abs(x) < delta,
-        tf.square(x) * 0.5,
-        delta * (tf.abs(x) - 0.5 * delta))
+        tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))


 def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
@@ -20,13 +20,11 @@ from ray.rllib.agents.es import policies
 from ray.rllib.agents.es import tabular_logger as tlogger
 from ray.rllib.agents.es import utils

-
 Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
 ])

-
 DEFAULT_CONFIG = {
    'l2_coeff': 0.005,
    'noise_stdev': 0.02,
@@ -64,7 +62,11 @@ class SharedNoiseTable(object):

@ray.remote
 class Worker(object):
-    def __init__(self, config, policy_params, env_creator, noise,
+    def __init__(self,
+                 config,
+                 policy_params,
+                 env_creator,
+                 noise,
                 min_task_runtime=0.2):
        self.min_task_runtime = min_task_runtime
        self.config = config
@@ -82,7 +84,9 @@ class Worker(object):

    def rollout(self, timestep_limit, add_noise=True):
        rollout_rewards, rollout_length = policies.rollout(
-            self.policy, self.env, timestep_limit=timestep_limit,
+            self.policy,
+            self.env,
+            timestep_limit=timestep_limit,
            add_noise=add_noise)
        return rollout_rewards, rollout_length

@@ -95,8 +99,8 @@ class Worker(object):

        # Perform some rollouts with noise.
        task_tstart = time.time()
-        while (len(noise_indices) == 0 or
-               time.time() - task_tstart < self.min_task_runtime):
+        while (len(noise_indices) == 0
+               or time.time() - task_tstart < self.min_task_runtime):

            if np.random.uniform() < self.config["eval_prob"]:
                # Do an evaluation run with no perturbation.
@@ -122,7 +126,8 @@ class Worker(object):
                noise_indices.append(noise_index)
                returns.append([rewards_pos.sum(), rewards_neg.sum()])
                sign_returns.append(
-                    [np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()])
+                    [np.sign(rewards_pos).sum(),
+                     np.sign(rewards_neg).sum()])
                lengths.append([lengths_pos, lengths_neg])

        return Result(
@@ -146,9 +151,7 @@ class ESAgent(Agent):
        return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])

    def _init(self):
-        policy_params = {
-            "action_noise_std": 0.01
-        }
+        policy_params = {"action_noise_std": 0.01}

        env = self.env_creator(self.config["env_config"])
        from ray.rllib import models
@@ -168,9 +171,9 @@ class ESAgent(Agent):
        # Create the actors.
        print("Creating actors.")
        self.workers = [
-            Worker.remote(
-                self.config, policy_params, self.env_creator, noise_id)
-            for _ in range(self.config["num_workers"])]
+            Worker.remote(self.config, policy_params, self.env_creator,
+                          noise_id) for _ in range(self.config["num_workers"])
+        ]

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
@@ -180,21 +183,20 @@ class ESAgent(Agent):
        num_episodes, num_timesteps = 0, 0
        results = []
        while num_episodes < min_episodes or num_timesteps < min_timesteps:
-            print(
-                "Collected {} episodes {} timesteps so far this iter".format(
-                    num_episodes, num_timesteps))
-            rollout_ids = [worker.do_rollouts.remote(theta_id)
-                           for worker in self.workers]
+            print("Collected {} episodes {} timesteps so far this iter".format(
+                num_episodes, num_timesteps))
+            rollout_ids = [
+                worker.do_rollouts.remote(theta_id) for worker in self.workers
+            ]
            # Get the results of the rollouts.
            for result in ray.get(rollout_ids):
                results.append(result)
                # Update the number of episodes and the number of timesteps
                # keeping in mind that result.noisy_lengths is a list of lists,
                # where the inner lists have length 2.
-                num_episodes += sum(len(pair) for pair
-                                    in result.noisy_lengths)
-                num_timesteps += sum(sum(pair) for pair
-                                     in result.noisy_lengths)
+                num_episodes += sum(len(pair) for pair in result.noisy_lengths)
+                num_timesteps += sum(
+                    sum(pair) for pair in result.noisy_lengths)
        return results, num_episodes, num_timesteps

    def _train(self):
@@ -209,8 +211,7 @@ class ESAgent(Agent):
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
-            theta_id,
-            config["episodes_per_batch"],
+            theta_id, config["episodes_per_batch"],
            config["timesteps_per_batch"])

        all_noise_indices = []
@@ -255,13 +256,11 @@ class ESAgent(Agent):
             for index in noise_indices),
            batch_size=500)
        g /= noisy_returns.size
-        assert (
-            g.shape == (self.policy.num_params,) and
-            g.dtype == np.float32 and
-            count == len(noise_indices))
+        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
+                and count == len(noise_indices))
        # Compute the new weights theta.
-        theta, update_ratio = self.optimizer.update(
-            -g + config["l2_coeff"] * theta)
+        theta, update_ratio = self.optimizer.update(-g +
+                                                    config["l2_coeff"] * theta)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)

@@ -313,13 +312,10 @@ class ESAgent(Agent):
            w.__ray_terminate__.remote()

    def _save(self, checkpoint_dir):
-        checkpoint_path = os.path.join(
-            checkpoint_dir, "checkpoint-{}".format(self.iteration))
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "checkpoint-{}".format(self.iteration))
        weights = self.policy.get_weights()
-        objects = [
-            weights,
-            self.episodes_so_far,
-            self.timesteps_so_far]
+        objects = [weights, self.episodes_so_far, self.timesteps_so_far]
        pickle.dump(objects, open(checkpoint_path, "wb"))
        return checkpoint_path

@@ -48,8 +48,8 @@ class Adam(Optimizer):
        self.v = np.zeros(self.dim, dtype=np.float32)

    def _compute_step(self, globalg):
-        a = self.stepsize * (np.sqrt(1 - self.beta2 ** self.t) /
-                             (1 - self.beta1 ** self.t))
+        a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
+                             (1 - self.beta1**self.t))
        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
        step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
@@ -21,8 +21,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
    noise drawn from that stream. Otherwise, no action noise will be added.
    """
    env_timestep_limit = env.spec.max_episode_steps
-    timestep_limit = (env_timestep_limit if timestep_limit is None
-                      else min(timestep_limit, env_timestep_limit))
+    timestep_limit = (env_timestep_limit if timestep_limit is None else min(
+        timestep_limit, env_timestep_limit))
    rews = []
    t = 0
    observation = env.reset()
@@ -38,16 +38,16 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):


 class GenericPolicy(object):
-    def __init__(self, sess, action_space, preprocessor,
-                 observation_filter, action_noise_std):
+    def __init__(self, sess, action_space, preprocessor, observation_filter,
+                 action_noise_std):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
-        self.observation_filter = get_filter(
-            observation_filter, self.preprocessor.shape)
-        self.inputs = tf.placeholder(
-            tf.float32, [None] + list(self.preprocessor.shape))
+        self.observation_filter = get_filter(observation_filter,
+                                             self.preprocessor.shape)
+        self.inputs = tf.placeholder(tf.float32,
+                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -59,16 +59,16 @@ class GenericPolicy(object):
        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

-        self.num_params = sum(np.prod(variable.shape.as_list())
-                              for _, variable
-                              in self.variables.variables.items())
+        self.num_params = sum(
+            np.prod(variable.shape.as_list())
+            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())

    def compute(self, observation, add_noise=False, update=True):
        observation = self.preprocessor.transform(observation)
        observation = self.observation_filter(observation[None], update=update)
-        action = self.sess.run(self.sampler,
-                               feed_dict={self.inputs: observation})
+        action = self.sess.run(
+            self.sampler, feed_dict={self.inputs: observation})
        if add_noise and isinstance(self.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * self.action_noise_std
        return action
@@ -25,6 +25,7 @@ DISABLED = 50

 class TbWriter(object):
    """Based on SummaryWriter, but changed to allow for a different prefix."""
+
    def __init__(self, dir, prefix):
        self.dir = dir
        # Start at 1, because EvWriter automatically generates an object with
@@ -34,9 +35,10 @@ class TbWriter(object):
            compat.as_bytes(os.path.join(dir, prefix)))

    def write_values(self, key2val):
-        summary = tf.Summary(value=[tf.Summary.Value(tag=k,
-                                                     simple_value=float(v))
-                                    for (k, v) in key2val.items()])
+        summary = tf.Summary(value=[
+            tf.Summary.Value(tag=k, simple_value=float(v))
+            for (k, v) in key2val.items()
+        ])
        event = event_pb2.Event(wall_time=time.time(), summary=summary)
        event.step = self.step
        self.evwriter.WriteEvent(event)
@@ -46,6 +48,7 @@ class TbWriter(object):
    def close(self):
        self.evwriter.Close()

+
 # API


@@ -126,6 +129,7 @@ def get_expt_dir():
    sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir()\n")
    return get_dir()

+
 # Backend


@@ -167,8 +171,8 @@ class _Logger(object):
        # Write to all text outputs
        self._write_text("-" * (keywidth + valwidth + 7), "\n")
        for (key, val) in key2str.items():
-            self._write_text("| ", key, " " * (keywidth - len(key)),
-                             " | ", val, " " * (valwidth - len(val)), " |\n")
+            self._write_text("| ", key, " " * (keywidth - len(key)), " | ",
+                             val, " " * (valwidth - len(val)), " |\n")
        self._write_text("-" * (keywidth + valwidth + 7), "\n")
        for f in self.text_outputs:
            try:
@@ -202,7 +206,7 @@ class _Logger(object):
    # Misc

    def _do_log(self, *args):
-        self._write_text(*args + ('\n',))
+        self._write_text(*args + ('\n', ))
        for f in self.text_outputs:
            try:
                f.flush()
@@ -31,8 +31,9 @@ def compute_centered_ranks(x):
 def make_session(single_threaded):
    if not single_threaded:
        return tf.Session()
-    return tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1,
-                                            intra_op_parallelism_threads=1))
+    return tf.Session(
+        config=tf.ConfigProto(
+            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))


 def itergroups(items, group_size):
@@ -50,10 +51,11 @@ def itergroups(items, group_size):
 def batched_weighted_sum(weights, vecs, batch_size):
    total = 0
    num_items_summed = 0
-    for batch_weights, batch_vecs in zip(itergroups(weights, batch_size),
-                                         itergroups(vecs, batch_size)):
+    for batch_weights, batch_vecs in zip(
+            itergroups(weights, batch_size), itergroups(vecs, batch_size)):
        assert len(batch_weights) == len(batch_vecs) <= batch_size
-        total += np.dot(np.asarray(batch_weights, dtype=np.float32),
-                        np.asarray(batch_vecs, dtype=np.float32))
+        total += np.dot(
+            np.asarray(batch_weights, dtype=np.float32),
+            np.asarray(batch_vecs, dtype=np.float32))
        num_items_summed += len(batch_weights)
    return total, num_items_summed
@@ -7,7 +7,6 @@ from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.tune.trial import Resources

-
 DEFAULT_CONFIG = with_common_config({
    # No remote workers by default
    "num_workers": 0,
@@ -43,9 +42,9 @@ class PGAgent(Agent):
            self.env_creator, PGPolicyGraph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, PGPolicyGraph, self.config["num_workers"], {})
-        self.optimizer = SyncSamplesOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
+                                              self.remote_evaluators,
+                                              self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
@@ -42,9 +42,15 @@ class PGPolicyGraph(TFPolicyGraph):
        ]

        TFPolicyGraph.__init__(
-            self, obs_space, action_space, sess, obs_input=obs,
-            action_sampler=action_dist.sample(), loss=loss,
-            loss_inputs=loss_in, state_inputs=self.model.state_in,
+            self,
+            obs_space,
+            action_space,
+            sess,
+            obs_input=obs,
+            action_sampler=action_dist.sample(),
+            loss=loss,
+            loss_inputs=loss_in,
+            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])
@@ -77,28 +77,30 @@ class PPOAgent(Agent):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, PPOPolicyGraph)
        self.remote_evaluators = self.make_remote_evaluators(
-            self.env_creator, PPOPolicyGraph, self.config["num_workers"],
-            {"num_cpus": self.config["num_cpus_per_worker"],
-             "num_gpus": self.config["num_gpus_per_worker"]})
+            self.env_creator, PPOPolicyGraph, self.config["num_workers"], {
+                "num_cpus": self.config["num_cpus_per_worker"],
+                "num_gpus": self.config["num_gpus_per_worker"]
+            })
        if self.config["simple_optimizer"]:
            self.optimizer = SyncSamplesOptimizer(
                self.local_evaluator, self.remote_evaluators,
                {"num_sgd_iter": self.config["num_sgd_iter"]})
        else:
            self.optimizer = LocalMultiGPUOptimizer(
-                self.local_evaluator, self.remote_evaluators,
-                {"sgd_batch_size": self.config["sgd_batchsize"],
-                 "sgd_stepsize": self.config["sgd_stepsize"],
-                 "num_sgd_iter": self.config["num_sgd_iter"],
-                 "timesteps_per_batch": self.config["timesteps_per_batch"],
-                 "standardize_fields": ["advantages"]})
+                self.local_evaluator, self.remote_evaluators, {
+                    "sgd_batch_size": self.config["sgd_batchsize"],
+                    "sgd_stepsize": self.config["sgd_stepsize"],
+                    "num_sgd_iter": self.config["num_sgd_iter"],
+                    "timesteps_per_batch": self.config["timesteps_per_batch"],
+                    "standardize_fields": ["advantages"]
+                })

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        fetches = self.optimizer.step()
        self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"]))
-        FilterManager.synchronize(
-            self.local_evaluator.filters, self.remote_evaluators)
+        FilterManager.synchronize(self.local_evaluator.filters,
+                                  self.remote_evaluators)
        res = self.optimizer.collect_metrics()
        res = res._replace(
            timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
@@ -115,9 +117,7 @@ class PPOAgent(Agent):
                                       "checkpoint-{}".format(self.iteration))
        agent_state = ray.get(
            [a.save.remote() for a in self.remote_evaluators])
-        extra_data = [
-            self.local_evaluator.save(),
-            agent_state]
+        extra_data = [self.local_evaluator.save(), agent_state]
        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
        return checkpoint_path

@@ -126,4 +126,5 @@ class PPOAgent(Agent):
        self.local_evaluator.restore(extra_data[0])
        ray.get([
            a.restore.remote(o)
-                for (a, o) in zip(self.remote_evaluators, extra_data[1])])
+            for (a, o) in zip(self.remote_evaluators, extra_data[1])
+        ])
@@ -10,10 +10,20 @@ from ray.rllib.models.catalog import ModelCatalog


 class PPOLoss(object):
-    def __init__(
-            self, action_space, value_targets, advantages, actions, logits,
-            vf_preds, curr_action_dist, value_fn, cur_kl_coeff,
-            entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True):
+    def __init__(self,
+                 action_space,
+                 value_targets,
+                 advantages,
+                 actions,
+                 logits,
+                 vf_preds,
+                 curr_action_dist,
+                 value_fn,
+                 cur_kl_coeff,
+                 entropy_coeff=0,
+                 clip_param=0.1,
+                 vf_loss_coeff=1.0,
+                 use_gae=True):
        """Constructs the loss for Proximal Policy Objective.

        Arguments:
@@ -51,31 +61,33 @@ class PPOLoss(object):

        surrogate_loss = tf.minimum(
            advantages * logp_ratio,
-            advantages * tf.clip_by_value(
-                logp_ratio, 1 - clip_param, 1 + clip_param))
+            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
+                                          1 + clip_param))
        self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)

        if use_gae:
            vf_loss1 = tf.square(value_fn - value_targets)
-            vf_clipped = vf_preds + tf.clip_by_value(
-                value_fn - vf_preds, -clip_param, clip_param)
+            vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
+                                                     -clip_param, clip_param)
            vf_loss2 = tf.square(vf_clipped - value_targets)
            vf_loss = tf.maximum(vf_loss1, vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(vf_loss)
-            loss = tf.reduce_mean(
-                -surrogate_loss + cur_kl_coeff*action_kl +
-                vf_loss_coeff*vf_loss - entropy_coeff*curr_entropy)
+            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
+                                  vf_loss_coeff * vf_loss -
+                                  entropy_coeff * curr_entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
-            loss = tf.reduce_mean(
-                -surrogate_loss + cur_kl_coeff*action_kl -
-                entropy_coeff*curr_entropy)
+            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
+                                  entropy_coeff * curr_entropy)
        self.loss = loss


 class PPOPolicyGraph(TFPolicyGraph):
-    def __init__(self, observation_space, action_space,
-                 config, existing_inputs=None):
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 config,
+                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
@@ -98,16 +110,18 @@ class PPOPolicyGraph(TFPolicyGraph):
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(
-                tf.float32, name="obs", shape=(None,)+observation_space.shape)
+                tf.float32,
+                name="obs",
+                shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(
-                tf.float32, name="advantages", shape=(None,))
+                tf.float32, name="advantages", shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(
                tf.float32, name="logits", shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(
-                tf.float32, name="vf_preds", shape=(None,))
+                tf.float32, name="vf_preds", shape=(None, ))
            value_targets_ph = tf.placeholder(
-                tf.float32, name="value_targets", shape=(None,))
+                tf.float32, name="value_targets", shape=(None, ))
            existing_state_in = None
            existing_seq_lens = None

@@ -120,13 +134,19 @@ class PPOPolicyGraph(TFPolicyGraph):
            ("vf_preds", vf_preds_ph),
        ]
        self.model = ModelCatalog.get_model(
-            obs_ph, logit_dim, self.config["model"],
-            state_in=existing_state_in, seq_lens=existing_seq_lens)
+            obs_ph,
+            logit_dim,
+            self.config["model"],
+            state_in=existing_state_in,
+            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(
            initializer=tf.constant_initializer(self.kl_coeff_val),
-            name="kl_coeff", shape=(), trainable=False, dtype=tf.float32)
+            name="kl_coeff",
+            shape=(),
+            trainable=False,
+            dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
@@ -146,20 +166,32 @@ class PPOPolicyGraph(TFPolicyGraph):
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        self.loss_obj = PPOLoss(
-            action_space, value_targets_ph, adv_ph, act_ph,
-            logits_ph, vf_preds_ph,
-            curr_action_dist, self.value_function, self.kl_coeff,
+            action_space,
+            value_targets_ph,
+            adv_ph,
+            act_ph,
+            logits_ph,
+            vf_preds_ph,
+            curr_action_dist,
+            self.value_function,
+            self.kl_coeff,
            entropy_coeff=self.config["entropy_coeff"],
            clip_param=self.config["clip_param"],
            vf_loss_coeff=self.config["kl_target"],
            use_gae=self.config["use_gae"])

        TFPolicyGraph.__init__(
-            self, observation_space, action_space,
-            self.sess, obs_input=obs_ph,
-            action_sampler=self.sampler, loss=self.loss_obj.loss,
-            loss_inputs=self.loss_in, state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
+            obs_input=obs_ph,
+            action_sampler=self.sampler,
+            loss=self.loss_obj.loss,
+            loss_inputs=self.loss_in,
+            state_inputs=self.model.state_in,
+            state_outputs=self.model.state_out,
+            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
@@ -167,7 +199,9 @@ class PPOPolicyGraph(TFPolicyGraph):
    def copy(self, existing_inputs):
        """Creates a copy of self using existing input placeholders."""
        return PPOPolicyGraph(
-            None, self.action_space, self.config,
+            None,
+            self.action_space,
+            self.config,
            existing_inputs=existing_inputs)

    def extra_compute_action_fetches(self):
@@ -193,8 +227,11 @@ class PPOPolicyGraph(TFPolicyGraph):
    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
        last_r = 0.0
        batch = compute_advantages(
-            sample_batch, last_r, self.config["gamma"],
-            self.config["lambda"], use_gae=self.config["use_gae"])
+            sample_batch,
+            last_r,
+            self.config["gamma"],
+            self.config["lambda"],
+            use_gae=self.config["use_gae"])
        return batch

    def optimizer(self):
@@ -13,7 +13,6 @@ from ray.rllib.agents.ppo.utils import flatten, concatenate

 # TODO(ekl): move to rllib/models dir
 class DistributionsTest(unittest.TestCase):
-
    def testCategorical(self):
        num_samples = 100000
        logits = tf.placeholder(tf.float32, shape=(None, 10))
@@ -32,10 +31,11 @@ class DistributionsTest(unittest.TestCase):


 class UtilsTest(unittest.TestCase):
-
    def testFlatten(self):
-        d = {"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
-             "a": np.array([[[5], [-5]], [[6], [-6]]])}
+        d = {
+            "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
+            "a": np.array([[[5], [-5]], [[6], [-6]]])
+        }
        flat = flatten(d.copy(), start=0, stop=2)
        assert_allclose(d["s"][0][0][:], flat["s"][0][:])
        assert_allclose(d["s"][0][1][:], flat["s"][1][:])
@@ -16,7 +16,7 @@ def flatten(weights, start=0, stop=2):
        stop: The ending index.
    """
    for key, val in weights.items():
-        new_shape = val.shape[0:start] + (-1,) + val.shape[stop:]
+        new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
        weights[key] = val.reshape(new_shape)
    return weights

@@ -286,8 +286,8 @@ class _MultiAgentEnvState(object):
        self.reset()

    def poll(self):
-        obs, rew, dones, info = (
-            self.last_obs, self.last_rewards, self.last_dones, self.last_infos)
+        obs, rew, dones, info = (self.last_obs, self.last_rewards,
+                                 self.last_dones, self.last_infos)
        self.last_obs = {}
        self.last_rewards = {}
        self.last_dones = {"__all__": False}
@@ -303,10 +303,13 @@ class _MultiAgentEnvState(object):
    def reset(self):
        self.last_obs = self.env.reset()
        self.last_rewards = {
-            agent_id: None for agent_id in self.last_obs.keys()}
+            agent_id: None
+            for agent_id in self.last_obs.keys()
+        }
        self.last_dones = {
-            agent_id: False for agent_id in self.last_obs.keys()}
-        self.last_infos = {
-            agent_id: {} for agent_id in self.last_obs.keys()}
+            agent_id: False
+            for agent_id in self.last_obs.keys()
+        }
+        self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
        self.last_dones["__all__"] = False
        return self.last_obs
@@ -28,8 +28,7 @@ class NoopResetEnv(gym.Wrapper):
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
-            noops = self.unwrapped.np_random.randint(
-                1, self.noop_max + 1)
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
        assert noops > 0
        obs = None
        for _ in range(noops):
@@ -121,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros(
-            (2,)+env.observation_space.shape, dtype=np.uint8)
+            (2, ) + env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
@@ -71,8 +71,7 @@ class _VectorizedGymEnv(VectorEnv):
        self.envs = existing_envs
        self.num_envs = num_envs
        if make_env and num_envs > 1:
-            self.resetter = _AsyncResetter(
-                make_env, int(self.num_envs ** 0.5))
+            self.resetter = _AsyncResetter(make_env, int(self.num_envs**0.5))
        else:
            self.resetter = _SimpleResetter(make_env)
        while len(self.envs) < self.num_envs:
@@ -15,9 +15,10 @@ def collect_metrics(local_evaluator, remote_evaluators=[]):
    episode_rewards = []
    episode_lengths = []
    policy_rewards = collections.defaultdict(list)
-    metric_lists = ray.get(
-        [a.apply.remote(lambda ev: ev.sampler.get_metrics())
-         for a in remote_evaluators])
+    metric_lists = ray.get([
+        a.apply.remote(lambda ev: ev.sampler.get_metrics())
+        for a in remote_evaluators
+    ])
    metric_lists.append(local_evaluator.sampler.get_metrics())
    for metrics in metric_lists:
        for episode in metrics:
@@ -82,24 +82,23 @@ class PolicyEvaluator(EvaluatorInterface):
    def as_remote(cls, num_cpus=None, num_gpus=None):
        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)

-    def __init__(
-            self,
-            env_creator,
-            policy_graph,
-            policy_mapping_fn=None,
-            tf_session_creator=None,
-            batch_steps=100,
-            batch_mode="truncate_episodes",
-            episode_horizon=None,
-            preprocessor_pref="rllib",
-            sample_async=False,
-            compress_observations=False,
-            num_envs=1,
-            observation_filter="NoFilter",
-            env_config=None,
-            model_config=None,
-            policy_config=None,
-            worker_index=0):
+    def __init__(self,
+                 env_creator,
+                 policy_graph,
+                 policy_mapping_fn=None,
+                 tf_session_creator=None,
+                 batch_steps=100,
+                 batch_mode="truncate_episodes",
+                 episode_horizon=None,
+                 preprocessor_pref="rllib",
+                 sample_async=False,
+                 compress_observations=False,
+                 num_envs=1,
+                 observation_filter="NoFilter",
+                 env_config=None,
+                 model_config=None,
+                 policy_config=None,
+                 worker_index=0):
        """Initialize a policy evaluator.

        Arguments:
@@ -157,8 +156,8 @@ class PolicyEvaluator(EvaluatorInterface):
        policy_config = policy_config or {}
        self.policy_config = policy_config
        model_config = model_config or {}
-        policy_mapping_fn = (
-            policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID))
+        policy_mapping_fn = (policy_mapping_fn
+                             or (lambda agent_id: DEFAULT_POLICY_ID))
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
@@ -170,17 +169,21 @@ class PolicyEvaluator(EvaluatorInterface):
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):
+
            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":
+
            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:
+
            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)
+
        self.env = wrap(self.env)

        def make_env():
@@ -193,20 +196,21 @@ class PolicyEvaluator(EvaluatorInterface):
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
-                    self.tf_sess = tf.Session(config=tf.ConfigProto(
-                        gpu_options=tf.GPUOptions(allow_growth=True)))
+                    self.tf_sess = tf.Session(
+                        config=tf.ConfigProto(
+                            gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map = self._build_policy_map(
                        policy_dict, policy_config)
        else:
-            self.policy_map = self._build_policy_map(
-                policy_dict, policy_config)
+            self.policy_map = self._build_policy_map(policy_dict,
+                                                     policy_config)

        self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)

        self.filters = {
-            policy_id: get_filter(
-                observation_filter, policy.observation_space.shape)
+            policy_id: get_filter(observation_filter,
+                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

@@ -226,24 +230,34 @@ class PolicyEvaluator(EvaluatorInterface):
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
-            raise ValueError(
-                "Unsupported batch mode: {}".format(self.batch_mode))
+            raise ValueError("Unsupported batch mode: {}".format(
+                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(
-                self.async_env, self.policy_map, policy_mapping_fn,
-                self.filters, batch_steps, horizon=episode_horizon,
-                pack=pack_episodes, tf_sess=self.tf_sess)
+                self.async_env,
+                self.policy_map,
+                policy_mapping_fn,
+                self.filters,
+                batch_steps,
+                horizon=episode_horizon,
+                pack=pack_episodes,
+                tf_sess=self.tf_sess)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(
-                self.async_env, self.policy_map, policy_mapping_fn,
-                self.filters, batch_steps, horizon=episode_horizon,
-                pack=pack_episodes, tf_sess=self.tf_sess)
+                self.async_env,
+                self.policy_map,
+                policy_mapping_fn,
+                self.filters,
+                batch_steps,
+                horizon=episode_horizon,
+                pack=pack_episodes,
+                tf_sess=self.tf_sess)

    def _build_policy_map(self, policy_dict, policy_config):
        policy_map = {}
-        for name, (cls, obs_space, act_space, conf) in sorted(
-                policy_dict.items()):
+        for name, (cls, obs_space, act_space,
+                   conf) in sorted(policy_dict.items()):
            merged_conf = policy_config.copy()
            merged_conf.update(conf)
            with tf.variable_scope(name):
@@ -315,7 +329,8 @@ class PolicyEvaluator(EvaluatorInterface):
    def get_weights(self):
        return {
            pid: policy.get_weights()
-            for pid, policy in self.policy_map.items()}
+            for pid, policy in self.policy_map.items()
+        }

    def set_weights(self, weights):
        for pid, w in weights.items():
@@ -351,9 +366,7 @@ class PolicyEvaluator(EvaluatorInterface):
                        builder, grad)
                    for pid, grad in grads.items()
                }
-                return {
-                    k: builder.get(v) for k, v in outputs.items()
-                }
+                return {k: builder.get(v) for k, v in outputs.items()}
            else:
                return {
                    pid: self.policy_map[pid].apply_gradients(g)
@@ -428,8 +441,9 @@ def _validate_and_canonicalize(policy_graph, env):
        raise ValueError("policy_graph must be a rllib.PolicyGraph class")
    else:
        return {
-            DEFAULT_POLICY_ID: (
-                policy_graph, env.observation_space, env.action_space, {})}
+            DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
+                                env.action_space, {})
+        }


 def _has_tensorflow_graph(policy_dict):
@@ -45,7 +45,8 @@ class SampleBatchBuilder(object):
        """Returns a sample batch including all previously added values."""

        batch = SampleBatch(
-            {k: to_float_array(v) for k, v in self.buffers.items()})
+            {k: to_float_array(v)
+             for k, v in self.buffers.items()})
        self.buffers.clear()
        self.count = 0
        return batch
@@ -69,7 +70,9 @@ class MultiAgentSampleBatchBuilder(object):

        self.policy_map = policy_map
        self.policy_builders = {
-            k: SampleBatchBuilder() for k in policy_map.keys()}
+            k: SampleBatchBuilder()
+            for k in policy_map.keys()
+        }
        self.agent_builders = {}
        self.agent_to_policy = {}
        self.count = 0  # increment this manually
@@ -12,12 +12,11 @@ from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.utils.tf_run_builder import TFRunBuilder

-
 RolloutMetrics = namedtuple(
    "RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])

-PolicyEvalData = namedtuple(
-    "PolicyEvalData", ["env_id", "agent_id", "obs", "rnn_state"])
+PolicyEvalData = namedtuple("PolicyEvalData",
+                            ["env_id", "agent_id", "obs", "rnn_state"])


 class SyncSampler(object):
@@ -29,9 +28,15 @@ class SyncSampler(object):
    This class provides data on invocation, rather than on a separate
    thread."""

-    def __init__(
-            self, env, policies, policy_mapping_fn, obs_filters,
-            num_local_steps, horizon=None, pack=False, tf_sess=None):
+    def __init__(self,
+                 env,
+                 policies,
+                 policy_mapping_fn,
+                 obs_filters,
+                 num_local_steps,
+                 horizon=None,
+                 pack=False,
+                 tf_sess=None):
        self.async_vector_env = AsyncVectorEnv.wrap_async(env)
        self.num_local_steps = num_local_steps
        self.horizon = horizon
@@ -68,9 +73,15 @@ class AsyncSampler(threading.Thread):
    Note that batch_size is only a unit of measure here. Batches can
    accumulate and the gradient can be calculated on up to 5 batches."""

-    def __init__(
-            self, env, policies, policy_mapping_fn, obs_filters,
-            num_local_steps, horizon=None, pack=False, tf_sess=None):
+    def __init__(self,
+                 env,
+                 policies,
+                 policy_mapping_fn,
+                 obs_filters,
+                 num_local_steps,
+                 horizon=None,
+                 pack=False,
+                 tf_sess=None):
        for _, f in obs_filters.items():
            assert getattr(f, "is_concurrent", False), \
                "Observation Filter must support concurrent updates."
@@ -142,9 +153,14 @@ class AsyncSampler(threading.Thread):
        return completed


-def _env_runner(
-        async_vector_env, policies, policy_mapping_fn, num_local_steps,
-        horizon, obs_filters, pack, tf_sess=None):
+def _env_runner(async_vector_env,
+                policies,
+                policy_mapping_fn,
+                num_local_steps,
+                horizon,
+                obs_filters,
+                pack,
+                tf_sess=None):
    """This implements the common experience collection logic.

    Args:
@@ -186,9 +202,11 @@ def _env_runner(
        else:
            return MultiAgentSampleBatchBuilder(policies)

-    active_episodes = defaultdict(
-        lambda: _MultiAgentEpisode(
-            policies, policy_mapping_fn, get_batch_builder))
+    def new_episode():
+        return _MultiAgentEpisode(policies, policy_mapping_fn,
+                                  get_batch_builder)
+
+    active_episodes = defaultdict(new_episode)

    while True:
        # Get observations from all ready agents
@@ -213,9 +231,8 @@ def _env_runner(
            # Check episode termination conditions
            if dones[env_id]["__all__"] or episode.length >= horizon:
                all_done = True
-                yield RolloutMetrics(
-                    episode.length, episode.total_reward,
-                    dict(episode.agent_rewards))
+                yield RolloutMetrics(episode.length, episode.total_reward,
+                                     dict(episode.agent_rewards))
            else:
                all_done = False
                # At least send an empty dict if not done
@@ -228,9 +245,8 @@ def _env_runner(
                agent_done = bool(all_done or dones[env_id].get(agent_id))
                if not agent_done:
                    to_eval[policy_id].append(
-                        PolicyEvalData(
-                            env_id, agent_id, filtered_obs,
-                            episode.rnn_state_for(agent_id)))
+                        PolicyEvalData(env_id, agent_id, filtered_obs,
+                                       episode.rnn_state_for(agent_id)))

                last_observation = episode.last_observation_for(agent_id)
                episode.set_last_observation(agent_id, filtered_obs)
@@ -274,13 +290,12 @@ def _env_runner(
                    episode = active_episodes[env_id]
                    for agent_id, raw_obs in resetted_obs.items():
                        policy_id = episode.policy_for(agent_id)
-                        filtered_obs = _get_or_raise(
-                            obs_filters, policy_id)(raw_obs)
+                        filtered_obs = _get_or_raise(obs_filters,
+                                                     policy_id)(raw_obs)
                        episode.set_last_observation(agent_id, filtered_obs)
                        to_eval[policy_id].append(
-                            PolicyEvalData(
-                                env_id, agent_id, filtered_obs,
-                                episode.rnn_state_for(agent_id)))
+                            PolicyEvalData(env_id, agent_id, filtered_obs,
+                                           episode.rnn_state_for(agent_id)))

        # Batch eval policy actions if possible
        if tf_sess:
@@ -295,7 +310,8 @@ def _env_runner(
            policy = _get_or_raise(policies, policy_id)
            if builder:
                eval_results[policy_id] = policy.build_compute_actions(
-                    builder, [t.obs for t in eval_data], rnn_in,
+                    builder, [t.obs for t in eval_data],
+                    rnn_in,
                    is_training=True)
            else:
                eval_results[policy_id] = policy.compute_actions(
@@ -319,7 +335,8 @@ def _env_runner(
                episode = active_episodes[env_id]
                episode.set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
                episode.set_last_pi_info(
-                    agent_id, {k: v[i] for k, v in pi_info_cols.items()})
+                    agent_id, {k: v[i]
+                               for k, v in pi_info_cols.items()})
                if env_id in off_policy_actions and \
                        agent_id in off_policy_actions[env_id]:
                    episode.set_last_action(
@@ -334,8 +351,7 @@ def _env_runner(

 def _to_column_format(rnn_state_rows):
    num_cols = len(rnn_state_rows[0])
-    return [
-        [row[i] for row in rnn_state_rows] for i in range(num_cols)]
+    return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]


 def _get_or_raise(mapping, policy_id):
@@ -363,8 +379,8 @@ class _MultiAgentEpisode(object):
    def add_agent_rewards(self, reward_dict):
        for agent_id, reward in reward_dict.items():
            if reward is not None:
-                self.agent_rewards[
-                    agent_id, self.policy_for(agent_id)] += reward
+                self.agent_rewards[agent_id,
+                                   self.policy_for(agent_id)] += reward
                self.total_reward += reward

    def policy_for(self, agent_id):
@@ -35,10 +35,18 @@ class TFPolicyGraph(PolicyGraph):
        SampleBatch({"action": ..., "advantages": ..., ...})
    """

-    def __init__(
-            self, observation_space, action_space, sess, obs_input,
-            action_sampler, loss, loss_inputs, state_inputs=None,
-            state_outputs=None, seq_lens=None, max_seq_len=20):
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 sess,
+                 obs_input,
+                 action_sampler,
+                 loss,
+                 loss_inputs,
+                 state_inputs=None,
+                 state_outputs=None,
+                 seq_lens=None,
+                 max_seq_len=20):
        """Initialize the policy graph.

        Arguments:
@@ -78,9 +86,9 @@ class TFPolicyGraph(PolicyGraph):
        self._seq_lens = seq_lens
        self._max_seq_len = max_seq_len
        self._optimizer = self.optimizer()
-        self._grads_and_vars = [
-            (g, v) for (g, v) in self.gradients(self._optimizer)
-            if g is not None]
+        self._grads_and_vars = [(g, v)
+                                for (g, v) in self.gradients(self._optimizer)
+                                if g is not None]
        self._grads = [g for (g, v) in self._grads_and_vars]
        self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
        self._variables = ray.experimental.TensorFlowVariables(
@@ -92,8 +100,11 @@ class TFPolicyGraph(PolicyGraph):
        if self._state_inputs:
            assert self._seq_lens is not None

-    def build_compute_actions(
-            self, builder, obs_batch, state_batches=None, is_training=False):
+    def build_compute_actions(self,
+                              builder,
+                              obs_batch,
+                              state_batches=None,
+                              is_training=False):
        state_batches = state_batches or []
        assert len(self._state_inputs) == len(state_batches), \
            (self._state_inputs, state_batches)
@@ -103,16 +114,15 @@ class TFPolicyGraph(PolicyGraph):
            builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
        builder.add_feed_dict({self._is_training: is_training})
        builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
-        fetches = builder.add_fetches(
-            [self._sampler] + self._state_outputs +
-            [self.extra_compute_action_fetches()])
+        fetches = builder.add_fetches([self._sampler] + self._state_outputs +
+                                      [self.extra_compute_action_fetches()])
        return fetches[0], fetches[1:-1], fetches[-1]

-    def compute_actions(
-            self, obs_batch, state_batches=None, is_training=False):
+    def compute_actions(self, obs_batch, state_batches=None,
+                        is_training=False):
        builder = TFRunBuilder(self._sess, "compute_actions")
-        fetches = self.build_compute_actions(
-            builder, obs_batch, state_batches, is_training)
+        fetches = self.build_compute_actions(builder, obs_batch, state_batches,
+                                             is_training)
        return builder.get(fetches)

    def _get_loss_inputs_dict(self, batch):
@@ -127,12 +137,11 @@ class TFPolicyGraph(PolicyGraph):
        # RNN case
        feature_keys = [k for k, v in self._loss_inputs]
        state_keys = [
-            "state_in_{}".format(i) for i in range(len(self._state_inputs))]
+            "state_in_{}".format(i) for i in range(len(self._state_inputs))
+        ]
        feature_sequences, initial_states, seq_lens = chop_into_sequences(
-            batch["t"],
-            [batch[k] for k in feature_keys],
-            [batch[k] for k in state_keys],
-            self._max_seq_len)
+            batch["t"], [batch[k] for k in feature_keys],
+            [batch[k] for k in state_keys], self._max_seq_len)
        for k, v in zip(feature_keys, feature_sequences):
            feed_dict[self._loss_input_dict[k]] = v
        for k, v in zip(state_keys, initial_states):
@@ -172,9 +181,11 @@ class TFPolicyGraph(PolicyGraph):
        builder.add_feed_dict(self.extra_apply_grad_feed_dict())
        builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
        builder.add_feed_dict({self._is_training: True})
-        fetches = builder.add_fetches(
-            [self._apply_op, self.extra_compute_grad_fetches(),
-             self.extra_apply_grad_fetches()])
+        fetches = builder.add_fetches([
+            self._apply_op,
+            self.extra_compute_grad_fetches(),
+            self.extra_apply_grad_fetches()
+        ])
        return fetches[1], fetches[2]

    def compute_apply(self, postprocessed_batch):
@@ -27,8 +27,8 @@ class TorchPolicyGraph(PolicyGraph):
            This is necessary when using the async sampler.
    """

-    def __init__(
-            self, observation_space, action_space, model, loss, loss_inputs):
+    def __init__(self, observation_space, action_space, model, loss,
+                 loss_inputs):
        """Build a policy graph from policy and loss torch modules.

        Note that module inputs will be CPU tensors. The model and loss modules
@@ -67,8 +67,8 @@ class TorchPolicyGraph(PolicyGraph):
        """Custom PyTorch optimizer to use."""
        return torch.optim.Adam(self._model.parameters())

-    def compute_actions(
-            self, obs_batch, state_batches=None, is_training=False):
+    def compute_actions(self, obs_batch, state_batches=None,
+                        is_training=False):
        if state_batches:
            raise NotImplementedError("Torch RNN support")
        with self.lock:
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
    global env_version_num

    register(
-      id=env_name,
-      entry_point=(
-        "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
-        "MultiAgentMountainCarEnv"),
-      max_episode_steps=200,
-      kwargs={}
-    )
+        id=env_name,
+        entry_point=(
+            "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
+            "MultiAgentMountainCarEnv"),
+        max_episode_steps=200,
+        kwargs={})


 def create_env(env_config):
@@ -48,10 +47,12 @@ if __name__ == '__main__':
    config["horizon"] = horizon
    config["use_gae"] = False
    config["model"].update({"fcnet_hiddens": [256, 256]})
-    options = {"multiagent_obs_shapes": [2, 2],
-               "multiagent_act_shapes": [1, 1],
-               "multiagent_shared_model": False,
-               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    options = {
+        "multiagent_obs_shapes": [2, 2],
+        "multiagent_act_shapes": [1, 1],
+        "multiagent_shared_model": False,
+        "multiagent_fcnet_hiddens": [[32, 32]] * 2
+    }
    config["model"].update({"custom_options": options})
    alg = ppo.PPOAgent(env=env_name, config=config)
    for i in range(1):
@@ -2,7 +2,6 @@ from math import cos
 from gym.spaces import Box, Tuple, Discrete
 import numpy as np
 from gym.envs.classic_control.mountain_car import MountainCarEnv
-
 """
 Multiagent mountain car that sums and then
 averages its actions to produce the velocity
@@ -22,8 +21,8 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
        self.viewer = None

        self.action_space = [Discrete(3) for _ in range(2)]
-        self.observation_space = Tuple([
-            Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
+        self.observation_space = Tuple(
+            [Box(self.low, self.high, dtype=np.float32) for _ in range(2)])

        self.seed()
        self.reset()
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
    global env_version_num

    register(
-      id=env_name,
-      entry_point=(
-        "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
-        "MultiAgentPendulumEnv"),
-      max_episode_steps=100,
-      kwargs={}
-    )
+        id=env_name,
+        entry_point=(
+            "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
+            "MultiAgentPendulumEnv"),
+        max_episode_steps=100,
+        kwargs={})


 def create_env(env_config):
@@ -49,10 +48,12 @@ if __name__ == '__main__':
    config["horizon"] = horizon
    config["use_gae"] = True
    config["model"].update({"fcnet_hiddens": [256, 256]})
-    options = {"multiagent_obs_shapes": [3, 3],
-               "multiagent_act_shapes": [1, 1],
-               "multiagent_shared_model": True,
-               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    options = {
+        "multiagent_obs_shapes": [3, 3],
+        "multiagent_act_shapes": [1, 1],
+        "multiagent_shared_model": True,
+        "multiagent_fcnet_hiddens": [[32, 32]] * 2
+    }
    config["model"].update({"custom_options": options})
    alg = ppo.PPOAgent(env=env_name, config=config)
    for i in range(1):
@@ -2,7 +2,6 @@ from gym.spaces import Box, Tuple
 from gym.utils import seeding
 from gym.envs.classic_control.pendulum import PendulumEnv
 import numpy as np
-
 """
 Multiagent pendulum that sums its torques to generate an action
 """
@@ -10,8 +9,8 @@ import numpy as np

 class MultiAgentPendulumEnv(PendulumEnv):
    metadata = {
-      'render.modes': ['human', 'rgb_array'],
-      'video.frames_per_second': 30
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
    }

    def __init__(self):
@@ -21,13 +20,14 @@ class MultiAgentPendulumEnv(PendulumEnv):
        self.viewer = None

        high = np.array([1., 1., self.max_speed])
-        self.action_space = [Box(low=-self.max_torque / 2,
-                                 high=self.max_torque / 2,
-                                 shape=(1,),
-                                 dtype=np.float32)
-                             for _ in range(2)]
-        self.observation_space = Tuple([
-            Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
+        self.action_space = [
+            Box(low=-self.max_torque / 2,
+                high=self.max_torque / 2,
+                shape=(1, ),
+                dtype=np.float32) for _ in range(2)
+        ]
+        self.observation_space = Tuple(
+            [Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])

        self.seed()

@@ -49,8 +49,8 @@ class MultiAgentPendulumEnv(PendulumEnv):
        costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
            .001 * (summed_u ** 2)

-        newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
-                            3. / (m * length ** 2) * summed_u) * dt
+        newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + 3. /
+                            (m * length**2) * summed_u) * dt
        newth = th + newthdot * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)

@@ -65,8 +65,10 @@ class MultiAgentPendulumEnv(PendulumEnv):

    def _get_obs(self):
        theta, thetadot = self.state
-        return [np.array([np.cos(theta), np.sin(theta), thetadot])
-                for _ in range(2)]
+        return [
+            np.array([np.cos(theta), np.sin(theta), thetadot])
+            for _ in range(2)
+        ]

    def angle_normalize(self, x):
        return (((x + np.pi) % (2 * np.pi)) - np.pi)
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Simple example of setting up a multi-agent policy mapping.

 Control the number of agents and policies via --num-agents and --num-policies.
@@ -24,14 +23,12 @@ from ray.rllib.test.test_multi_agent_env import MultiCartpole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env

-
 parser = argparse.ArgumentParser()

 parser.add_argument("--num-agents", type=int, default=4)
 parser.add_argument("--num-policies", type=int, default=2)
 parser.add_argument("--num-iters", type=int, default=20)

-
 if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()
@@ -51,7 +48,8 @@ if __name__ == "__main__":

    # Setup PG with an ensemble of `num_policies` different policy graphs
    policy_graphs = {
-        "policy_{}".format(i): gen_policy() for i in range(args.num_policies)
+        "policy_{}".format(i): gen_policy()
+        for i in range(args.num_policies)
    }
    policy_ids = list(policy_graphs.keys())

@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Example of querying a policy server. Copy this file for your use case.

 To try this out, in two separate shells run:
@@ -14,18 +13,19 @@ import gym

 from ray.rllib.utils.policy_client import PolicyClient

-
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--no-train", action="store_true", help="Whether to disable training.")
 parser.add_argument(
-    "--off-policy", action="store_true",
+    "--off-policy",
+    action="store_true",
    help="Whether to take random instead of on-policy actions.")
 parser.add_argument(
-    "--stop-at-reward", type=int, default=9999,
+    "--stop-at-reward",
+    type=int,
+    default=9999,
    help="Stop once the specified reward is reached.")

-
 if __name__ == "__main__":
    args = parser.parse_args()
    env = gym.make("CartPole-v0")
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Example of running a policy server. Copy this file for your use case.

 To try this out, in two separate shells run:
@@ -26,12 +25,12 @@ CHECKPOINT_FILE = "last_checkpoint.out"

 class CartpoleServing(ServingEnv):
    def __init__(self):
-        ServingEnv.__init__(
-            self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4,)))
+        ServingEnv.__init__(self, spaces.Discrete(2),
+                            spaces.Box(low=-10, high=10, shape=(4, )))

    def run(self):
-        print("Starting policy server at {}:{}".format(
-            SERVER_ADDRESS, SERVER_PORT))
+        print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
+                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()

@@ -42,14 +41,16 @@ if __name__ == "__main__":

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
-    dqn = DQNAgent(env="srv", config={
-        # Use a single process to avoid needing to set up a load balancer
-        "num_workers": 0,
-        # Configure the agent to run short iterations for debugging
-        "exploration_fraction": 0.01,
-        "learning_starts": 100,
-        "timesteps_per_iteration": 200,
-    })
+    dqn = DQNAgent(
+        env="srv",
+        config={
+            # Use a single process to avoid needing to set up a load balancer
+            "num_workers": 0,
+            # Configure the agent to run short iterations for debugging
+            "exploration_fraction": 0.01,
+            "learning_starts": 100,
+            "timesteps_per_iteration": 200,
+        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
@@ -6,7 +6,7 @@ from ray.rllib.models.preprocessors import Preprocessor
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.lstm import LSTM

-
-__all__ = ["ActionDistribution", "Categorical",
-           "DiagGaussian", "Deterministic", "ModelCatalog", "Model",
-           "Preprocessor", "FullyConnectedNetwork", "LSTM"]
+__all__ = [
+    "ActionDistribution", "Categorical", "DiagGaussian", "Deterministic",
+    "ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM"
+]
@@ -42,25 +42,25 @@ class Categorical(ActionDistribution):
            logits=self.inputs, labels=x)

    def entropy(self):
-        a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
-                                         keepdims=True)
+        a0 = self.inputs - tf.reduce_max(
+            self.inputs, reduction_indices=[1], keepdims=True)
        ea0 = tf.exp(a0)
        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
        p0 = ea0 / z0
        return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])

    def kl(self, other):
-        a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
-                                         keepdims=True)
-        a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1],
-                                          keepdims=True)
+        a0 = self.inputs - tf.reduce_max(
+            self.inputs, reduction_indices=[1], keepdims=True)
+        a1 = other.inputs - tf.reduce_max(
+            other.inputs, reduction_indices=[1], keepdims=True)
        ea0 = tf.exp(a0)
        ea1 = tf.exp(a1)
        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
        z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
        p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)),
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])

    def sample(self):
        return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
@@ -90,22 +90,23 @@ class DiagGaussian(ActionDistribution):
        self.std = tf.exp(log_std)

    def logp(self, x):
-        return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
-                                     reduction_indices=[1]) -
+        return (-0.5 * tf.reduce_sum(
+            tf.square((x - self.mean) / self.std), reduction_indices=[1]) -
                0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
                tf.reduce_sum(self.log_std, reduction_indices=[1]))

    def kl(self, other):
        assert isinstance(other, DiagGaussian)
-        return tf.reduce_sum(other.log_std - self.log_std +
-                             (tf.square(self.std) +
-                              tf.square(self.mean - other.mean)) /
-                             (2.0 * tf.square(other.std)) - 0.5,
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            other.log_std - self.log_std +
+            (tf.square(self.std) + tf.square(self.mean - other.mean)) /
+            (2.0 * tf.square(other.std)) - 0.5,
+            reduction_indices=[1])

    def entropy(self):
-        return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            self.log_std + .5 * np.log(2.0 * np.pi * np.e),
+            reduction_indices=[1])

    def sample(self):
        out = self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@@ -158,6 +159,7 @@ class MultiActionDistribution(ActionDistribution):
    Args:
        inputs (Tensor list): A list of tensors from which to compute samples.
    """
+
    def __init__(self, inputs, action_space, child_distributions):
        # you actually have to instantiate the child distributions
        self.reshaper = Reshaper(action_space.spaces)
@@ -174,23 +176,25 @@ class MultiActionDistribution(ActionDistribution):
            # Remove extra categorical dimension
            if isinstance(distribution, Categorical):
                split_list[i] = tf.squeeze(split_list[i], axis=-1)
-        log_list = np.asarray([distribution.logp(split_x) for
-                              distribution, split_x in
-                               zip(self.child_distributions, split_list)])
+        log_list = np.asarray([
+            distribution.logp(split_x) for distribution, split_x in zip(
+                self.child_distributions, split_list)
+        ])
        return np.sum(log_list)

    def kl(self, other):
        """The KL-divergence between two action distributions."""
-        kl_list = np.asarray([distribution.kl(other_distribution) for
-                              distribution, other_distribution in
-                              zip(self.child_distributions,
-                                  other.child_distributions)])
+        kl_list = np.asarray([
+            distribution.kl(other_distribution)
+            for distribution, other_distribution in zip(
+                self.child_distributions, other.child_distributions)
+        ])
        return np.sum(kl_list)

    def entropy(self):
        """The entropy of the action distribution."""
-        entropy_list = np.array([s.entropy() for s in
-                                 self.child_distributions])
+        entropy_list = np.array(
+            [s.entropy() for s in self.child_distributions])
        return np.sum(entropy_list)

    def sample(self):
@@ -19,7 +19,6 @@ from ray.rllib.models.visionnet import VisionNetwork
 from ray.rllib.models.lstm import LSTM
 from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork

-
 MODEL_CONFIGS = [
    # === Built-in options ===
    "conv_filters",  # Filter configuration
@@ -30,11 +29,9 @@ MODEL_CONFIGS = [
    "grayscale",  # Converts ATARI frame to 1 Channel Grayscale image
    "zero_mean",  # Changes frame to range from [-1, 1] if true
    "extra_frameskip",  # (int) for number of frames to skip
-
    "free_log_std",  # Documented in ray.rllib.models.Model
    "channel_major",  # Pytorch conv requires images to be channel-major
    "squash_to_range",  # Whether to squash the action output to space range
-
    "use_lstm",  # Whether to wrap the model with a LSTM
    "max_seq_len",  # Max seq len for training the LSTM, defaults to 20
    "lstm_cell_size",  # Size of the LSTM cell
@@ -81,8 +78,8 @@ class ModelCatalog(object):
            if dist_type is None:
                dist = DiagGaussian
                if config.get("squash_to_range"):
-                    dist = squash_to_range(
-                        dist, action_space.low, action_space.high)
+                    dist = squash_to_range(dist, action_space.low,
+                                           action_space.high)
                return dist, action_space.shape[0] * 2
            elif dist_type == 'deterministic':
                return Deterministic, action_space.shape[0]
@@ -95,12 +92,13 @@ class ModelCatalog(object):
                dist, action_size = ModelCatalog.get_action_dist(action)
                child_dist.append(dist)
                size += action_size
-            return partial(MultiActionDistribution,
-                           child_distributions=child_dist,
-                           action_space=action_space), size
+            return partial(
+                MultiActionDistribution,
+                child_distributions=child_dist,
+                action_space=action_space), size

-        raise NotImplementedError(
-            "Unsupported args: {} {}".format(action_space, dist_type))
+        raise NotImplementedError("Unsupported args: {} {}".format(
+            action_space, dist_type))

    @staticmethod
    def get_action_placeholder(action_space):
@@ -120,7 +118,7 @@ class ModelCatalog(object):
            return tf.placeholder(
                tf.float32, shape=(None, action_space.shape[0]), name="action")
        elif isinstance(action_space, gym.spaces.Discrete):
-            return tf.placeholder(tf.int64, shape=(None,), name="action")
+            return tf.placeholder(tf.int64, shape=(None, ), name="action")
        elif isinstance(action_space, gym.spaces.Tuple):
            size = 0
            all_discrete = True
@@ -131,15 +129,19 @@ class ModelCatalog(object):
                    all_discrete = False
                    size += np.product(action_space.spaces[i].shape)
            return tf.placeholder(
-                tf.int64 if all_discrete else tf.float32, shape=(None, size),
+                tf.int64 if all_discrete else tf.float32,
+                shape=(None, size),
                name="action")
        else:
            raise NotImplementedError("action space {}"
                                      " not supported".format(action_space))

    @staticmethod
-    def get_model(
-            inputs, num_outputs, options=None, state_in=None, seq_lens=None):
+    def get_model(inputs,
+                  num_outputs,
+                  options=None,
+                  state_in=None,
+                  seq_lens=None):
        """Returns a suitable model conforming to given input and output specs.

        Args:
@@ -154,12 +156,12 @@ class ModelCatalog(object):
        """

        options = options or {}
-        model = ModelCatalog._get_model(
-            inputs, num_outputs, options, state_in, seq_lens)
+        model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
+                                        seq_lens)

        if options.get("use_lstm"):
-            model = LSTM(
-                model.last_layer, num_outputs, options, state_in, seq_lens)
+            model = LSTM(model.last_layer, num_outputs, options, state_in,
+                         seq_lens)

        return model

@@ -169,16 +171,20 @@ class ModelCatalog(object):
            model = options["custom_model"]
            print("Using custom model {}".format(model))
            return _global_registry.get(RLLIB_MODEL, model)(
-                inputs, num_outputs, options,
-                state_in=state_in, seq_lens=seq_lens)
+                inputs,
+                num_outputs,
+                options,
+                state_in=state_in,
+                seq_lens=seq_lens)

        obs_rank = len(inputs.shape) - 1

        # num_outputs > 1 used to avoid hitting this with the value function
-        if isinstance(options.get("custom_options", {}).get(
-          "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
-            return MultiAgentFullyConnectedNetwork(
-                inputs, num_outputs, options)
+        if isinstance(
+                options.get("custom_options", {}).get(
+                    "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
+            return MultiAgentFullyConnectedNetwork(inputs, num_outputs,
+                                                   options)

        if obs_rank > 1:
            return VisionNetwork(inputs, num_outputs, options)
@@ -198,10 +204,10 @@ class ModelCatalog(object):
        Returns:
            model (Model): Neural network model.
        """
-        from ray.rllib.models.pytorch.fcnet import (
-            FullyConnectedNetwork as PyTorchFCNet)
-        from ray.rllib.models.pytorch.visionnet import (
-            VisionNetwork as PyTorchVisionNet)
+        from ray.rllib.models.pytorch.fcnet import (FullyConnectedNetwork as
+                                                    PyTorchFCNet)
+        from ray.rllib.models.pytorch.visionnet import (VisionNetwork as
+                                                        PyTorchVisionNet)

        if "custom_model" in options:
            model = options["custom_model"]
@@ -232,9 +238,8 @@ class ModelCatalog(object):
        """
        for k in options.keys():
            if k not in MODEL_CONFIGS:
-                raise Exception(
-                    "Unknown config key `{}`, all keys: {}".format(
-                        k, MODEL_CONFIGS))
+                raise Exception("Unknown config key `{}`, all keys: {}".format(
+                    k, MODEL_CONFIGS))

        if "custom_preprocessor" in options:
            preprocessor = options["custom_preprocessor"]
@@ -271,8 +276,8 @@ class ModelCatalog(object):
            preprocessor_name (str): Name to register the preprocessor under.
            preprocessor_class (type): Python class of the preprocessor.
        """
-        _global_registry.register(
-            RLLIB_PREPROCESSOR, preprocessor_name, preprocessor_class)
+        _global_registry.register(RLLIB_PREPROCESSOR, preprocessor_name,
+                                  preprocessor_class)

    @staticmethod
    def register_custom_model(model_name, model_class):
@@ -22,14 +22,17 @@ class FullyConnectedNetwork(Model):
            for size in hiddens:
                label = "fc{}".format(i)
                last_layer = slim.fully_connected(
-                    last_layer, size,
+                    last_layer,
+                    size,
                    weights_initializer=normc_initializer(1.0),
                    activation_fn=activation,
                    scope=label)
                i += 1
            label = "fc_out"
            output = slim.fully_connected(
-                last_layer, num_outputs,
+                last_layer,
+                num_outputs,
                weights_initializer=normc_initializer(0.01),
-                activation_fn=None, scope=label)
+                activation_fn=None,
+                scope=label)
            return output, last_layer
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """LSTM support for RLlib.

 The main trick here is that we add the time dimension at the last moment.
@@ -14,7 +13,6 @@ See the add_time_dimension() and chop_into_sequences() functions below for
 more info.
 """

-
 import numpy as np
 import tensorflow as tf
 import tensorflow.contrib.rnn as rnn
@@ -46,14 +44,13 @@ def add_time_dimension(padded_inputs, seq_lens):

    # Dynamically reshape the padded batch to introduce a time dimension.
    new_batch_size = padded_batch_size // max_seq_len
-    new_shape = (
-        [new_batch_size, max_seq_len] +
-        padded_inputs.get_shape().as_list()[1:])
+    new_shape = ([new_batch_size, max_seq_len] +
+                 padded_inputs.get_shape().as_list()[1:])
    return tf.reshape(padded_inputs, new_shape)


-def chop_into_sequences(
-        time_column, feature_columns, state_columns, max_seq_len):
+def chop_into_sequences(time_column, feature_columns, state_columns,
+                        max_seq_len):
    """Truncate and pad experiences into fixed-length sequences.

    Arguments:
@@ -106,7 +103,7 @@ def chop_into_sequences(
    feature_sequences = []
    for f in feature_columns:
        f = np.array(f)
-        f_pad = np.zeros((len(seq_lens) * max_seq_len,) + np.shape(f)[1:])
+        f_pad = np.zeros((len(seq_lens) * max_seq_len, ) + np.shape(f)[1:])
        seq_base = 0
        i = 0
        for l in seq_lens:
@@ -152,7 +149,8 @@ class LSTM(Model):
            lstm = rnn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
        self.state_init = [
            np.zeros(lstm.state_size.c, np.float32),
-            np.zeros(lstm.state_size.h, np.float32)]
+            np.zeros(lstm.state_size.h, np.float32)
+        ]

        # Setup LSTM inputs
        if self.state_in:
@@ -170,12 +168,15 @@ class LSTM(Model):
        else:
            state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
        lstm_out, lstm_state = tf.nn.dynamic_rnn(
-            lstm, last_layer, initial_state=state_in,
-            sequence_length=self.seq_lens, time_major=False)
+            lstm,
+            last_layer,
+            initial_state=state_in,
+            sequence_length=self.seq_lens,
+            time_major=False)
        self.state_out = list(lstm_state)

        # Compute outputs
        last_layer = tf.reshape(lstm_out, [-1, cell_size])
-        logits = linear(
-            last_layer, num_outputs, "action", normc_initializer(0.01))
+        logits = linear(last_layer, num_outputs, "action",
+                        normc_initializer(0.01))
        return logits, last_layer
@@ -11,6 +11,7 @@ def normc_initializer(std=1.0):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
+
    return _initializer


@@ -18,12 +19,20 @@ def get_activation_fn(name):
    return getattr(tf.nn, name)


-def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
-           dtype=tf.float32, collections=None):
+def conv2d(x,
+           num_filters,
+           name,
+           filter_size=(3, 3),
+           stride=(1, 1),
+           pad="SAME",
+           dtype=tf.float32,
+           collections=None):
    with tf.variable_scope(name):
        stride_shape = [1, stride[0], stride[1], 1]
-        filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
-                        num_filters]
+        filter_shape = [
+            filter_size[0], filter_size[1],
+            int(x.get_shape()[3]), num_filters
+        ]

        # There are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit.
@@ -34,20 +43,24 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
        # Initialize weights with random weights.
        w_bound = np.sqrt(6 / (fan_in + fan_out))

-        w = tf.get_variable("W", filter_shape, dtype,
-                            tf.random_uniform_initializer(-w_bound, w_bound),
-                            collections=collections)
-        b = tf.get_variable("b", [1, 1, 1, num_filters],
-                            initializer=tf.constant_initializer(0.0),
-                            collections=collections)
+        w = tf.get_variable(
+            "W",
+            filter_shape,
+            dtype,
+            tf.random_uniform_initializer(-w_bound, w_bound),
+            collections=collections)
+        b = tf.get_variable(
+            "b", [1, 1, 1, num_filters],
+            initializer=tf.constant_initializer(0.0),
+            collections=collections)
        return tf.nn.conv2d(x, w, stride_shape, pad) + b


 def linear(x, size, name, initializer=None, bias_init=0):
-    w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
-                        initializer=initializer)
-    b = tf.get_variable(name + "/b", [size],
-                        initializer=tf.constant_initializer(bias_init))
+    w = tf.get_variable(
+        name + "/w", [x.get_shape()[1], size], initializer=initializer)
+    b = tf.get_variable(
+        name + "/b", [size], initializer=tf.constant_initializer(bias_init))
    return tf.matmul(x, w) + b


@@ -37,8 +37,12 @@ class Model(object):
    a scale parameter (like a standard deviation).
    """

-    def __init__(
-            self, inputs, num_outputs, options, state_in=None, seq_lens=None):
+    def __init__(self,
+                 inputs,
+                 num_outputs,
+                 options,
+                 state_in=None,
+                 seq_lens=None):
        self.inputs = inputs

        # Default attribute values for the non-RNN case
@@ -57,8 +61,10 @@ class Model(object):
        self.outputs, self.last_layer = self._build_layers(
            inputs, num_outputs, options)
        if options.get("free_log_std", False):
-            log_std = tf.get_variable(name="log_std", shape=[num_outputs],
-                                      initializer=tf.zeros_initializer)
+            log_std = tf.get_variable(
+                name="log_std",
+                shape=[num_outputs],
+                initializer=tf.zeros_initializer)
            self.outputs = tf.concat(
                [self.outputs, 0.0 * self.outputs + log_std], 1)

@@ -23,7 +23,7 @@ class MultiAgentFullyConnectedNetwork(Model):

        custom_options = options["custom_options"]
        hiddens = custom_options.get("multiagent_fcnet_hiddens",
-                                     [[256, 256]]*1)
+                                     [[256, 256]] * 1)

        # check for a shared model
        shared_model = custom_options.get("multiagent_shared_model", 0)
@@ -35,8 +35,8 @@ class MultiAgentFullyConnectedNetwork(Model):
                sub_options = options.copy()
                sub_options.update({"fcnet_hiddens": hiddens[i]})
                # TODO(ev) make this support arbitrary networks
-                fcnet = FullyConnectedNetwork(
-                    split_inputs[i], int(num_actions[i]), sub_options)
+                fcnet = FullyConnectedNetwork(split_inputs[i],
+                                              int(num_actions[i]), sub_options)
                output = fcnet.outputs
                outputs.append(output)
        overall_output = tf.concat(outputs, axis=1)
@@ -6,7 +6,7 @@ import numpy as np
 import gym

 ATARI_OBS_SHAPE = (210, 160, 3)
-ATARI_RAM_OBS_SHAPE = (128,)
+ATARI_RAM_OBS_SHAPE = (128, )


 class Preprocessor(object):
@@ -70,7 +70,7 @@ class AtariPixelPreprocessor(Preprocessor):

 class AtariRamPreprocessor(Preprocessor):
    def _init(self):
-        self.shape = (128,)
+        self.shape = (128, )

    def transform(self, observation):
        return (observation - 128) / 128
@@ -78,7 +78,7 @@ class AtariRamPreprocessor(Preprocessor):

 class OneHotPreprocessor(Preprocessor):
    def _init(self):
-        self.shape = (self._obs_space.n,)
+        self.shape = (self._obs_space.n, )

    def transform(self, observation):
        arr = np.zeros(self._obs_space.n)
@@ -111,13 +111,14 @@ class TupleFlatteningPreprocessor(Preprocessor):
            preprocessor = get_preprocessor(space)(space, self._options)
            self.preprocessors.append(preprocessor)
            size += np.product(preprocessor.shape)
-        self.shape = (size,)
+        self.shape = (size, )

    def transform(self, observation):
        assert len(observation) == len(self.preprocessors), observation
        return np.concatenate([
            np.reshape(p.transform(o), [np.product(p.shape)])
-            for (o, p) in zip(observation, self.preprocessors)])
+            for (o, p) in zip(observation, self.preprocessors)
+        ])


 def get_preprocessor(space):
@@ -22,14 +22,27 @@ class VisionNetwork(Model):
        with tf.name_scope("vision_net"):
            for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
                inputs = slim.conv2d(
-                    inputs, out_size, kernel, stride,
-                    activation_fn=activation, scope="conv{}".format(i))
+                    inputs,
+                    out_size,
+                    kernel,
+                    stride,
+                    activation_fn=activation,
+                    scope="conv{}".format(i))
            out_size, kernel, stride = filters[-1]
            fc1 = slim.conv2d(
-                inputs, out_size, kernel, stride,
-                activation_fn=activation, padding="VALID", scope="fc1")
-            fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None,
-                              normalizer_fn=None, scope="fc2")
+                inputs,
+                out_size,
+                kernel,
+                stride,
+                activation_fn=activation,
+                padding="VALID",
+                scope="fc1")
+            fc2 = slim.conv2d(
+                fc1,
+                num_outputs, [1, 1],
+                activation_fn=None,
+                normalizer_fn=None,
+                scope="fc2")
            return flatten(fc2), flatten(fc1)


@@ -6,7 +6,6 @@ from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer
 from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer
 from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer

-
 __all__ = [
    "PolicyOptimizer", "AsyncSamplesOptimizer", "AsyncGradientsOptimizer",
    "SyncSamplesOptimizer", "SyncReplayOptimizer", "LocalMultiGPUOptimizer"
@@ -14,6 +14,7 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
    evaluators, sending updated weights back as needed. This pipelines the
    gradient computations on the remote workers.
    """
+
    def _init(self, grads_per_step=100):
        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
@@ -55,8 +56,9 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
                    num_gradients += 1

    def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "wait_time_ms": round(1000 * self.wait_timer.mean, 3),
-            "apply_time_ms": round(1000 * self.apply_timer.mean, 3),
-            "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "wait_time_ms": round(1000 * self.wait_timer.mean, 3),
+                "apply_time_ms": round(1000 * self.apply_timer.mean, 3),
+                "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
+            })
@@ -22,7 +22,6 @@ from ray.rllib.utils.actors import TaskPool, create_colocated
 from ray.rllib.utils.timer import TimerStat
 from ray.rllib.utils.window_stat import WindowStat

-
 SAMPLE_QUEUE_DEPTH = 2
 REPLAY_QUEUE_DEPTH = 4
 LEARNER_QUEUE_MAX_SIZE = 16
@@ -35,10 +34,10 @@ class ReplayActor(object):
    Ray actors are single-threaded, so for scalability multiple replay actors
    may be created to increase parallelism."""

-    def __init__(
-            self, num_shards, learning_starts, buffer_size, train_batch_size,
-            prioritized_replay_alpha, prioritized_replay_beta,
-            prioritized_replay_eps, clip_rewards):
+    def __init__(self, num_shards, learning_starts, buffer_size,
+                 train_batch_size, prioritized_replay_alpha,
+                 prioritized_replay_beta, prioritized_replay_eps,
+                 clip_rewards):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.train_batch_size = train_batch_size
@@ -46,7 +45,8 @@ class ReplayActor(object):
        self.prioritized_replay_eps = prioritized_replay_eps

        self.replay_buffer = PrioritizedReplayBuffer(
-            self.buffer_size, alpha=prioritized_replay_alpha,
+            self.buffer_size,
+            alpha=prioritized_replay_alpha,
            clip_rewards=clip_rewards)

        # Metrics
@@ -60,38 +60,39 @@ class ReplayActor(object):
    def add_batch(self, batch):
        with self.add_batch_timer:
            for row in batch.rows():
-                self.replay_buffer.add(
-                    row["obs"], row["actions"], row["rewards"], row["new_obs"],
-                    row["dones"], row["weights"])
+                self.replay_buffer.add(row["obs"], row["actions"],
+                                       row["rewards"], row["new_obs"],
+                                       row["dones"], row["weights"])

    def replay(self):
        with self.replay_timer:
            if len(self.replay_buffer) < self.replay_starts:
                return None

-            (obses_t, actions, rewards, obses_tp1,
-                dones, weights, batch_indexes) = self.replay_buffer.sample(
-                    self.train_batch_size,
-                    beta=self.prioritized_replay_beta)
+            (obses_t, actions, rewards, obses_tp1, dones, weights,
+             batch_indexes) = self.replay_buffer.sample(
+                 self.train_batch_size, beta=self.prioritized_replay_beta)

            batch = SampleBatch({
-                "obs": obses_t, "actions": actions, "rewards": rewards,
-                "new_obs": obses_tp1, "dones": dones, "weights": weights,
-                "batch_indexes": batch_indexes})
+                "obs": obses_t,
+                "actions": actions,
+                "rewards": rewards,
+                "new_obs": obses_tp1,
+                "dones": dones,
+                "weights": weights,
+                "batch_indexes": batch_indexes
+            })
            return batch

    def update_priorities(self, batch_indexes, td_errors):
        with self.update_priorities_timer:
-            new_priorities = (
-                np.abs(td_errors) + self.prioritized_replay_eps)
+            new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
            self.replay_buffer.update_priorities(batch_indexes, new_priorities)

    def stats(self):
        stat = {
-            "add_batch_time_ms": round(
-                1000 * self.add_batch_timer.mean, 3),
-            "replay_time_ms": round(
-                1000 * self.replay_timer.mean, 3),
+            "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3),
+            "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
            "update_priorities_time_ms": round(
                1000 * self.update_priorities_timer.mean, 3),
        }
@@ -145,13 +146,19 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
    "td_error" array in the info return of compute_gradients(). This error
    term will be used for sample prioritization."""

-    def _init(
-            self, learning_starts=1000, buffer_size=10000,
-            prioritized_replay=True, prioritized_replay_alpha=0.6,
-            prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
-            train_batch_size=512, sample_batch_size=50,
-            num_replay_buffer_shards=1, max_weight_sync_delay=400,
-            clip_rewards=True, debug=False):
+    def _init(self,
+              learning_starts=1000,
+              buffer_size=10000,
+              prioritized_replay=True,
+              prioritized_replay_alpha=0.6,
+              prioritized_replay_beta=0.4,
+              prioritized_replay_eps=1e-6,
+              train_batch_size=512,
+              sample_batch_size=50,
+              num_replay_buffer_shards=1,
+              max_weight_sync_delay=400,
+              clip_rewards=True,
+              debug=False):

        self.debug = debug
        self.replay_starts = learning_starts
@@ -164,18 +171,21 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
        self.learner = LearnerThread(self.local_evaluator)
        self.learner.start()

-        self.replay_actors = create_colocated(
-            ReplayActor,
-            [num_replay_buffer_shards, learning_starts, buffer_size,
-             train_batch_size, prioritized_replay_alpha,
-             prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
-            num_replay_buffer_shards)
+        self.replay_actors = create_colocated(ReplayActor, [
+            num_replay_buffer_shards, learning_starts, buffer_size,
+            train_batch_size, prioritized_replay_alpha,
+            prioritized_replay_beta, prioritized_replay_eps, clip_rewards
+        ], num_replay_buffer_shards)
        assert len(self.remote_evaluators) > 0

        # Stats
-        self.timers = {k: TimerStat() for k in [
-            "put_weights", "get_samples", "enqueue", "sample_processing",
-            "replay_processing", "update_priorities", "train", "sample"]}
+        self.timers = {
+            k: TimerStat()
+            for k in [
+                "put_weights", "get_samples", "enqueue", "sample_processing",
+                "replay_processing", "update_priorities", "train", "sample"
+            ]
+        }
        self.num_weight_syncs = 0
        self.learning_started = False

@@ -221,8 +231,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
                sample_timesteps += self.sample_batch_size

                # Send the data to the replay buffer
-                random.choice(self.replay_actors).add_batch.remote(
-                    sample_batch)
+                random.choice(
+                    self.replay_actors).add_batch.remote(sample_batch)

                # Update weights if needed
                self.steps_since_update[ev] += self.sample_batch_size
@@ -268,8 +278,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
        timing["learner_dequeue_time_ms"] = round(
            1000 * self.learner.queue_timer.mean, 3)
        stats = {
-            "sample_throughput": round(
-                self.timers["sample"].mean_throughput, 3),
+            "sample_throughput": round(self.timers["sample"].mean_throughput,
+                                       3),
            "train_throughput": round(self.timers["train"].mean_throughput, 3),
            "num_weight_syncs": self.num_weight_syncs,
        }
@@ -6,7 +6,6 @@ from collections import namedtuple

 import tensorflow as tf

-
 # Variable scope in which created variables will be placed under
 TOWER_SCOPE_NAME = "tower"

@@ -47,8 +46,14 @@ class LocalSyncParallelOptimizer(object):
        grad_norm_clipping: None or int stdev to clip grad norms by
    """

-    def __init__(self, optimizer, devices, input_placeholders, rnn_inputs,
-                 per_device_batch_size, build_graph, logdir,
+    def __init__(self,
+                 optimizer,
+                 devices,
+                 input_placeholders,
+                 rnn_inputs,
+                 per_device_batch_size,
+                 build_graph,
+                 logdir,
                 grad_norm_clipping=None):
        # TODO(rliaw): remove logdir
        self.optimizer = optimizer
@@ -78,8 +83,8 @@ class LocalSyncParallelOptimizer(object):
        self._towers = []
        for device, device_placeholders in zip(self.devices, data_splits):
            self._towers.append(
-                self._setup_device(
-                    device, device_placeholders, len(input_placeholders)))
+                self._setup_device(device, device_placeholders,
+                                   len(input_placeholders)))

        avg = average_gradients([t.grads for t in self._towers])
        if grad_norm_clipping:
@@ -119,14 +124,10 @@ class LocalSyncParallelOptimizer(object):
            assert len(state_inputs[0]) * seq_len == len(inputs[0])
            # Make sure the shorter state inputs arrays are evenly divisible
            state_inputs = [
-                make_divisible_by(arr, self.batch_size)
-                for arr in state_inputs
+                make_divisible_by(arr, self.batch_size) for arr in state_inputs
            ]
            # Then truncate the data inputs to match
-            inputs = [
-                arr[:len(state_inputs[0]) * seq_len]
-                for arr in inputs
-            ]
+            inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs]
            assert len(state_inputs[0]) * seq_len == len(inputs[0])
            assert len(state_inputs[0]) % self.batch_size == 0
            for ph, arr in zip(self.loss_inputs, inputs + state_inputs):
@@ -138,8 +139,7 @@ class LocalSyncParallelOptimizer(object):
                feed_dict[ph] = truncated_arr
                truncated_len = len(truncated_arr)

-        sess.run(
-            [t.init_op for t in self._towers], feed_dict=feed_dict)
+        sess.run([t.init_op for t in self._towers], feed_dict=feed_dict)

        tuples_per_device = truncated_len / len(self.devices)
        assert tuples_per_device > 0, \
@@ -198,7 +198,9 @@ class LocalSyncParallelOptimizer(object):
                device_input_slices = []
                for i, ph in enumerate(device_input_placeholders):
                    current_batch = tf.Variable(
-                        ph, trainable=False, validate_shape=False,
+                        ph,
+                        trainable=False,
+                        validate_shape=False,
                        collections=[])
                    device_input_batches.append(current_batch)
                    if i < num_data_in:
@@ -210,18 +212,17 @@ class LocalSyncParallelOptimizer(object):
                    current_slice = tf.slice(
                        current_batch,
                        ([self._batch_index // scale * granularity] +
-                            [0] * len(ph.shape[1:])),
+                         [0] * len(ph.shape[1:])),
                        ([self.per_device_batch_size // scale * granularity] +
-                            [-1] * len(ph.shape[1:])))
+                         [-1] * len(ph.shape[1:])))
                    current_slice.set_shape(ph.shape)
                    device_input_slices.append(current_slice)
                graph_obj = self.build_graph(device_input_slices)
                device_grads = graph_obj.gradients(self.optimizer)
            return Tower(
-                tf.group(*[batch.initializer
-                           for batch in device_input_batches]),
-                device_grads,
-                graph_obj)
+                tf.group(
+                    *[batch.initializer for batch in device_input_batches]),
+                device_grads, graph_obj)


 # Each tower is a copy of the loss graph pinned to a specific device.
@@ -30,8 +30,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
    may result in unexpected behavior.
    """

-    def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
-              timesteps_per_batch=1024, standardize_fields=[]):
+    def _init(self,
+              sgd_batch_size=128,
+              sgd_stepsize=5e-5,
+              num_sgd_iter=10,
+              timesteps_per_batch=1024,
+              standardize_fields=[]):
        self.batch_size = sgd_batch_size
        self.sgd_stepsize = sgd_stepsize
        self.num_sgd_iter = num_sgd_iter
@@ -41,8 +45,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
            self.devices = ["/cpu:0"]
        else:
            self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
-        self.batch_size = int(
-                sgd_batch_size / len(self.devices)) * len(self.devices)
+        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
+            self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
@@ -70,16 +74,15 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
                with tf.variable_scope("default", reuse=tf.AUTO_REUSE):
                    if self.policy._state_inputs:
                        rnn_inputs = self.policy._state_inputs + [
-                            self.policy._seq_lens]
+                            self.policy._seq_lens
+                        ]
                    else:
                        rnn_inputs = []
                    self.par_opt = LocalSyncParallelOptimizer(
-                        tf.train.AdamOptimizer(self.sgd_stepsize),
-                        self.devices,
-                        [v for _, v in self.policy.loss_inputs()],
-                        rnn_inputs,
-                        self.per_device_batch_size,
-                        self.policy.copy,
+                        tf.train.AdamOptimizer(
+                            self.sgd_stepsize), self.devices,
+                        [v for _, v in self.policy.loss_inputs()], rnn_inputs,
+                        self.per_device_batch_size, self.policy.copy,
                        os.getcwd())

                self.sess = self.local_evaluator.tf_sess
@@ -117,8 +120,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
            else:
                state_keys = []
            tuples_per_device = self.par_opt.load_data(
-                self.sess,
-                [tuples[k] for k in data_keys],
+                self.sess, [tuples[k] for k in data_keys],
                [tuples[k] for k in state_keys])

        with self.grad_timer:
@@ -141,12 +143,14 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
        return _averaged(iter_extra_fetches)

    def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "load_time_ms": round(1000 * self.load_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "load_time_ms": round(1000 * self.load_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+            })


 def _averaged(kv):
@@ -103,9 +103,10 @@ class PolicyOptimizer(object):
        """

        local_result = [func(self.local_evaluator, 0)]
-        remote_results = ray.get(
-            [ev.apply.remote(func, i + 1)
-             for i, ev in enumerate(self.remote_evaluators)])
+        remote_results = ray.get([
+            ev.apply.remote(func, i + 1)
+            for i, ev in enumerate(self.remote_evaluators)
+        ])
        return local_result + remote_results

    def collect_metrics(self):
@@ -90,8 +90,10 @@ class ReplayBuffer(object):
          done_mask[i] = 1 if executing act_batch[i] resulted in
          the end of an episode and 0 otherwise.
        """
-        idxes = [random.randint(0, len(self._storage) - 1)
-                 for _ in range(batch_size)]
+        idxes = [
+            random.randint(0,
+                           len(self._storage) - 1) for _ in range(batch_size)
+        ]
        self._num_sampled += batch_size
        return self._encode_sample(idxes)

@@ -142,12 +144,12 @@ class PrioritizedReplayBuffer(ReplayBuffer):
            reward = np.sign(reward)

        idx = self._next_idx
-        super(PrioritizedReplayBuffer, self).add(
-            obs_t, action, reward, obs_tp1, done, weight)
+        super(PrioritizedReplayBuffer, self).add(obs_t, action, reward,
+                                                 obs_tp1, done, weight)
        if weight is None:
            weight = self._max_priority
-        self._it_sum[idx] = weight ** self._alpha
-        self._it_min[idx] = weight ** self._alpha
+        self._it_sum[idx] = weight**self._alpha
+        self._it_min[idx] = weight**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
@@ -202,11 +204,11 @@ class PrioritizedReplayBuffer(ReplayBuffer):

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
-        max_weight = (p_min * len(self._storage)) ** (-beta)
+        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
-            weight = (p_sample * len(self._storage)) ** (-beta)
+            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
@@ -231,10 +233,10 @@ class PrioritizedReplayBuffer(ReplayBuffer):
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
-            delta = priority ** self._alpha - self._it_sum[idx]
+            delta = priority**self._alpha - self._it_sum[idx]
            self._prio_change_stats.push(delta)
-            self._it_sum[idx] = priority ** self._alpha
-            self._it_min[idx] = priority ** self._alpha
+            self._it_sum[idx] = priority**self._alpha
+            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)

@@ -54,8 +54,7 @@ class SegmentTree(object):
                return self._operation(
                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1,
-                                        node_end)
-                )
+                                        node_end))

    def reduce(self, start=0, end=None):
        """Returns result of applying `self.operation`
@@ -89,9 +88,8 @@ class SegmentTree(object):
        self._value[idx] = val
        idx //= 2
        while idx >= 1:
-            self._value[idx] = self._operation(
-                self._value[2 * idx],
-                self._value[2 * idx + 1])
+            self._value[idx] = self._operation(self._value[2 * idx],
+                                               self._value[2 * idx + 1])
            idx //= 2

    def __getitem__(self, idx):
@@ -102,9 +100,7 @@ class SegmentTree(object):
 class SumSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(SumSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=operator.add,
-            neutral_element=0.0)
+            capacity=capacity, operation=operator.add, neutral_element=0.0)

    def sum(self, start=0, end=None):
        """Returns arr[start] + ... + arr[end]"""
@@ -142,9 +138,7 @@ class SumSegmentTree(SegmentTree):
 class MinSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(MinSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=min,
-            neutral_element=float('inf'))
+            capacity=capacity, operation=min, neutral_element=float('inf'))

    def min(self, start=0, end=None):
        """Returns min(arr[start], ...,  arr[end])"""
@@ -23,11 +23,16 @@ class SyncReplayOptimizer(PolicyOptimizer):
    "td_error" array in the info return of compute_gradients(). This error
    term will be used for sample prioritization."""

-    def _init(
-            self, learning_starts=1000, buffer_size=10000,
-            prioritized_replay=True, prioritized_replay_alpha=0.6,
-            prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
-            train_batch_size=32, sample_batch_size=4, clip_rewards=True):
+    def _init(self,
+              learning_starts=1000,
+              buffer_size=10000,
+              prioritized_replay=True,
+              prioritized_replay_alpha=0.6,
+              prioritized_replay_beta=0.4,
+              prioritized_replay_eps=1e-6,
+              train_batch_size=32,
+              sample_batch_size=4,
+              clip_rewards=True):

        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
@@ -43,11 +48,14 @@ class SyncReplayOptimizer(PolicyOptimizer):

        # Set up replay buffer
        if prioritized_replay:
+
            def new_buffer():
                return PrioritizedReplayBuffer(
-                    buffer_size, alpha=prioritized_replay_alpha,
+                    buffer_size,
+                    alpha=prioritized_replay_alpha,
                    clip_rewards=clip_rewards)
        else:
+
            def new_buffer():
                return ReplayBuffer(buffer_size, clip_rewards)

@@ -72,17 +80,19 @@ class SyncReplayOptimizer(PolicyOptimizer):

            # Handle everything as if multiagent
            if isinstance(batch, SampleBatch):
-                batch = MultiAgentBatch(
-                    {DEFAULT_POLICY_ID: batch}, batch.count)
+                batch = MultiAgentBatch({
+                    DEFAULT_POLICY_ID: batch
+                }, batch.count)

            for policy_id, s in batch.policy_batches.items():
                for row in s.rows():
                    if "weights" not in row:
                        row["weights"] = np.ones_like(row["rewards"])
                    self.replay_buffers[policy_id].add(
-                        pack_if_needed(row["obs"]), row["actions"],
-                        row["rewards"], pack_if_needed(row["new_obs"]),
-                        row["dones"], row["weights"])
+                        pack_if_needed(row["obs"]),
+                        row["actions"], row["rewards"],
+                        pack_if_needed(row["new_obs"]), row["dones"],
+                        row["weights"])

        if self.num_steps_sampled >= self.replay_starts:
            self._optimize()
@@ -112,27 +122,35 @@ class SyncReplayOptimizer(PolicyOptimizer):
        with self.replay_timer:
            for policy_id, replay_buffer in self.replay_buffers.items():
                if isinstance(replay_buffer, PrioritizedReplayBuffer):
-                    (obses_t, actions, rewards, obses_tp1,
-                        dones, weights, batch_indexes) = replay_buffer.sample(
-                            self.train_batch_size,
-                            beta=self.prioritized_replay_beta)
+                    (obses_t, actions, rewards, obses_tp1, dones, weights,
+                     batch_indexes) = replay_buffer.sample(
+                         self.train_batch_size,
+                         beta=self.prioritized_replay_beta)
                else:
                    (obses_t, actions, rewards, obses_tp1,
-                        dones) = replay_buffer.sample(self.train_batch_size)
+                     dones) = replay_buffer.sample(self.train_batch_size)
                    weights = np.ones_like(rewards)
-                    batch_indexes = - np.ones_like(rewards)
+                    batch_indexes = -np.ones_like(rewards)
            samples[policy_id] = SampleBatch({
-                "obs": obses_t, "actions": actions, "rewards": rewards,
-                "new_obs": obses_tp1, "dones": dones, "weights": weights,
-                "batch_indexes": batch_indexes})
+                "obs": obses_t,
+                "actions": actions,
+                "rewards": rewards,
+                "new_obs": obses_tp1,
+                "dones": dones,
+                "weights": weights,
+                "batch_indexes": batch_indexes
+            })
        return MultiAgentBatch(samples, self.train_batch_size)

    def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-            "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
-            "opt_samples": round(self.grad_timer.mean_units_processed, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+                "opt_peak_throughput": round(self.grad_timer.mean_throughput,
+                                             3),
+                "opt_samples": round(self.grad_timer.mean_units_processed, 3),
+            })
@@ -51,10 +51,13 @@ class SyncSamplesOptimizer(PolicyOptimizer):
        return fetches

    def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-            "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
-            "opt_samples": round(self.grad_timer.mean_units_processed, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+                "opt_peak_throughput": round(self.grad_timer.mean_throughput,
+                                             3),
+                "opt_samples": round(self.grad_timer.mean_units_processed, 3),
+            })
@@ -15,7 +15,6 @@ from ray.rllib.agents.agent import get_agent_class
 from ray.rllib.agents.dqn.common.wrappers import wrap_dqn
 from ray.rllib.models import ModelCatalog

-
 EXAMPLE_USAGE = """
 Example Usage via RLlib CLI:
    rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
@@ -32,30 +31,37 @@ def create_parser(parser_creator=None):
    parser = parser_creator(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="Roll out a reinforcement learning agent "
-                    "given a checkpoint.", epilog=EXAMPLE_USAGE)
+        "given a checkpoint.",
+        epilog=EXAMPLE_USAGE)

    parser.add_argument(
        "checkpoint", type=str, help="Checkpoint from which to roll out.")
    required_named = parser.add_argument_group("required named arguments")
    required_named.add_argument(
-        "--run", type=str, required=True,
+        "--run",
+        type=str,
+        required=True,
        help="The algorithm or model to train. This may refer to the name "
-             "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
-             "user-defined trainable function or class registered in the "
-             "tune registry.")
+        "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
+        "user-defined trainable function or class registered in the "
+        "tune registry.")
    required_named.add_argument(
        "--env", type=str, help="The gym environment to use.")
    parser.add_argument(
-        "--no-render", default=False, action="store_const", const=True,
+        "--no-render",
+        default=False,
+        action="store_const",
+        const=True,
        help="Surpress rendering of the environment.")
    parser.add_argument(
        "--steps", default=None, help="Number of steps to roll out.")
+    parser.add_argument("--out", default=None, help="Output filename.")
    parser.add_argument(
-        "--out", default=None, help="Output filename.")
-    parser.add_argument(
-        "--config", default="{}", type=json.loads,
+        "--config",
+        default="{}",
+        type=json.loads,
        help="Algorithm-specific configuration (e.g. env, hyperparams). "
-             "Surpresses loading of configuration from checkpoint.")
+        "Surpresses loading of configuration from checkpoint.")
    return parser


@@ -9,7 +9,6 @@ import argparse
 from ray.rllib import train
 from ray.rllib import rollout

-
 EXAMPLE_USAGE = """
 Example usage for training:
    rllib train --run DQN --env CartPole-v0
@@ -15,16 +15,17 @@ class _MockEvaluator(object):
        self._sample_count = sample_count
        self.obs_filter = MeanStdFilter(())
        self.rew_filter = MeanStdFilter(())
-        self.filters = {"obs_filter": self.obs_filter,
-                        "rew_filter": self.rew_filter}
+        self.filters = {
+            "obs_filter": self.obs_filter,
+            "rew_filter": self.rew_filter
+        }

    def sample(self):
        samples_dict = {"observations": [], "rewards": []}
        for i in range(self._sample_count):
            samples_dict["observations"].append(
                self.obs_filter(np.random.randn()))
-            samples_dict["rewards"].append(
-                self.rew_filter(np.random.randn()))
+            samples_dict["rewards"].append(self.rew_filter(np.random.randn()))
        return SampleBatch(samples_dict)

    def compute_gradients(self, samples):
@@ -8,8 +8,8 @@ import ray

 from ray.rllib.models import ModelCatalog
 from ray.rllib.models.model import Model
-from ray.rllib.models.preprocessors import (
-    NoPreprocessor, OneHotPreprocessor, Preprocessor)
+from ray.rllib.models.preprocessors import (NoPreprocessor, OneHotPreprocessor,
+                                            Preprocessor)
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.visionnet import VisionNetwork

@@ -44,9 +44,11 @@ class ModelCatalogTest(unittest.TestCase):
        class TupleEnv(object):
            def __init__(self):
                self.observation_space = Tuple(
-                    [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
+                    [Discrete(5),
+                     Box(0, 1, shape=(3, ), dtype=np.float32)])
+
        p1 = ModelCatalog.get_preprocessor(TupleEnv())
-        self.assertEqual(p1.shape, (8,))
+        self.assertEqual(p1.shape, (8, ))
        self.assertEqual(
            list(p1.transform((0, [1, 2, 3]))),
            [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
@@ -20,12 +20,24 @@ def get_mean_action(alg, obs):
 ray.init(num_cpus=10)

 CONFIGS = {
-    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
-           "num_workers": 2},
+    "ES": {
+        "episodes_per_batch": 10,
+        "timesteps_per_batch": 100,
+        "num_workers": 2
+    },
    "DQN": {},
-    "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
-    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
-    "A3C": {"num_workers": 1},
+    "DDPG": {
+        "noise_scale": 0.0,
+        "timesteps_per_iteration": 100
+    },
+    "PPO": {
+        "num_sgd_iter": 5,
+        "timesteps_per_batch": 1000,
+        "num_workers": 2
+    },
+    "A3C": {
+        "num_workers": 1
+    },
 }


@@ -13,7 +13,7 @@ from ray.rllib.test.mock_evaluator import _MockEvaluator

 class RunningStatTest(unittest.TestCase):
    def testRunningStat(self):
-        for shp in ((), (3,), (3, 4)):
+        for shp in ((), (3, ), (3, 4)):
            li = []
            rs = RunningStat(shp)
            for _ in range(5):
@@ -22,12 +22,12 @@ class RunningStatTest(unittest.TestCase):
                li.append(val)
                m = np.mean(li, axis=0)
                self.assertTrue(np.allclose(rs.mean, m))
-                v = (np.square(m) if (len(li) == 1)
-                     else np.var(li, ddof=1, axis=0))
+                v = (np.square(m)
+                     if (len(li) == 1) else np.var(li, ddof=1, axis=0))
                self.assertTrue(np.allclose(rs.var, v))

    def testCombiningStat(self):
-        for shape in [(), (3,), (3, 4)]:
+        for shape in [(), (3, ), (3, 4)]:
            li = []
            rs1 = RunningStat(shape)
            rs2 = RunningStat(shape)
@@ -48,7 +48,7 @@ class RunningStatTest(unittest.TestCase):

 class MSFTest(unittest.TestCase):
    def testBasic(self):
-        for shape in [(), (3,), (3, 4, 4)]:
+        for shape in [(), (3, ), (3, 4, 4)]:
            filt = MeanStdFilter(shape)
            for i in range(5):
                filt(np.ones(shape))
@@ -93,8 +93,10 @@ class FilterManagerTest(unittest.TestCase):
        remote_e = RemoteEvaluator.remote(sample_count=10)
        remote_e.sample.remote()

-        FilterManager.synchronize(
-            {"obs_filter": filt1, "rew_filter": filt1.copy()}, [remote_e])
+        FilterManager.synchronize({
+            "obs_filter": filt1,
+            "rew_filter": filt1.copy()
+        }, [remote_e])

        filters = ray.get(remote_e.get_filters.remote())
        obs_f = filters["obs_filter"]
@@ -10,22 +10,15 @@ from ray.rllib.models.lstm import chop_into_sequences
 class LSTMUtilsTest(unittest.TestCase):
    def testBasic(self):
        t = [1, 2, 3, 1, 2, 3, 4, 5]
-        f = [
-            [101, 102, 103, 201, 202, 203, 204, 205],
-            [[101], [102], [103], [201], [202], [203], [204], [205]]
-        ]
+        f = [[101, 102, 103, 201, 202, 203, 204, 205],
+             [[101], [102], [103], [201], [202], [203], [204], [205]]]
        s = [[209, 208, 207, 109, 108, 107, 106, 105]]
        f_pad, s_init, seq_lens = chop_into_sequences(t, f, s, 4)
-        self.assertEqual(
-            [f.tolist() for f in f_pad],
-            [
-                [101, 102, 103, 0,
-                 201, 202, 203, 204,
-                 205, 0, 0, 0],
-                [[101], [102], [103], [0],
-                 [201], [202], [203], [204],
-                 [205], [0], [0], [0]],
-            ])
+        self.assertEqual([f.tolist() for f in f_pad], [
+            [101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0],
+            [[101], [102], [103], [0], [201], [202], [203], [204], [205], [0],
+             [0], [0]],
+        ])
        self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]])
        self.assertEqual(seq_lens.tolist(), [3, 4, 1])

@@ -129,12 +129,21 @@ class TestMultiAgentEnv(unittest.TestCase):
            obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
            self.assertEqual(obs, {0: 0, 1: 0, 2: 0, 3: 0})
            self.assertEqual(rew, {0: 1, 1: 1, 2: 1, 3: 1})
-            self.assertEqual(
-                done,
-                {0: False, 1: False, 2: False, 3: False, "__all__": False})
+            self.assertEqual(done, {
+                0: False,
+                1: False,
+                2: False,
+                3: False,
+                "__all__": False
+            })
        obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
-        self.assertEqual(
-            done, {0: True, 1: True, 2: True, 3: True, "__all__": True})
+        self.assertEqual(done, {
+            0: True,
+            1: True,
+            2: True,
+            3: True,
+            "__all__": True
+        })

    def testRoundRobinMock(self):
        env = RoundRobinMultiAgent(2)
@@ -156,24 +165,51 @@ class TestMultiAgentEnv(unittest.TestCase):
        self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
        self.assertEqual(rew, {0: {0: None, 1: None}, 1: {0: None, 1: None}})
        self.assertEqual(
-            dones,
-            {0: {0: False, 1: False, "__all__": False},
-             1: {0: False, 1: False, "__all__": False}})
+            dones, {
+                0: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                },
+                1: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                }
+            })
        for _ in range(24):
            env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
            obs, rew, dones, _, _ = env.poll()
            self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
            self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
            self.assertEqual(
-                dones,
-                {0: {0: False, 1: False, "__all__": False},
-                 1: {0: False, 1: False, "__all__": False}})
+                dones, {
+                    0: {
+                        0: False,
+                        1: False,
+                        "__all__": False
+                    },
+                    1: {
+                        0: False,
+                        1: False,
+                        "__all__": False
+                    }
+                })
        env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
        obs, rew, dones, _, _ = env.poll()
        self.assertEqual(
-            dones,
-            {0: {0: True, 1: True, "__all__": True},
-             1: {0: True, 1: True, "__all__": True}})
+            dones, {
+                0: {
+                    0: True,
+                    1: True,
+                    "__all__": True
+                },
+                1: {
+                    0: True,
+                    1: True,
+                    "__all__": True
+                }
+            })

        # Reset processing
        self.assertRaises(
@@ -186,9 +222,18 @@ class TestMultiAgentEnv(unittest.TestCase):
        self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
        self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
        self.assertEqual(
-            dones,
-            {0: {0: False, 1: False, "__all__": False},
-             1: {0: False, 1: False, "__all__": False}})
+            dones, {
+                0: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                },
+                1: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                }
+            })

    def testVectorizeRoundRobin(self):
        env = _MultiAgentEnvToAsync(lambda: RoundRobinMultiAgent(2), [], 2)
@@ -217,9 +262,8 @@ class TestMultiAgentEnv(unittest.TestCase):
        self.assertEqual(batch.count, 50)
        self.assertEqual(batch.policy_batches["p0"].count, 150)
        self.assertEqual(batch.policy_batches["p1"].count, 100)
-        self.assertEqual(
-            batch.policy_batches["p0"]["t"].tolist(),
-            list(range(25)) * 6)
+        self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
+                         list(range(25)) * 6)

    def testMultiAgentSampleRoundRobin(self):
        act_space = gym.spaces.Discrete(2)
@@ -236,21 +280,16 @@ class TestMultiAgentEnv(unittest.TestCase):
        # since we round robin introduce agents into the env, some of the env
        # steps don't count as proper transitions
        self.assertEqual(batch.policy_batches["p0"].count, 42)
-        self.assertEqual(
-            batch.policy_batches["p0"]["obs"].tolist()[:10],
-            [0, 1, 2, 3, 4] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["new_obs"].tolist()[:10],
-            [1, 2, 3, 4, 5] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["rewards"].tolist()[:10],
-            [100, 100, 100, 100, 0] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["dones"].tolist()[:10],
-            [False, False, False, False, True] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["t"].tolist()[:10],
-            [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
+        self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10],
+                         [0, 1, 2, 3, 4] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10],
+                         [1, 2, 3, 4, 5] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
+                         [100, 100, 100, 100, 0] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
+                         [False, False, False, False, True] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
+                         [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])

    def testTrainMultiCartpoleSinglePolicy(self):
        n = 10
@@ -289,11 +328,17 @@ class TestMultiAgentEnv(unittest.TestCase):
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:
-            remote_evs = [PolicyEvaluator.as_remote().remote(
-                env_creator=lambda _: MultiCartpole(n),
-                policy_graph=policies,
-                policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
-                batch_steps=50)]
+
+            def policy_mapper(agent_id):
+                return ["p1", "p2"][agent_id % 2]
+
+            remote_evs = [
+                PolicyEvaluator.as_remote().remote(
+                    env_creator=lambda _: MultiCartpole(n),
+                    policy_graph=policies,
+                    policy_mapping_fn=policy_mapper,
+                    batch_steps=50)
+            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs, {})
@@ -330,8 +375,8 @@ class TestMultiAgentEnv(unittest.TestCase):
        obs_space = env.observation_space
        policies = {}
        for i in range(20):
-            policies["pg_{}".format(i)] = (
-                PGPolicyGraph, obs_space, act_space, {})
+            policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
+                                           {})
        policy_ids = list(policies.keys())
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
@@ -21,8 +21,8 @@ class AsyncOptimizerTest(unittest.TestCase):
        local = _MockEvaluator()
        remotes = ray.remote(_MockEvaluator)
        remote_evaluators = [remotes.remote() for i in range(5)]
-        test_optimizer = AsyncGradientsOptimizer(
-            local, remote_evaluators, {"grads_per_step": 10})
+        test_optimizer = AsyncGradientsOptimizer(local, remote_evaluators,
+                                                 {"grads_per_step": 10})
        test_optimizer.step()
        self.assertTrue(all(local.get_weights() == 0))

@@ -66,8 +66,7 @@ class MockEnv2(gym.Env):

 class MockVectorEnv(VectorEnv):
    def __init__(self, episode_length, num_envs):
-        self.envs = [
-            MockEnv(episode_length) for _ in range(num_envs)]
+        self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
        self.observation_space = gym.spaces.Discrete(1)
        self.action_space = gym.spaces.Discrete(2)
        self.num_envs = num_envs
@@ -102,7 +101,10 @@ class TestPolicyEvaluator(unittest.TestCase):
    def testQueryEvaluators(self):
        register_env("test", lambda _: gym.make("CartPole-v0"))
        pg = PGAgent(
-            env="test", config={"num_workers": 2, "sample_batch_size": 5})
+            env="test", config={
+                "num_workers": 2,
+                "sample_batch_size": 5
+            })
        results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps)
        results2 = pg.optimizer.foreach_evaluator_with_index(
            lambda ev, i: (i, ev.batch_steps))
@@ -112,10 +114,12 @@ class TestPolicyEvaluator(unittest.TestCase):
    def testMetrics(self):
        ev = PolicyEvaluator(
            env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
+            policy_graph=MockPolicyGraph,
+            batch_mode="complete_episodes")
        remote_ev = PolicyEvaluator.as_remote().remote(
            env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
+            policy_graph=MockPolicyGraph,
+            batch_mode="complete_episodes")
        ev.sample()
        ray.get(remote_ev.sample.remote())
        result = collect_metrics(ev, [remote_ev])
@@ -149,7 +153,8 @@ class TestPolicyEvaluator(unittest.TestCase):
            env_creator=lambda _: MockEnv(episode_length=20),
            policy_graph=MockPolicyGraph,
            batch_mode="truncate_episodes",
-            batch_steps=16, num_envs=8)
+            batch_steps=16,
+            num_envs=8)
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 16)
@@ -175,7 +180,8 @@ class TestPolicyEvaluator(unittest.TestCase):
            env_creator=lambda _: MockEnv(episode_length=8),
            policy_graph=MockPolicyGraph,
            batch_mode="truncate_episodes",
-            batch_steps=16, num_envs=4)
+            batch_steps=16,
+            num_envs=4)
        batch = ev.sample()
        self.assertEqual(batch.count, 16)
        result = collect_metrics(ev, [])
@@ -186,8 +192,7 @@ class TestPolicyEvaluator(unittest.TestCase):

    def testVectorEnvSupport(self):
        ev = PolicyEvaluator(
-            env_creator=lambda _: MockVectorEnv(
-                episode_length=20, num_envs=8),
+            env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
            policy_graph=MockPolicyGraph,
            batch_mode="truncate_episodes",
            batch_steps=10)
@@ -83,8 +83,8 @@ class MultiServing(ServingEnv):
    def __init__(self, env_creator):
        self.env_creator = env_creator
        self.env = env_creator()
-        ServingEnv.__init__(
-            self, self.env.action_space, self.env.observation_space)
+        ServingEnv.__init__(self, self.env.action_space,
+                            self.env.observation_space)

    def run(self):
        envs = [self.env_creator() for _ in range(5)]
@@ -97,8 +97,7 @@ class MultiServing(ServingEnv):
                    eids[i] = uuid.uuid4().hex
                    self.start_episode(episode_id=eids[i])
                    cur_obs[i] = envs[i].reset()
-            actions = [
-                self.get_action(eids[i], cur_obs[i]) for i in active]
+            actions = [self.get_action(eids[i], cur_obs[i]) for i in active]
            for i, action in zip(active, actions):
                obs, reward, done, _ = envs[i].step(action)
                cur_obs[i] = obs
@@ -164,8 +163,7 @@ class TestServingEnv(unittest.TestCase):
        raise Exception("failed to improve reward")

    def testTrainCartpole(self):
-        register_env(
-            "test", lambda _: SimpleServing(gym.make("CartPole-v0")))
+        register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0")))
        pg = PGAgent(env="test", config={"num_workers": 0})
        for i in range(100):
            result = pg.train()
@@ -176,8 +174,8 @@ class TestServingEnv(unittest.TestCase):
        raise Exception("failed to improve reward")

    def testTrainCartpoleMulti(self):
-        register_env(
-            "test2", lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
+        register_env("test2",
+                     lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
        pg = PGAgent(env="test2", config={"num_workers": 0})
        for i in range(100):
            result = pg.train()
@@ -14,27 +14,29 @@ from ray.tune.registry import register_env

 ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
    "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ]),
    "implicit_tuple": [
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)],
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ],
 }

 OBSERVATION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
    "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
    "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
-    "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
+    "atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
    "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
-    "mixed_tuple": Tuple([
-        Discrete(10),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ]),
+    "mixed_tuple": Tuple(
+        [Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
 }


@@ -90,30 +92,33 @@ class ModelSupportedSpaces(unittest.TestCase):
        stats = {}
        check_support("DDPG", {"timesteps_per_iteration": 1}, stats)
        check_support("DQN", {"timesteps_per_iteration": 1}, stats)
+        check_support("A3C", {
+            "num_workers": 1,
+            "optimizer": {
+                "grads_per_step": 1
+            }
+        }, stats)
        check_support(
-            "A3C", {"num_workers": 1, "optimizer": {"grads_per_step": 1}},
-            stats)
+            "PPO", {
+                "num_workers": 1,
+                "num_sgd_iter": 1,
+                "timesteps_per_batch": 1,
+                "sgd_batchsize": 1
+            }, stats)
        check_support(
-            "PPO",
-            {"num_workers": 1, "num_sgd_iter": 1, "timesteps_per_batch": 1,
-             "sgd_batchsize": 1},
-            stats)
-        check_support(
-            "ES",
-            {"num_workers": 1, "noise_size": 10000000,
-             "episodes_per_batch": 1, "timesteps_per_batch": 1},
-            stats)
-        check_support(
-            "PG",
-            {"num_workers": 1, "optimizer": {}},
-            stats)
+            "ES", {
+                "num_workers": 1,
+                "noise_size": 10000000,
+                "episodes_per_batch": 1,
+                "timesteps_per_batch": 1
+            }, stats)
+        check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
        num_unexpected_errors = 0
        for (alg, a_name, o_name), stat in sorted(stats.items()):
            if stat not in ["ok", "unsupported"]:
                num_unexpected_errors += 1
-            print(
-                alg, "action_space", a_name, "obs_space", o_name,
-                "result", stat)
+            print(alg, "action_space", a_name, "obs_space", o_name, "result",
+                  stat)
        self.assertEqual(num_unexpected_errors, 0)


@@ -123,7 +128,7 @@ if __name__ == "__main__":
            "discrete": Discrete(5),
        }
        OBSERVATION_SPACES_TO_TEST = {
-            "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+            "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
            "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
        }
    unittest.main(verbosity=2)
@@ -11,7 +11,6 @@ import ray
 from ray.tune.config_parser import make_parser, resources_to_json
 from ray.tune.tune import _make_scheduler, run_experiments

-
 EXAMPLE_USAGE = """
 Training example via RLlib CLI:
    rllib train --run DQN --env CartPole-v0
@@ -35,29 +34,41 @@ def create_parser(parser_creator=None):

    # See also the base parser definition in ray/tune/config_parser.py
    parser.add_argument(
-        "--redis-address", default=None, type=str,
+        "--redis-address",
+        default=None,
+        type=str,
        help="The Redis address of the cluster.")
    parser.add_argument(
-        "--ray-num-cpus", default=None, type=int,
+        "--ray-num-cpus",
+        default=None,
+        type=int,
        help="--num-cpus to pass to Ray."
-             " This only has an affect in local mode.")
+        " This only has an affect in local mode.")
    parser.add_argument(
-        "--ray-num-gpus", default=None, type=int,
+        "--ray-num-gpus",
+        default=None,
+        type=int,
        help="--num-gpus to pass to Ray."
-             " This only has an affect in local mode.")
+        " This only has an affect in local mode.")
    parser.add_argument(
-        "--experiment-name", default="default", type=str,
+        "--experiment-name",
+        default="default",
+        type=str,
        help="Name of the subdirectory under `local_dir` to put results in.")
    parser.add_argument(
        "--env", default=None, type=str, help="The gym environment to use.")
    parser.add_argument(
-        "--queue-trials", action='store_true',
+        "--queue-trials",
+        action='store_true',
        help=(
            "Whether to queue trials when the cluster does not currently have "
            "enough resources to launch one. This should be set to True when "
            "running on an autoscaling cluster to enable automatic scale-up."))
    parser.add_argument(
-        "-f", "--config-file", default=None, type=str,
+        "-f",
+        "--config-file",
+        default=None,
+        type=str,
        help="If specified, use config options from this file. Note that this "
        "overrides any trial-specific options set via flags above.")
    return parser
@@ -93,9 +104,11 @@ def run(args, parser):

    ray.init(
        redis_address=args.redis_address,
-        num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus)
+        num_cpus=args.ray_num_cpus,
+        num_gpus=args.ray_num_gpus)
    run_experiments(
-        experiments, scheduler=_make_scheduler(args),
+        experiments,
+        scheduler=_make_scheduler(args),
        queue_trials=args.queue_trials)


@@ -6,10 +6,8 @@ import re
 import os
 import os.path as osp

-
 CONFIG_DIR = osp.join(osp.dirname(osp.abspath(__file__)), "regression_tests")

-
 TEMPLATE = """
 class Test{name}(Regression):
    _file = "{filename}"
@@ -15,7 +15,6 @@ import yaml
 import ray
 from ray import tune

-
 CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))


@@ -8,7 +8,6 @@ import yaml
 import ray
 from ray.tune import run_experiments

-
 if __name__ == '__main__':
    experiments = {}

@@ -29,5 +28,4 @@ if __name__ == '__main__':
            num_failures += 1

    if num_failures:
-        raise Exception(
-            "{} trials did not converge".format(num_failures))
+        raise Exception("{} trials did not converge".format(num_failures))
@@ -11,10 +11,9 @@ try:
    import lz4.frame
    LZ4_ENABLED = True
 except ImportError:
-    print(
-        "WARNING: lz4 not available, disabling sample compression. "
-        "This will significantly impact RLlib performance. "
-        "To install lz4, run `pip install lz4`.")
+    print("WARNING: lz4 not available, disabling sample compression. "
+          "This will significantly impact RLlib performance. "
+          "To install lz4, run `pip install lz4`.")
    LZ4_ENABLED = False


@@ -59,7 +59,6 @@ class NoFilter(Filter):

 # http://www.johndcook.com/blog/standard_deviation/
 class RunningStat(object):
-
    def __init__(self, shape=None):
        self._n = 0
        self._M = np.zeros(shape)
@@ -227,8 +226,8 @@ class MeanStdFilter(Filter):

    def __repr__(self):
        return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format(
-            self.shape, self.demean, self.destd,
-            self.clip, self.rs, self.buffer)
+            self.shape, self.demean, self.destd, self.clip, self.rs,
+            self.buffer)


 class ConcurrentMeanStdFilter(MeanStdFilter):
@@ -242,6 +241,7 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
            def wrapper(*args, **kwargs):
                with self._lock:
                    return func(*args, **kwargs)
+
            return wrapper

        self.__getattribute__ = lock_wrap(self.__getattribute__)
@@ -260,8 +260,8 @@ class ConcurrentMeanStdFilter(MeanStdFilter):

    def __repr__(self):
        return 'ConcurrentMeanStdFilter({}, {}, {}, {}, {}, {})'.format(
-            self.shape, self.demean, self.destd,
-            self.clip, self.rs, self.buffer)
+            self.shape, self.demean, self.destd, self.clip, self.rs,
+            self.buffer)


 def get_filter(filter_config, shape):
@@ -273,5 +273,4 @@ def get_filter(filter_config, shape):
    elif filter_config == "NoFilter":
        return NoFilter()
    else:
-        raise Exception("Unknown observation_filter: " +
-                        str(filter_config))
+        raise Exception("Unknown observation_filter: " + str(filter_config))
@@ -75,14 +75,14 @@ def _make_handler(serving_env):
                response["action"] = serving_env.get_action(
                    args["episode_id"], args["observation"])
            elif command == PolicyClient.LOG_ACTION:
-                serving_env.log_action(
-                    args["episode_id"], args["observation"], args["action"])
+                serving_env.log_action(args["episode_id"], args["observation"],
+                                       args["action"])
            elif command == PolicyClient.LOG_RETURNS:
-                serving_env.log_returns(
-                    args["episode_id"], args["reward"], args["info"])
+                serving_env.log_returns(args["episode_id"], args["reward"],
+                                        args["info"])
            elif command == PolicyClient.END_EPISODE:
-                serving_env.end_episode(
-                    args["episode_id"], args["observation"])
+                serving_env.end_episode(args["episode_id"],
+                                        args["observation"])
            else:
                raise Exception("Unknown command: {}".format(command))
            return response
@@ -7,6 +7,7 @@ class Reshaper(object):
    This class keeps track of where in the flattened observation space
    we should be slicing and what the new shapes should be
    """
+
    def __init__(self, env_space):
        self.shapes = []
        self.slice_positions = []
@@ -24,8 +25,8 @@ class Reshaper(object):
                if len(self.slice_positions) == 0:
                    self.slice_positions.append(np.product(arr_shape))
                else:
-                    self.slice_positions.append(np.product(arr_shape) +
-                                                self.slice_positions[-1])
+                    self.slice_positions.append(
+                        np.product(arr_shape) + self.slice_positions[-1])
        else:
            self.shapes.append(np.asarray(env_space.shape))
            self.slice_positions.append(np.product(env_space.shape))
@@ -38,11 +39,11 @@ class Reshaper(object):
    def split_tensor(self, tensor, axis=-1):
        # FIXME (ev) This won't work for mixed action distributions like
        # one agent Gaussian one agent discrete
-        slice_rescale = int(tensor.shape.as_list()[axis] /
-                            int(np.sum(self.get_slice_lengths())))
-        return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
-                        axis=axis)
+        slice_rescale = int(tensor.shape.as_list()[axis] / int(
+            np.sum(self.get_slice_lengths())))
+        return tf.split(
+            tensor, slice_rescale * self.get_slice_lengths(), axis=axis)

    def split_number(self, number):
        slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
-        return slice_rescale*self.get_slice_lengths()
+        return slice_rescale * self.get_slice_lengths()
@@ -39,10 +39,10 @@ def linear_interpolation(l, r, alpha):


 class PiecewiseSchedule(object):
-    def __init__(
-            self, endpoints, interpolation=linear_interpolation,
-            outside_value=None):
-
+    def __init__(self,
+                 endpoints,
+                 interpolation=linear_interpolation,
+                 outside_value=None):
        """Piecewise schedule.

        endpoints: [(int, int)]
@@ -64,18 +64,19 @@ def run_timeline(sess, ops, debug_name, feed_dict={}, timeline_dir=None):
        run_metadata = tf.RunMetadata()
        start = time.time()
        fetches = sess.run(
-            ops, options=run_options, run_metadata=run_metadata,
+            ops,
+            options=run_options,
+            run_metadata=run_metadata,
            feed_dict=feed_dict)
        trace = timeline.Timeline(step_stats=run_metadata.step_stats)
        global _count
        outf = os.path.join(
-            timeline_dir,
-            "timeline-{}-{}-{}.json".format(debug_name, os.getpid(), _count))
+            timeline_dir, "timeline-{}-{}-{}.json".format(
+                debug_name, os.getpid(), _count))
        _count += 1
        trace_file = open(outf, "w")
-        print(
-            "Wrote tf timeline ({} s) to {}".format(
-                time.time() - start, os.path.abspath(outf)))
+        print("Wrote tf timeline ({} s) to {}".format(time.time() - start,
+                                                      os.path.abspath(outf)))
        trace_file.write(trace.generate_chrome_trace_format())
    else:
        fetches = sess.run(ops, feed_dict=feed_dict)
@@ -22,8 +22,8 @@ class WindowStat(object):
        if not self.count:
            quantiles = []
        else:
-            quantiles = np.percentile(
-                self.items[:self.count], [0, 10, 50, 90, 100]).tolist()
+            quantiles = np.percentile(self.items[:self.count],
+                                      [0, 10, 50, 90, 100]).tolist()
        return {
            self.name + "_count": int(self.count),
            self.name + "_mean": float(np.mean(self.items[:self.count])),