From d01dc9e22d5e8625ae6ac49e2e689eebf472b5f8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 19 Jul 2018 15:30:36 -0700
Subject: [PATCH] [rllib] format with yapf (#2427)

* initial yapf

* manual fix yapf bugs
---
 .travis/yapf.sh                               |   1 -
 python/ray/rllib/__init__.py                  |  17 ++-
 python/ray/rllib/agents/a3c/a3c.py            |  10 +-
 .../rllib/agents/a3c/a3c_tf_policy_graph.py   |  44 +++---
 .../agents/a3c/a3c_torch_policy_graph.py      |  21 +--
 python/ray/rllib/agents/agent.py              |  61 +++++----
 python/ray/rllib/agents/bc/bc.py              |  21 +--
 python/ray/rllib/agents/bc/bc_evaluator.py    |  13 +-
 .../ray/rllib/agents/bc/experience_dataset.py |   5 +-
 python/ray/rllib/agents/bc/policy.py          |  37 ++---
 python/ray/rllib/agents/ddpg/apex.py          |  13 +-
 python/ray/rllib/agents/ddpg/ddpg.py          |   6 +-
 .../rllib/agents/ddpg/ddpg_policy_graph.py    |  79 ++++++-----
 python/ray/rllib/agents/dqn/apex.py           |  13 +-
 python/ray/rllib/agents/dqn/dqn.py            |  48 +++----
 .../ray/rllib/agents/dqn/dqn_policy_graph.py  | 112 ++++++++-------
 python/ray/rllib/agents/es/es.py              |  68 +++++----
 python/ray/rllib/agents/es/optimizers.py      |   4 +-
 python/ray/rllib/agents/es/policies.py        |  26 ++--
 python/ray/rllib/agents/es/tabular_logger.py  |  16 ++-
 python/ray/rllib/agents/es/utils.py           |  14 +-
 python/ray/rllib/agents/pg/pg.py              |   7 +-
 python/ray/rllib/agents/pg/pg_policy_graph.py |  12 +-
 python/ray/rllib/agents/ppo/ppo.py            |  31 +++--
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  | 105 +++++++++-----
 python/ray/rllib/agents/ppo/test/test.py      |   8 +-
 python/ray/rllib/agents/ppo/utils.py          |   2 +-
 python/ray/rllib/env/async_vector_env.py      |  15 +-
 python/ray/rllib/env/atari_wrappers.py        |   5 +-
 python/ray/rllib/env/vector_env.py            |   3 +-
 python/ray/rllib/evaluation/metrics.py        |   7 +-
 .../ray/rllib/evaluation/policy_evaluator.py  |  98 +++++++------
 python/ray/rllib/evaluation/sample_batch.py   |   7 +-
 python/ray/rllib/evaluation/sampler.py        |  80 ++++++-----
 .../ray/rllib/evaluation/tf_policy_graph.py   |  59 ++++----
 .../rllib/evaluation/torch_policy_graph.py    |   8 +-
 .../multiagent_mountaincar.py                 |  23 ++--
 .../multiagent_mountaincar_env.py             |   5 +-
 .../legacy_multiagent/multiagent_pendulum.py  |  23 ++--
 .../multiagent_pendulum_env.py                |  30 ++--
 .../ray/rllib/examples/multiagent_cartpole.py |   6 +-
 .../rllib/examples/serving/cartpole_client.py |  10 +-
 .../rllib/examples/serving/cartpole_server.py |  27 ++--
 python/ray/rllib/models/__init__.py           |   8 +-
 python/ray/rllib/models/action_dist.py        |  56 ++++----
 python/ray/rllib/models/catalog.py            |  71 +++++-----
 python/ray/rllib/models/fcnet.py              |   9 +-
 python/ray/rllib/models/lstm.py               |  27 ++--
 python/ray/rllib/models/misc.py               |  41 ++++--
 python/ray/rllib/models/model.py              |  14 +-
 python/ray/rllib/models/multiagentfcnet.py    |   6 +-
 python/ray/rllib/models/preprocessors.py      |  11 +-
 python/ray/rllib/models/visionnet.py          |  25 +++-
 python/ray/rllib/optimizers/__init__.py       |   1 -
 .../optimizers/async_gradients_optimizer.py   |  12 +-
 .../optimizers/async_samples_optimizer.py     |  94 +++++++------
 python/ray/rllib/optimizers/multi_gpu_impl.py |  41 +++---
 .../rllib/optimizers/multi_gpu_optimizer.py   |  42 +++---
 .../ray/rllib/optimizers/policy_optimizer.py  |   7 +-
 python/ray/rllib/optimizers/replay_buffer.py  |  24 ++--
 python/ray/rllib/optimizers/segment_tree.py   |  16 +--
 .../rllib/optimizers/sync_replay_optimizer.py |  74 ++++++----
 .../optimizers/sync_samples_optimizer.py      |  17 ++-
 python/ray/rllib/rollout.py                   |  28 ++--
 python/ray/rllib/scripts.py                   |   1 -
 python/ray/rllib/test/mock_evaluator.py       |   9 +-
 python/ray/rllib/test/test_catalog.py         |  10 +-
 .../ray/rllib/test/test_checkpoint_restore.py |  22 ++-
 python/ray/rllib/test/test_filters.py         |  16 ++-
 python/ray/rllib/test/test_lstm.py            |  21 +--
 python/ray/rllib/test/test_multi_agent_env.py | 129 ++++++++++++------
 python/ray/rllib/test/test_optimizers.py      |   4 +-
 .../ray/rllib/test/test_policy_evaluator.py   |  23 ++--
 python/ray/rllib/test/test_serving_env.py     |  14 +-
 .../ray/rllib/test/test_supported_spaces.py   |  67 ++++-----
 python/ray/rllib/train.py                     |  35 +++--
 .../generate_regression_tests.py              |   2 -
 .../regression_tests/regression_test.py       |   1 -
 .../tuned_examples/run_regression_tests.py    |   4 +-
 python/ray/rllib/utils/compression.py         |   7 +-
 python/ray/rllib/utils/filter.py              |  13 +-
 python/ray/rllib/utils/policy_server.py       |  12 +-
 python/ray/rllib/utils/reshaper.py            |  15 +-
 python/ray/rllib/utils/schedules.py           |   8 +-
 python/ray/rllib/utils/tf_run_builder.py      |  13 +-
 python/ray/rllib/utils/window_stat.py         |   4 +-
 86 files changed, 1276 insertions(+), 978 deletions(-)

diff --git a/.travis/yapf.sh b/.travis/yapf.sh
index 7c12ce4b4..75fed3efe 100755
--- a/.travis/yapf.sh
+++ b/.travis/yapf.sh
@@ -24,7 +24,6 @@ YAPF_FLAGS=(
 )
 
 YAPF_EXCLUDES=(
-    '--exclude' 'python/ray/rllib/*'
     '--exclude' 'python/ray/cloudpickle/*'
     '--exclude' 'python/build/*'
     '--exclude' 'python/ray/pyarrow_files/*'
diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index 609acdd0f..cf0f10580 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
 
 
 def _register_all():
-    for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
-                "APEX_DDPG", "__fake", "__sigmoid_fake_data",
-                "__parameter_tuning"]:
+    for key in [
+            "PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG",
+            "__fake", "__sigmoid_fake_data", "__parameter_tuning"
+    ]:
         from ray.rllib.agents.agent import get_agent_class
         register_trainable(key, get_agent_class(key))
 
@@ -27,6 +28,12 @@ def _register_all():
 _register_all()
 
 __all__ = [
-    "PolicyGraph", "TFPolicyGraph", "PolicyEvaluator", "SampleBatch",
-    "AsyncVectorEnv", "MultiAgentEnv", "VectorEnv", "ServingEnv",
+    "PolicyGraph",
+    "TFPolicyGraph",
+    "PolicyEvaluator",
+    "SampleBatch",
+    "AsyncVectorEnv",
+    "MultiAgentEnv",
+    "VectorEnv",
+    "ServingEnv",
 ]
diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py
index 7326685aa..0a739474f 100644
--- a/python/ray/rllib/agents/a3c/a3c.py
+++ b/python/ray/rllib/agents/a3c/a3c.py
@@ -92,15 +92,15 @@ class A3CAgent(Agent):
         self.remote_evaluators = self.make_remote_evaluators(
             self.env_creator, policy_cls, self.config["num_workers"],
             {"num_gpus": 1 if self.config["use_gpu_for_workers"] else 0})
-        self.optimizer = AsyncGradientsOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+        self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
+                                                 self.remote_evaluators,
+                                                 self.config["optimizer"])
 
     def _train(self):
         prev_steps = self.optimizer.num_steps_sampled
         self.optimizer.step()
-        FilterManager.synchronize(
-            self.local_evaluator.filters, self.remote_evaluators)
+        FilterManager.synchronize(self.local_evaluator.filters,
+                                  self.remote_evaluators)
         result = self.optimizer.collect_metrics()
         result = result._replace(
             timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps)
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
index faf22f602..00f630d3b 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
@@ -14,19 +14,23 @@ from ray.rllib.models.catalog import ModelCatalog
 
 
 class A3CLoss(object):
-    def __init__(
-            self, action_dist, actions, advantages, v_target, vf,
-            vf_loss_coeff=0.5, entropy_coeff=-0.01):
+    def __init__(self,
+                 action_dist,
+                 actions,
+                 advantages,
+                 v_target,
+                 vf,
+                 vf_loss_coeff=0.5,
+                 entropy_coeff=-0.01):
         log_prob = action_dist.logp(actions)
 
         # The "policy gradients" loss
-        self.pi_loss = - tf.reduce_sum(log_prob * advantages)
+        self.pi_loss = -tf.reduce_sum(log_prob * advantages)
 
         delta = vf - v_target
         self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
         self.entropy = tf.reduce_sum(action_dist.entropy())
-        self.total_loss = (self.pi_loss +
-                           self.vf_loss * vf_loss_coeff +
+        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
                            self.entropy * entropy_coeff)
 
 
@@ -41,8 +45,8 @@ class A3CPolicyGraph(TFPolicyGraph):
             tf.float32, [None] + list(observation_space.shape))
         dist_class, logit_dim = ModelCatalog.get_action_dist(
             action_space, self.config["model"])
-        self.model = ModelCatalog.get_model(
-            self.observations, logit_dim, self.config["model"])
+        self.model = ModelCatalog.get_model(self.observations, logit_dim,
+                                            self.config["model"])
         action_dist = dist_class(self.model.outputs)
         self.vf = tf.reshape(
             linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
@@ -62,9 +66,9 @@ class A3CPolicyGraph(TFPolicyGraph):
                     action_space))
         advantages = tf.placeholder(tf.float32, [None], name="advantages")
         v_target = tf.placeholder(tf.float32, [None], name="v_target")
-        self.loss = A3CLoss(
-            action_dist, actions, advantages, v_target, self.vf,
-            self.config["vf_loss_coeff"], self.config["entropy_coeff"])
+        self.loss = A3CLoss(action_dist, actions, advantages, v_target,
+                            self.vf, self.config["vf_loss_coeff"],
+                            self.config["entropy_coeff"])
 
         # Initialize TFPolicyGraph
         loss_in = [
@@ -76,10 +80,16 @@ class A3CPolicyGraph(TFPolicyGraph):
         self.state_in = self.model.state_in
         self.state_out = self.model.state_out
         TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
-            obs_input=self.observations, action_sampler=action_dist.sample(),
-            loss=self.loss.total_loss, loss_inputs=loss_in,
-            state_inputs=self.state_in, state_outputs=self.state_out,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
+            obs_input=self.observations,
+            action_sampler=action_dist.sample(),
+            loss=self.loss.total_loss,
+            loss_inputs=loss_in,
+            state_inputs=self.state_in,
+            state_outputs=self.state_out,
             seq_lens=self.model.seq_lens,
             max_seq_len=self.config["model"]["max_seq_len"])
 
@@ -132,5 +142,5 @@ class A3CPolicyGraph(TFPolicyGraph):
             for i in range(len(self.state_in)):
                 next_state.append([sample_batch["state_out_{}".format(i)][-1]])
             last_r = self.value(sample_batch["new_obs"][-1], *next_state)
-        return compute_advantages(
-            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+        return compute_advantages(sample_batch, last_r, self.config["gamma"],
+                                  self.config["lambda"])
diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index a277de945..dcdada591 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -46,20 +46,21 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
             action_space, self.config["model"])
         self.model = ModelCatalog.get_torch_model(
             obs_space.shape, self.logit_dim, self.config["model"])
-        loss = A3CLoss(
-            self.model, self.config["vf_loss_coeff"],
-            self.config["entropy_coeff"])
+        loss = A3CLoss(self.model, self.config["vf_loss_coeff"],
+                       self.config["entropy_coeff"])
         TorchPolicyGraph.__init__(
-            self, obs_space, action_space, self.model, loss,
-            loss_inputs=[
-                "obs", "actions", "advantages", "value_targets"])
+            self,
+            obs_space,
+            action_space,
+            self.model,
+            loss,
+            loss_inputs=["obs", "actions", "advantages", "value_targets"])
 
     def extra_action_out(self, model_out):
         return {"vf_preds": var_to_np(model_out[1])}
 
     def optimizer(self):
-        return torch.optim.Adam(
-            self.model.parameters(), lr=self.config["lr"])
+        return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
 
     def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
         completed = sample_batch["dones"][-1]
@@ -67,8 +68,8 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
             last_r = 0.0
         else:
             last_r = self._value(sample_batch["new_obs"][-1])
-        return compute_advantages(
-            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+        return compute_advantages(sample_batch, last_r, self.config["gamma"],
+                                  self.config["lambda"])
 
     def _value(self, obs):
         with self.lock:
diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py
index f53923030..9cd661f3a 100644
--- a/python/ray/rllib/agents/agent.py
+++ b/python/ray/rllib/agents/agent.py
@@ -47,7 +47,9 @@ COMMON_CONFIG = {
             "allow_growth": True,
         },
         "log_device_placement": False,
-        "device_count": {"CPU": 1},
+        "device_count": {
+            "CPU": 1
+        },
         "allow_soft_placement": True,  # required by PPO multi-gpu
     },
     # Whether to LZ4 compress observations
@@ -86,8 +88,7 @@ def _deep_update(original, new_dict, new_keys_allowed, whitelist):
     for k, value in new_dict.items():
         if k not in original and k != "env":
             if not new_keys_allowed:
-                raise Exception(
-                    "Unknown config parameter `{}` ".format(k))
+                raise Exception("Unknown config parameter `{}` ".format(k))
         if type(original.get(k)) is dict:
             if k in whitelist:
                 _deep_update(original[k], value, True, [])
@@ -112,22 +113,24 @@ class Agent(Trainable):
 
     _allow_unknown_configs = False
     _allow_unknown_subkeys = [
-        "tf_session_args", "env_config", "model", "optimizer", "multiagent"]
+        "tf_session_args", "env_config", "model", "optimizer", "multiagent"
+    ]
 
     def make_local_evaluator(self, env_creator, policy_graph):
         """Convenience method to return configured local evaluator."""
 
-        return self._make_evaluator(
-            PolicyEvaluator, env_creator, policy_graph, 0)
+        return self._make_evaluator(PolicyEvaluator, env_creator, policy_graph,
+                                    0)
 
-    def make_remote_evaluators(
-            self, env_creator, policy_graph, count, remote_args):
+    def make_remote_evaluators(self, env_creator, policy_graph, count,
+                               remote_args):
         """Convenience method to return a number of remote evaluators."""
 
         cls = PolicyEvaluator.as_remote(**remote_args).remote
         return [
-            self._make_evaluator(cls, env_creator, policy_graph, i+1)
-            for i in range(count)]
+            self._make_evaluator(cls, env_creator, policy_graph, i + 1)
+            for i in range(count)
+        ]
 
     def _make_evaluator(self, cls, env_creator, policy_graph, worker_index):
         config = self.config
@@ -140,8 +143,8 @@ class Agent(Trainable):
             env_creator,
             self.config["multiagent"]["policy_graphs"] or policy_graph,
             policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
-            tf_session_creator=(
-                session_creator if config["tf_session_args"] else None),
+            tf_session_creator=(session_creator
+                                if config["tf_session_args"] else None),
             batch_steps=config["sample_batch_size"],
             batch_mode=config["batch_mode"],
             episode_horizon=config["horizon"],
@@ -157,14 +160,12 @@ class Agent(Trainable):
 
     @classmethod
     def resource_help(cls, config):
-        return (
-            "\n\nYou can adjust the resource requests of RLlib agents by "
-            "setting `num_workers` and other configs. See the "
-            "DEFAULT_CONFIG defined by each agent for more info.\n\n"
-            "The config of this agent is: " + json.dumps(config))
+        return ("\n\nYou can adjust the resource requests of RLlib agents by "
+                "setting `num_workers` and other configs. See the "
+                "DEFAULT_CONFIG defined by each agent for more info.\n\n"
+                "The config of this agent is: " + json.dumps(config))
 
-    def __init__(
-            self, config=None, env=None, logger_creator=None):
+    def __init__(self, config=None, env=None, logger_creator=None):
         """Initialize an RLLib agent.
 
         Args:
@@ -235,8 +236,8 @@ class Agent(Trainable):
         obs = self.local_evaluator.filters["default"](
             observation, update=False)
         return self.local_evaluator.for_policy(
-            lambda p: p.compute_single_action(
-                obs, state, is_training=False)[0])
+            lambda p: p.compute_single_action(obs, state, is_training=False)[0]
+        )
 
 
 class _MockAgent(Agent):
@@ -257,8 +258,10 @@ class _MockAgent(Agent):
                 and (self.config["persistent_error"] or not self.restored):
             raise Exception("mock error")
         return TrainingResult(
-            episode_reward_mean=10, episode_len_mean=10,
-            timesteps_this_iter=10, info={})
+            episode_reward_mean=10,
+            episode_len_mean=10,
+            timesteps_this_iter=10,
+            info={})
 
     def _save(self, checkpoint_dir):
         path = os.path.join(checkpoint_dir, "mock_agent.pkl")
@@ -299,9 +302,11 @@ class _SigmoidFakeData(_MockAgent):
         v = np.tanh(float(i) / self.config["width"])
         v *= self.config["height"]
         return TrainingResult(
-            episode_reward_mean=v, episode_len_mean=v,
+            episode_reward_mean=v,
+            episode_len_mean=v,
             timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"], info={})
+            time_this_iter_s=self.config["iter_time"],
+            info={})
 
 
 class _ParameterTuningAgent(_MockAgent):
@@ -320,7 +325,8 @@ class _ParameterTuningAgent(_MockAgent):
             episode_reward_mean=self.config["reward_amt"] * self.iteration,
             episode_len_mean=self.config["reward_amt"],
             timesteps_this_iter=self.config["iter_timesteps"],
-            time_this_iter_s=self.config["iter_time"], info={})
+            time_this_iter_s=self.config["iter_time"],
+            info={})
 
 
 def get_agent_class(alg):
@@ -363,5 +369,4 @@ def get_agent_class(alg):
     elif alg == "__parameter_tuning":
         return _ParameterTuningAgent
     else:
-        raise Exception(
-            ("Unknown algorithm {}.").format(alg))
+        raise Exception(("Unknown algorithm {}.").format(alg))
diff --git a/python/ray/rllib/agents/bc/bc.py b/python/ray/rllib/agents/bc/bc.py
index 1484a5dbe..2bb3792b8 100644
--- a/python/ray/rllib/agents/bc/bc.py
+++ b/python/ray/rllib/agents/bc/bc.py
@@ -57,28 +57,31 @@ class BCAgent(Agent):
         else:
             num_gpus_per_worker = 0
         return Resources(
-            cpu=1, gpu=cf["gpu"] and 1 or 0,
+            cpu=1,
+            gpu=cf["gpu"] and 1 or 0,
             extra_cpu=cf["num_workers"],
             extra_gpu=num_gpus_per_worker * cf["num_workers"])
 
     def _init(self):
-        self.local_evaluator = BCEvaluator(
-            self.env_creator, self.config, self.logdir)
+        self.local_evaluator = BCEvaluator(self.env_creator, self.config,
+                                           self.logdir)
         if self.config["use_gpu_for_workers"]:
             remote_cls = GPURemoteBCEvaluator
         else:
             remote_cls = RemoteBCEvaluator
         self.remote_evaluators = [
             remote_cls.remote(self.env_creator, self.config, self.logdir)
-            for _ in range(self.config["num_workers"])]
-        self.optimizer = AsyncGradientsOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+            for _ in range(self.config["num_workers"])
+        ]
+        self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
+                                                 self.remote_evaluators,
+                                                 self.config["optimizer"])
 
     def _train(self):
         self.optimizer.step()
-        metric_lists = [re.get_metrics.remote() for re in
-                        self.remote_evaluators]
+        metric_lists = [
+            re.get_metrics.remote() for re in self.remote_evaluators
+        ]
         total_samples = 0
         total_loss = 0
         for metrics in metric_lists:
diff --git a/python/ray/rllib/agents/bc/bc_evaluator.py b/python/ray/rllib/agents/bc/bc_evaluator.py
index e896b1f88..4726b4a3c 100644
--- a/python/ray/rllib/agents/bc/bc_evaluator.py
+++ b/python/ray/rllib/agents/bc/bc_evaluator.py
@@ -14,8 +14,8 @@ from ray.rllib.models import ModelCatalog
 
 class BCEvaluator(EvaluatorInterface):
     def __init__(self, env_creator, config, logdir):
-        env = ModelCatalog.get_preprocessor_as_wrapper(env_creator(
-            config["env_config"]), config["model"])
+        env = ModelCatalog.get_preprocessor_as_wrapper(
+            env_creator(config["env_config"]), config["model"])
         self.dataset = ExperienceDataset(config["dataset_path"])
         self.policy = BCPolicy(env.observation_space, env.action_space, config)
         self.config = config
@@ -27,8 +27,10 @@ class BCEvaluator(EvaluatorInterface):
 
     def compute_gradients(self, samples):
         gradient, info = self.policy.compute_gradients(samples)
-        self.metrics_queue.put(
-            {"num_samples": info["num_samples"], "loss": info["loss"]})
+        self.metrics_queue.put({
+            "num_samples": info["num_samples"],
+            "loss": info["loss"]
+        })
         return gradient, {}
 
     def apply_gradients(self, grads):
@@ -42,8 +44,7 @@ class BCEvaluator(EvaluatorInterface):
 
     def save(self):
         weights = self.get_weights()
-        return pickle.dumps({
-            "weights": weights})
+        return pickle.dumps({"weights": weights})
 
     def restore(self, objs):
         objs = pickle.loads(objs)
diff --git a/python/ray/rllib/agents/bc/experience_dataset.py b/python/ray/rllib/agents/bc/experience_dataset.py
index ccf47bc31..d08284184 100644
--- a/python/ray/rllib/agents/bc/experience_dataset.py
+++ b/python/ray/rllib/agents/bc/experience_dataset.py
@@ -21,8 +21,9 @@ class ExperienceDataset(object):
             elements.
           The file must be available on each machine used by a BCEvaluator.
         """
-        self._dataset = list(itertools.chain.from_iterable(
-            pickle.load(open(dataset_path, "rb"))))
+        self._dataset = list(
+            itertools.chain.from_iterable(
+                pickle.load(open(dataset_path, "rb"))))
 
     def sample(self, batch_size):
         indexes = np.random.choice(len(self._dataset), batch_size)
diff --git a/python/ray/rllib/agents/bc/policy.py b/python/ray/rllib/agents/bc/policy.py
index e3077dd3d..a504e3ec6 100644
--- a/python/ray/rllib/agents/bc/policy.py
+++ b/python/ray/rllib/agents/bc/policy.py
@@ -23,8 +23,8 @@ class BCPolicy(object):
         self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(
             ac_space, self.config["model"])
-        self._model = ModelCatalog.get_model(
-            self.x, self.logit_dim, self.config["model"])
+        self._model = ModelCatalog.get_model(self.x, self.logit_dim,
+                                             self.config["model"])
         self.logits = self._model.outputs
         self.curr_dist = dist_class(self.logits)
         self.sample = self.curr_dist.sample()
@@ -33,17 +33,16 @@ class BCPolicy(object):
 
     def setup_loss(self, action_space):
         if isinstance(action_space, gym.spaces.Box):
-            self.ac = tf.placeholder(tf.float32,
-                                     [None] + list(action_space.shape),
-                                     name="ac")
+            self.ac = tf.placeholder(
+                tf.float32, [None] + list(action_space.shape), name="ac")
         elif isinstance(action_space, gym.spaces.Discrete):
             self.ac = tf.placeholder(tf.int64, [None], name="ac")
         else:
-            raise NotImplementedError(
-                "action space" + str(type(action_space)) +
-                "currently not supported")
+            raise NotImplementedError("action space" +
+                                      str(type(action_space)) +
+                                      "currently not supported")
         log_prob = self.curr_dist.logp(self.ac)
-        self.pi_loss = - tf.reduce_sum(log_prob)
+        self.pi_loss = -tf.reduce_sum(log_prob)
         self.loss = self.pi_loss
 
     def setup_gradients(self):
@@ -62,11 +61,14 @@ class BCPolicy(object):
             self.summary_op = tf.summary.merge_all()
 
         # TODO(rliaw): Can consider exposing these parameters
-        self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
-            intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
-            gpu_options=tf.GPUOptions(allow_growth=True)))
-        self.variables = ray.experimental.TensorFlowVariables(self.loss,
-                                                              self.sess)
+        self.sess = tf.Session(
+            graph=self.g,
+            config=tf.ConfigProto(
+                intra_op_parallelism_threads=1,
+                inter_op_parallelism_threads=2,
+                gpu_options=tf.GPUOptions(allow_growth=True)))
+        self.variables = ray.experimental.TensorFlowVariables(
+            self.loss, self.sess)
         self.sess.run(tf.global_variables_initializer())
 
     def compute_gradients(self, samples):
@@ -82,15 +84,14 @@ class BCPolicy(object):
                 [self.loss, self.grads, self.summary_op], feed_dict=feed_dict)
             info["summary"] = summ
         else:
-            loss, grad = self.sess.run([self.loss, self.grads],
-                                       feed_dict=feed_dict)
+            loss, grad = self.sess.run(
+                [self.loss, self.grads], feed_dict=feed_dict)
         info["num_samples"] = len(samples)
         info["loss"] = loss
         return grad, info
 
     def apply_gradients(self, grads):
-        feed_dict = {self.grads[i]: grads[i]
-                     for i in range(len(grads))}
+        feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))}
         self.sess.run(self._apply_gradients, feed_dict=feed_dict)
 
     def get_weights(self):
diff --git a/python/ray/rllib/agents/ddpg/apex.py b/python/ray/rllib/agents/ddpg/apex.py
index b53d4178e..b35f1ea35 100644
--- a/python/ray/rllib/agents/ddpg/apex.py
+++ b/python/ray/rllib/agents/ddpg/apex.py
@@ -9,13 +9,12 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
     DDPG_CONFIG,
     {
         "optimizer_class": "AsyncSamplesOptimizer",
-        "optimizer":
-            merge_dicts(
-                DDPG_CONFIG["optimizer"], {
-                    "max_weight_sync_delay": 400,
-                    "num_replay_buffer_shards": 4,
-                    "debug": False
-                }),
+        "optimizer": merge_dicts(
+            DDPG_CONFIG["optimizer"], {
+                "max_weight_sync_delay": 400,
+                "num_replay_buffer_shards": 4,
+                "debug": False
+            }),
         "n_step": 3,
         "num_workers": 32,
         "buffer_size": 2000000,
diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py
index c7e45f1b3..95b6859d2 100644
--- a/python/ray/rllib/agents/ddpg/ddpg.py
+++ b/python/ray/rllib/agents/ddpg/ddpg.py
@@ -118,9 +118,9 @@ class DDPGAgent(DQNAgent):
         if self.config["per_worker_exploration"]:
             assert self.config["num_workers"] > 1, \
                 "This requires multiple workers"
-            return ConstantSchedule(
-                self.config["noise_scale"] * 0.4 **
-                (1 + worker_index / float(self.config["num_workers"] - 1) * 7))
+            exponent = (
+                1 + worker_index / float(self.config["num_workers"] - 1) * 7)
+            return ConstantSchedule(self.config["noise_scale"] * 0.4**exponent)
         else:
             return LinearSchedule(
                 schedule_timesteps=int(self.config["exploration_fraction"] *
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index 1dd8941b9..ceae0d0f0 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -14,7 +14,6 @@ from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 
-
 A_SCOPE = "a_func"
 P_SCOPE = "p_func"
 P_TARGET_SCOPE = "target_p_func"
@@ -26,8 +25,8 @@ class PNetwork(object):
     """Maps an observations (i.e., state) to an action where each entry takes
     value from (0, 1) due to the sigmoid function."""
 
-    def __init__(
-            self, model, dim_actions, hiddens=[64, 64], activation="relu"):
+    def __init__(self, model, dim_actions, hiddens=[64, 64],
+                 activation="relu"):
         action_out = model.last_layer
         activation = tf.nn.__dict__[activation]
         for hidden in hiddens:
@@ -44,9 +43,14 @@ class ActionNetwork(object):
     for training, thus ignoring the batch_size issue when constructing a
     stochastic action."""
 
-    def __init__(
-            self, p_values, low_action, high_action, stochastic, eps,
-            theta=0.15, sigma=0.2):
+    def __init__(self,
+                 p_values,
+                 low_action,
+                 high_action,
+                 stochastic,
+                 eps,
+                 theta=0.15,
+                 sigma=0.2):
 
         # shape is [None, dim_action]
         deterministic_actions = (
@@ -65,15 +69,16 @@ class ActionNetwork(object):
         stochastic_actions = deterministic_actions + eps * (
             high_action - low_action) * exploration_value
 
-        self.actions = tf.cond(
-            stochastic, lambda: stochastic_actions,
-            lambda: deterministic_actions)
+        self.actions = tf.cond(stochastic, lambda: stochastic_actions,
+                               lambda: deterministic_actions)
 
 
 class QNetwork(object):
-    def __init__(
-            self, model, action_inputs,
-            hiddens=[64, 64], activation="relu"):
+    def __init__(self,
+                 model,
+                 action_inputs,
+                 hiddens=[64, 64],
+                 activation="relu"):
         q_out = tf.concat([model.last_layer, action_inputs], axis=1)
         activation = tf.nn.__dict__[activation]
         for hidden in hiddens:
@@ -84,14 +89,21 @@ class QNetwork(object):
 
 
 class ActorCriticLoss(object):
-    def __init__(
-            self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask,
-            gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0):
+    def __init__(self,
+                 q_t,
+                 q_tp1,
+                 q_tp0,
+                 importance_weights,
+                 rewards,
+                 done_mask,
+                 gamma=0.99,
+                 n_step=1,
+                 use_huber=False,
+                 huber_threshold=1.0):
 
         q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
 
-        q_tp1_best = tf.squeeze(
-            input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
         q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
 
         # compute RHS of bellman equation
@@ -131,27 +143,20 @@ class DDPGPolicyGraph(TFPolicyGraph):
 
         def _build_q_network(obs, actions):
             return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                actions,
+                ModelCatalog.get_model(obs, 1, config["model"]), actions,
                 config["critic_hiddens"],
                 config["critic_hidden_activation"]).value
 
         def _build_p_network(obs):
             return PNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                dim_actions,
+                ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
                 config["actor_hiddens"],
                 config["actor_hidden_activation"]).action_scores
 
         def _build_action_network(p_values, stochastic, eps):
-            return ActionNetwork(
-                p_values,
-                low_action,
-                high_action,
-                stochastic,
-                eps,
-                config["exploration_theta"],
-                config["exploration_sigma"]).actions
+            return ActionNetwork(p_values, low_action, high_action, stochastic,
+                                 eps, config["exploration_theta"],
+                                 config["exploration_sigma"]).actions
 
         # Action inputs
         self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -263,9 +268,13 @@ class DDPGPolicyGraph(TFPolicyGraph):
             ("weights", self.importance_weights),
         ]
         TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
             obs_input=self.cur_observations,
-            action_sampler=self.output_actions, loss=self.loss.total_loss,
+            action_sampler=self.output_actions,
+            loss=self.loss.total_loss,
             loss_inputs=self.loss_inputs)
         self.sess.run(tf.global_variables_initializer())
 
@@ -294,10 +303,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
                 self.loss.actor_loss, var_list=self.p_func_vars)
             critic_grads_and_vars = self.critic_optimizer.compute_gradients(
                 self.loss.critic_loss, var_list=self.q_func_vars)
-        actor_grads_and_vars = [
-            (g, v) for (g, v) in actor_grads_and_vars if g is not None]
-        critic_grads_and_vars = [
-            (g, v) for (g, v) in critic_grads_and_vars if g is not None]
+        actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
+                                if g is not None]
+        critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
+                                 if g is not None]
         grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
         return grads_and_vars
 
diff --git a/python/ray/rllib/agents/dqn/apex.py b/python/ray/rllib/agents/dqn/apex.py
index 1c8b2f6b3..9321a70ff 100644
--- a/python/ray/rllib/agents/dqn/apex.py
+++ b/python/ray/rllib/agents/dqn/apex.py
@@ -10,13 +10,12 @@ APEX_DEFAULT_CONFIG = merge_dicts(
     DQN_CONFIG,
     {
         "optimizer_class": "AsyncSamplesOptimizer",
-        "optimizer":
-            merge_dicts(
-                DQN_CONFIG["optimizer"], {
-                    "max_weight_sync_delay": 400,
-                    "num_replay_buffer_shards": 4,
-                    "debug": False
-                }),
+        "optimizer": merge_dicts(
+            DQN_CONFIG["optimizer"], {
+                "max_weight_sync_delay": 400,
+                "num_replay_buffer_shards": 4,
+                "debug": False
+            }),
         "n_step": 3,
         "gpu": True,
         "num_workers": 32,
diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py
index adb4e427b..197831c1f 100644
--- a/python/ray/rllib/agents/dqn/dqn.py
+++ b/python/ray/rllib/agents/dqn/dqn.py
@@ -13,11 +13,11 @@ from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 from ray.tune.trial import Resources
 
-
 OPTIMIZER_SHARED_CONFIGS = [
     "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
     "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
-    "train_batch_size", "learning_starts", "clip_rewards"]
+    "train_batch_size", "learning_starts", "clip_rewards"
+]
 
 DEFAULT_CONFIG = with_common_config({
     # === Model ===
@@ -110,7 +110,8 @@ class DQNAgent(Agent):
     def default_resource_request(cls, config):
         cf = dict(cls._default_config, **config)
         return Resources(
-            cpu=1, gpu=cf["gpu"] and 1 or 0,
+            cpu=1,
+            gpu=cf["gpu"] and 1 or 0,
             extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
             extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
 
@@ -123,7 +124,8 @@ class DQNAgent(Agent):
         self.exploration0 = self._make_exploration_schedule(0)
         self.explorations = [
             self._make_exploration_schedule(i)
-            for i in range(self.config["num_workers"])]
+            for i in range(self.config["num_workers"])
+        ]
 
         for k in OPTIMIZER_SHARED_CONFIGS:
             if k not in self.config["optimizer"]:
@@ -132,9 +134,10 @@ class DQNAgent(Agent):
         self.local_evaluator = self.make_local_evaluator(
             self.env_creator, self._policy_graph)
         self.remote_evaluators = self.make_remote_evaluators(
-            self.env_creator, self._policy_graph, self.config["num_workers"],
-            {"num_cpus": self.config["num_cpus_per_worker"],
-             "num_gpus": self.config["num_gpus_per_worker"]})
+            self.env_creator, self._policy_graph, self.config["num_workers"], {
+                "num_cpus": self.config["num_cpus_per_worker"],
+                "num_gpus": self.config["num_gpus_per_worker"]
+            })
         self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
             self.local_evaluator, self.remote_evaluators,
             self.config["optimizer"])
@@ -147,14 +150,12 @@ class DQNAgent(Agent):
         if self.config["per_worker_exploration"]:
             assert self.config["num_workers"] > 1, \
                 "This requires multiple workers"
-            return ConstantSchedule(
-                0.4 ** (
-                    1 + worker_index / float(
-                        self.config["num_workers"] - 1) * 7))
+            exponent = (
+                1 + worker_index / float(self.config["num_workers"] - 1) * 7)
+            return ConstantSchedule(0.4**exponent)
         return LinearSchedule(
-            schedule_timesteps=int(
-                self.config["exploration_fraction"] *
-                self.config["schedule_max_timesteps"]),
+            schedule_timesteps=int(self.config["exploration_fraction"] *
+                                   self.config["schedule_max_timesteps"]),
             initial_p=1.0,
             final_p=self.config["exploration_final_eps"])
 
@@ -191,8 +192,8 @@ class DQNAgent(Agent):
                 self.local_evaluator,
                 self.remote_evaluators[-len(self.remote_evaluators) // 3:])
         else:
-            result = collect_metrics(
-                self.local_evaluator, self.remote_evaluators)
+            result = collect_metrics(self.local_evaluator,
+                                     self.remote_evaluators)
 
         return result._replace(
             timesteps_this_iter=self.global_timestep - start_timestep,
@@ -208,14 +209,14 @@ class DQNAgent(Agent):
             ev.__ray_terminate__.remote()
 
     def _save(self, checkpoint_dir):
-        checkpoint_path = os.path.join(
-            checkpoint_dir, "checkpoint-{}".format(self.iteration))
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "checkpoint-{}".format(self.iteration))
         extra_data = [
             self.local_evaluator.save(),
             ray.get([e.save.remote() for e in self.remote_evaluators]),
-            self.optimizer.save(),
-            self.num_target_updates,
-            self.last_target_update_ts]
+            self.optimizer.save(), self.num_target_updates,
+            self.last_target_update_ts
+        ]
         pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
         return checkpoint_path
 
@@ -223,8 +224,9 @@ class DQNAgent(Agent):
         extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
         self.local_evaluator.restore(extra_data[0])
         ray.get([
-            e.restore.remote(d) for (d, e)
-            in zip(extra_data[1], self.remote_evaluators)])
+            e.restore.remote(d)
+            for (d, e) in zip(extra_data[1], self.remote_evaluators)
+        ])
         self.optimizer.restore(extra_data[2])
         self.num_target_updates = extra_data[3]
         self.last_target_update_ts = extra_data[4]
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index 7905935ce..f553ad325 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -13,7 +13,6 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 
-
 Q_SCOPE = "q_func"
 Q_TARGET_SCOPE = "target_q_func"
 
@@ -33,7 +32,8 @@ class QNetwork(object):
                 state_out = model.last_layer
                 for hidden in hiddens:
                     state_out = layers.fully_connected(
-                        state_out, num_outputs=hidden,
+                        state_out,
+                        num_outputs=hidden,
                         activation_fn=tf.nn.relu)
                 state_score = layers.fully_connected(
                     state_out, num_outputs=1, activation_fn=None)
@@ -50,26 +50,32 @@ class QValuePolicy(object):
         deterministic_actions = tf.argmax(q_values, axis=1)
         batch_size = tf.shape(observations)[0]
         random_actions = tf.random_uniform(
-            tf.stack([batch_size]), minval=0, maxval=num_actions,
+            tf.stack([batch_size]),
+            minval=0,
+            maxval=num_actions,
             dtype=tf.int64)
         chose_random = tf.random_uniform(
             tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
-        stochastic_actions = tf.where(
-            chose_random, random_actions, deterministic_actions)
-        self.action = tf.cond(
-            stochastic, lambda: stochastic_actions,
-            lambda: deterministic_actions)
+        stochastic_actions = tf.where(chose_random, random_actions,
+                                      deterministic_actions)
+        self.action = tf.cond(stochastic, lambda: stochastic_actions,
+                              lambda: deterministic_actions)
 
 
 class QLoss(object):
-    def __init__(
-            self, q_t_selected, q_tp1_best, importance_weights, rewards,
-            done_mask, gamma=0.99, n_step=1):
+    def __init__(self,
+                 q_t_selected,
+                 q_tp1_best,
+                 importance_weights,
+                 rewards,
+                 done_mask,
+                 gamma=0.99,
+                 n_step=1):
 
         q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
 
         # compute RHS of bellman equation
-        q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked
+        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
 
         # compute the error (potentially clipped)
         self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
@@ -91,14 +97,14 @@ class DQNPolicyGraph(TFPolicyGraph):
 
         def _build_q_network(obs):
             return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]),
-                num_actions, config["dueling"], config["hiddens"]).value
+                ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
+                config["dueling"], config["hiddens"]).value
 
         # Action inputs
         self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
         self.eps = tf.placeholder(tf.float32, (), name="eps")
         self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)
 
         # Action Q network
         with tf.variable_scope(Q_SCOPE) as scope:
@@ -106,20 +112,17 @@ class DQNPolicyGraph(TFPolicyGraph):
             self.q_func_vars = _scope_vars(scope.name)
 
         # Action outputs
-        self.output_actions = QValuePolicy(
-            q_values,
-            self.cur_observations,
-            num_actions,
-            self.stochastic,
-            self.eps).action
+        self.output_actions = QValuePolicy(q_values, self.cur_observations,
+                                           num_actions, self.stochastic,
+                                           self.eps).action
 
         # Replay inputs
         self.obs_t = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)
         self.act_t = tf.placeholder(tf.int32, [None], name="action")
         self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
         self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None,) + observation_space.shape)
+            tf.float32, shape=(None, ) + observation_space.shape)
         self.done_mask = tf.placeholder(tf.float32, [None], name="done")
         self.importance_weights = tf.placeholder(
             tf.float32, [None], name="weight")
@@ -134,8 +137,8 @@ class DQNPolicyGraph(TFPolicyGraph):
             self.target_q_func_vars = _scope_vars(scope.name)
 
         # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(
-            q_t * tf.one_hot(self.act_t, num_actions), 1)
+        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
+                                     1)
 
         # compute estimate of best possible value starting from state at t + 1
         if config["double_q"]:
@@ -143,20 +146,20 @@ class DQNPolicyGraph(TFPolicyGraph):
                 q_tp1_using_online_net = _build_q_network(self.obs_tp1)
             q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
             q_tp1_best = tf.reduce_sum(
-                q_tp1 * tf.one_hot(
-                    q_tp1_best_using_online_net, num_actions), 1)
+                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
+                1)
         else:
             q_tp1_best = tf.reduce_max(q_tp1, 1)
 
-        self.loss = QLoss(
-            q_t_selected, q_tp1_best, self.importance_weights,
-            self.rew_t, self.done_mask, config["gamma"], config["n_step"])
+        self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
+                          self.rew_t, self.done_mask, config["gamma"],
+                          config["n_step"])
 
         # update_target_fn will be called periodically to copy Q network to
         # target Q network
         update_target_expr = []
         for var, var_target in zip(
-            sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(self.q_func_vars, key=lambda v: v.name),
                 sorted(self.target_q_func_vars, key=lambda v: v.name)):
             update_target_expr.append(var_target.assign(var))
         self.update_target_expr = tf.group(*update_target_expr)
@@ -172,9 +175,13 @@ class DQNPolicyGraph(TFPolicyGraph):
             ("weights", self.importance_weights),
         ]
         TFPolicyGraph.__init__(
-            self, observation_space, action_space, self.sess,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
             obs_input=self.cur_observations,
-            action_sampler=self.output_actions, loss=self.loss.loss,
+            action_sampler=self.output_actions,
+            loss=self.loss.loss,
             loss_inputs=self.loss_inputs)
         self.sess.run(tf.global_variables_initializer())
 
@@ -184,13 +191,14 @@ class DQNPolicyGraph(TFPolicyGraph):
     def gradients(self, optimizer):
         if self.config["grad_norm_clipping"] is not None:
             grads_and_vars = _minimize_and_clip(
-                optimizer, self.loss.loss, var_list=self.q_func_vars,
+                optimizer,
+                self.loss.loss,
+                var_list=self.q_func_vars,
                 clip_val=self.config["grad_norm_clipping"])
         else:
             grads_and_vars = optimizer.compute_gradients(
                 self.loss.loss, var_list=self.q_func_vars)
-        grads_and_vars = [
-            (g, v) for (g, v) in grads_and_vars if g is not None]
+        grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
         return grads_and_vars
 
     def extra_compute_action_feed_dict(self):
@@ -207,8 +215,8 @@ class DQNPolicyGraph(TFPolicyGraph):
     def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
         return _postprocess_dqn(self, sample_batch)
 
-    def compute_td_error(
-            self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
+    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
+                         importance_weights):
         td_err = self.sess.run(
             self.loss.td_error,
             feed_dict={
@@ -254,7 +262,7 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
             continue  # episode end
         for j in range(1, n_step):
             new_obs[i] = new_obs[i + j]
-            rewards[i] += gamma ** j * rewards[i + j]
+            rewards[i] += gamma**j * rewards[i + j]
             if dones[i + j]:
                 break  # episode end
     # truncate ends of the trajectory
@@ -266,24 +274,29 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
 def _postprocess_dqn(policy_graph, sample_batch):
     obs, actions, rewards, new_obs, dones = [
         list(x) for x in sample_batch.columns(
-            ["obs", "actions", "rewards", "new_obs", "dones"])]
+            ["obs", "actions", "rewards", "new_obs", "dones"])
+    ]
 
     # N-step Q adjustments
     if policy_graph.config["n_step"] > 1:
-        adjust_nstep(
-            policy_graph.config["n_step"], policy_graph.config["gamma"],
-            obs, actions, rewards, new_obs, dones)
+        adjust_nstep(policy_graph.config["n_step"],
+                     policy_graph.config["gamma"], obs, actions, rewards,
+                     new_obs, dones)
 
     batch = SampleBatch({
-        "obs": obs, "actions": actions, "rewards": rewards,
-        "new_obs": new_obs, "dones": dones,
-        "weights": np.ones_like(rewards)})
+        "obs": obs,
+        "actions": actions,
+        "rewards": rewards,
+        "new_obs": new_obs,
+        "dones": dones,
+        "weights": np.ones_like(rewards)
+    })
 
     # Prioritize on the worker side
     if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
         td_errors = policy_graph.compute_td_error(
-            batch["obs"], batch["actions"], batch["rewards"],
-            batch["new_obs"], batch["dones"], batch["weights"])
+            batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"],
+            batch["dones"], batch["weights"])
         new_priorities = (
             np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
         batch.data["weights"] = new_priorities
@@ -295,8 +308,7 @@ def _huber_loss(x, delta=1.0):
     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
     return tf.where(
         tf.abs(x) < delta,
-        tf.square(x) * 0.5,
-        delta * (tf.abs(x) - 0.5 * delta))
+        tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
 
 
 def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py
index 62249e380..a2a39e612 100644
--- a/python/ray/rllib/agents/es/es.py
+++ b/python/ray/rllib/agents/es/es.py
@@ -20,13 +20,11 @@ from ray.rllib.agents.es import policies
 from ray.rllib.agents.es import tabular_logger as tlogger
 from ray.rllib.agents.es import utils
 
-
 Result = namedtuple("Result", [
     "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
     "eval_returns", "eval_lengths"
 ])
 
-
 DEFAULT_CONFIG = {
     'l2_coeff': 0.005,
     'noise_stdev': 0.02,
@@ -64,7 +62,11 @@ class SharedNoiseTable(object):
 
 @ray.remote
 class Worker(object):
-    def __init__(self, config, policy_params, env_creator, noise,
+    def __init__(self,
+                 config,
+                 policy_params,
+                 env_creator,
+                 noise,
                  min_task_runtime=0.2):
         self.min_task_runtime = min_task_runtime
         self.config = config
@@ -82,7 +84,9 @@ class Worker(object):
 
     def rollout(self, timestep_limit, add_noise=True):
         rollout_rewards, rollout_length = policies.rollout(
-            self.policy, self.env, timestep_limit=timestep_limit,
+            self.policy,
+            self.env,
+            timestep_limit=timestep_limit,
             add_noise=add_noise)
         return rollout_rewards, rollout_length
 
@@ -95,8 +99,8 @@ class Worker(object):
 
         # Perform some rollouts with noise.
         task_tstart = time.time()
-        while (len(noise_indices) == 0 or
-               time.time() - task_tstart < self.min_task_runtime):
+        while (len(noise_indices) == 0
+               or time.time() - task_tstart < self.min_task_runtime):
 
             if np.random.uniform() < self.config["eval_prob"]:
                 # Do an evaluation run with no perturbation.
@@ -122,7 +126,8 @@ class Worker(object):
                 noise_indices.append(noise_index)
                 returns.append([rewards_pos.sum(), rewards_neg.sum()])
                 sign_returns.append(
-                    [np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()])
+                    [np.sign(rewards_pos).sum(),
+                     np.sign(rewards_neg).sum()])
                 lengths.append([lengths_pos, lengths_neg])
 
         return Result(
@@ -146,9 +151,7 @@ class ESAgent(Agent):
         return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])
 
     def _init(self):
-        policy_params = {
-            "action_noise_std": 0.01
-        }
+        policy_params = {"action_noise_std": 0.01}
 
         env = self.env_creator(self.config["env_config"])
         from ray.rllib import models
@@ -168,9 +171,9 @@ class ESAgent(Agent):
         # Create the actors.
         print("Creating actors.")
         self.workers = [
-            Worker.remote(
-                self.config, policy_params, self.env_creator, noise_id)
-            for _ in range(self.config["num_workers"])]
+            Worker.remote(self.config, policy_params, self.env_creator,
+                          noise_id) for _ in range(self.config["num_workers"])
+        ]
 
         self.episodes_so_far = 0
         self.timesteps_so_far = 0
@@ -180,21 +183,20 @@ class ESAgent(Agent):
         num_episodes, num_timesteps = 0, 0
         results = []
         while num_episodes < min_episodes or num_timesteps < min_timesteps:
-            print(
-                "Collected {} episodes {} timesteps so far this iter".format(
-                    num_episodes, num_timesteps))
-            rollout_ids = [worker.do_rollouts.remote(theta_id)
-                           for worker in self.workers]
+            print("Collected {} episodes {} timesteps so far this iter".format(
+                num_episodes, num_timesteps))
+            rollout_ids = [
+                worker.do_rollouts.remote(theta_id) for worker in self.workers
+            ]
             # Get the results of the rollouts.
             for result in ray.get(rollout_ids):
                 results.append(result)
                 # Update the number of episodes and the number of timesteps
                 # keeping in mind that result.noisy_lengths is a list of lists,
                 # where the inner lists have length 2.
-                num_episodes += sum(len(pair) for pair
-                                    in result.noisy_lengths)
-                num_timesteps += sum(sum(pair) for pair
-                                     in result.noisy_lengths)
+                num_episodes += sum(len(pair) for pair in result.noisy_lengths)
+                num_timesteps += sum(
+                    sum(pair) for pair in result.noisy_lengths)
         return results, num_episodes, num_timesteps
 
     def _train(self):
@@ -209,8 +211,7 @@ class ESAgent(Agent):
         # Use the actors to do rollouts, note that we pass in the ID of the
         # policy weights.
         results, num_episodes, num_timesteps = self._collect_results(
-            theta_id,
-            config["episodes_per_batch"],
+            theta_id, config["episodes_per_batch"],
             config["timesteps_per_batch"])
 
         all_noise_indices = []
@@ -255,13 +256,11 @@ class ESAgent(Agent):
              for index in noise_indices),
             batch_size=500)
         g /= noisy_returns.size
-        assert (
-            g.shape == (self.policy.num_params,) and
-            g.dtype == np.float32 and
-            count == len(noise_indices))
+        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
+                and count == len(noise_indices))
         # Compute the new weights theta.
-        theta, update_ratio = self.optimizer.update(
-            -g + config["l2_coeff"] * theta)
+        theta, update_ratio = self.optimizer.update(-g +
+                                                    config["l2_coeff"] * theta)
         # Set the new weights in the local copy of the policy.
         self.policy.set_weights(theta)
 
@@ -313,13 +312,10 @@ class ESAgent(Agent):
             w.__ray_terminate__.remote()
 
     def _save(self, checkpoint_dir):
-        checkpoint_path = os.path.join(
-            checkpoint_dir, "checkpoint-{}".format(self.iteration))
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "checkpoint-{}".format(self.iteration))
         weights = self.policy.get_weights()
-        objects = [
-            weights,
-            self.episodes_so_far,
-            self.timesteps_so_far]
+        objects = [weights, self.episodes_so_far, self.timesteps_so_far]
         pickle.dump(objects, open(checkpoint_path, "wb"))
         return checkpoint_path
 
diff --git a/python/ray/rllib/agents/es/optimizers.py b/python/ray/rllib/agents/es/optimizers.py
index f5ef4e109..3b48f7393 100644
--- a/python/ray/rllib/agents/es/optimizers.py
+++ b/python/ray/rllib/agents/es/optimizers.py
@@ -48,8 +48,8 @@ class Adam(Optimizer):
         self.v = np.zeros(self.dim, dtype=np.float32)
 
     def _compute_step(self, globalg):
-        a = self.stepsize * (np.sqrt(1 - self.beta2 ** self.t) /
-                             (1 - self.beta1 ** self.t))
+        a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
+                             (1 - self.beta1**self.t))
         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
         step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
diff --git a/python/ray/rllib/agents/es/policies.py b/python/ray/rllib/agents/es/policies.py
index eb492373f..d62fee43c 100644
--- a/python/ray/rllib/agents/es/policies.py
+++ b/python/ray/rllib/agents/es/policies.py
@@ -21,8 +21,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
     noise drawn from that stream. Otherwise, no action noise will be added.
     """
     env_timestep_limit = env.spec.max_episode_steps
-    timestep_limit = (env_timestep_limit if timestep_limit is None
-                      else min(timestep_limit, env_timestep_limit))
+    timestep_limit = (env_timestep_limit if timestep_limit is None else min(
+        timestep_limit, env_timestep_limit))
     rews = []
     t = 0
     observation = env.reset()
@@ -38,16 +38,16 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
 
 
 class GenericPolicy(object):
-    def __init__(self, sess, action_space, preprocessor,
-                 observation_filter, action_noise_std):
+    def __init__(self, sess, action_space, preprocessor, observation_filter,
+                 action_noise_std):
         self.sess = sess
         self.action_space = action_space
         self.action_noise_std = action_noise_std
         self.preprocessor = preprocessor
-        self.observation_filter = get_filter(
-            observation_filter, self.preprocessor.shape)
-        self.inputs = tf.placeholder(
-            tf.float32, [None] + list(self.preprocessor.shape))
+        self.observation_filter = get_filter(observation_filter,
+                                             self.preprocessor.shape)
+        self.inputs = tf.placeholder(tf.float32,
+                                     [None] + list(self.preprocessor.shape))
 
         # Policy network.
         dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -59,16 +59,16 @@ class GenericPolicy(object):
         self.variables = ray.experimental.TensorFlowVariables(
             model.outputs, self.sess)
 
-        self.num_params = sum(np.prod(variable.shape.as_list())
-                              for _, variable
-                              in self.variables.variables.items())
+        self.num_params = sum(
+            np.prod(variable.shape.as_list())
+            for _, variable in self.variables.variables.items())
         self.sess.run(tf.global_variables_initializer())
 
     def compute(self, observation, add_noise=False, update=True):
         observation = self.preprocessor.transform(observation)
         observation = self.observation_filter(observation[None], update=update)
-        action = self.sess.run(self.sampler,
-                               feed_dict={self.inputs: observation})
+        action = self.sess.run(
+            self.sampler, feed_dict={self.inputs: observation})
         if add_noise and isinstance(self.action_space, gym.spaces.Box):
             action += np.random.randn(*action.shape) * self.action_noise_std
         return action
diff --git a/python/ray/rllib/agents/es/tabular_logger.py b/python/ray/rllib/agents/es/tabular_logger.py
index 80e7b5b37..1463e59e0 100644
--- a/python/ray/rllib/agents/es/tabular_logger.py
+++ b/python/ray/rllib/agents/es/tabular_logger.py
@@ -25,6 +25,7 @@ DISABLED = 50
 
 class TbWriter(object):
     """Based on SummaryWriter, but changed to allow for a different prefix."""
+
     def __init__(self, dir, prefix):
         self.dir = dir
         # Start at 1, because EvWriter automatically generates an object with
@@ -34,9 +35,10 @@ class TbWriter(object):
             compat.as_bytes(os.path.join(dir, prefix)))
 
     def write_values(self, key2val):
-        summary = tf.Summary(value=[tf.Summary.Value(tag=k,
-                                                     simple_value=float(v))
-                                    for (k, v) in key2val.items()])
+        summary = tf.Summary(value=[
+            tf.Summary.Value(tag=k, simple_value=float(v))
+            for (k, v) in key2val.items()
+        ])
         event = event_pb2.Event(wall_time=time.time(), summary=summary)
         event.step = self.step
         self.evwriter.WriteEvent(event)
@@ -46,6 +48,7 @@ class TbWriter(object):
     def close(self):
         self.evwriter.Close()
 
+
 # API
 
 
@@ -126,6 +129,7 @@ def get_expt_dir():
     sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir()\n")
     return get_dir()
 
+
 # Backend
 
 
@@ -167,8 +171,8 @@ class _Logger(object):
         # Write to all text outputs
         self._write_text("-" * (keywidth + valwidth + 7), "\n")
         for (key, val) in key2str.items():
-            self._write_text("| ", key, " " * (keywidth - len(key)),
-                             " | ", val, " " * (valwidth - len(val)), " |\n")
+            self._write_text("| ", key, " " * (keywidth - len(key)), " | ",
+                             val, " " * (valwidth - len(val)), " |\n")
         self._write_text("-" * (keywidth + valwidth + 7), "\n")
         for f in self.text_outputs:
             try:
@@ -202,7 +206,7 @@ class _Logger(object):
     # Misc
 
     def _do_log(self, *args):
-        self._write_text(*args + ('\n',))
+        self._write_text(*args + ('\n', ))
         for f in self.text_outputs:
             try:
                 f.flush()
diff --git a/python/ray/rllib/agents/es/utils.py b/python/ray/rllib/agents/es/utils.py
index 6ea5d31ac..1575e46c3 100644
--- a/python/ray/rllib/agents/es/utils.py
+++ b/python/ray/rllib/agents/es/utils.py
@@ -31,8 +31,9 @@ def compute_centered_ranks(x):
 def make_session(single_threaded):
     if not single_threaded:
         return tf.Session()
-    return tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1,
-                                            intra_op_parallelism_threads=1))
+    return tf.Session(
+        config=tf.ConfigProto(
+            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
 
 
 def itergroups(items, group_size):
@@ -50,10 +51,11 @@ def itergroups(items, group_size):
 def batched_weighted_sum(weights, vecs, batch_size):
     total = 0
     num_items_summed = 0
-    for batch_weights, batch_vecs in zip(itergroups(weights, batch_size),
-                                         itergroups(vecs, batch_size)):
+    for batch_weights, batch_vecs in zip(
+            itergroups(weights, batch_size), itergroups(vecs, batch_size)):
         assert len(batch_weights) == len(batch_vecs) <= batch_size
-        total += np.dot(np.asarray(batch_weights, dtype=np.float32),
-                        np.asarray(batch_vecs, dtype=np.float32))
+        total += np.dot(
+            np.asarray(batch_weights, dtype=np.float32),
+            np.asarray(batch_vecs, dtype=np.float32))
         num_items_summed += len(batch_weights)
     return total, num_items_summed
diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index 0bd4c33b4..c66146832 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -7,7 +7,6 @@ from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.tune.trial import Resources
 
-
 DEFAULT_CONFIG = with_common_config({
     # No remote workers by default
     "num_workers": 0,
@@ -43,9 +42,9 @@ class PGAgent(Agent):
             self.env_creator, PGPolicyGraph)
         self.remote_evaluators = self.make_remote_evaluators(
             self.env_creator, PGPolicyGraph, self.config["num_workers"], {})
-        self.optimizer = SyncSamplesOptimizer(
-            self.local_evaluator, self.remote_evaluators,
-            self.config["optimizer"])
+        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
+                                              self.remote_evaluators,
+                                              self.config["optimizer"])
 
     def _train(self):
         prev_steps = self.optimizer.num_steps_sampled
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index cbd9b2745..bb831c47d 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -42,9 +42,15 @@ class PGPolicyGraph(TFPolicyGraph):
         ]
 
         TFPolicyGraph.__init__(
-            self, obs_space, action_space, sess, obs_input=obs,
-            action_sampler=action_dist.sample(), loss=loss,
-            loss_inputs=loss_in, state_inputs=self.model.state_in,
+            self,
+            obs_space,
+            action_space,
+            sess,
+            obs_input=obs,
+            action_sampler=action_dist.sample(),
+            loss=loss,
+            loss_inputs=loss_in,
+            state_inputs=self.model.state_in,
             state_outputs=self.model.state_out,
             seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 2f8b403aa..120619d47 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -77,28 +77,30 @@ class PPOAgent(Agent):
         self.local_evaluator = self.make_local_evaluator(
             self.env_creator, PPOPolicyGraph)
         self.remote_evaluators = self.make_remote_evaluators(
-            self.env_creator, PPOPolicyGraph, self.config["num_workers"],
-            {"num_cpus": self.config["num_cpus_per_worker"],
-             "num_gpus": self.config["num_gpus_per_worker"]})
+            self.env_creator, PPOPolicyGraph, self.config["num_workers"], {
+                "num_cpus": self.config["num_cpus_per_worker"],
+                "num_gpus": self.config["num_gpus_per_worker"]
+            })
         if self.config["simple_optimizer"]:
             self.optimizer = SyncSamplesOptimizer(
                 self.local_evaluator, self.remote_evaluators,
                 {"num_sgd_iter": self.config["num_sgd_iter"]})
         else:
             self.optimizer = LocalMultiGPUOptimizer(
-                self.local_evaluator, self.remote_evaluators,
-                {"sgd_batch_size": self.config["sgd_batchsize"],
-                 "sgd_stepsize": self.config["sgd_stepsize"],
-                 "num_sgd_iter": self.config["num_sgd_iter"],
-                 "timesteps_per_batch": self.config["timesteps_per_batch"],
-                 "standardize_fields": ["advantages"]})
+                self.local_evaluator, self.remote_evaluators, {
+                    "sgd_batch_size": self.config["sgd_batchsize"],
+                    "sgd_stepsize": self.config["sgd_stepsize"],
+                    "num_sgd_iter": self.config["num_sgd_iter"],
+                    "timesteps_per_batch": self.config["timesteps_per_batch"],
+                    "standardize_fields": ["advantages"]
+                })
 
     def _train(self):
         prev_steps = self.optimizer.num_steps_sampled
         fetches = self.optimizer.step()
         self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"]))
-        FilterManager.synchronize(
-            self.local_evaluator.filters, self.remote_evaluators)
+        FilterManager.synchronize(self.local_evaluator.filters,
+                                  self.remote_evaluators)
         res = self.optimizer.collect_metrics()
         res = res._replace(
             timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
@@ -115,9 +117,7 @@ class PPOAgent(Agent):
                                        "checkpoint-{}".format(self.iteration))
         agent_state = ray.get(
             [a.save.remote() for a in self.remote_evaluators])
-        extra_data = [
-            self.local_evaluator.save(),
-            agent_state]
+        extra_data = [self.local_evaluator.save(), agent_state]
         pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
         return checkpoint_path
 
@@ -126,4 +126,5 @@ class PPOAgent(Agent):
         self.local_evaluator.restore(extra_data[0])
         ray.get([
             a.restore.remote(o)
-                for (a, o) in zip(self.remote_evaluators, extra_data[1])])
+            for (a, o) in zip(self.remote_evaluators, extra_data[1])
+        ])
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 2bc6d5507..df3444318 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -10,10 +10,20 @@ from ray.rllib.models.catalog import ModelCatalog
 
 
 class PPOLoss(object):
-    def __init__(
-            self, action_space, value_targets, advantages, actions, logits,
-            vf_preds, curr_action_dist, value_fn, cur_kl_coeff,
-            entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True):
+    def __init__(self,
+                 action_space,
+                 value_targets,
+                 advantages,
+                 actions,
+                 logits,
+                 vf_preds,
+                 curr_action_dist,
+                 value_fn,
+                 cur_kl_coeff,
+                 entropy_coeff=0,
+                 clip_param=0.1,
+                 vf_loss_coeff=1.0,
+                 use_gae=True):
         """Constructs the loss for Proximal Policy Objective.
 
         Arguments:
@@ -51,31 +61,33 @@ class PPOLoss(object):
 
         surrogate_loss = tf.minimum(
             advantages * logp_ratio,
-            advantages * tf.clip_by_value(
-                logp_ratio, 1 - clip_param, 1 + clip_param))
+            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
+                                          1 + clip_param))
         self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)
 
         if use_gae:
             vf_loss1 = tf.square(value_fn - value_targets)
-            vf_clipped = vf_preds + tf.clip_by_value(
-                value_fn - vf_preds, -clip_param, clip_param)
+            vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
+                                                     -clip_param, clip_param)
             vf_loss2 = tf.square(vf_clipped - value_targets)
             vf_loss = tf.maximum(vf_loss1, vf_loss2)
             self.mean_vf_loss = tf.reduce_mean(vf_loss)
-            loss = tf.reduce_mean(
-                -surrogate_loss + cur_kl_coeff*action_kl +
-                vf_loss_coeff*vf_loss - entropy_coeff*curr_entropy)
+            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
+                                  vf_loss_coeff * vf_loss -
+                                  entropy_coeff * curr_entropy)
         else:
             self.mean_vf_loss = tf.constant(0.0)
-            loss = tf.reduce_mean(
-                -surrogate_loss + cur_kl_coeff*action_kl -
-                entropy_coeff*curr_entropy)
+            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
+                                  entropy_coeff * curr_entropy)
         self.loss = loss
 
 
 class PPOPolicyGraph(TFPolicyGraph):
-    def __init__(self, observation_space, action_space,
-                 config, existing_inputs=None):
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 config,
+                 existing_inputs=None):
         """
         Arguments:
             observation_space: Environment observation space specification.
@@ -98,16 +110,18 @@ class PPOPolicyGraph(TFPolicyGraph):
             existing_seq_lens = existing_inputs[-1]
         else:
             obs_ph = tf.placeholder(
-                tf.float32, name="obs", shape=(None,)+observation_space.shape)
+                tf.float32,
+                name="obs",
+                shape=(None, ) + observation_space.shape)
             adv_ph = tf.placeholder(
-                tf.float32, name="advantages", shape=(None,))
+                tf.float32, name="advantages", shape=(None, ))
             act_ph = ModelCatalog.get_action_placeholder(action_space)
             logits_ph = tf.placeholder(
                 tf.float32, name="logits", shape=(None, logit_dim))
             vf_preds_ph = tf.placeholder(
-                tf.float32, name="vf_preds", shape=(None,))
+                tf.float32, name="vf_preds", shape=(None, ))
             value_targets_ph = tf.placeholder(
-                tf.float32, name="value_targets", shape=(None,))
+                tf.float32, name="value_targets", shape=(None, ))
             existing_state_in = None
             existing_seq_lens = None
 
@@ -120,13 +134,19 @@ class PPOPolicyGraph(TFPolicyGraph):
             ("vf_preds", vf_preds_ph),
         ]
         self.model = ModelCatalog.get_model(
-            obs_ph, logit_dim, self.config["model"],
-            state_in=existing_state_in, seq_lens=existing_seq_lens)
+            obs_ph,
+            logit_dim,
+            self.config["model"],
+            state_in=existing_state_in,
+            seq_lens=existing_seq_lens)
 
         # KL Coefficient
         self.kl_coeff = tf.get_variable(
             initializer=tf.constant_initializer(self.kl_coeff_val),
-            name="kl_coeff", shape=(), trainable=False, dtype=tf.float32)
+            name="kl_coeff",
+            shape=(),
+            trainable=False,
+            dtype=tf.float32)
 
         self.logits = self.model.outputs
         curr_action_dist = dist_cls(self.logits)
@@ -146,20 +166,32 @@ class PPOPolicyGraph(TFPolicyGraph):
             self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
 
         self.loss_obj = PPOLoss(
-            action_space, value_targets_ph, adv_ph, act_ph,
-            logits_ph, vf_preds_ph,
-            curr_action_dist, self.value_function, self.kl_coeff,
+            action_space,
+            value_targets_ph,
+            adv_ph,
+            act_ph,
+            logits_ph,
+            vf_preds_ph,
+            curr_action_dist,
+            self.value_function,
+            self.kl_coeff,
             entropy_coeff=self.config["entropy_coeff"],
             clip_param=self.config["clip_param"],
             vf_loss_coeff=self.config["kl_target"],
             use_gae=self.config["use_gae"])
 
         TFPolicyGraph.__init__(
-            self, observation_space, action_space,
-            self.sess, obs_input=obs_ph,
-            action_sampler=self.sampler, loss=self.loss_obj.loss,
-            loss_inputs=self.loss_in, state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
+            self,
+            observation_space,
+            action_space,
+            self.sess,
+            obs_input=obs_ph,
+            action_sampler=self.sampler,
+            loss=self.loss_obj.loss,
+            loss_inputs=self.loss_in,
+            state_inputs=self.model.state_in,
+            state_outputs=self.model.state_out,
+            seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
 
         self.sess.run(tf.global_variables_initializer())
@@ -167,7 +199,9 @@ class PPOPolicyGraph(TFPolicyGraph):
     def copy(self, existing_inputs):
         """Creates a copy of self using existing input placeholders."""
         return PPOPolicyGraph(
-            None, self.action_space, self.config,
+            None,
+            self.action_space,
+            self.config,
             existing_inputs=existing_inputs)
 
     def extra_compute_action_fetches(self):
@@ -193,8 +227,11 @@ class PPOPolicyGraph(TFPolicyGraph):
     def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
         last_r = 0.0
         batch = compute_advantages(
-            sample_batch, last_r, self.config["gamma"],
-            self.config["lambda"], use_gae=self.config["use_gae"])
+            sample_batch,
+            last_r,
+            self.config["gamma"],
+            self.config["lambda"],
+            use_gae=self.config["use_gae"])
         return batch
 
     def optimizer(self):
diff --git a/python/ray/rllib/agents/ppo/test/test.py b/python/ray/rllib/agents/ppo/test/test.py
index d6454eb56..432b22f9a 100644
--- a/python/ray/rllib/agents/ppo/test/test.py
+++ b/python/ray/rllib/agents/ppo/test/test.py
@@ -13,7 +13,6 @@ from ray.rllib.agents.ppo.utils import flatten, concatenate
 
 # TODO(ekl): move to rllib/models dir
 class DistributionsTest(unittest.TestCase):
-
     def testCategorical(self):
         num_samples = 100000
         logits = tf.placeholder(tf.float32, shape=(None, 10))
@@ -32,10 +31,11 @@ class DistributionsTest(unittest.TestCase):
 
 
 class UtilsTest(unittest.TestCase):
-
     def testFlatten(self):
-        d = {"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
-             "a": np.array([[[5], [-5]], [[6], [-6]]])}
+        d = {
+            "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
+            "a": np.array([[[5], [-5]], [[6], [-6]]])
+        }
         flat = flatten(d.copy(), start=0, stop=2)
         assert_allclose(d["s"][0][0][:], flat["s"][0][:])
         assert_allclose(d["s"][0][1][:], flat["s"][1][:])
diff --git a/python/ray/rllib/agents/ppo/utils.py b/python/ray/rllib/agents/ppo/utils.py
index 5e8ac5a3a..e97dce5cf 100644
--- a/python/ray/rllib/agents/ppo/utils.py
+++ b/python/ray/rllib/agents/ppo/utils.py
@@ -16,7 +16,7 @@ def flatten(weights, start=0, stop=2):
         stop: The ending index.
     """
     for key, val in weights.items():
-        new_shape = val.shape[0:start] + (-1,) + val.shape[stop:]
+        new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
         weights[key] = val.reshape(new_shape)
     return weights
 
diff --git a/python/ray/rllib/env/async_vector_env.py b/python/ray/rllib/env/async_vector_env.py
index 1d6a9b374..ba0d63c12 100644
--- a/python/ray/rllib/env/async_vector_env.py
+++ b/python/ray/rllib/env/async_vector_env.py
@@ -286,8 +286,8 @@ class _MultiAgentEnvState(object):
         self.reset()
 
     def poll(self):
-        obs, rew, dones, info = (
-            self.last_obs, self.last_rewards, self.last_dones, self.last_infos)
+        obs, rew, dones, info = (self.last_obs, self.last_rewards,
+                                 self.last_dones, self.last_infos)
         self.last_obs = {}
         self.last_rewards = {}
         self.last_dones = {"__all__": False}
@@ -303,10 +303,13 @@ class _MultiAgentEnvState(object):
     def reset(self):
         self.last_obs = self.env.reset()
         self.last_rewards = {
-            agent_id: None for agent_id in self.last_obs.keys()}
+            agent_id: None
+            for agent_id in self.last_obs.keys()
+        }
         self.last_dones = {
-            agent_id: False for agent_id in self.last_obs.keys()}
-        self.last_infos = {
-            agent_id: {} for agent_id in self.last_obs.keys()}
+            agent_id: False
+            for agent_id in self.last_obs.keys()
+        }
+        self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
         self.last_dones["__all__"] = False
         return self.last_obs
diff --git a/python/ray/rllib/env/atari_wrappers.py b/python/ray/rllib/env/atari_wrappers.py
index d9d7beffd..f9bf5b94a 100644
--- a/python/ray/rllib/env/atari_wrappers.py
+++ b/python/ray/rllib/env/atari_wrappers.py
@@ -28,8 +28,7 @@ class NoopResetEnv(gym.Wrapper):
         if self.override_num_noops is not None:
             noops = self.override_num_noops
         else:
-            noops = self.unwrapped.np_random.randint(
-                1, self.noop_max + 1)
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
         assert noops > 0
         obs = None
         for _ in range(noops):
@@ -121,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
         gym.Wrapper.__init__(self, env)
         # most recent raw observations (for max pooling across time steps)
         self._obs_buffer = np.zeros(
-            (2,)+env.observation_space.shape, dtype=np.uint8)
+            (2, ) + env.observation_space.shape, dtype=np.uint8)
         self._skip = skip
 
     def step(self, action):
diff --git a/python/ray/rllib/env/vector_env.py b/python/ray/rllib/env/vector_env.py
index ef57be859..28791f552 100644
--- a/python/ray/rllib/env/vector_env.py
+++ b/python/ray/rllib/env/vector_env.py
@@ -71,8 +71,7 @@ class _VectorizedGymEnv(VectorEnv):
         self.envs = existing_envs
         self.num_envs = num_envs
         if make_env and num_envs > 1:
-            self.resetter = _AsyncResetter(
-                make_env, int(self.num_envs ** 0.5))
+            self.resetter = _AsyncResetter(make_env, int(self.num_envs**0.5))
         else:
             self.resetter = _SimpleResetter(make_env)
         while len(self.envs) < self.num_envs:
diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py
index ceabce7ee..d4d0b5743 100644
--- a/python/ray/rllib/evaluation/metrics.py
+++ b/python/ray/rllib/evaluation/metrics.py
@@ -15,9 +15,10 @@ def collect_metrics(local_evaluator, remote_evaluators=[]):
     episode_rewards = []
     episode_lengths = []
     policy_rewards = collections.defaultdict(list)
-    metric_lists = ray.get(
-        [a.apply.remote(lambda ev: ev.sampler.get_metrics())
-         for a in remote_evaluators])
+    metric_lists = ray.get([
+        a.apply.remote(lambda ev: ev.sampler.get_metrics())
+        for a in remote_evaluators
+    ])
     metric_lists.append(local_evaluator.sampler.get_metrics())
     for metrics in metric_lists:
         for episode in metrics:
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index c513389c3..58f121ef4 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -82,24 +82,23 @@ class PolicyEvaluator(EvaluatorInterface):
     def as_remote(cls, num_cpus=None, num_gpus=None):
         return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
 
-    def __init__(
-            self,
-            env_creator,
-            policy_graph,
-            policy_mapping_fn=None,
-            tf_session_creator=None,
-            batch_steps=100,
-            batch_mode="truncate_episodes",
-            episode_horizon=None,
-            preprocessor_pref="rllib",
-            sample_async=False,
-            compress_observations=False,
-            num_envs=1,
-            observation_filter="NoFilter",
-            env_config=None,
-            model_config=None,
-            policy_config=None,
-            worker_index=0):
+    def __init__(self,
+                 env_creator,
+                 policy_graph,
+                 policy_mapping_fn=None,
+                 tf_session_creator=None,
+                 batch_steps=100,
+                 batch_mode="truncate_episodes",
+                 episode_horizon=None,
+                 preprocessor_pref="rllib",
+                 sample_async=False,
+                 compress_observations=False,
+                 num_envs=1,
+                 observation_filter="NoFilter",
+                 env_config=None,
+                 model_config=None,
+                 policy_config=None,
+                 worker_index=0):
         """Initialize a policy evaluator.
 
         Arguments:
@@ -157,8 +156,8 @@ class PolicyEvaluator(EvaluatorInterface):
         policy_config = policy_config or {}
         self.policy_config = policy_config
         model_config = model_config or {}
-        policy_mapping_fn = (
-            policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID))
+        policy_mapping_fn = (policy_mapping_fn
+                             or (lambda agent_id: DEFAULT_POLICY_ID))
         self.env_creator = env_creator
         self.policy_graph = policy_graph
         self.batch_steps = batch_steps
@@ -170,17 +169,21 @@ class PolicyEvaluator(EvaluatorInterface):
                 isinstance(self.env, ServingEnv) or \
                 isinstance(self.env, MultiAgentEnv) or \
                 isinstance(self.env, AsyncVectorEnv):
+
             def wrap(env):
                 return env  # we can't auto-wrap these env types
         elif is_atari(self.env) and \
                 "custom_preprocessor" not in model_config and \
                 preprocessor_pref == "deepmind":
+
             def wrap(env):
                 return wrap_deepmind(env, dim=model_config.get("dim", 80))
         else:
+
             def wrap(env):
                 return ModelCatalog.get_preprocessor_as_wrapper(
                     env, model_config)
+
         self.env = wrap(self.env)
 
         def make_env():
@@ -193,20 +196,21 @@ class PolicyEvaluator(EvaluatorInterface):
                 if tf_session_creator:
                     self.tf_sess = tf_session_creator()
                 else:
-                    self.tf_sess = tf.Session(config=tf.ConfigProto(
-                        gpu_options=tf.GPUOptions(allow_growth=True)))
+                    self.tf_sess = tf.Session(
+                        config=tf.ConfigProto(
+                            gpu_options=tf.GPUOptions(allow_growth=True)))
                 with self.tf_sess.as_default():
                     self.policy_map = self._build_policy_map(
                         policy_dict, policy_config)
         else:
-            self.policy_map = self._build_policy_map(
-                policy_dict, policy_config)
+            self.policy_map = self._build_policy_map(policy_dict,
+                                                     policy_config)
 
         self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)
 
         self.filters = {
-            policy_id: get_filter(
-                observation_filter, policy.observation_space.shape)
+            policy_id: get_filter(observation_filter,
+                                  policy.observation_space.shape)
             for (policy_id, policy) in self.policy_map.items()
         }
 
@@ -226,24 +230,34 @@ class PolicyEvaluator(EvaluatorInterface):
             batch_steps = float("inf")  # never cut episodes
             pack_episodes = False  # sampler will return 1 episode per poll
         else:
-            raise ValueError(
-                "Unsupported batch mode: {}".format(self.batch_mode))
+            raise ValueError("Unsupported batch mode: {}".format(
+                self.batch_mode))
         if sample_async:
             self.sampler = AsyncSampler(
-                self.async_env, self.policy_map, policy_mapping_fn,
-                self.filters, batch_steps, horizon=episode_horizon,
-                pack=pack_episodes, tf_sess=self.tf_sess)
+                self.async_env,
+                self.policy_map,
+                policy_mapping_fn,
+                self.filters,
+                batch_steps,
+                horizon=episode_horizon,
+                pack=pack_episodes,
+                tf_sess=self.tf_sess)
             self.sampler.start()
         else:
             self.sampler = SyncSampler(
-                self.async_env, self.policy_map, policy_mapping_fn,
-                self.filters, batch_steps, horizon=episode_horizon,
-                pack=pack_episodes, tf_sess=self.tf_sess)
+                self.async_env,
+                self.policy_map,
+                policy_mapping_fn,
+                self.filters,
+                batch_steps,
+                horizon=episode_horizon,
+                pack=pack_episodes,
+                tf_sess=self.tf_sess)
 
     def _build_policy_map(self, policy_dict, policy_config):
         policy_map = {}
-        for name, (cls, obs_space, act_space, conf) in sorted(
-                policy_dict.items()):
+        for name, (cls, obs_space, act_space,
+                   conf) in sorted(policy_dict.items()):
             merged_conf = policy_config.copy()
             merged_conf.update(conf)
             with tf.variable_scope(name):
@@ -315,7 +329,8 @@ class PolicyEvaluator(EvaluatorInterface):
     def get_weights(self):
         return {
             pid: policy.get_weights()
-            for pid, policy in self.policy_map.items()}
+            for pid, policy in self.policy_map.items()
+        }
 
     def set_weights(self, weights):
         for pid, w in weights.items():
@@ -351,9 +366,7 @@ class PolicyEvaluator(EvaluatorInterface):
                         builder, grad)
                     for pid, grad in grads.items()
                 }
-                return {
-                    k: builder.get(v) for k, v in outputs.items()
-                }
+                return {k: builder.get(v) for k, v in outputs.items()}
             else:
                 return {
                     pid: self.policy_map[pid].apply_gradients(g)
@@ -428,8 +441,9 @@ def _validate_and_canonicalize(policy_graph, env):
         raise ValueError("policy_graph must be a rllib.PolicyGraph class")
     else:
         return {
-            DEFAULT_POLICY_ID: (
-                policy_graph, env.observation_space, env.action_space, {})}
+            DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
+                                env.action_space, {})
+        }
 
 
 def _has_tensorflow_graph(policy_dict):
diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py
index 14584b41f..109db4d3f 100644
--- a/python/ray/rllib/evaluation/sample_batch.py
+++ b/python/ray/rllib/evaluation/sample_batch.py
@@ -45,7 +45,8 @@ class SampleBatchBuilder(object):
         """Returns a sample batch including all previously added values."""
 
         batch = SampleBatch(
-            {k: to_float_array(v) for k, v in self.buffers.items()})
+            {k: to_float_array(v)
+             for k, v in self.buffers.items()})
         self.buffers.clear()
         self.count = 0
         return batch
@@ -69,7 +70,9 @@ class MultiAgentSampleBatchBuilder(object):
 
         self.policy_map = policy_map
         self.policy_builders = {
-            k: SampleBatchBuilder() for k in policy_map.keys()}
+            k: SampleBatchBuilder()
+            for k in policy_map.keys()
+        }
         self.agent_builders = {}
         self.agent_to_policy = {}
         self.count = 0  # increment this manually
diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py
index 4ea09652c..6ae66e6da 100644
--- a/python/ray/rllib/evaluation/sampler.py
+++ b/python/ray/rllib/evaluation/sampler.py
@@ -12,12 +12,11 @@ from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
 
-
 RolloutMetrics = namedtuple(
     "RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])
 
-PolicyEvalData = namedtuple(
-    "PolicyEvalData", ["env_id", "agent_id", "obs", "rnn_state"])
+PolicyEvalData = namedtuple("PolicyEvalData",
+                            ["env_id", "agent_id", "obs", "rnn_state"])
 
 
 class SyncSampler(object):
@@ -29,9 +28,15 @@ class SyncSampler(object):
     This class provides data on invocation, rather than on a separate
     thread."""
 
-    def __init__(
-            self, env, policies, policy_mapping_fn, obs_filters,
-            num_local_steps, horizon=None, pack=False, tf_sess=None):
+    def __init__(self,
+                 env,
+                 policies,
+                 policy_mapping_fn,
+                 obs_filters,
+                 num_local_steps,
+                 horizon=None,
+                 pack=False,
+                 tf_sess=None):
         self.async_vector_env = AsyncVectorEnv.wrap_async(env)
         self.num_local_steps = num_local_steps
         self.horizon = horizon
@@ -68,9 +73,15 @@ class AsyncSampler(threading.Thread):
     Note that batch_size is only a unit of measure here. Batches can
     accumulate and the gradient can be calculated on up to 5 batches."""
 
-    def __init__(
-            self, env, policies, policy_mapping_fn, obs_filters,
-            num_local_steps, horizon=None, pack=False, tf_sess=None):
+    def __init__(self,
+                 env,
+                 policies,
+                 policy_mapping_fn,
+                 obs_filters,
+                 num_local_steps,
+                 horizon=None,
+                 pack=False,
+                 tf_sess=None):
         for _, f in obs_filters.items():
             assert getattr(f, "is_concurrent", False), \
                 "Observation Filter must support concurrent updates."
@@ -142,9 +153,14 @@ class AsyncSampler(threading.Thread):
         return completed
 
 
-def _env_runner(
-        async_vector_env, policies, policy_mapping_fn, num_local_steps,
-        horizon, obs_filters, pack, tf_sess=None):
+def _env_runner(async_vector_env,
+                policies,
+                policy_mapping_fn,
+                num_local_steps,
+                horizon,
+                obs_filters,
+                pack,
+                tf_sess=None):
     """This implements the common experience collection logic.
 
     Args:
@@ -186,9 +202,11 @@ def _env_runner(
         else:
             return MultiAgentSampleBatchBuilder(policies)
 
-    active_episodes = defaultdict(
-        lambda: _MultiAgentEpisode(
-            policies, policy_mapping_fn, get_batch_builder))
+    def new_episode():
+        return _MultiAgentEpisode(policies, policy_mapping_fn,
+                                  get_batch_builder)
+
+    active_episodes = defaultdict(new_episode)
 
     while True:
         # Get observations from all ready agents
@@ -213,9 +231,8 @@ def _env_runner(
             # Check episode termination conditions
             if dones[env_id]["__all__"] or episode.length >= horizon:
                 all_done = True
-                yield RolloutMetrics(
-                    episode.length, episode.total_reward,
-                    dict(episode.agent_rewards))
+                yield RolloutMetrics(episode.length, episode.total_reward,
+                                     dict(episode.agent_rewards))
             else:
                 all_done = False
                 # At least send an empty dict if not done
@@ -228,9 +245,8 @@ def _env_runner(
                 agent_done = bool(all_done or dones[env_id].get(agent_id))
                 if not agent_done:
                     to_eval[policy_id].append(
-                        PolicyEvalData(
-                            env_id, agent_id, filtered_obs,
-                            episode.rnn_state_for(agent_id)))
+                        PolicyEvalData(env_id, agent_id, filtered_obs,
+                                       episode.rnn_state_for(agent_id)))
 
                 last_observation = episode.last_observation_for(agent_id)
                 episode.set_last_observation(agent_id, filtered_obs)
@@ -274,13 +290,12 @@ def _env_runner(
                     episode = active_episodes[env_id]
                     for agent_id, raw_obs in resetted_obs.items():
                         policy_id = episode.policy_for(agent_id)
-                        filtered_obs = _get_or_raise(
-                            obs_filters, policy_id)(raw_obs)
+                        filtered_obs = _get_or_raise(obs_filters,
+                                                     policy_id)(raw_obs)
                         episode.set_last_observation(agent_id, filtered_obs)
                         to_eval[policy_id].append(
-                            PolicyEvalData(
-                                env_id, agent_id, filtered_obs,
-                                episode.rnn_state_for(agent_id)))
+                            PolicyEvalData(env_id, agent_id, filtered_obs,
+                                           episode.rnn_state_for(agent_id)))
 
         # Batch eval policy actions if possible
         if tf_sess:
@@ -295,7 +310,8 @@ def _env_runner(
             policy = _get_or_raise(policies, policy_id)
             if builder:
                 eval_results[policy_id] = policy.build_compute_actions(
-                    builder, [t.obs for t in eval_data], rnn_in,
+                    builder, [t.obs for t in eval_data],
+                    rnn_in,
                     is_training=True)
             else:
                 eval_results[policy_id] = policy.compute_actions(
@@ -319,7 +335,8 @@ def _env_runner(
                 episode = active_episodes[env_id]
                 episode.set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
                 episode.set_last_pi_info(
-                    agent_id, {k: v[i] for k, v in pi_info_cols.items()})
+                    agent_id, {k: v[i]
+                               for k, v in pi_info_cols.items()})
                 if env_id in off_policy_actions and \
                         agent_id in off_policy_actions[env_id]:
                     episode.set_last_action(
@@ -334,8 +351,7 @@ def _env_runner(
 
 def _to_column_format(rnn_state_rows):
     num_cols = len(rnn_state_rows[0])
-    return [
-        [row[i] for row in rnn_state_rows] for i in range(num_cols)]
+    return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]
 
 
 def _get_or_raise(mapping, policy_id):
@@ -363,8 +379,8 @@ class _MultiAgentEpisode(object):
     def add_agent_rewards(self, reward_dict):
         for agent_id, reward in reward_dict.items():
             if reward is not None:
-                self.agent_rewards[
-                    agent_id, self.policy_for(agent_id)] += reward
+                self.agent_rewards[agent_id,
+                                   self.policy_for(agent_id)] += reward
                 self.total_reward += reward
 
     def policy_for(self, agent_id):
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 58f5e3cac..d7225d7a4 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -35,10 +35,18 @@ class TFPolicyGraph(PolicyGraph):
         SampleBatch({"action": ..., "advantages": ..., ...})
     """
 
-    def __init__(
-            self, observation_space, action_space, sess, obs_input,
-            action_sampler, loss, loss_inputs, state_inputs=None,
-            state_outputs=None, seq_lens=None, max_seq_len=20):
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 sess,
+                 obs_input,
+                 action_sampler,
+                 loss,
+                 loss_inputs,
+                 state_inputs=None,
+                 state_outputs=None,
+                 seq_lens=None,
+                 max_seq_len=20):
         """Initialize the policy graph.
 
         Arguments:
@@ -78,9 +86,9 @@ class TFPolicyGraph(PolicyGraph):
         self._seq_lens = seq_lens
         self._max_seq_len = max_seq_len
         self._optimizer = self.optimizer()
-        self._grads_and_vars = [
-            (g, v) for (g, v) in self.gradients(self._optimizer)
-            if g is not None]
+        self._grads_and_vars = [(g, v)
+                                for (g, v) in self.gradients(self._optimizer)
+                                if g is not None]
         self._grads = [g for (g, v) in self._grads_and_vars]
         self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
         self._variables = ray.experimental.TensorFlowVariables(
@@ -92,8 +100,11 @@ class TFPolicyGraph(PolicyGraph):
         if self._state_inputs:
             assert self._seq_lens is not None
 
-    def build_compute_actions(
-            self, builder, obs_batch, state_batches=None, is_training=False):
+    def build_compute_actions(self,
+                              builder,
+                              obs_batch,
+                              state_batches=None,
+                              is_training=False):
         state_batches = state_batches or []
         assert len(self._state_inputs) == len(state_batches), \
             (self._state_inputs, state_batches)
@@ -103,16 +114,15 @@ class TFPolicyGraph(PolicyGraph):
             builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
         builder.add_feed_dict({self._is_training: is_training})
         builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
-        fetches = builder.add_fetches(
-            [self._sampler] + self._state_outputs +
-            [self.extra_compute_action_fetches()])
+        fetches = builder.add_fetches([self._sampler] + self._state_outputs +
+                                      [self.extra_compute_action_fetches()])
         return fetches[0], fetches[1:-1], fetches[-1]
 
-    def compute_actions(
-            self, obs_batch, state_batches=None, is_training=False):
+    def compute_actions(self, obs_batch, state_batches=None,
+                        is_training=False):
         builder = TFRunBuilder(self._sess, "compute_actions")
-        fetches = self.build_compute_actions(
-            builder, obs_batch, state_batches, is_training)
+        fetches = self.build_compute_actions(builder, obs_batch, state_batches,
+                                             is_training)
         return builder.get(fetches)
 
     def _get_loss_inputs_dict(self, batch):
@@ -127,12 +137,11 @@ class TFPolicyGraph(PolicyGraph):
         # RNN case
         feature_keys = [k for k, v in self._loss_inputs]
         state_keys = [
-            "state_in_{}".format(i) for i in range(len(self._state_inputs))]
+            "state_in_{}".format(i) for i in range(len(self._state_inputs))
+        ]
         feature_sequences, initial_states, seq_lens = chop_into_sequences(
-            batch["t"],
-            [batch[k] for k in feature_keys],
-            [batch[k] for k in state_keys],
-            self._max_seq_len)
+            batch["t"], [batch[k] for k in feature_keys],
+            [batch[k] for k in state_keys], self._max_seq_len)
         for k, v in zip(feature_keys, feature_sequences):
             feed_dict[self._loss_input_dict[k]] = v
         for k, v in zip(state_keys, initial_states):
@@ -172,9 +181,11 @@ class TFPolicyGraph(PolicyGraph):
         builder.add_feed_dict(self.extra_apply_grad_feed_dict())
         builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
         builder.add_feed_dict({self._is_training: True})
-        fetches = builder.add_fetches(
-            [self._apply_op, self.extra_compute_grad_fetches(),
-             self.extra_apply_grad_fetches()])
+        fetches = builder.add_fetches([
+            self._apply_op,
+            self.extra_compute_grad_fetches(),
+            self.extra_apply_grad_fetches()
+        ])
         return fetches[1], fetches[2]
 
     def compute_apply(self, postprocessed_batch):
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index 778eeff2e..069ca2244 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -27,8 +27,8 @@ class TorchPolicyGraph(PolicyGraph):
             This is necessary when using the async sampler.
     """
 
-    def __init__(
-            self, observation_space, action_space, model, loss, loss_inputs):
+    def __init__(self, observation_space, action_space, model, loss,
+                 loss_inputs):
         """Build a policy graph from policy and loss torch modules.
 
         Note that module inputs will be CPU tensors. The model and loss modules
@@ -67,8 +67,8 @@ class TorchPolicyGraph(PolicyGraph):
         """Custom PyTorch optimizer to use."""
         return torch.optim.Adam(self._model.parameters())
 
-    def compute_actions(
-            self, obs_batch, state_batches=None, is_training=False):
+    def compute_actions(self, obs_batch, state_batches=None,
+                        is_training=False):
         if state_batches:
             raise NotImplementedError("Torch RNN support")
         with self.lock:
diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py
index 1e97264a5..4e01bbc77 100644
--- a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py
+++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
     global env_version_num
 
     register(
-      id=env_name,
-      entry_point=(
-        "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
-        "MultiAgentMountainCarEnv"),
-      max_episode_steps=200,
-      kwargs={}
-    )
+        id=env_name,
+        entry_point=(
+            "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
+            "MultiAgentMountainCarEnv"),
+        max_episode_steps=200,
+        kwargs={})
 
 
 def create_env(env_config):
@@ -48,10 +47,12 @@ if __name__ == '__main__':
     config["horizon"] = horizon
     config["use_gae"] = False
     config["model"].update({"fcnet_hiddens": [256, 256]})
-    options = {"multiagent_obs_shapes": [2, 2],
-               "multiagent_act_shapes": [1, 1],
-               "multiagent_shared_model": False,
-               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    options = {
+        "multiagent_obs_shapes": [2, 2],
+        "multiagent_act_shapes": [1, 1],
+        "multiagent_shared_model": False,
+        "multiagent_fcnet_hiddens": [[32, 32]] * 2
+    }
     config["model"].update({"custom_options": options})
     alg = ppo.PPOAgent(env=env_name, config=config)
     for i in range(1):
diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py
index d454937ac..c120f00c9 100644
--- a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py
+++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py
@@ -2,7 +2,6 @@ from math import cos
 from gym.spaces import Box, Tuple, Discrete
 import numpy as np
 from gym.envs.classic_control.mountain_car import MountainCarEnv
-
 """
 Multiagent mountain car that sums and then
 averages its actions to produce the velocity
@@ -22,8 +21,8 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
         self.viewer = None
 
         self.action_space = [Discrete(3) for _ in range(2)]
-        self.observation_space = Tuple([
-            Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
+        self.observation_space = Tuple(
+            [Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
 
         self.seed()
         self.reset()
diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py
index c78b5d601..098ad6954 100644
--- a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py
+++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
     global env_version_num
 
     register(
-      id=env_name,
-      entry_point=(
-        "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
-        "MultiAgentPendulumEnv"),
-      max_episode_steps=100,
-      kwargs={}
-    )
+        id=env_name,
+        entry_point=(
+            "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
+            "MultiAgentPendulumEnv"),
+        max_episode_steps=100,
+        kwargs={})
 
 
 def create_env(env_config):
@@ -49,10 +48,12 @@ if __name__ == '__main__':
     config["horizon"] = horizon
     config["use_gae"] = True
     config["model"].update({"fcnet_hiddens": [256, 256]})
-    options = {"multiagent_obs_shapes": [3, 3],
-               "multiagent_act_shapes": [1, 1],
-               "multiagent_shared_model": True,
-               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    options = {
+        "multiagent_obs_shapes": [3, 3],
+        "multiagent_act_shapes": [1, 1],
+        "multiagent_shared_model": True,
+        "multiagent_fcnet_hiddens": [[32, 32]] * 2
+    }
     config["model"].update({"custom_options": options})
     alg = ppo.PPOAgent(env=env_name, config=config)
     for i in range(1):
diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py
index 44c86f4e6..026458327 100644
--- a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py
+++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py
@@ -2,7 +2,6 @@ from gym.spaces import Box, Tuple
 from gym.utils import seeding
 from gym.envs.classic_control.pendulum import PendulumEnv
 import numpy as np
-
 """
  Multiagent pendulum that sums its torques to generate an action
 """
@@ -10,8 +9,8 @@ import numpy as np
 
 class MultiAgentPendulumEnv(PendulumEnv):
     metadata = {
-      'render.modes': ['human', 'rgb_array'],
-      'video.frames_per_second': 30
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
     }
 
     def __init__(self):
@@ -21,13 +20,14 @@ class MultiAgentPendulumEnv(PendulumEnv):
         self.viewer = None
 
         high = np.array([1., 1., self.max_speed])
-        self.action_space = [Box(low=-self.max_torque / 2,
-                                 high=self.max_torque / 2,
-                                 shape=(1,),
-                                 dtype=np.float32)
-                             for _ in range(2)]
-        self.observation_space = Tuple([
-            Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
+        self.action_space = [
+            Box(low=-self.max_torque / 2,
+                high=self.max_torque / 2,
+                shape=(1, ),
+                dtype=np.float32) for _ in range(2)
+        ]
+        self.observation_space = Tuple(
+            [Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
 
         self.seed()
 
@@ -49,8 +49,8 @@ class MultiAgentPendulumEnv(PendulumEnv):
         costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
             .001 * (summed_u ** 2)
 
-        newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
-                            3. / (m * length ** 2) * summed_u) * dt
+        newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + 3. /
+                            (m * length**2) * summed_u) * dt
         newth = th + newthdot * dt
         newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
 
@@ -65,8 +65,10 @@ class MultiAgentPendulumEnv(PendulumEnv):
 
     def _get_obs(self):
         theta, thetadot = self.state
-        return [np.array([np.cos(theta), np.sin(theta), thetadot])
-                for _ in range(2)]
+        return [
+            np.array([np.cos(theta), np.sin(theta), thetadot])
+            for _ in range(2)
+        ]
 
     def angle_normalize(self, x):
         return (((x + np.pi) % (2 * np.pi)) - np.pi)
diff --git a/python/ray/rllib/examples/multiagent_cartpole.py b/python/ray/rllib/examples/multiagent_cartpole.py
index 75c678c53..767bf84aa 100644
--- a/python/ray/rllib/examples/multiagent_cartpole.py
+++ b/python/ray/rllib/examples/multiagent_cartpole.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Simple example of setting up a multi-agent policy mapping.
 
 Control the number of agents and policies via --num-agents and --num-policies.
@@ -24,14 +23,12 @@ from ray.rllib.test.test_multi_agent_env import MultiCartpole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env
 
-
 parser = argparse.ArgumentParser()
 
 parser.add_argument("--num-agents", type=int, default=4)
 parser.add_argument("--num-policies", type=int, default=2)
 parser.add_argument("--num-iters", type=int, default=20)
 
-
 if __name__ == "__main__":
     args = parser.parse_args()
     ray.init()
@@ -51,7 +48,8 @@ if __name__ == "__main__":
 
     # Setup PG with an ensemble of `num_policies` different policy graphs
     policy_graphs = {
-        "policy_{}".format(i): gen_policy() for i in range(args.num_policies)
+        "policy_{}".format(i): gen_policy()
+        for i in range(args.num_policies)
     }
     policy_ids = list(policy_graphs.keys())
 
diff --git a/python/ray/rllib/examples/serving/cartpole_client.py b/python/ray/rllib/examples/serving/cartpole_client.py
index fb27e8567..6f6a2e189 100755
--- a/python/ray/rllib/examples/serving/cartpole_client.py
+++ b/python/ray/rllib/examples/serving/cartpole_client.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Example of querying a policy server. Copy this file for your use case.
 
 To try this out, in two separate shells run:
@@ -14,18 +13,19 @@ import gym
 
 from ray.rllib.utils.policy_client import PolicyClient
 
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--no-train", action="store_true", help="Whether to disable training.")
 parser.add_argument(
-    "--off-policy", action="store_true",
+    "--off-policy",
+    action="store_true",
     help="Whether to take random instead of on-policy actions.")
 parser.add_argument(
-    "--stop-at-reward", type=int, default=9999,
+    "--stop-at-reward",
+    type=int,
+    default=9999,
     help="Stop once the specified reward is reached.")
 
-
 if __name__ == "__main__":
     args = parser.parse_args()
     env = gym.make("CartPole-v0")
diff --git a/python/ray/rllib/examples/serving/cartpole_server.py b/python/ray/rllib/examples/serving/cartpole_server.py
index 7e6d79996..a64ce03e6 100755
--- a/python/ray/rllib/examples/serving/cartpole_server.py
+++ b/python/ray/rllib/examples/serving/cartpole_server.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """Example of running a policy server. Copy this file for your use case.
 
 To try this out, in two separate shells run:
@@ -26,12 +25,12 @@ CHECKPOINT_FILE = "last_checkpoint.out"
 
 class CartpoleServing(ServingEnv):
     def __init__(self):
-        ServingEnv.__init__(
-            self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4,)))
+        ServingEnv.__init__(self, spaces.Discrete(2),
+                            spaces.Box(low=-10, high=10, shape=(4, )))
 
     def run(self):
-        print("Starting policy server at {}:{}".format(
-            SERVER_ADDRESS, SERVER_PORT))
+        print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
+                                                       SERVER_PORT))
         server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
         server.serve_forever()
 
@@ -42,14 +41,16 @@ if __name__ == "__main__":
 
     # We use DQN since it supports off-policy actions, but you can choose and
     # configure any agent.
-    dqn = DQNAgent(env="srv", config={
-        # Use a single process to avoid needing to set up a load balancer
-        "num_workers": 0,
-        # Configure the agent to run short iterations for debugging
-        "exploration_fraction": 0.01,
-        "learning_starts": 100,
-        "timesteps_per_iteration": 200,
-    })
+    dqn = DQNAgent(
+        env="srv",
+        config={
+            # Use a single process to avoid needing to set up a load balancer
+            "num_workers": 0,
+            # Configure the agent to run short iterations for debugging
+            "exploration_fraction": 0.01,
+            "learning_starts": 100,
+            "timesteps_per_iteration": 200,
+        })
 
     # Attempt to restore from checkpoint if possible.
     if os.path.exists(CHECKPOINT_FILE):
diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py
index 91c2381f0..ddfdd16b8 100644
--- a/python/ray/rllib/models/__init__.py
+++ b/python/ray/rllib/models/__init__.py
@@ -6,7 +6,7 @@ from ray.rllib.models.preprocessors import Preprocessor
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.lstm import LSTM
 
-
-__all__ = ["ActionDistribution", "Categorical",
-           "DiagGaussian", "Deterministic", "ModelCatalog", "Model",
-           "Preprocessor", "FullyConnectedNetwork", "LSTM"]
+__all__ = [
+    "ActionDistribution", "Categorical", "DiagGaussian", "Deterministic",
+    "ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM"
+]
diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py
index c4de85004..a88f5fa3a 100644
--- a/python/ray/rllib/models/action_dist.py
+++ b/python/ray/rllib/models/action_dist.py
@@ -42,25 +42,25 @@ class Categorical(ActionDistribution):
             logits=self.inputs, labels=x)
 
     def entropy(self):
-        a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
-                                         keepdims=True)
+        a0 = self.inputs - tf.reduce_max(
+            self.inputs, reduction_indices=[1], keepdims=True)
         ea0 = tf.exp(a0)
         z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
         p0 = ea0 / z0
         return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
 
     def kl(self, other):
-        a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
-                                         keepdims=True)
-        a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1],
-                                          keepdims=True)
+        a0 = self.inputs - tf.reduce_max(
+            self.inputs, reduction_indices=[1], keepdims=True)
+        a1 = other.inputs - tf.reduce_max(
+            other.inputs, reduction_indices=[1], keepdims=True)
         ea0 = tf.exp(a0)
         ea1 = tf.exp(a1)
         z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
         z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
         p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)),
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])
 
     def sample(self):
         return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
@@ -90,22 +90,23 @@ class DiagGaussian(ActionDistribution):
         self.std = tf.exp(log_std)
 
     def logp(self, x):
-        return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
-                                     reduction_indices=[1]) -
+        return (-0.5 * tf.reduce_sum(
+            tf.square((x - self.mean) / self.std), reduction_indices=[1]) -
                 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
                 tf.reduce_sum(self.log_std, reduction_indices=[1]))
 
     def kl(self, other):
         assert isinstance(other, DiagGaussian)
-        return tf.reduce_sum(other.log_std - self.log_std +
-                             (tf.square(self.std) +
-                              tf.square(self.mean - other.mean)) /
-                             (2.0 * tf.square(other.std)) - 0.5,
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            other.log_std - self.log_std +
+            (tf.square(self.std) + tf.square(self.mean - other.mean)) /
+            (2.0 * tf.square(other.std)) - 0.5,
+            reduction_indices=[1])
 
     def entropy(self):
-        return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            self.log_std + .5 * np.log(2.0 * np.pi * np.e),
+            reduction_indices=[1])
 
     def sample(self):
         out = self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@@ -158,6 +159,7 @@ class MultiActionDistribution(ActionDistribution):
     Args:
         inputs (Tensor list): A list of tensors from which to compute samples.
     """
+
     def __init__(self, inputs, action_space, child_distributions):
         # you actually have to instantiate the child distributions
         self.reshaper = Reshaper(action_space.spaces)
@@ -174,23 +176,25 @@ class MultiActionDistribution(ActionDistribution):
             # Remove extra categorical dimension
             if isinstance(distribution, Categorical):
                 split_list[i] = tf.squeeze(split_list[i], axis=-1)
-        log_list = np.asarray([distribution.logp(split_x) for
-                              distribution, split_x in
-                               zip(self.child_distributions, split_list)])
+        log_list = np.asarray([
+            distribution.logp(split_x) for distribution, split_x in zip(
+                self.child_distributions, split_list)
+        ])
         return np.sum(log_list)
 
     def kl(self, other):
         """The KL-divergence between two action distributions."""
-        kl_list = np.asarray([distribution.kl(other_distribution) for
-                              distribution, other_distribution in
-                              zip(self.child_distributions,
-                                  other.child_distributions)])
+        kl_list = np.asarray([
+            distribution.kl(other_distribution)
+            for distribution, other_distribution in zip(
+                self.child_distributions, other.child_distributions)
+        ])
         return np.sum(kl_list)
 
     def entropy(self):
         """The entropy of the action distribution."""
-        entropy_list = np.array([s.entropy() for s in
-                                 self.child_distributions])
+        entropy_list = np.array(
+            [s.entropy() for s in self.child_distributions])
         return np.sum(entropy_list)
 
     def sample(self):
diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index db5717a83..b611d2e31 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -19,7 +19,6 @@ from ray.rllib.models.visionnet import VisionNetwork
 from ray.rllib.models.lstm import LSTM
 from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
 
-
 MODEL_CONFIGS = [
     # === Built-in options ===
     "conv_filters",  # Filter configuration
@@ -30,11 +29,9 @@ MODEL_CONFIGS = [
     "grayscale",  # Converts ATARI frame to 1 Channel Grayscale image
     "zero_mean",  # Changes frame to range from [-1, 1] if true
     "extra_frameskip",  # (int) for number of frames to skip
-
     "free_log_std",  # Documented in ray.rllib.models.Model
     "channel_major",  # Pytorch conv requires images to be channel-major
     "squash_to_range",  # Whether to squash the action output to space range
-
     "use_lstm",  # Whether to wrap the model with a LSTM
     "max_seq_len",  # Max seq len for training the LSTM, defaults to 20
     "lstm_cell_size",  # Size of the LSTM cell
@@ -81,8 +78,8 @@ class ModelCatalog(object):
             if dist_type is None:
                 dist = DiagGaussian
                 if config.get("squash_to_range"):
-                    dist = squash_to_range(
-                        dist, action_space.low, action_space.high)
+                    dist = squash_to_range(dist, action_space.low,
+                                           action_space.high)
                 return dist, action_space.shape[0] * 2
             elif dist_type == 'deterministic':
                 return Deterministic, action_space.shape[0]
@@ -95,12 +92,13 @@ class ModelCatalog(object):
                 dist, action_size = ModelCatalog.get_action_dist(action)
                 child_dist.append(dist)
                 size += action_size
-            return partial(MultiActionDistribution,
-                           child_distributions=child_dist,
-                           action_space=action_space), size
+            return partial(
+                MultiActionDistribution,
+                child_distributions=child_dist,
+                action_space=action_space), size
 
-        raise NotImplementedError(
-            "Unsupported args: {} {}".format(action_space, dist_type))
+        raise NotImplementedError("Unsupported args: {} {}".format(
+            action_space, dist_type))
 
     @staticmethod
     def get_action_placeholder(action_space):
@@ -120,7 +118,7 @@ class ModelCatalog(object):
             return tf.placeholder(
                 tf.float32, shape=(None, action_space.shape[0]), name="action")
         elif isinstance(action_space, gym.spaces.Discrete):
-            return tf.placeholder(tf.int64, shape=(None,), name="action")
+            return tf.placeholder(tf.int64, shape=(None, ), name="action")
         elif isinstance(action_space, gym.spaces.Tuple):
             size = 0
             all_discrete = True
@@ -131,15 +129,19 @@ class ModelCatalog(object):
                     all_discrete = False
                     size += np.product(action_space.spaces[i].shape)
             return tf.placeholder(
-                tf.int64 if all_discrete else tf.float32, shape=(None, size),
+                tf.int64 if all_discrete else tf.float32,
+                shape=(None, size),
                 name="action")
         else:
             raise NotImplementedError("action space {}"
                                       " not supported".format(action_space))
 
     @staticmethod
-    def get_model(
-            inputs, num_outputs, options=None, state_in=None, seq_lens=None):
+    def get_model(inputs,
+                  num_outputs,
+                  options=None,
+                  state_in=None,
+                  seq_lens=None):
         """Returns a suitable model conforming to given input and output specs.
 
         Args:
@@ -154,12 +156,12 @@ class ModelCatalog(object):
         """
 
         options = options or {}
-        model = ModelCatalog._get_model(
-            inputs, num_outputs, options, state_in, seq_lens)
+        model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
+                                        seq_lens)
 
         if options.get("use_lstm"):
-            model = LSTM(
-                model.last_layer, num_outputs, options, state_in, seq_lens)
+            model = LSTM(model.last_layer, num_outputs, options, state_in,
+                         seq_lens)
 
         return model
 
@@ -169,16 +171,20 @@ class ModelCatalog(object):
             model = options["custom_model"]
             print("Using custom model {}".format(model))
             return _global_registry.get(RLLIB_MODEL, model)(
-                inputs, num_outputs, options,
-                state_in=state_in, seq_lens=seq_lens)
+                inputs,
+                num_outputs,
+                options,
+                state_in=state_in,
+                seq_lens=seq_lens)
 
         obs_rank = len(inputs.shape) - 1
 
         # num_outputs > 1 used to avoid hitting this with the value function
-        if isinstance(options.get("custom_options", {}).get(
-          "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
-            return MultiAgentFullyConnectedNetwork(
-                inputs, num_outputs, options)
+        if isinstance(
+                options.get("custom_options", {}).get(
+                    "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
+            return MultiAgentFullyConnectedNetwork(inputs, num_outputs,
+                                                   options)
 
         if obs_rank > 1:
             return VisionNetwork(inputs, num_outputs, options)
@@ -198,10 +204,10 @@ class ModelCatalog(object):
         Returns:
             model (Model): Neural network model.
         """
-        from ray.rllib.models.pytorch.fcnet import (
-            FullyConnectedNetwork as PyTorchFCNet)
-        from ray.rllib.models.pytorch.visionnet import (
-            VisionNetwork as PyTorchVisionNet)
+        from ray.rllib.models.pytorch.fcnet import (FullyConnectedNetwork as
+                                                    PyTorchFCNet)
+        from ray.rllib.models.pytorch.visionnet import (VisionNetwork as
+                                                        PyTorchVisionNet)
 
         if "custom_model" in options:
             model = options["custom_model"]
@@ -232,9 +238,8 @@ class ModelCatalog(object):
         """
         for k in options.keys():
             if k not in MODEL_CONFIGS:
-                raise Exception(
-                    "Unknown config key `{}`, all keys: {}".format(
-                        k, MODEL_CONFIGS))
+                raise Exception("Unknown config key `{}`, all keys: {}".format(
+                    k, MODEL_CONFIGS))
 
         if "custom_preprocessor" in options:
             preprocessor = options["custom_preprocessor"]
@@ -271,8 +276,8 @@ class ModelCatalog(object):
             preprocessor_name (str): Name to register the preprocessor under.
             preprocessor_class (type): Python class of the preprocessor.
         """
-        _global_registry.register(
-            RLLIB_PREPROCESSOR, preprocessor_name, preprocessor_class)
+        _global_registry.register(RLLIB_PREPROCESSOR, preprocessor_name,
+                                  preprocessor_class)
 
     @staticmethod
     def register_custom_model(model_name, model_class):
diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py
index 3f5bcabf6..11aee2c0d 100644
--- a/python/ray/rllib/models/fcnet.py
+++ b/python/ray/rllib/models/fcnet.py
@@ -22,14 +22,17 @@ class FullyConnectedNetwork(Model):
             for size in hiddens:
                 label = "fc{}".format(i)
                 last_layer = slim.fully_connected(
-                    last_layer, size,
+                    last_layer,
+                    size,
                     weights_initializer=normc_initializer(1.0),
                     activation_fn=activation,
                     scope=label)
                 i += 1
             label = "fc_out"
             output = slim.fully_connected(
-                last_layer, num_outputs,
+                last_layer,
+                num_outputs,
                 weights_initializer=normc_initializer(0.01),
-                activation_fn=None, scope=label)
+                activation_fn=None,
+                scope=label)
             return output, last_layer
diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py
index 55a9626cb..1365b5a69 100644
--- a/python/ray/rllib/models/lstm.py
+++ b/python/ray/rllib/models/lstm.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 """LSTM support for RLlib.
 
 The main trick here is that we add the time dimension at the last moment.
@@ -14,7 +13,6 @@ See the add_time_dimension() and chop_into_sequences() functions below for
 more info.
 """
 
-
 import numpy as np
 import tensorflow as tf
 import tensorflow.contrib.rnn as rnn
@@ -46,14 +44,13 @@ def add_time_dimension(padded_inputs, seq_lens):
 
     # Dynamically reshape the padded batch to introduce a time dimension.
     new_batch_size = padded_batch_size // max_seq_len
-    new_shape = (
-        [new_batch_size, max_seq_len] +
-        padded_inputs.get_shape().as_list()[1:])
+    new_shape = ([new_batch_size, max_seq_len] +
+                 padded_inputs.get_shape().as_list()[1:])
     return tf.reshape(padded_inputs, new_shape)
 
 
-def chop_into_sequences(
-        time_column, feature_columns, state_columns, max_seq_len):
+def chop_into_sequences(time_column, feature_columns, state_columns,
+                        max_seq_len):
     """Truncate and pad experiences into fixed-length sequences.
 
     Arguments:
@@ -106,7 +103,7 @@ def chop_into_sequences(
     feature_sequences = []
     for f in feature_columns:
         f = np.array(f)
-        f_pad = np.zeros((len(seq_lens) * max_seq_len,) + np.shape(f)[1:])
+        f_pad = np.zeros((len(seq_lens) * max_seq_len, ) + np.shape(f)[1:])
         seq_base = 0
         i = 0
         for l in seq_lens:
@@ -152,7 +149,8 @@ class LSTM(Model):
             lstm = rnn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
         self.state_init = [
             np.zeros(lstm.state_size.c, np.float32),
-            np.zeros(lstm.state_size.h, np.float32)]
+            np.zeros(lstm.state_size.h, np.float32)
+        ]
 
         # Setup LSTM inputs
         if self.state_in:
@@ -170,12 +168,15 @@ class LSTM(Model):
         else:
             state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
         lstm_out, lstm_state = tf.nn.dynamic_rnn(
-            lstm, last_layer, initial_state=state_in,
-            sequence_length=self.seq_lens, time_major=False)
+            lstm,
+            last_layer,
+            initial_state=state_in,
+            sequence_length=self.seq_lens,
+            time_major=False)
         self.state_out = list(lstm_state)
 
         # Compute outputs
         last_layer = tf.reshape(lstm_out, [-1, cell_size])
-        logits = linear(
-            last_layer, num_outputs, "action", normc_initializer(0.01))
+        logits = linear(last_layer, num_outputs, "action",
+                        normc_initializer(0.01))
         return logits, last_layer
diff --git a/python/ray/rllib/models/misc.py b/python/ray/rllib/models/misc.py
index 461296ecd..aad399c3b 100644
--- a/python/ray/rllib/models/misc.py
+++ b/python/ray/rllib/models/misc.py
@@ -11,6 +11,7 @@ def normc_initializer(std=1.0):
         out = np.random.randn(*shape).astype(np.float32)
         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
         return tf.constant(out)
+
     return _initializer
 
 
@@ -18,12 +19,20 @@ def get_activation_fn(name):
     return getattr(tf.nn, name)
 
 
-def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
-           dtype=tf.float32, collections=None):
+def conv2d(x,
+           num_filters,
+           name,
+           filter_size=(3, 3),
+           stride=(1, 1),
+           pad="SAME",
+           dtype=tf.float32,
+           collections=None):
     with tf.variable_scope(name):
         stride_shape = [1, stride[0], stride[1], 1]
-        filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
-                        num_filters]
+        filter_shape = [
+            filter_size[0], filter_size[1],
+            int(x.get_shape()[3]), num_filters
+        ]
 
         # There are "num input feature maps * filter height * filter width"
         # inputs to each hidden unit.
@@ -34,20 +43,24 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
         # Initialize weights with random weights.
         w_bound = np.sqrt(6 / (fan_in + fan_out))
 
-        w = tf.get_variable("W", filter_shape, dtype,
-                            tf.random_uniform_initializer(-w_bound, w_bound),
-                            collections=collections)
-        b = tf.get_variable("b", [1, 1, 1, num_filters],
-                            initializer=tf.constant_initializer(0.0),
-                            collections=collections)
+        w = tf.get_variable(
+            "W",
+            filter_shape,
+            dtype,
+            tf.random_uniform_initializer(-w_bound, w_bound),
+            collections=collections)
+        b = tf.get_variable(
+            "b", [1, 1, 1, num_filters],
+            initializer=tf.constant_initializer(0.0),
+            collections=collections)
         return tf.nn.conv2d(x, w, stride_shape, pad) + b
 
 
 def linear(x, size, name, initializer=None, bias_init=0):
-    w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
-                        initializer=initializer)
-    b = tf.get_variable(name + "/b", [size],
-                        initializer=tf.constant_initializer(bias_init))
+    w = tf.get_variable(
+        name + "/w", [x.get_shape()[1], size], initializer=initializer)
+    b = tf.get_variable(
+        name + "/b", [size], initializer=tf.constant_initializer(bias_init))
     return tf.matmul(x, w) + b
 
 
diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py
index 27206adaf..00d6575e6 100644
--- a/python/ray/rllib/models/model.py
+++ b/python/ray/rllib/models/model.py
@@ -37,8 +37,12 @@ class Model(object):
     a scale parameter (like a standard deviation).
     """
 
-    def __init__(
-            self, inputs, num_outputs, options, state_in=None, seq_lens=None):
+    def __init__(self,
+                 inputs,
+                 num_outputs,
+                 options,
+                 state_in=None,
+                 seq_lens=None):
         self.inputs = inputs
 
         # Default attribute values for the non-RNN case
@@ -57,8 +61,10 @@ class Model(object):
         self.outputs, self.last_layer = self._build_layers(
             inputs, num_outputs, options)
         if options.get("free_log_std", False):
-            log_std = tf.get_variable(name="log_std", shape=[num_outputs],
-                                      initializer=tf.zeros_initializer)
+            log_std = tf.get_variable(
+                name="log_std",
+                shape=[num_outputs],
+                initializer=tf.zeros_initializer)
             self.outputs = tf.concat(
                 [self.outputs, 0.0 * self.outputs + log_std], 1)
 
diff --git a/python/ray/rllib/models/multiagentfcnet.py b/python/ray/rllib/models/multiagentfcnet.py
index 81d9c8d15..d000e95df 100644
--- a/python/ray/rllib/models/multiagentfcnet.py
+++ b/python/ray/rllib/models/multiagentfcnet.py
@@ -23,7 +23,7 @@ class MultiAgentFullyConnectedNetwork(Model):
 
         custom_options = options["custom_options"]
         hiddens = custom_options.get("multiagent_fcnet_hiddens",
-                                     [[256, 256]]*1)
+                                     [[256, 256]] * 1)
 
         # check for a shared model
         shared_model = custom_options.get("multiagent_shared_model", 0)
@@ -35,8 +35,8 @@ class MultiAgentFullyConnectedNetwork(Model):
                 sub_options = options.copy()
                 sub_options.update({"fcnet_hiddens": hiddens[i]})
                 # TODO(ev) make this support arbitrary networks
-                fcnet = FullyConnectedNetwork(
-                    split_inputs[i], int(num_actions[i]), sub_options)
+                fcnet = FullyConnectedNetwork(split_inputs[i],
+                                              int(num_actions[i]), sub_options)
                 output = fcnet.outputs
                 outputs.append(output)
         overall_output = tf.concat(outputs, axis=1)
diff --git a/python/ray/rllib/models/preprocessors.py b/python/ray/rllib/models/preprocessors.py
index 050d8b5a2..46404ae08 100644
--- a/python/ray/rllib/models/preprocessors.py
+++ b/python/ray/rllib/models/preprocessors.py
@@ -6,7 +6,7 @@ import numpy as np
 import gym
 
 ATARI_OBS_SHAPE = (210, 160, 3)
-ATARI_RAM_OBS_SHAPE = (128,)
+ATARI_RAM_OBS_SHAPE = (128, )
 
 
 class Preprocessor(object):
@@ -70,7 +70,7 @@ class AtariPixelPreprocessor(Preprocessor):
 
 class AtariRamPreprocessor(Preprocessor):
     def _init(self):
-        self.shape = (128,)
+        self.shape = (128, )
 
     def transform(self, observation):
         return (observation - 128) / 128
@@ -78,7 +78,7 @@ class AtariRamPreprocessor(Preprocessor):
 
 class OneHotPreprocessor(Preprocessor):
     def _init(self):
-        self.shape = (self._obs_space.n,)
+        self.shape = (self._obs_space.n, )
 
     def transform(self, observation):
         arr = np.zeros(self._obs_space.n)
@@ -111,13 +111,14 @@ class TupleFlatteningPreprocessor(Preprocessor):
             preprocessor = get_preprocessor(space)(space, self._options)
             self.preprocessors.append(preprocessor)
             size += np.product(preprocessor.shape)
-        self.shape = (size,)
+        self.shape = (size, )
 
     def transform(self, observation):
         assert len(observation) == len(self.preprocessors), observation
         return np.concatenate([
             np.reshape(p.transform(o), [np.product(p.shape)])
-            for (o, p) in zip(observation, self.preprocessors)])
+            for (o, p) in zip(observation, self.preprocessors)
+        ])
 
 
 def get_preprocessor(space):
diff --git a/python/ray/rllib/models/visionnet.py b/python/ray/rllib/models/visionnet.py
index 893f7acd2..c3b633dbe 100644
--- a/python/ray/rllib/models/visionnet.py
+++ b/python/ray/rllib/models/visionnet.py
@@ -22,14 +22,27 @@ class VisionNetwork(Model):
         with tf.name_scope("vision_net"):
             for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
                 inputs = slim.conv2d(
-                    inputs, out_size, kernel, stride,
-                    activation_fn=activation, scope="conv{}".format(i))
+                    inputs,
+                    out_size,
+                    kernel,
+                    stride,
+                    activation_fn=activation,
+                    scope="conv{}".format(i))
             out_size, kernel, stride = filters[-1]
             fc1 = slim.conv2d(
-                inputs, out_size, kernel, stride,
-                activation_fn=activation, padding="VALID", scope="fc1")
-            fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None,
-                              normalizer_fn=None, scope="fc2")
+                inputs,
+                out_size,
+                kernel,
+                stride,
+                activation_fn=activation,
+                padding="VALID",
+                scope="fc1")
+            fc2 = slim.conv2d(
+                fc1,
+                num_outputs, [1, 1],
+                activation_fn=None,
+                normalizer_fn=None,
+                scope="fc2")
             return flatten(fc2), flatten(fc1)
 
 
diff --git a/python/ray/rllib/optimizers/__init__.py b/python/ray/rllib/optimizers/__init__.py
index f8b530ff7..eadb38620 100644
--- a/python/ray/rllib/optimizers/__init__.py
+++ b/python/ray/rllib/optimizers/__init__.py
@@ -6,7 +6,6 @@ from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer
 from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer
 from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer
 
-
 __all__ = [
     "PolicyOptimizer", "AsyncSamplesOptimizer", "AsyncGradientsOptimizer",
     "SyncSamplesOptimizer", "SyncReplayOptimizer", "LocalMultiGPUOptimizer"
diff --git a/python/ray/rllib/optimizers/async_gradients_optimizer.py b/python/ray/rllib/optimizers/async_gradients_optimizer.py
index 3c379782f..397fabba9 100644
--- a/python/ray/rllib/optimizers/async_gradients_optimizer.py
+++ b/python/ray/rllib/optimizers/async_gradients_optimizer.py
@@ -14,6 +14,7 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
     evaluators, sending updated weights back as needed. This pipelines the
     gradient computations on the remote workers.
     """
+
     def _init(self, grads_per_step=100):
         self.apply_timer = TimerStat()
         self.wait_timer = TimerStat()
@@ -55,8 +56,9 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
                     num_gradients += 1
 
     def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "wait_time_ms": round(1000 * self.wait_timer.mean, 3),
-            "apply_time_ms": round(1000 * self.apply_timer.mean, 3),
-            "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "wait_time_ms": round(1000 * self.wait_timer.mean, 3),
+                "apply_time_ms": round(1000 * self.apply_timer.mean, 3),
+                "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
+            })
diff --git a/python/ray/rllib/optimizers/async_samples_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py
index dfc52e1d8..e37901c46 100644
--- a/python/ray/rllib/optimizers/async_samples_optimizer.py
+++ b/python/ray/rllib/optimizers/async_samples_optimizer.py
@@ -22,7 +22,6 @@ from ray.rllib.utils.actors import TaskPool, create_colocated
 from ray.rllib.utils.timer import TimerStat
 from ray.rllib.utils.window_stat import WindowStat
 
-
 SAMPLE_QUEUE_DEPTH = 2
 REPLAY_QUEUE_DEPTH = 4
 LEARNER_QUEUE_MAX_SIZE = 16
@@ -35,10 +34,10 @@ class ReplayActor(object):
     Ray actors are single-threaded, so for scalability multiple replay actors
     may be created to increase parallelism."""
 
-    def __init__(
-            self, num_shards, learning_starts, buffer_size, train_batch_size,
-            prioritized_replay_alpha, prioritized_replay_beta,
-            prioritized_replay_eps, clip_rewards):
+    def __init__(self, num_shards, learning_starts, buffer_size,
+                 train_batch_size, prioritized_replay_alpha,
+                 prioritized_replay_beta, prioritized_replay_eps,
+                 clip_rewards):
         self.replay_starts = learning_starts // num_shards
         self.buffer_size = buffer_size // num_shards
         self.train_batch_size = train_batch_size
@@ -46,7 +45,8 @@ class ReplayActor(object):
         self.prioritized_replay_eps = prioritized_replay_eps
 
         self.replay_buffer = PrioritizedReplayBuffer(
-            self.buffer_size, alpha=prioritized_replay_alpha,
+            self.buffer_size,
+            alpha=prioritized_replay_alpha,
             clip_rewards=clip_rewards)
 
         # Metrics
@@ -60,38 +60,39 @@ class ReplayActor(object):
     def add_batch(self, batch):
         with self.add_batch_timer:
             for row in batch.rows():
-                self.replay_buffer.add(
-                    row["obs"], row["actions"], row["rewards"], row["new_obs"],
-                    row["dones"], row["weights"])
+                self.replay_buffer.add(row["obs"], row["actions"],
+                                       row["rewards"], row["new_obs"],
+                                       row["dones"], row["weights"])
 
     def replay(self):
         with self.replay_timer:
             if len(self.replay_buffer) < self.replay_starts:
                 return None
 
-            (obses_t, actions, rewards, obses_tp1,
-                dones, weights, batch_indexes) = self.replay_buffer.sample(
-                    self.train_batch_size,
-                    beta=self.prioritized_replay_beta)
+            (obses_t, actions, rewards, obses_tp1, dones, weights,
+             batch_indexes) = self.replay_buffer.sample(
+                 self.train_batch_size, beta=self.prioritized_replay_beta)
 
             batch = SampleBatch({
-                "obs": obses_t, "actions": actions, "rewards": rewards,
-                "new_obs": obses_tp1, "dones": dones, "weights": weights,
-                "batch_indexes": batch_indexes})
+                "obs": obses_t,
+                "actions": actions,
+                "rewards": rewards,
+                "new_obs": obses_tp1,
+                "dones": dones,
+                "weights": weights,
+                "batch_indexes": batch_indexes
+            })
             return batch
 
     def update_priorities(self, batch_indexes, td_errors):
         with self.update_priorities_timer:
-            new_priorities = (
-                np.abs(td_errors) + self.prioritized_replay_eps)
+            new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
             self.replay_buffer.update_priorities(batch_indexes, new_priorities)
 
     def stats(self):
         stat = {
-            "add_batch_time_ms": round(
-                1000 * self.add_batch_timer.mean, 3),
-            "replay_time_ms": round(
-                1000 * self.replay_timer.mean, 3),
+            "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3),
+            "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
             "update_priorities_time_ms": round(
                 1000 * self.update_priorities_timer.mean, 3),
         }
@@ -145,13 +146,19 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
     "td_error" array in the info return of compute_gradients(). This error
     term will be used for sample prioritization."""
 
-    def _init(
-            self, learning_starts=1000, buffer_size=10000,
-            prioritized_replay=True, prioritized_replay_alpha=0.6,
-            prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
-            train_batch_size=512, sample_batch_size=50,
-            num_replay_buffer_shards=1, max_weight_sync_delay=400,
-            clip_rewards=True, debug=False):
+    def _init(self,
+              learning_starts=1000,
+              buffer_size=10000,
+              prioritized_replay=True,
+              prioritized_replay_alpha=0.6,
+              prioritized_replay_beta=0.4,
+              prioritized_replay_eps=1e-6,
+              train_batch_size=512,
+              sample_batch_size=50,
+              num_replay_buffer_shards=1,
+              max_weight_sync_delay=400,
+              clip_rewards=True,
+              debug=False):
 
         self.debug = debug
         self.replay_starts = learning_starts
@@ -164,18 +171,21 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
         self.learner = LearnerThread(self.local_evaluator)
         self.learner.start()
 
-        self.replay_actors = create_colocated(
-            ReplayActor,
-            [num_replay_buffer_shards, learning_starts, buffer_size,
-             train_batch_size, prioritized_replay_alpha,
-             prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
-            num_replay_buffer_shards)
+        self.replay_actors = create_colocated(ReplayActor, [
+            num_replay_buffer_shards, learning_starts, buffer_size,
+            train_batch_size, prioritized_replay_alpha,
+            prioritized_replay_beta, prioritized_replay_eps, clip_rewards
+        ], num_replay_buffer_shards)
         assert len(self.remote_evaluators) > 0
 
         # Stats
-        self.timers = {k: TimerStat() for k in [
-            "put_weights", "get_samples", "enqueue", "sample_processing",
-            "replay_processing", "update_priorities", "train", "sample"]}
+        self.timers = {
+            k: TimerStat()
+            for k in [
+                "put_weights", "get_samples", "enqueue", "sample_processing",
+                "replay_processing", "update_priorities", "train", "sample"
+            ]
+        }
         self.num_weight_syncs = 0
         self.learning_started = False
 
@@ -221,8 +231,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
                 sample_timesteps += self.sample_batch_size
 
                 # Send the data to the replay buffer
-                random.choice(self.replay_actors).add_batch.remote(
-                    sample_batch)
+                random.choice(
+                    self.replay_actors).add_batch.remote(sample_batch)
 
                 # Update weights if needed
                 self.steps_since_update[ev] += self.sample_batch_size
@@ -268,8 +278,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
         timing["learner_dequeue_time_ms"] = round(
             1000 * self.learner.queue_timer.mean, 3)
         stats = {
-            "sample_throughput": round(
-                self.timers["sample"].mean_throughput, 3),
+            "sample_throughput": round(self.timers["sample"].mean_throughput,
+                                       3),
             "train_throughput": round(self.timers["train"].mean_throughput, 3),
             "num_weight_syncs": self.num_weight_syncs,
         }
diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py
index 844dc11fb..7233e37e9 100644
--- a/python/ray/rllib/optimizers/multi_gpu_impl.py
+++ b/python/ray/rllib/optimizers/multi_gpu_impl.py
@@ -6,7 +6,6 @@ from collections import namedtuple
 
 import tensorflow as tf
 
-
 # Variable scope in which created variables will be placed under
 TOWER_SCOPE_NAME = "tower"
 
@@ -47,8 +46,14 @@ class LocalSyncParallelOptimizer(object):
         grad_norm_clipping: None or int stdev to clip grad norms by
     """
 
-    def __init__(self, optimizer, devices, input_placeholders, rnn_inputs,
-                 per_device_batch_size, build_graph, logdir,
+    def __init__(self,
+                 optimizer,
+                 devices,
+                 input_placeholders,
+                 rnn_inputs,
+                 per_device_batch_size,
+                 build_graph,
+                 logdir,
                  grad_norm_clipping=None):
         # TODO(rliaw): remove logdir
         self.optimizer = optimizer
@@ -78,8 +83,8 @@ class LocalSyncParallelOptimizer(object):
         self._towers = []
         for device, device_placeholders in zip(self.devices, data_splits):
             self._towers.append(
-                self._setup_device(
-                    device, device_placeholders, len(input_placeholders)))
+                self._setup_device(device, device_placeholders,
+                                   len(input_placeholders)))
 
         avg = average_gradients([t.grads for t in self._towers])
         if grad_norm_clipping:
@@ -119,14 +124,10 @@ class LocalSyncParallelOptimizer(object):
             assert len(state_inputs[0]) * seq_len == len(inputs[0])
             # Make sure the shorter state inputs arrays are evenly divisible
             state_inputs = [
-                make_divisible_by(arr, self.batch_size)
-                for arr in state_inputs
+                make_divisible_by(arr, self.batch_size) for arr in state_inputs
             ]
             # Then truncate the data inputs to match
-            inputs = [
-                arr[:len(state_inputs[0]) * seq_len]
-                for arr in inputs
-            ]
+            inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs]
             assert len(state_inputs[0]) * seq_len == len(inputs[0])
             assert len(state_inputs[0]) % self.batch_size == 0
             for ph, arr in zip(self.loss_inputs, inputs + state_inputs):
@@ -138,8 +139,7 @@ class LocalSyncParallelOptimizer(object):
                 feed_dict[ph] = truncated_arr
                 truncated_len = len(truncated_arr)
 
-        sess.run(
-            [t.init_op for t in self._towers], feed_dict=feed_dict)
+        sess.run([t.init_op for t in self._towers], feed_dict=feed_dict)
 
         tuples_per_device = truncated_len / len(self.devices)
         assert tuples_per_device > 0, \
@@ -198,7 +198,9 @@ class LocalSyncParallelOptimizer(object):
                 device_input_slices = []
                 for i, ph in enumerate(device_input_placeholders):
                     current_batch = tf.Variable(
-                        ph, trainable=False, validate_shape=False,
+                        ph,
+                        trainable=False,
+                        validate_shape=False,
                         collections=[])
                     device_input_batches.append(current_batch)
                     if i < num_data_in:
@@ -210,18 +212,17 @@ class LocalSyncParallelOptimizer(object):
                     current_slice = tf.slice(
                         current_batch,
                         ([self._batch_index // scale * granularity] +
-                            [0] * len(ph.shape[1:])),
+                         [0] * len(ph.shape[1:])),
                         ([self.per_device_batch_size // scale * granularity] +
-                            [-1] * len(ph.shape[1:])))
+                         [-1] * len(ph.shape[1:])))
                     current_slice.set_shape(ph.shape)
                     device_input_slices.append(current_slice)
                 graph_obj = self.build_graph(device_input_slices)
                 device_grads = graph_obj.gradients(self.optimizer)
             return Tower(
-                tf.group(*[batch.initializer
-                           for batch in device_input_batches]),
-                device_grads,
-                graph_obj)
+                tf.group(
+                    *[batch.initializer for batch in device_input_batches]),
+                device_grads, graph_obj)
 
 
 # Each tower is a copy of the loss graph pinned to a specific device.
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index 0c39aab7a..7e4ee2895 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -30,8 +30,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
     may result in unexpected behavior.
     """
 
-    def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
-              timesteps_per_batch=1024, standardize_fields=[]):
+    def _init(self,
+              sgd_batch_size=128,
+              sgd_stepsize=5e-5,
+              num_sgd_iter=10,
+              timesteps_per_batch=1024,
+              standardize_fields=[]):
         self.batch_size = sgd_batch_size
         self.sgd_stepsize = sgd_stepsize
         self.num_sgd_iter = num_sgd_iter
@@ -41,8 +45,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
             self.devices = ["/cpu:0"]
         else:
             self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
-        self.batch_size = int(
-                sgd_batch_size / len(self.devices)) * len(self.devices)
+        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
+            self.devices)
         assert self.batch_size % len(self.devices) == 0
         assert self.batch_size >= len(self.devices), "batch size too small"
         self.per_device_batch_size = int(self.batch_size / len(self.devices))
@@ -70,16 +74,15 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
                 with tf.variable_scope("default", reuse=tf.AUTO_REUSE):
                     if self.policy._state_inputs:
                         rnn_inputs = self.policy._state_inputs + [
-                            self.policy._seq_lens]
+                            self.policy._seq_lens
+                        ]
                     else:
                         rnn_inputs = []
                     self.par_opt = LocalSyncParallelOptimizer(
-                        tf.train.AdamOptimizer(self.sgd_stepsize),
-                        self.devices,
-                        [v for _, v in self.policy.loss_inputs()],
-                        rnn_inputs,
-                        self.per_device_batch_size,
-                        self.policy.copy,
+                        tf.train.AdamOptimizer(
+                            self.sgd_stepsize), self.devices,
+                        [v for _, v in self.policy.loss_inputs()], rnn_inputs,
+                        self.per_device_batch_size, self.policy.copy,
                         os.getcwd())
 
                 self.sess = self.local_evaluator.tf_sess
@@ -117,8 +120,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
             else:
                 state_keys = []
             tuples_per_device = self.par_opt.load_data(
-                self.sess,
-                [tuples[k] for k in data_keys],
+                self.sess, [tuples[k] for k in data_keys],
                 [tuples[k] for k in state_keys])
 
         with self.grad_timer:
@@ -141,12 +143,14 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
         return _averaged(iter_extra_fetches)
 
     def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "load_time_ms": round(1000 * self.load_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "load_time_ms": round(1000 * self.load_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+            })
 
 
 def _averaged(kv):
diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py
index e4f8ce011..04a76f4ea 100644
--- a/python/ray/rllib/optimizers/policy_optimizer.py
+++ b/python/ray/rllib/optimizers/policy_optimizer.py
@@ -103,9 +103,10 @@ class PolicyOptimizer(object):
         """
 
         local_result = [func(self.local_evaluator, 0)]
-        remote_results = ray.get(
-            [ev.apply.remote(func, i + 1)
-             for i, ev in enumerate(self.remote_evaluators)])
+        remote_results = ray.get([
+            ev.apply.remote(func, i + 1)
+            for i, ev in enumerate(self.remote_evaluators)
+        ])
         return local_result + remote_results
 
     def collect_metrics(self):
diff --git a/python/ray/rllib/optimizers/replay_buffer.py b/python/ray/rllib/optimizers/replay_buffer.py
index a1e374414..6730a62b2 100644
--- a/python/ray/rllib/optimizers/replay_buffer.py
+++ b/python/ray/rllib/optimizers/replay_buffer.py
@@ -90,8 +90,10 @@ class ReplayBuffer(object):
           done_mask[i] = 1 if executing act_batch[i] resulted in
           the end of an episode and 0 otherwise.
         """
-        idxes = [random.randint(0, len(self._storage) - 1)
-                 for _ in range(batch_size)]
+        idxes = [
+            random.randint(0,
+                           len(self._storage) - 1) for _ in range(batch_size)
+        ]
         self._num_sampled += batch_size
         return self._encode_sample(idxes)
 
@@ -142,12 +144,12 @@ class PrioritizedReplayBuffer(ReplayBuffer):
             reward = np.sign(reward)
 
         idx = self._next_idx
-        super(PrioritizedReplayBuffer, self).add(
-            obs_t, action, reward, obs_tp1, done, weight)
+        super(PrioritizedReplayBuffer, self).add(obs_t, action, reward,
+                                                 obs_tp1, done, weight)
         if weight is None:
             weight = self._max_priority
-        self._it_sum[idx] = weight ** self._alpha
-        self._it_min[idx] = weight ** self._alpha
+        self._it_sum[idx] = weight**self._alpha
+        self._it_min[idx] = weight**self._alpha
 
     def _sample_proportional(self, batch_size):
         res = []
@@ -202,11 +204,11 @@ class PrioritizedReplayBuffer(ReplayBuffer):
 
         weights = []
         p_min = self._it_min.min() / self._it_sum.sum()
-        max_weight = (p_min * len(self._storage)) ** (-beta)
+        max_weight = (p_min * len(self._storage))**(-beta)
 
         for idx in idxes:
             p_sample = self._it_sum[idx] / self._it_sum.sum()
-            weight = (p_sample * len(self._storage)) ** (-beta)
+            weight = (p_sample * len(self._storage))**(-beta)
             weights.append(weight / max_weight)
         weights = np.array(weights)
         encoded_sample = self._encode_sample(idxes)
@@ -231,10 +233,10 @@ class PrioritizedReplayBuffer(ReplayBuffer):
         for idx, priority in zip(idxes, priorities):
             assert priority > 0
             assert 0 <= idx < len(self._storage)
-            delta = priority ** self._alpha - self._it_sum[idx]
+            delta = priority**self._alpha - self._it_sum[idx]
             self._prio_change_stats.push(delta)
-            self._it_sum[idx] = priority ** self._alpha
-            self._it_min[idx] = priority ** self._alpha
+            self._it_sum[idx] = priority**self._alpha
+            self._it_min[idx] = priority**self._alpha
 
             self._max_priority = max(self._max_priority, priority)
 
diff --git a/python/ray/rllib/optimizers/segment_tree.py b/python/ray/rllib/optimizers/segment_tree.py
index b412a89bd..e09ed4723 100644
--- a/python/ray/rllib/optimizers/segment_tree.py
+++ b/python/ray/rllib/optimizers/segment_tree.py
@@ -54,8 +54,7 @@ class SegmentTree(object):
                 return self._operation(
                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1,
-                                        node_end)
-                )
+                                        node_end))
 
     def reduce(self, start=0, end=None):
         """Returns result of applying `self.operation`
@@ -89,9 +88,8 @@ class SegmentTree(object):
         self._value[idx] = val
         idx //= 2
         while idx >= 1:
-            self._value[idx] = self._operation(
-                self._value[2 * idx],
-                self._value[2 * idx + 1])
+            self._value[idx] = self._operation(self._value[2 * idx],
+                                               self._value[2 * idx + 1])
             idx //= 2
 
     def __getitem__(self, idx):
@@ -102,9 +100,7 @@ class SegmentTree(object):
 class SumSegmentTree(SegmentTree):
     def __init__(self, capacity):
         super(SumSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=operator.add,
-            neutral_element=0.0)
+            capacity=capacity, operation=operator.add, neutral_element=0.0)
 
     def sum(self, start=0, end=None):
         """Returns arr[start] + ... + arr[end]"""
@@ -142,9 +138,7 @@ class SumSegmentTree(SegmentTree):
 class MinSegmentTree(SegmentTree):
     def __init__(self, capacity):
         super(MinSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=min,
-            neutral_element=float('inf'))
+            capacity=capacity, operation=min, neutral_element=float('inf'))
 
     def min(self, start=0, end=None):
         """Returns min(arr[start], ...,  arr[end])"""
diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py
index 1058b0d5a..834994cd7 100644
--- a/python/ray/rllib/optimizers/sync_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py
@@ -23,11 +23,16 @@ class SyncReplayOptimizer(PolicyOptimizer):
     "td_error" array in the info return of compute_gradients(). This error
     term will be used for sample prioritization."""
 
-    def _init(
-            self, learning_starts=1000, buffer_size=10000,
-            prioritized_replay=True, prioritized_replay_alpha=0.6,
-            prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
-            train_batch_size=32, sample_batch_size=4, clip_rewards=True):
+    def _init(self,
+              learning_starts=1000,
+              buffer_size=10000,
+              prioritized_replay=True,
+              prioritized_replay_alpha=0.6,
+              prioritized_replay_beta=0.4,
+              prioritized_replay_eps=1e-6,
+              train_batch_size=32,
+              sample_batch_size=4,
+              clip_rewards=True):
 
         self.replay_starts = learning_starts
         self.prioritized_replay_beta = prioritized_replay_beta
@@ -43,11 +48,14 @@ class SyncReplayOptimizer(PolicyOptimizer):
 
         # Set up replay buffer
         if prioritized_replay:
+
             def new_buffer():
                 return PrioritizedReplayBuffer(
-                    buffer_size, alpha=prioritized_replay_alpha,
+                    buffer_size,
+                    alpha=prioritized_replay_alpha,
                     clip_rewards=clip_rewards)
         else:
+
             def new_buffer():
                 return ReplayBuffer(buffer_size, clip_rewards)
 
@@ -72,17 +80,19 @@ class SyncReplayOptimizer(PolicyOptimizer):
 
             # Handle everything as if multiagent
             if isinstance(batch, SampleBatch):
-                batch = MultiAgentBatch(
-                    {DEFAULT_POLICY_ID: batch}, batch.count)
+                batch = MultiAgentBatch({
+                    DEFAULT_POLICY_ID: batch
+                }, batch.count)
 
             for policy_id, s in batch.policy_batches.items():
                 for row in s.rows():
                     if "weights" not in row:
                         row["weights"] = np.ones_like(row["rewards"])
                     self.replay_buffers[policy_id].add(
-                        pack_if_needed(row["obs"]), row["actions"],
-                        row["rewards"], pack_if_needed(row["new_obs"]),
-                        row["dones"], row["weights"])
+                        pack_if_needed(row["obs"]),
+                        row["actions"], row["rewards"],
+                        pack_if_needed(row["new_obs"]), row["dones"],
+                        row["weights"])
 
         if self.num_steps_sampled >= self.replay_starts:
             self._optimize()
@@ -112,27 +122,35 @@ class SyncReplayOptimizer(PolicyOptimizer):
         with self.replay_timer:
             for policy_id, replay_buffer in self.replay_buffers.items():
                 if isinstance(replay_buffer, PrioritizedReplayBuffer):
-                    (obses_t, actions, rewards, obses_tp1,
-                        dones, weights, batch_indexes) = replay_buffer.sample(
-                            self.train_batch_size,
-                            beta=self.prioritized_replay_beta)
+                    (obses_t, actions, rewards, obses_tp1, dones, weights,
+                     batch_indexes) = replay_buffer.sample(
+                         self.train_batch_size,
+                         beta=self.prioritized_replay_beta)
                 else:
                     (obses_t, actions, rewards, obses_tp1,
-                        dones) = replay_buffer.sample(self.train_batch_size)
+                     dones) = replay_buffer.sample(self.train_batch_size)
                     weights = np.ones_like(rewards)
-                    batch_indexes = - np.ones_like(rewards)
+                    batch_indexes = -np.ones_like(rewards)
             samples[policy_id] = SampleBatch({
-                "obs": obses_t, "actions": actions, "rewards": rewards,
-                "new_obs": obses_tp1, "dones": dones, "weights": weights,
-                "batch_indexes": batch_indexes})
+                "obs": obses_t,
+                "actions": actions,
+                "rewards": rewards,
+                "new_obs": obses_tp1,
+                "dones": dones,
+                "weights": weights,
+                "batch_indexes": batch_indexes
+            })
         return MultiAgentBatch(samples, self.train_batch_size)
 
     def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-            "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
-            "opt_samples": round(self.grad_timer.mean_units_processed, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "replay_time_ms": round(1000 * self.replay_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+                "opt_peak_throughput": round(self.grad_timer.mean_throughput,
+                                             3),
+                "opt_samples": round(self.grad_timer.mean_units_processed, 3),
+            })
diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py
index 6b4483fb1..76d2d9c46 100644
--- a/python/ray/rllib/optimizers/sync_samples_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py
@@ -51,10 +51,13 @@ class SyncSamplesOptimizer(PolicyOptimizer):
         return fetches
 
     def stats(self):
-        return dict(PolicyOptimizer.stats(self), **{
-            "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
-            "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
-            "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
-            "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
-            "opt_samples": round(self.grad_timer.mean_units_processed, 3),
-        })
+        return dict(
+            PolicyOptimizer.stats(self), **{
+                "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
+                "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
+                "update_time_ms": round(1000 * self.update_weights_timer.mean,
+                                        3),
+                "opt_peak_throughput": round(self.grad_timer.mean_throughput,
+                                             3),
+                "opt_samples": round(self.grad_timer.mean_units_processed, 3),
+            })
diff --git a/python/ray/rllib/rollout.py b/python/ray/rllib/rollout.py
index 58639c361..0e33e3d6c 100755
--- a/python/ray/rllib/rollout.py
+++ b/python/ray/rllib/rollout.py
@@ -15,7 +15,6 @@ from ray.rllib.agents.agent import get_agent_class
 from ray.rllib.agents.dqn.common.wrappers import wrap_dqn
 from ray.rllib.models import ModelCatalog
 
-
 EXAMPLE_USAGE = """
 Example Usage via RLlib CLI:
     rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
@@ -32,30 +31,37 @@ def create_parser(parser_creator=None):
     parser = parser_creator(
         formatter_class=argparse.RawDescriptionHelpFormatter,
         description="Roll out a reinforcement learning agent "
-                    "given a checkpoint.", epilog=EXAMPLE_USAGE)
+        "given a checkpoint.",
+        epilog=EXAMPLE_USAGE)
 
     parser.add_argument(
         "checkpoint", type=str, help="Checkpoint from which to roll out.")
     required_named = parser.add_argument_group("required named arguments")
     required_named.add_argument(
-        "--run", type=str, required=True,
+        "--run",
+        type=str,
+        required=True,
         help="The algorithm or model to train. This may refer to the name "
-             "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
-             "user-defined trainable function or class registered in the "
-             "tune registry.")
+        "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
+        "user-defined trainable function or class registered in the "
+        "tune registry.")
     required_named.add_argument(
         "--env", type=str, help="The gym environment to use.")
     parser.add_argument(
-        "--no-render", default=False, action="store_const", const=True,
+        "--no-render",
+        default=False,
+        action="store_const",
+        const=True,
         help="Surpress rendering of the environment.")
     parser.add_argument(
         "--steps", default=None, help="Number of steps to roll out.")
+    parser.add_argument("--out", default=None, help="Output filename.")
     parser.add_argument(
-        "--out", default=None, help="Output filename.")
-    parser.add_argument(
-        "--config", default="{}", type=json.loads,
+        "--config",
+        default="{}",
+        type=json.loads,
         help="Algorithm-specific configuration (e.g. env, hyperparams). "
-             "Surpresses loading of configuration from checkpoint.")
+        "Surpresses loading of configuration from checkpoint.")
     return parser
 
 
diff --git a/python/ray/rllib/scripts.py b/python/ray/rllib/scripts.py
index ede37efc5..cc48b83cf 100644
--- a/python/ray/rllib/scripts.py
+++ b/python/ray/rllib/scripts.py
@@ -9,7 +9,6 @@ import argparse
 from ray.rllib import train
 from ray.rllib import rollout
 
-
 EXAMPLE_USAGE = """
 Example usage for training:
     rllib train --run DQN --env CartPole-v0
diff --git a/python/ray/rllib/test/mock_evaluator.py b/python/ray/rllib/test/mock_evaluator.py
index 83c0f354e..e11b097e7 100644
--- a/python/ray/rllib/test/mock_evaluator.py
+++ b/python/ray/rllib/test/mock_evaluator.py
@@ -15,16 +15,17 @@ class _MockEvaluator(object):
         self._sample_count = sample_count
         self.obs_filter = MeanStdFilter(())
         self.rew_filter = MeanStdFilter(())
-        self.filters = {"obs_filter": self.obs_filter,
-                        "rew_filter": self.rew_filter}
+        self.filters = {
+            "obs_filter": self.obs_filter,
+            "rew_filter": self.rew_filter
+        }
 
     def sample(self):
         samples_dict = {"observations": [], "rewards": []}
         for i in range(self._sample_count):
             samples_dict["observations"].append(
                 self.obs_filter(np.random.randn()))
-            samples_dict["rewards"].append(
-                self.rew_filter(np.random.randn()))
+            samples_dict["rewards"].append(self.rew_filter(np.random.randn()))
         return SampleBatch(samples_dict)
 
     def compute_gradients(self, samples):
diff --git a/python/ray/rllib/test/test_catalog.py b/python/ray/rllib/test/test_catalog.py
index 3e8a08990..454c9255c 100644
--- a/python/ray/rllib/test/test_catalog.py
+++ b/python/ray/rllib/test/test_catalog.py
@@ -8,8 +8,8 @@ import ray
 
 from ray.rllib.models import ModelCatalog
 from ray.rllib.models.model import Model
-from ray.rllib.models.preprocessors import (
-    NoPreprocessor, OneHotPreprocessor, Preprocessor)
+from ray.rllib.models.preprocessors import (NoPreprocessor, OneHotPreprocessor,
+                                            Preprocessor)
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.visionnet import VisionNetwork
 
@@ -44,9 +44,11 @@ class ModelCatalogTest(unittest.TestCase):
         class TupleEnv(object):
             def __init__(self):
                 self.observation_space = Tuple(
-                    [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
+                    [Discrete(5),
+                     Box(0, 1, shape=(3, ), dtype=np.float32)])
+
         p1 = ModelCatalog.get_preprocessor(TupleEnv())
-        self.assertEqual(p1.shape, (8,))
+        self.assertEqual(p1.shape, (8, ))
         self.assertEqual(
             list(p1.transform((0, [1, 2, 3]))),
             [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py
index f94e08b5a..1776ee8a1 100644
--- a/python/ray/rllib/test/test_checkpoint_restore.py
+++ b/python/ray/rllib/test/test_checkpoint_restore.py
@@ -20,12 +20,24 @@ def get_mean_action(alg, obs):
 ray.init(num_cpus=10)
 
 CONFIGS = {
-    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
-           "num_workers": 2},
+    "ES": {
+        "episodes_per_batch": 10,
+        "timesteps_per_batch": 100,
+        "num_workers": 2
+    },
     "DQN": {},
-    "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
-    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
-    "A3C": {"num_workers": 1},
+    "DDPG": {
+        "noise_scale": 0.0,
+        "timesteps_per_iteration": 100
+    },
+    "PPO": {
+        "num_sgd_iter": 5,
+        "timesteps_per_batch": 1000,
+        "num_workers": 2
+    },
+    "A3C": {
+        "num_workers": 1
+    },
 }
 
 
diff --git a/python/ray/rllib/test/test_filters.py b/python/ray/rllib/test/test_filters.py
index 7cb7da6b5..664b1388e 100644
--- a/python/ray/rllib/test/test_filters.py
+++ b/python/ray/rllib/test/test_filters.py
@@ -13,7 +13,7 @@ from ray.rllib.test.mock_evaluator import _MockEvaluator
 
 class RunningStatTest(unittest.TestCase):
     def testRunningStat(self):
-        for shp in ((), (3,), (3, 4)):
+        for shp in ((), (3, ), (3, 4)):
             li = []
             rs = RunningStat(shp)
             for _ in range(5):
@@ -22,12 +22,12 @@ class RunningStatTest(unittest.TestCase):
                 li.append(val)
                 m = np.mean(li, axis=0)
                 self.assertTrue(np.allclose(rs.mean, m))
-                v = (np.square(m) if (len(li) == 1)
-                     else np.var(li, ddof=1, axis=0))
+                v = (np.square(m)
+                     if (len(li) == 1) else np.var(li, ddof=1, axis=0))
                 self.assertTrue(np.allclose(rs.var, v))
 
     def testCombiningStat(self):
-        for shape in [(), (3,), (3, 4)]:
+        for shape in [(), (3, ), (3, 4)]:
             li = []
             rs1 = RunningStat(shape)
             rs2 = RunningStat(shape)
@@ -48,7 +48,7 @@ class RunningStatTest(unittest.TestCase):
 
 class MSFTest(unittest.TestCase):
     def testBasic(self):
-        for shape in [(), (3,), (3, 4, 4)]:
+        for shape in [(), (3, ), (3, 4, 4)]:
             filt = MeanStdFilter(shape)
             for i in range(5):
                 filt(np.ones(shape))
@@ -93,8 +93,10 @@ class FilterManagerTest(unittest.TestCase):
         remote_e = RemoteEvaluator.remote(sample_count=10)
         remote_e.sample.remote()
 
-        FilterManager.synchronize(
-            {"obs_filter": filt1, "rew_filter": filt1.copy()}, [remote_e])
+        FilterManager.synchronize({
+            "obs_filter": filt1,
+            "rew_filter": filt1.copy()
+        }, [remote_e])
 
         filters = ray.get(remote_e.get_filters.remote())
         obs_f = filters["obs_filter"]
diff --git a/python/ray/rllib/test/test_lstm.py b/python/ray/rllib/test/test_lstm.py
index 0e92901fd..0fd6dffc3 100644
--- a/python/ray/rllib/test/test_lstm.py
+++ b/python/ray/rllib/test/test_lstm.py
@@ -10,22 +10,15 @@ from ray.rllib.models.lstm import chop_into_sequences
 class LSTMUtilsTest(unittest.TestCase):
     def testBasic(self):
         t = [1, 2, 3, 1, 2, 3, 4, 5]
-        f = [
-            [101, 102, 103, 201, 202, 203, 204, 205],
-            [[101], [102], [103], [201], [202], [203], [204], [205]]
-        ]
+        f = [[101, 102, 103, 201, 202, 203, 204, 205],
+             [[101], [102], [103], [201], [202], [203], [204], [205]]]
         s = [[209, 208, 207, 109, 108, 107, 106, 105]]
         f_pad, s_init, seq_lens = chop_into_sequences(t, f, s, 4)
-        self.assertEqual(
-            [f.tolist() for f in f_pad],
-            [
-                [101, 102, 103, 0,
-                 201, 202, 203, 204,
-                 205, 0, 0, 0],
-                [[101], [102], [103], [0],
-                 [201], [202], [203], [204],
-                 [205], [0], [0], [0]],
-            ])
+        self.assertEqual([f.tolist() for f in f_pad], [
+            [101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0],
+            [[101], [102], [103], [0], [201], [202], [203], [204], [205], [0],
+             [0], [0]],
+        ])
         self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]])
         self.assertEqual(seq_lens.tolist(), [3, 4, 1])
 
diff --git a/python/ray/rllib/test/test_multi_agent_env.py b/python/ray/rllib/test/test_multi_agent_env.py
index fcaabfdd9..c6ce25ed4 100644
--- a/python/ray/rllib/test/test_multi_agent_env.py
+++ b/python/ray/rllib/test/test_multi_agent_env.py
@@ -129,12 +129,21 @@ class TestMultiAgentEnv(unittest.TestCase):
             obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
             self.assertEqual(obs, {0: 0, 1: 0, 2: 0, 3: 0})
             self.assertEqual(rew, {0: 1, 1: 1, 2: 1, 3: 1})
-            self.assertEqual(
-                done,
-                {0: False, 1: False, 2: False, 3: False, "__all__": False})
+            self.assertEqual(done, {
+                0: False,
+                1: False,
+                2: False,
+                3: False,
+                "__all__": False
+            })
         obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
-        self.assertEqual(
-            done, {0: True, 1: True, 2: True, 3: True, "__all__": True})
+        self.assertEqual(done, {
+            0: True,
+            1: True,
+            2: True,
+            3: True,
+            "__all__": True
+        })
 
     def testRoundRobinMock(self):
         env = RoundRobinMultiAgent(2)
@@ -156,24 +165,51 @@ class TestMultiAgentEnv(unittest.TestCase):
         self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
         self.assertEqual(rew, {0: {0: None, 1: None}, 1: {0: None, 1: None}})
         self.assertEqual(
-            dones,
-            {0: {0: False, 1: False, "__all__": False},
-             1: {0: False, 1: False, "__all__": False}})
+            dones, {
+                0: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                },
+                1: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                }
+            })
         for _ in range(24):
             env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
             obs, rew, dones, _, _ = env.poll()
             self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
             self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
             self.assertEqual(
-                dones,
-                {0: {0: False, 1: False, "__all__": False},
-                 1: {0: False, 1: False, "__all__": False}})
+                dones, {
+                    0: {
+                        0: False,
+                        1: False,
+                        "__all__": False
+                    },
+                    1: {
+                        0: False,
+                        1: False,
+                        "__all__": False
+                    }
+                })
         env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
         obs, rew, dones, _, _ = env.poll()
         self.assertEqual(
-            dones,
-            {0: {0: True, 1: True, "__all__": True},
-             1: {0: True, 1: True, "__all__": True}})
+            dones, {
+                0: {
+                    0: True,
+                    1: True,
+                    "__all__": True
+                },
+                1: {
+                    0: True,
+                    1: True,
+                    "__all__": True
+                }
+            })
 
         # Reset processing
         self.assertRaises(
@@ -186,9 +222,18 @@ class TestMultiAgentEnv(unittest.TestCase):
         self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
         self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
         self.assertEqual(
-            dones,
-            {0: {0: False, 1: False, "__all__": False},
-             1: {0: False, 1: False, "__all__": False}})
+            dones, {
+                0: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                },
+                1: {
+                    0: False,
+                    1: False,
+                    "__all__": False
+                }
+            })
 
     def testVectorizeRoundRobin(self):
         env = _MultiAgentEnvToAsync(lambda: RoundRobinMultiAgent(2), [], 2)
@@ -217,9 +262,8 @@ class TestMultiAgentEnv(unittest.TestCase):
         self.assertEqual(batch.count, 50)
         self.assertEqual(batch.policy_batches["p0"].count, 150)
         self.assertEqual(batch.policy_batches["p1"].count, 100)
-        self.assertEqual(
-            batch.policy_batches["p0"]["t"].tolist(),
-            list(range(25)) * 6)
+        self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
+                         list(range(25)) * 6)
 
     def testMultiAgentSampleRoundRobin(self):
         act_space = gym.spaces.Discrete(2)
@@ -236,21 +280,16 @@ class TestMultiAgentEnv(unittest.TestCase):
         # since we round robin introduce agents into the env, some of the env
         # steps don't count as proper transitions
         self.assertEqual(batch.policy_batches["p0"].count, 42)
-        self.assertEqual(
-            batch.policy_batches["p0"]["obs"].tolist()[:10],
-            [0, 1, 2, 3, 4] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["new_obs"].tolist()[:10],
-            [1, 2, 3, 4, 5] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["rewards"].tolist()[:10],
-            [100, 100, 100, 100, 0] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["dones"].tolist()[:10],
-            [False, False, False, False, True] * 2)
-        self.assertEqual(
-            batch.policy_batches["p0"]["t"].tolist()[:10],
-            [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
+        self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10],
+                         [0, 1, 2, 3, 4] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10],
+                         [1, 2, 3, 4, 5] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
+                         [100, 100, 100, 100, 0] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
+                         [False, False, False, False, True] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
+                         [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
 
     def testTrainMultiCartpoleSinglePolicy(self):
         n = 10
@@ -289,11 +328,17 @@ class TestMultiAgentEnv(unittest.TestCase):
             policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
             batch_steps=50)
         if optimizer_cls == AsyncGradientsOptimizer:
-            remote_evs = [PolicyEvaluator.as_remote().remote(
-                env_creator=lambda _: MultiCartpole(n),
-                policy_graph=policies,
-                policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
-                batch_steps=50)]
+
+            def policy_mapper(agent_id):
+                return ["p1", "p2"][agent_id % 2]
+
+            remote_evs = [
+                PolicyEvaluator.as_remote().remote(
+                    env_creator=lambda _: MultiCartpole(n),
+                    policy_graph=policies,
+                    policy_mapping_fn=policy_mapper,
+                    batch_steps=50)
+            ]
         else:
             remote_evs = []
         optimizer = optimizer_cls(ev, remote_evs, {})
@@ -330,8 +375,8 @@ class TestMultiAgentEnv(unittest.TestCase):
         obs_space = env.observation_space
         policies = {}
         for i in range(20):
-            policies["pg_{}".format(i)] = (
-                PGPolicyGraph, obs_space, act_space, {})
+            policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
+                                           {})
         policy_ids = list(policies.keys())
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(n),
diff --git a/python/ray/rllib/test/test_optimizers.py b/python/ray/rllib/test/test_optimizers.py
index c39327255..6a5022d36 100644
--- a/python/ray/rllib/test/test_optimizers.py
+++ b/python/ray/rllib/test/test_optimizers.py
@@ -21,8 +21,8 @@ class AsyncOptimizerTest(unittest.TestCase):
         local = _MockEvaluator()
         remotes = ray.remote(_MockEvaluator)
         remote_evaluators = [remotes.remote() for i in range(5)]
-        test_optimizer = AsyncGradientsOptimizer(
-            local, remote_evaluators, {"grads_per_step": 10})
+        test_optimizer = AsyncGradientsOptimizer(local, remote_evaluators,
+                                                 {"grads_per_step": 10})
         test_optimizer.step()
         self.assertTrue(all(local.get_weights() == 0))
 
diff --git a/python/ray/rllib/test/test_policy_evaluator.py b/python/ray/rllib/test/test_policy_evaluator.py
index 2817174f7..472625fb3 100644
--- a/python/ray/rllib/test/test_policy_evaluator.py
+++ b/python/ray/rllib/test/test_policy_evaluator.py
@@ -66,8 +66,7 @@ class MockEnv2(gym.Env):
 
 class MockVectorEnv(VectorEnv):
     def __init__(self, episode_length, num_envs):
-        self.envs = [
-            MockEnv(episode_length) for _ in range(num_envs)]
+        self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
         self.observation_space = gym.spaces.Discrete(1)
         self.action_space = gym.spaces.Discrete(2)
         self.num_envs = num_envs
@@ -102,7 +101,10 @@ class TestPolicyEvaluator(unittest.TestCase):
     def testQueryEvaluators(self):
         register_env("test", lambda _: gym.make("CartPole-v0"))
         pg = PGAgent(
-            env="test", config={"num_workers": 2, "sample_batch_size": 5})
+            env="test", config={
+                "num_workers": 2,
+                "sample_batch_size": 5
+            })
         results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps)
         results2 = pg.optimizer.foreach_evaluator_with_index(
             lambda ev, i: (i, ev.batch_steps))
@@ -112,10 +114,12 @@ class TestPolicyEvaluator(unittest.TestCase):
     def testMetrics(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
+            policy_graph=MockPolicyGraph,
+            batch_mode="complete_episodes")
         remote_ev = PolicyEvaluator.as_remote().remote(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
+            policy_graph=MockPolicyGraph,
+            batch_mode="complete_episodes")
         ev.sample()
         ray.get(remote_ev.sample.remote())
         result = collect_metrics(ev, [remote_ev])
@@ -149,7 +153,8 @@ class TestPolicyEvaluator(unittest.TestCase):
             env_creator=lambda _: MockEnv(episode_length=20),
             policy_graph=MockPolicyGraph,
             batch_mode="truncate_episodes",
-            batch_steps=16, num_envs=8)
+            batch_steps=16,
+            num_envs=8)
         for _ in range(8):
             batch = ev.sample()
             self.assertEqual(batch.count, 16)
@@ -175,7 +180,8 @@ class TestPolicyEvaluator(unittest.TestCase):
             env_creator=lambda _: MockEnv(episode_length=8),
             policy_graph=MockPolicyGraph,
             batch_mode="truncate_episodes",
-            batch_steps=16, num_envs=4)
+            batch_steps=16,
+            num_envs=4)
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
         result = collect_metrics(ev, [])
@@ -186,8 +192,7 @@ class TestPolicyEvaluator(unittest.TestCase):
 
     def testVectorEnvSupport(self):
         ev = PolicyEvaluator(
-            env_creator=lambda _: MockVectorEnv(
-                episode_length=20, num_envs=8),
+            env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
             policy_graph=MockPolicyGraph,
             batch_mode="truncate_episodes",
             batch_steps=10)
diff --git a/python/ray/rllib/test/test_serving_env.py b/python/ray/rllib/test/test_serving_env.py
index eadfe1164..5d9dd641a 100644
--- a/python/ray/rllib/test/test_serving_env.py
+++ b/python/ray/rllib/test/test_serving_env.py
@@ -83,8 +83,8 @@ class MultiServing(ServingEnv):
     def __init__(self, env_creator):
         self.env_creator = env_creator
         self.env = env_creator()
-        ServingEnv.__init__(
-            self, self.env.action_space, self.env.observation_space)
+        ServingEnv.__init__(self, self.env.action_space,
+                            self.env.observation_space)
 
     def run(self):
         envs = [self.env_creator() for _ in range(5)]
@@ -97,8 +97,7 @@ class MultiServing(ServingEnv):
                     eids[i] = uuid.uuid4().hex
                     self.start_episode(episode_id=eids[i])
                     cur_obs[i] = envs[i].reset()
-            actions = [
-                self.get_action(eids[i], cur_obs[i]) for i in active]
+            actions = [self.get_action(eids[i], cur_obs[i]) for i in active]
             for i, action in zip(active, actions):
                 obs, reward, done, _ = envs[i].step(action)
                 cur_obs[i] = obs
@@ -164,8 +163,7 @@ class TestServingEnv(unittest.TestCase):
         raise Exception("failed to improve reward")
 
     def testTrainCartpole(self):
-        register_env(
-            "test", lambda _: SimpleServing(gym.make("CartPole-v0")))
+        register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0")))
         pg = PGAgent(env="test", config={"num_workers": 0})
         for i in range(100):
             result = pg.train()
@@ -176,8 +174,8 @@ class TestServingEnv(unittest.TestCase):
         raise Exception("failed to improve reward")
 
     def testTrainCartpoleMulti(self):
-        register_env(
-            "test2", lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
+        register_env("test2",
+                     lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
         pg = PGAgent(env="test2", config={"num_workers": 0})
         for i in range(100):
             result = pg.train()
diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py
index 1189168e8..bbdbda4b0 100644
--- a/python/ray/rllib/test/test_supported_spaces.py
+++ b/python/ray/rllib/test/test_supported_spaces.py
@@ -14,27 +14,29 @@ from ray.tune.registry import register_env
 
 ACTION_SPACES_TO_TEST = {
     "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
     "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ]),
     "implicit_tuple": [
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)],
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ],
 }
 
 OBSERVATION_SPACES_TO_TEST = {
     "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
     "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
     "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
-    "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
+    "atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
     "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5,), dtype=np.float32),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
-    "mixed_tuple": Tuple([
-        Discrete(10),
-        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+        Box(0.0, 1.0, (5, ), dtype=np.float32),
+        Box(0.0, 1.0, (5, ), dtype=np.float32)
+    ]),
+    "mixed_tuple": Tuple(
+        [Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
 }
 
 
@@ -90,30 +92,33 @@ class ModelSupportedSpaces(unittest.TestCase):
         stats = {}
         check_support("DDPG", {"timesteps_per_iteration": 1}, stats)
         check_support("DQN", {"timesteps_per_iteration": 1}, stats)
+        check_support("A3C", {
+            "num_workers": 1,
+            "optimizer": {
+                "grads_per_step": 1
+            }
+        }, stats)
         check_support(
-            "A3C", {"num_workers": 1, "optimizer": {"grads_per_step": 1}},
-            stats)
+            "PPO", {
+                "num_workers": 1,
+                "num_sgd_iter": 1,
+                "timesteps_per_batch": 1,
+                "sgd_batchsize": 1
+            }, stats)
         check_support(
-            "PPO",
-            {"num_workers": 1, "num_sgd_iter": 1, "timesteps_per_batch": 1,
-             "sgd_batchsize": 1},
-            stats)
-        check_support(
-            "ES",
-            {"num_workers": 1, "noise_size": 10000000,
-             "episodes_per_batch": 1, "timesteps_per_batch": 1},
-            stats)
-        check_support(
-            "PG",
-            {"num_workers": 1, "optimizer": {}},
-            stats)
+            "ES", {
+                "num_workers": 1,
+                "noise_size": 10000000,
+                "episodes_per_batch": 1,
+                "timesteps_per_batch": 1
+            }, stats)
+        check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
         num_unexpected_errors = 0
         for (alg, a_name, o_name), stat in sorted(stats.items()):
             if stat not in ["ok", "unsupported"]:
                 num_unexpected_errors += 1
-            print(
-                alg, "action_space", a_name, "obs_space", o_name,
-                "result", stat)
+            print(alg, "action_space", a_name, "obs_space", o_name, "result",
+                  stat)
         self.assertEqual(num_unexpected_errors, 0)
 
 
@@ -123,7 +128,7 @@ if __name__ == "__main__":
             "discrete": Discrete(5),
         }
         OBSERVATION_SPACES_TO_TEST = {
-            "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+            "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
             "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
         }
     unittest.main(verbosity=2)
diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py
index 736eb7ec2..ac18025a9 100755
--- a/python/ray/rllib/train.py
+++ b/python/ray/rllib/train.py
@@ -11,7 +11,6 @@ import ray
 from ray.tune.config_parser import make_parser, resources_to_json
 from ray.tune.tune import _make_scheduler, run_experiments
 
-
 EXAMPLE_USAGE = """
 Training example via RLlib CLI:
     rllib train --run DQN --env CartPole-v0
@@ -35,29 +34,41 @@ def create_parser(parser_creator=None):
 
     # See also the base parser definition in ray/tune/config_parser.py
     parser.add_argument(
-        "--redis-address", default=None, type=str,
+        "--redis-address",
+        default=None,
+        type=str,
         help="The Redis address of the cluster.")
     parser.add_argument(
-        "--ray-num-cpus", default=None, type=int,
+        "--ray-num-cpus",
+        default=None,
+        type=int,
         help="--num-cpus to pass to Ray."
-             " This only has an affect in local mode.")
+        " This only has an affect in local mode.")
     parser.add_argument(
-        "--ray-num-gpus", default=None, type=int,
+        "--ray-num-gpus",
+        default=None,
+        type=int,
         help="--num-gpus to pass to Ray."
-             " This only has an affect in local mode.")
+        " This only has an affect in local mode.")
     parser.add_argument(
-        "--experiment-name", default="default", type=str,
+        "--experiment-name",
+        default="default",
+        type=str,
         help="Name of the subdirectory under `local_dir` to put results in.")
     parser.add_argument(
         "--env", default=None, type=str, help="The gym environment to use.")
     parser.add_argument(
-        "--queue-trials", action='store_true',
+        "--queue-trials",
+        action='store_true',
         help=(
             "Whether to queue trials when the cluster does not currently have "
             "enough resources to launch one. This should be set to True when "
             "running on an autoscaling cluster to enable automatic scale-up."))
     parser.add_argument(
-        "-f", "--config-file", default=None, type=str,
+        "-f",
+        "--config-file",
+        default=None,
+        type=str,
         help="If specified, use config options from this file. Note that this "
         "overrides any trial-specific options set via flags above.")
     return parser
@@ -93,9 +104,11 @@ def run(args, parser):
 
     ray.init(
         redis_address=args.redis_address,
-        num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus)
+        num_cpus=args.ray_num_cpus,
+        num_gpus=args.ray_num_gpus)
     run_experiments(
-        experiments, scheduler=_make_scheduler(args),
+        experiments,
+        scheduler=_make_scheduler(args),
         queue_trials=args.queue_trials)
 
 
diff --git a/python/ray/rllib/tuned_examples/generate_regression_tests.py b/python/ray/rllib/tuned_examples/generate_regression_tests.py
index 12cbe2d61..3196bd4d0 100755
--- a/python/ray/rllib/tuned_examples/generate_regression_tests.py
+++ b/python/ray/rllib/tuned_examples/generate_regression_tests.py
@@ -6,10 +6,8 @@ import re
 import os
 import os.path as osp
 
-
 CONFIG_DIR = osp.join(osp.dirname(osp.abspath(__file__)), "regression_tests")
 
-
 TEMPLATE = """
 class Test{name}(Regression):
     _file = "{filename}"
diff --git a/python/ray/rllib/tuned_examples/regression_tests/regression_test.py b/python/ray/rllib/tuned_examples/regression_tests/regression_test.py
index 58433da57..a4624f372 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/regression_test.py
+++ b/python/ray/rllib/tuned_examples/regression_tests/regression_test.py
@@ -15,7 +15,6 @@ import yaml
 import ray
 from ray import tune
 
-
 CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
diff --git a/python/ray/rllib/tuned_examples/run_regression_tests.py b/python/ray/rllib/tuned_examples/run_regression_tests.py
index 3bb7d5224..65ba1a310 100755
--- a/python/ray/rllib/tuned_examples/run_regression_tests.py
+++ b/python/ray/rllib/tuned_examples/run_regression_tests.py
@@ -8,7 +8,6 @@ import yaml
 import ray
 from ray.tune import run_experiments
 
-
 if __name__ == '__main__':
     experiments = {}
 
@@ -29,5 +28,4 @@ if __name__ == '__main__':
             num_failures += 1
 
     if num_failures:
-        raise Exception(
-            "{} trials did not converge".format(num_failures))
+        raise Exception("{} trials did not converge".format(num_failures))
diff --git a/python/ray/rllib/utils/compression.py b/python/ray/rllib/utils/compression.py
index dee8d875d..ddef7a6ab 100644
--- a/python/ray/rllib/utils/compression.py
+++ b/python/ray/rllib/utils/compression.py
@@ -11,10 +11,9 @@ try:
     import lz4.frame
     LZ4_ENABLED = True
 except ImportError:
-    print(
-        "WARNING: lz4 not available, disabling sample compression. "
-        "This will significantly impact RLlib performance. "
-        "To install lz4, run `pip install lz4`.")
+    print("WARNING: lz4 not available, disabling sample compression. "
+          "This will significantly impact RLlib performance. "
+          "To install lz4, run `pip install lz4`.")
     LZ4_ENABLED = False
 
 
diff --git a/python/ray/rllib/utils/filter.py b/python/ray/rllib/utils/filter.py
index 6e60b4e5f..b2a361948 100644
--- a/python/ray/rllib/utils/filter.py
+++ b/python/ray/rllib/utils/filter.py
@@ -59,7 +59,6 @@ class NoFilter(Filter):
 
 # http://www.johndcook.com/blog/standard_deviation/
 class RunningStat(object):
-
     def __init__(self, shape=None):
         self._n = 0
         self._M = np.zeros(shape)
@@ -227,8 +226,8 @@ class MeanStdFilter(Filter):
 
     def __repr__(self):
         return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format(
-            self.shape, self.demean, self.destd,
-            self.clip, self.rs, self.buffer)
+            self.shape, self.demean, self.destd, self.clip, self.rs,
+            self.buffer)
 
 
 class ConcurrentMeanStdFilter(MeanStdFilter):
@@ -242,6 +241,7 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
             def wrapper(*args, **kwargs):
                 with self._lock:
                     return func(*args, **kwargs)
+
             return wrapper
 
         self.__getattribute__ = lock_wrap(self.__getattribute__)
@@ -260,8 +260,8 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
 
     def __repr__(self):
         return 'ConcurrentMeanStdFilter({}, {}, {}, {}, {}, {})'.format(
-            self.shape, self.demean, self.destd,
-            self.clip, self.rs, self.buffer)
+            self.shape, self.demean, self.destd, self.clip, self.rs,
+            self.buffer)
 
 
 def get_filter(filter_config, shape):
@@ -273,5 +273,4 @@ def get_filter(filter_config, shape):
     elif filter_config == "NoFilter":
         return NoFilter()
     else:
-        raise Exception("Unknown observation_filter: " +
-                        str(filter_config))
+        raise Exception("Unknown observation_filter: " + str(filter_config))
diff --git a/python/ray/rllib/utils/policy_server.py b/python/ray/rllib/utils/policy_server.py
index 554d74974..7a5a05093 100644
--- a/python/ray/rllib/utils/policy_server.py
+++ b/python/ray/rllib/utils/policy_server.py
@@ -75,14 +75,14 @@ def _make_handler(serving_env):
                 response["action"] = serving_env.get_action(
                     args["episode_id"], args["observation"])
             elif command == PolicyClient.LOG_ACTION:
-                serving_env.log_action(
-                    args["episode_id"], args["observation"], args["action"])
+                serving_env.log_action(args["episode_id"], args["observation"],
+                                       args["action"])
             elif command == PolicyClient.LOG_RETURNS:
-                serving_env.log_returns(
-                    args["episode_id"], args["reward"], args["info"])
+                serving_env.log_returns(args["episode_id"], args["reward"],
+                                        args["info"])
             elif command == PolicyClient.END_EPISODE:
-                serving_env.end_episode(
-                    args["episode_id"], args["observation"])
+                serving_env.end_episode(args["episode_id"],
+                                        args["observation"])
             else:
                 raise Exception("Unknown command: {}".format(command))
             return response
diff --git a/python/ray/rllib/utils/reshaper.py b/python/ray/rllib/utils/reshaper.py
index c0687b488..e9c165212 100644
--- a/python/ray/rllib/utils/reshaper.py
+++ b/python/ray/rllib/utils/reshaper.py
@@ -7,6 +7,7 @@ class Reshaper(object):
     This class keeps track of where in the flattened observation space
     we should be slicing and what the new shapes should be
     """
+
     def __init__(self, env_space):
         self.shapes = []
         self.slice_positions = []
@@ -24,8 +25,8 @@ class Reshaper(object):
                 if len(self.slice_positions) == 0:
                     self.slice_positions.append(np.product(arr_shape))
                 else:
-                    self.slice_positions.append(np.product(arr_shape) +
-                                                self.slice_positions[-1])
+                    self.slice_positions.append(
+                        np.product(arr_shape) + self.slice_positions[-1])
         else:
             self.shapes.append(np.asarray(env_space.shape))
             self.slice_positions.append(np.product(env_space.shape))
@@ -38,11 +39,11 @@ class Reshaper(object):
     def split_tensor(self, tensor, axis=-1):
         # FIXME (ev) This won't work for mixed action distributions like
         # one agent Gaussian one agent discrete
-        slice_rescale = int(tensor.shape.as_list()[axis] /
-                            int(np.sum(self.get_slice_lengths())))
-        return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
-                        axis=axis)
+        slice_rescale = int(tensor.shape.as_list()[axis] / int(
+            np.sum(self.get_slice_lengths())))
+        return tf.split(
+            tensor, slice_rescale * self.get_slice_lengths(), axis=axis)
 
     def split_number(self, number):
         slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
-        return slice_rescale*self.get_slice_lengths()
+        return slice_rescale * self.get_slice_lengths()
diff --git a/python/ray/rllib/utils/schedules.py b/python/ray/rllib/utils/schedules.py
index d9ceb2f76..41518e6b9 100644
--- a/python/ray/rllib/utils/schedules.py
+++ b/python/ray/rllib/utils/schedules.py
@@ -39,10 +39,10 @@ def linear_interpolation(l, r, alpha):
 
 
 class PiecewiseSchedule(object):
-    def __init__(
-            self, endpoints, interpolation=linear_interpolation,
-            outside_value=None):
-
+    def __init__(self,
+                 endpoints,
+                 interpolation=linear_interpolation,
+                 outside_value=None):
         """Piecewise schedule.
 
         endpoints: [(int, int)]
diff --git a/python/ray/rllib/utils/tf_run_builder.py b/python/ray/rllib/utils/tf_run_builder.py
index 6512fc85c..030642ae5 100644
--- a/python/ray/rllib/utils/tf_run_builder.py
+++ b/python/ray/rllib/utils/tf_run_builder.py
@@ -64,18 +64,19 @@ def run_timeline(sess, ops, debug_name, feed_dict={}, timeline_dir=None):
         run_metadata = tf.RunMetadata()
         start = time.time()
         fetches = sess.run(
-            ops, options=run_options, run_metadata=run_metadata,
+            ops,
+            options=run_options,
+            run_metadata=run_metadata,
             feed_dict=feed_dict)
         trace = timeline.Timeline(step_stats=run_metadata.step_stats)
         global _count
         outf = os.path.join(
-            timeline_dir,
-            "timeline-{}-{}-{}.json".format(debug_name, os.getpid(), _count))
+            timeline_dir, "timeline-{}-{}-{}.json".format(
+                debug_name, os.getpid(), _count))
         _count += 1
         trace_file = open(outf, "w")
-        print(
-            "Wrote tf timeline ({} s) to {}".format(
-                time.time() - start, os.path.abspath(outf)))
+        print("Wrote tf timeline ({} s) to {}".format(time.time() - start,
+                                                      os.path.abspath(outf)))
         trace_file.write(trace.generate_chrome_trace_format())
     else:
         fetches = sess.run(ops, feed_dict=feed_dict)
diff --git a/python/ray/rllib/utils/window_stat.py b/python/ray/rllib/utils/window_stat.py
index ed1d99c46..21c93069a 100644
--- a/python/ray/rllib/utils/window_stat.py
+++ b/python/ray/rllib/utils/window_stat.py
@@ -22,8 +22,8 @@ class WindowStat(object):
         if not self.count:
             quantiles = []
         else:
-            quantiles = np.percentile(
-                self.items[:self.count], [0, 10, 50, 90, 100]).tolist()
+            quantiles = np.percentile(self.items[:self.count],
+                                      [0, 10, 50, 90, 100]).tolist()
         return {
             self.name + "_count": int(self.count),
             self.name + "_mean": float(np.mean(self.items[:self.count])),