diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
index 37ea011a0..4f8a4c66a 100644
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@@ -107,6 +107,10 @@ RLlib will auto-vectorize Gym envs for batch evaluation if the ``num_envs_per_wo
 Multi-Agent
 -----------
 
+.. note::
+
+   Learn more about multi-agent reinforcement learning in RLlib by reading the `blog post <https://rise.cs.berkeley.edu/blog/scaling-multi-agent-rl-with-rllib/>`__.
+
 A multi-agent environment is one which has multiple acting entities per step, e.g., in a traffic simulation, there may be multiple "car" and "traffic light" agents in the environment. The model for multi-agent in RLlib as follows: (1) as a user you define the number of policies available up front, and (2) a function that maps agent ids to policy ids. This is summarized by the below figure:
 
 .. image:: multi-agent.svg
diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst
index 4b6630090..dc350d272 100644
--- a/doc/source/rllib-training.rst
+++ b/doc/source/rllib-training.rst
@@ -228,7 +228,7 @@ Ray actors provide high levels of performance, so in more complex cases they can
 Callbacks and Custom Metrics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You can provide callback functions to be called at points during policy evaluation. These functions have access to an info dict containing state for the current `episode <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/episode.py>`__. Custom state can be stored for the `episode <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/episode.py>`__ in the ``info["episode"].user_data`` dict, and custom scalar metrics reported by saving values to the ``info["episode"].custom_metrics`` dict. These custom metrics will be averaged and reported as part of training results. The following example (full code `here <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/custom_metrics_and_callbacks.py>`__) logs a custom metric from the environment:
+You can provide callback functions to be called at points during policy evaluation. These functions have access to an info dict containing state for the current `episode <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/episode.py>`__. Custom state can be stored for the `episode <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/episode.py>`__ in the ``info["episode"].user_data`` dict, and custom scalar metrics reported by saving values to the ``info["episode"].custom_metrics`` dict. These custom metrics will be aggregated and reported as part of training results. The following example (full code `here <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/custom_metrics_and_callbacks.py>`__) logs a custom metric from the environment:
 
 .. code-block:: python
 
@@ -245,10 +245,10 @@ You can provide callback functions to be called at points during policy evaluati
 
     def on_episode_end(info):
         episode = info["episode"]
-        mean_pole_angle = np.mean(episode.user_data["pole_angles"])
+        pole_angle = np.mean(episode.user_data["pole_angles"])
         print("episode {} ended with length {} and pole angles {}".format(
-            episode.episode_id, episode.length, mean_pole_angle))
-        episode.custom_metrics["mean_pole_angle"] = mean_pole_angle
+            episode.episode_id, episode.length, pole_angle))
+        episode.custom_metrics["pole_angle"] = pole_angle
 
     def on_train_result(info):
         print("agent.train() result: {} -> {} episodes".format(
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index c883ef250..af43b7d17 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -253,6 +253,10 @@ class QLoss(object):
             self.td_error = tf.nn.softmax_cross_entropy_with_logits(
                 labels=m, logits=q_logits_t_selected)
             self.loss = tf.reduce_mean(self.td_error * importance_weights)
+            self.stats = {
+                # TODO: better Q stats for dist dqn
+                "mean_td_error": tf.reduce_mean(self.td_error),
+            }
         else:
             q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
 
@@ -264,6 +268,12 @@ class QLoss(object):
                 q_t_selected - tf.stop_gradient(q_t_selected_target))
             self.loss = tf.reduce_mean(
                 importance_weights * _huber_loss(self.td_error))
+            self.stats = {
+                "mean_q": tf.reduce_mean(q_t_selected),
+                "min_q": tf.reduce_min(q_t_selected),
+                "max_q": tf.reduce_max(q_t_selected),
+                "mean_td_error": tf.reduce_mean(self.td_error),
+            }
 
 
 class DQNPolicyGraph(TFPolicyGraph):
@@ -430,6 +440,7 @@ class DQNPolicyGraph(TFPolicyGraph):
     def extra_compute_grad_fetches(self):
         return {
             "td_error": self.loss.td_error,
+            "stats": self.loss.stats,
         }
 
     def postprocess_trajectory(self,
diff --git a/python/ray/rllib/env/async_vector_env.py b/python/ray/rllib/env/async_vector_env.py
index aff373802..edbf7a233 100644
--- a/python/ray/rllib/env/async_vector_env.py
+++ b/python/ray/rllib/env/async_vector_env.py
@@ -291,6 +291,13 @@ class _MultiAgentEnvToAsync(AsyncVectorEnv):
             assert isinstance(rewards, dict), "Not a multi-agent reward"
             assert isinstance(dones, dict), "Not a multi-agent return"
             assert isinstance(infos, dict), "Not a multi-agent info"
+            if set(obs.keys()) != set(rewards.keys()):
+                raise ValueError(
+                    "Key set for obs and rewards must be the same: "
+                    "{} vs {}".format(obs.keys(), rewards.keys()))
+            if set(obs.keys()) != set(infos.keys()):
+                raise ValueError("Key set for obs and infos must be the same: "
+                                 "{} vs {}".format(obs.keys(), infos.keys()))
             if dones["__all__"]:
                 self.dones.add(env_id)
             self.env_states[env_id].observe(obs, rewards, dones, infos)
diff --git a/python/ray/rllib/env/multi_agent_env.py b/python/ray/rllib/env/multi_agent_env.py
index 42f7cee8c..2e569230a 100644
--- a/python/ray/rllib/env/multi_agent_env.py
+++ b/python/ray/rllib/env/multi_agent_env.py
@@ -56,7 +56,7 @@ class MultiAgentEnv(object):
             rewards (dict): Reward values for each ready agent. If the
                 episode is just started, the value will be None.
             dones (dict): Done values for each ready agent. The special key
-                "__all__" is used to indicate env termination.
+                "__all__" (required) is used to indicate env termination.
             infos (dict): Info values for each ready agent.
         """
         raise NotImplementedError
diff --git a/python/ray/rllib/evaluation/episode.py b/python/ray/rllib/evaluation/episode.py
index 24fa431f9..119777451 100644
--- a/python/ray/rllib/evaluation/episode.py
+++ b/python/ray/rllib/evaluation/episode.py
@@ -60,6 +60,7 @@ class MultiAgentEpisode(object):
         self._agent_to_policy = {}
         self._agent_to_rnn_state = {}
         self._agent_to_last_obs = {}
+        self._agent_to_last_info = {}
         self._agent_to_last_action = {}
         self._agent_to_last_pi_info = {}
         self._agent_to_prev_action = {}
@@ -81,6 +82,11 @@ class MultiAgentEpisode(object):
 
         return self._agent_to_last_obs.get(agent_id)
 
+    def last_info_for(self, agent_id=_DUMMY_AGENT_ID):
+        """Returns the last info for the specified agent."""
+
+        return self._agent_to_last_info.get(agent_id)
+
     def last_action_for(self, agent_id=_DUMMY_AGENT_ID):
         """Returns the last action for the specified agent, or zeros."""
 
@@ -137,6 +143,9 @@ class MultiAgentEpisode(object):
     def _set_last_observation(self, agent_id, obs):
         self._agent_to_last_obs[agent_id] = obs
 
+    def _set_last_info(self, agent_id, info):
+        self._agent_to_last_info[agent_id] = info
+
     def _set_last_action(self, agent_id, action):
         self._agent_to_last_action[agent_id] = action
 
diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py
index fadf2a5a2..92c357d11 100644
--- a/python/ray/rllib/evaluation/metrics.py
+++ b/python/ray/rllib/evaluation/metrics.py
@@ -80,8 +80,16 @@ def summarize_episodes(episodes, new_episodes, num_dropped):
     for policy_id, rewards in policy_rewards.copy().items():
         policy_rewards[policy_id] = np.mean(rewards)
 
-    for k, v_list in custom_metrics.items():
-        custom_metrics[k] = np.mean(v_list)
+    for k, v_list in custom_metrics.copy().items():
+        custom_metrics[k + "_mean"] = np.mean(v_list)
+        filt = [v for v in v_list if not np.isnan(v)]
+        if filt:
+            custom_metrics[k + "_min"] = np.min(filt)
+            custom_metrics[k + "_max"] = np.max(filt)
+        else:
+            custom_metrics[k + "_min"] = float("nan")
+            custom_metrics[k + "_max"] = float("nan")
+        del custom_metrics[k]
 
     return dict(
         episode_reward_max=max_reward,
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index 33d5ee219..360139839 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -8,7 +8,6 @@ import pickle
 import tensorflow as tf
 
 import ray
-from ray.rllib.models import ModelCatalog
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
 from ray.rllib.env.env_context import EnvContext
@@ -19,6 +18,8 @@ from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
 from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.preprocessors import NoPreprocessor
 from ray.rllib.utils import merge_dicts
 from ray.rllib.utils.compression import pack
 from ray.rllib.utils.filter import get_filter
@@ -191,23 +192,21 @@ class PolicyEvaluator(EvaluatorInterface):
         self.sample_batch_size = batch_steps * num_envs
         self.batch_mode = batch_mode
         self.compress_observations = compress_observations
+        self.preprocessing_enabled = True
 
         self.env = env_creator(env_context)
         if isinstance(self.env, MultiAgentEnv) or \
                 isinstance(self.env, AsyncVectorEnv):
 
-            if model_config.get("custom_preprocessor"):
-                raise ValueError(
-                    "Custom preprocessors are not supported for env types "
-                    "MultiAgentEnv and AsyncVectorEnv. Please preprocess "
-                    "observations in your env directly.")
-
             def wrap(env):
                 return env  # we can't auto-wrap these env types
         elif is_atari(self.env) and \
                 not model_config.get("custom_preprocessor") and \
                 preprocessor_pref == "deepmind":
 
+            # Deepmind wrappers already handle all preprocessing
+            self.preprocessing_enabled = False
+
             if clip_rewards is None:
                 clip_rewards = True
 
@@ -222,8 +221,6 @@ class PolicyEvaluator(EvaluatorInterface):
         else:
 
             def wrap(env):
-                env = ModelCatalog.get_preprocessor_as_wrapper(
-                    env, model_config)
                 if monitor_path:
                     env = _monitor(env, monitor_path)
                 return env
@@ -246,11 +243,11 @@ class PolicyEvaluator(EvaluatorInterface):
                         config=tf.ConfigProto(
                             gpu_options=tf.GPUOptions(allow_growth=True)))
                 with self.tf_sess.as_default():
-                    self.policy_map = self._build_policy_map(
-                        policy_dict, policy_config)
+                    self.policy_map, self.preprocessors = \
+                        self._build_policy_map(policy_dict, policy_config)
         else:
-            self.policy_map = self._build_policy_map(policy_dict,
-                                                     policy_config)
+            self.policy_map, self.preprocessors = self._build_policy_map(
+                policy_dict, policy_config)
 
         self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
         if self.multiagent:
@@ -286,6 +283,7 @@ class PolicyEvaluator(EvaluatorInterface):
                 self.async_env,
                 self.policy_map,
                 policy_mapping_fn,
+                self.preprocessors,
                 self.filters,
                 clip_rewards,
                 unroll_length,
@@ -300,6 +298,7 @@ class PolicyEvaluator(EvaluatorInterface):
                 self.async_env,
                 self.policy_map,
                 policy_mapping_fn,
+                self.preprocessors,
                 self.filters,
                 clip_rewards,
                 unroll_length,
@@ -314,24 +313,26 @@ class PolicyEvaluator(EvaluatorInterface):
 
     def _build_policy_map(self, policy_dict, policy_config):
         policy_map = {}
+        preprocessors = {}
         for name, (cls, obs_space, act_space,
                    conf) in sorted(policy_dict.items()):
             merged_conf = merge_dicts(policy_config, conf)
+            if self.preprocessing_enabled:
+                preprocessor = ModelCatalog.get_preprocessor_for_space(
+                    obs_space, merged_conf.get("model"))
+                preprocessors[name] = preprocessor
+                obs_space = preprocessor.observation_space
+            else:
+                preprocessors[name] = NoPreprocessor(obs_space)
+            if isinstance(obs_space, gym.spaces.Dict) or \
+                    isinstance(obs_space, gym.spaces.Tuple):
+                raise ValueError(
+                    "Found raw Tuple|Dict space as input to policy graph. "
+                    "Please preprocess these observations with a "
+                    "Tuple|DictFlatteningPreprocessor.")
             with tf.variable_scope(name):
-                if isinstance(obs_space, gym.spaces.Dict):
-                    raise ValueError(
-                        "Found raw Dict space as input to policy graph. "
-                        "Please preprocess your environment observations "
-                        "with DictFlatteningPreprocessor and set the "
-                        "obs space to `preprocessor.observation_space`.")
-                elif isinstance(obs_space, gym.spaces.Tuple):
-                    raise ValueError(
-                        "Found raw Tuple space as input to policy graph. "
-                        "Please preprocess your environment observations "
-                        "with TupleFlatteningPreprocessor and set the "
-                        "obs space to `preprocessor.observation_space`.")
                 policy_map[name] = cls(obs_space, act_space, merged_conf)
-        return policy_map
+        return policy_map, preprocessors
 
     def sample(self):
         """Evaluate the current policies and return a batch of experiences.
@@ -554,6 +555,11 @@ def _validate_and_canonicalize(policy_graph, env):
     elif not issubclass(policy_graph, PolicyGraph):
         raise ValueError("policy_graph must be a rllib.PolicyGraph class")
     else:
+        if (isinstance(env, MultiAgentEnv)
+                and not hasattr(env, "observation_space")):
+            raise ValueError(
+                "MultiAgentEnv must have observation_space defined if run "
+                "in a single-agent configuration.")
         return {
             DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
                                 env.action_space, {})
diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py
index caec1bf43..83f2f5ca1 100644
--- a/python/ray/rllib/evaluation/sample_batch.py
+++ b/python/ray/rllib/evaluation/sample_batch.py
@@ -79,6 +79,11 @@ class MultiAgentSampleBatchBuilder(object):
         self.agent_to_policy = {}
         self.count = 0  # increment this manually
 
+    def total(self):
+        """Returns summed number of steps across all agent buffers."""
+
+        return sum(p.count for p in self.policy_builders.values())
+
     def has_pending_data(self):
         """Returns whether there is pending unprocessed data."""
 
diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py
index 0bda18bc0..cc9e32295 100644
--- a/python/ray/rllib/evaluation/sampler.py
+++ b/python/ray/rllib/evaluation/sampler.py
@@ -19,6 +19,7 @@ from ray.rllib.models.action_dist import TupleActions
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
 
 logger = logging.getLogger(__name__)
+_large_batch_warned = False
 
 RolloutMetrics = namedtuple(
     "RolloutMetrics",
@@ -42,6 +43,7 @@ class SyncSampler(object):
                  env,
                  policies,
                  policy_mapping_fn,
+                 preprocessors,
                  obs_filters,
                  clip_rewards,
                  unroll_length,
@@ -55,13 +57,14 @@ class SyncSampler(object):
         self.horizon = horizon
         self.policies = policies
         self.policy_mapping_fn = policy_mapping_fn
-        self._obs_filters = obs_filters
+        self.preprocessors = preprocessors
+        self.obs_filters = obs_filters
         self.extra_batches = queue.Queue()
         self.rollout_provider = _env_runner(
             self.async_vector_env, self.extra_batches.put, self.policies,
             self.policy_mapping_fn, self.unroll_length, self.horizon,
-            self._obs_filters, clip_rewards, clip_actions, pack, callbacks,
-            tf_sess)
+            self.preprocessors, self.obs_filters, clip_rewards, clip_actions,
+            pack, callbacks, tf_sess)
         self.metrics_queue = queue.Queue()
 
     def get_data(self):
@@ -101,6 +104,7 @@ class AsyncSampler(threading.Thread):
                  env,
                  policies,
                  policy_mapping_fn,
+                 preprocessors,
                  obs_filters,
                  clip_rewards,
                  unroll_length,
@@ -121,7 +125,8 @@ class AsyncSampler(threading.Thread):
         self.horizon = horizon
         self.policies = policies
         self.policy_mapping_fn = policy_mapping_fn
-        self._obs_filters = obs_filters
+        self.preprocessors = preprocessors
+        self.obs_filters = obs_filters
         self.clip_rewards = clip_rewards
         self.daemon = True
         self.pack = pack
@@ -140,8 +145,8 @@ class AsyncSampler(threading.Thread):
         rollout_provider = _env_runner(
             self.async_vector_env, self.extra_batches.put, self.policies,
             self.policy_mapping_fn, self.unroll_length, self.horizon,
-            self._obs_filters, self.clip_rewards, self.clip_actions, self.pack,
-            self.callbacks, self.tf_sess)
+            self.preprocessors, self.obs_filters, self.clip_rewards,
+            self.clip_actions, self.pack, self.callbacks, self.tf_sess)
         while True:
             # The timeout variable exists because apparently, if one worker
             # dies, the other workers won't die with it, unless the timeout is
@@ -200,6 +205,7 @@ def _env_runner(async_vector_env,
                 policy_mapping_fn,
                 unroll_length,
                 horizon,
+                preprocessors,
                 obs_filters,
                 clip_rewards,
                 clip_actions,
@@ -218,6 +224,8 @@ def _env_runner(async_vector_env,
         unroll_length (int): Number of episode steps before `SampleBatch` is
             yielded. Set to infinity to yield complete episodes.
         horizon (int): Horizon of the episode.
+        preprocessors (dict): Map of policy id to preprocessor for the
+            observations prior to filtering.
         obs_filters (dict): Map of policy id to filter used to process
             observations for the policy.
         clip_rewards (bool): Whether to clip rewards before postprocessing.
@@ -273,7 +281,7 @@ def _env_runner(async_vector_env,
         active_envs, to_eval, outputs = _process_observations(
             async_vector_env, policies, batch_builder_pool, active_episodes,
             unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon,
-            obs_filters, unroll_length, pack, callbacks)
+            preprocessors, obs_filters, unroll_length, pack, callbacks)
         for o in outputs:
             yield o
 
@@ -293,8 +301,8 @@ def _env_runner(async_vector_env,
 
 def _process_observations(async_vector_env, policies, batch_builder_pool,
                           active_episodes, unfiltered_obs, rewards, dones,
-                          infos, off_policy_actions, horizon, obs_filters,
-                          unroll_length, pack, callbacks):
+                          infos, off_policy_actions, horizon, preprocessors,
+                          obs_filters, unroll_length, pack, callbacks):
     """Record new data from the environment and prepare for policy evaluation.
 
     Returns:
@@ -316,6 +324,21 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
             episode.batch_builder.count += 1
             episode._add_agent_rewards(rewards[env_id])
 
+        global _large_batch_warned
+        if (not _large_batch_warned and
+                episode.batch_builder.total() > max(1000, unroll_length * 10)):
+            _large_batch_warned = True
+            logger.warn(
+                "More than {} observations for {} env steps ".format(
+                    episode.batch_builder.total(),
+                    episode.batch_builder.count) + "are buffered in "
+                "the sampler. If this is not intentional, check that the "
+                "the `horizon` config is set correctly, or consider setting "
+                "`batch_mode` to 'truncate_episodes'. Note that in "
+                "multi-agent environments, `sample_batch_size` sets the "
+                "batch size based on environment steps, not the steps of "
+                "individual agents.")
+
         # Check episode termination conditions
         if dones[env_id]["__all__"] or episode.length >= horizon:
             all_done = True
@@ -336,7 +359,9 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
         # For each agent in the environment
         for agent_id, raw_obs in agent_obs.items():
             policy_id = episode.policy_for(agent_id)
-            filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs)
+            prep_obs = _get_or_raise(preprocessors,
+                                     policy_id).transform(raw_obs)
+            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
             agent_done = bool(all_done or dones[env_id].get(agent_id))
             if not agent_done:
                 to_eval[policy_id].append(
@@ -347,6 +372,7 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
 
             last_observation = episode.last_observation_for(agent_id)
             episode._set_last_observation(agent_id, filtered_obs)
+            episode._set_last_info(agent_id, infos[env_id][agent_id])
 
             # Record transition info if applicable
             if last_observation is not None and \
@@ -406,8 +432,10 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
                 for agent_id, raw_obs in resetted_obs.items():
                     policy_id = episode.policy_for(agent_id)
                     policy = _get_or_raise(policies, policy_id)
+                    prep_obs = _get_or_raise(preprocessors,
+                                             policy_id).transform(raw_obs)
                     filtered_obs = _get_or_raise(obs_filters,
-                                                 policy_id)(raw_obs)
+                                                 policy_id)(prep_obs)
                     episode._set_last_observation(agent_id, filtered_obs)
                     to_eval[policy_id].append(
                         PolicyEvalData(
diff --git a/python/ray/rllib/examples/custom_metrics_and_callbacks.py b/python/ray/rllib/examples/custom_metrics_and_callbacks.py
index c92ae8783..af1d25f16 100644
--- a/python/ray/rllib/examples/custom_metrics_and_callbacks.py
+++ b/python/ray/rllib/examples/custom_metrics_and_callbacks.py
@@ -25,10 +25,10 @@ def on_episode_step(info):
 
 def on_episode_end(info):
     episode = info["episode"]
-    mean_pole_angle = np.mean(episode.user_data["pole_angles"])
+    pole_angle = np.mean(episode.user_data["pole_angles"])
     print("episode {} ended with length {} and pole angles {}".format(
-        episode.episode_id, episode.length, mean_pole_angle))
-    episode.custom_metrics["mean_pole_angle"] = mean_pole_angle
+        episode.episode_id, episode.length, pole_angle))
+    episode.custom_metrics["pole_angle"] = pole_angle
 
 
 def on_sample_end(info):
@@ -70,6 +70,8 @@ if __name__ == "__main__":
     # verify custom metrics for integration tests
     custom_metrics = trials[0].last_result["custom_metrics"]
     print(custom_metrics)
-    assert "mean_pole_angle" in custom_metrics
-    assert type(custom_metrics["mean_pole_angle"]) is float
+    assert "pole_angle_mean" in custom_metrics
+    assert "pole_angle_min" in custom_metrics
+    assert "pole_angle_max" in custom_metrics
+    assert type(custom_metrics["pole_angle_mean"]) is float
     assert "callback_ok" in trials[0].last_result
diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index f9e8af282..822af4a37 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -271,15 +271,26 @@ class ModelCatalog(object):
 
     @staticmethod
     def get_preprocessor(env, options=None):
-        """Returns a suitable processor for the given environment.
+        """Returns a suitable preprocessor for the given env.
+
+        This is a wrapper for get_preprocessor_for_space().
+        """
+
+        return ModelCatalog.get_preprocessor_for_space(env.observation_space,
+                                                       options)
+
+    @staticmethod
+    def get_preprocessor_for_space(observation_space, options=None):
+        """Returns a suitable preprocessor for the given observation space.
 
         Args:
-            env (gym.Env|VectorEnv|ExternalEnv): The environment to wrap.
+            observation_space (Space): The input observation space.
             options (dict): Options to pass to the preprocessor.
 
         Returns:
-            preprocessor (Preprocessor): Preprocessor for the env observations.
+            preprocessor (Preprocessor): Preprocessor for the observations.
         """
+
         options = options or MODEL_DEFAULTS
         for k in options.keys():
             if k not in MODEL_DEFAULTS:
@@ -290,13 +301,13 @@ class ModelCatalog(object):
             preprocessor = options["custom_preprocessor"]
             logger.info("Using custom preprocessor {}".format(preprocessor))
             prep = _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)(
-                env.observation_space, options)
+                observation_space, options)
         else:
-            cls = get_preprocessor(env.observation_space)
-            prep = cls(env.observation_space, options)
+            cls = get_preprocessor(observation_space)
+            prep = cls(observation_space, options)
 
         logger.debug("Created preprocessor {}: {} -> {}".format(
-            prep, env.observation_space, prep.shape))
+            prep, observation_space, prep.shape))
         return prep
 
     @staticmethod
diff --git a/python/ray/rllib/models/preprocessors.py b/python/ray/rllib/models/preprocessors.py
index a4af708b7..66835e864 100644
--- a/python/ray/rllib/models/preprocessors.py
+++ b/python/ray/rllib/models/preprocessors.py
@@ -109,6 +109,9 @@ class OneHotPreprocessor(Preprocessor):
 
     def transform(self, observation):
         arr = np.zeros(self._obs_space.n)
+        if not self._obs_space.contains(observation):
+            raise ValueError("Observation outside expected value range",
+                             self._obs_space, observation)
         arr[observation] = 1
         return arr
 
diff --git a/python/ray/rllib/optimizers/async_replay_optimizer.py b/python/ray/rllib/optimizers/async_replay_optimizer.py
index 3cd5a16ad..1f2167408 100644
--- a/python/ray/rllib/optimizers/async_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/async_replay_optimizer.py
@@ -135,6 +135,7 @@ class LearnerThread(threading.Thread):
         self.daemon = True
         self.weights_updated = False
         self.stopped = False
+        self.stats = {}
 
     def run(self):
         while not self.stopped:
@@ -151,6 +152,8 @@ class LearnerThread(threading.Thread):
                     prio_dict[pid] = (
                         replay.policy_batches[pid]["batch_indexes"],
                         info["td_error"])
+                    if "stats" in info:
+                        self.stats[pid] = info["stats"]
             # send `replay` back also so that it gets released by the original
             # thread: https://github.com/ray-project/ray/issues/2610
             self.outqueue.put((ra, replay, prio_dict, replay.count))
@@ -331,4 +334,6 @@ class AsyncReplayOptimizer(PolicyOptimizer):
         }
         if self.debug:
             stats.update(debug_stats)
+        if self.learner.stats:
+            stats["learner"] = self.learner.stats
         return dict(PolicyOptimizer.stats(self), **stats)
diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py
index 73df00601..b15ab3390 100644
--- a/python/ray/rllib/optimizers/sync_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py
@@ -53,6 +53,7 @@ class SyncReplayOptimizer(PolicyOptimizer):
         self.replay_timer = TimerStat()
         self.grad_timer = TimerStat()
         self.throughput = RunningStat()
+        self.learner_stats = {}
 
         # Set up replay buffer
         if prioritized_replay:
@@ -111,6 +112,8 @@ class SyncReplayOptimizer(PolicyOptimizer):
         with self.grad_timer:
             info_dict = self.local_evaluator.compute_apply(samples)
             for policy_id, info in info_dict.items():
+                if "stats" in info:
+                    self.learner_stats[policy_id] = info["stats"]
                 replay_buffer = self.replay_buffers[policy_id]
                 if isinstance(replay_buffer, PrioritizedReplayBuffer):
                     td_error = info["td_error"]
@@ -160,4 +163,5 @@ class SyncReplayOptimizer(PolicyOptimizer):
                 "opt_peak_throughput": round(self.grad_timer.mean_throughput,
                                              3),
                 "opt_samples": round(self.grad_timer.mean_units_processed, 3),
+                "learner": self.learner_stats,
             })
diff --git a/python/ray/rllib/test/test_env_with_subprocess.py b/python/ray/rllib/test/test_env_with_subprocess.py
index 70ccb46cc..fc940cdea 100644
--- a/python/ray/rllib/test/test_env_with_subprocess.py
+++ b/python/ray/rllib/test/test_env_with_subprocess.py
@@ -38,10 +38,10 @@ class EnvWithSubprocess(gym.Env):
         atexit.register(lambda: self.subproc.kill())
 
     def reset(self):
-        return [0]
+        return 0
 
     def step(self, action):
-        return [0], 0, True, {}
+        return 0, 0, True, {}
 
 
 def leaked_processes():
diff --git a/python/ray/rllib/test/test_multi_agent_env.py b/python/ray/rllib/test/test_multi_agent_env.py
index 5712390c0..1fdfa5d74 100644
--- a/python/ray/rllib/test/test_multi_agent_env.py
+++ b/python/ray/rllib/test/test_multi_agent_env.py
@@ -22,6 +22,12 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.tune.registry import register_env
 
 
+def one_hot(i, n):
+    out = [0.0] * n
+    out[i] = 1.0
+    return out
+
+
 class BasicMultiAgent(MultiAgentEnv):
     """Env of N independent agents, each of which exits after 25 steps."""
 
@@ -64,7 +70,7 @@ class RoundRobinMultiAgent(MultiAgentEnv):
         self.last_info = {}
         self.i = 0
         self.num = num
-        self.observation_space = gym.spaces.Discrete(2)
+        self.observation_space = gym.spaces.Discrete(10)
         self.action_space = gym.spaces.Discrete(2)
 
     def reset(self):
@@ -290,7 +296,7 @@ class TestMultiAgentEnv(unittest.TestCase):
 
     def testMultiAgentSampleRoundRobin(self):
         act_space = gym.spaces.Discrete(2)
-        obs_space = gym.spaces.Discrete(2)
+        obs_space = gym.spaces.Discrete(10)
         ev = PolicyEvaluator(
             env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
             policy_graph={
@@ -303,10 +309,20 @@ class TestMultiAgentEnv(unittest.TestCase):
         # since we round robin introduce agents into the env, some of the env
         # steps don't count as proper transitions
         self.assertEqual(batch.policy_batches["p0"].count, 42)
-        self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10],
-                         [0, 1, 2, 3, 4] * 2)
-        self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10],
-                         [1, 2, 3, 4, 5] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10], [
+            one_hot(0, 10),
+            one_hot(1, 10),
+            one_hot(2, 10),
+            one_hot(3, 10),
+            one_hot(4, 10),
+        ] * 2)
+        self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10], [
+            one_hot(1, 10),
+            one_hot(2, 10),
+            one_hot(3, 10),
+            one_hot(4, 10),
+            one_hot(5, 10),
+        ] * 2)
         self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
                          [100, 100, 100, 100, 0] * 2)
         self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
diff --git a/python/ray/rllib/test/test_nested_spaces.py b/python/ray/rllib/test/test_nested_spaces.py
index 490e6af15..95744b7e2 100644
--- a/python/ray/rllib/test/test_nested_spaces.py
+++ b/python/ray/rllib/test/test_nested_spaces.py
@@ -13,6 +13,8 @@ import unittest
 
 import ray
 from ray.rllib.agents.pg import PGAgent
+from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.env import MultiAgentEnv
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.models import ModelCatalog
@@ -88,6 +90,34 @@ class NestedTupleEnv(gym.Env):
         return TUPLE_SAMPLES[self.steps], 1, self.steps >= 5, {}
 
 
+class NestedMultiAgentEnv(MultiAgentEnv):
+    def __init__(self):
+        self.steps = 0
+
+    def reset(self):
+        return {
+            "dict_agent": DICT_SAMPLES[0],
+            "tuple_agent": TUPLE_SAMPLES[0],
+        }
+
+    def step(self, actions):
+        self.steps += 1
+        obs = {
+            "dict_agent": DICT_SAMPLES[self.steps],
+            "tuple_agent": TUPLE_SAMPLES[self.steps],
+        }
+        rew = {
+            "dict_agent": 0,
+            "tuple_agent": 0,
+        }
+        dones = {"__all__": self.steps >= 5}
+        infos = {
+            "dict_agent": {},
+            "tuple_agent": {},
+        }
+        return obs, rew, dones, infos
+
+
 class InvalidModel(Model):
     def _build_layers_v2(self, input_dict, num_outputs, options):
         return "not", "valid"
@@ -107,7 +137,8 @@ class DictSpyModel(Model):
             # redis to communicate back to our suite
             ray.experimental.internal_kv._internal_kv_put(
                 "d_spy_in_{}".format(DictSpyModel.capture_index),
-                pickle.dumps((pos, front_cam, task)))
+                pickle.dumps((pos, front_cam, task)),
+                overwrite=True)
             DictSpyModel.capture_index += 1
             return 0
 
@@ -135,7 +166,8 @@ class TupleSpyModel(Model):
             # redis to communicate back to our suite
             ray.experimental.internal_kv._internal_kv_put(
                 "t_spy_in_{}".format(TupleSpyModel.capture_index),
-                pickle.dumps((pos, cam, task)))
+                pickle.dumps((pos, cam, task)),
+                overwrite=True)
             TupleSpyModel.capture_index += 1
             return 0
 
@@ -242,10 +274,8 @@ class NestedSpacesTest(unittest.TestCase):
         self.doTestNestedDict(lambda _: SimpleServing(NestedDictEnv()))
 
     def testNestedDictAsync(self):
-        self.assertRaisesRegexp(
-            ValueError, "Found raw Dict space.*",
-            lambda: self.doTestNestedDict(
-                lambda _: AsyncVectorEnv.wrap_async(NestedDictEnv())))
+        self.doTestNestedDict(
+            lambda _: AsyncVectorEnv.wrap_async(NestedDictEnv()))
 
     def testNestedTupleGym(self):
         self.doTestNestedTuple(lambda _: NestedTupleEnv())
@@ -258,10 +288,57 @@ class NestedSpacesTest(unittest.TestCase):
         self.doTestNestedTuple(lambda _: SimpleServing(NestedTupleEnv()))
 
     def testNestedTupleAsync(self):
-        self.assertRaisesRegexp(
-            ValueError, "Found raw Tuple space.*",
-            lambda: self.doTestNestedTuple(
-                lambda _: AsyncVectorEnv.wrap_async(NestedTupleEnv())))
+        self.doTestNestedTuple(
+            lambda _: AsyncVectorEnv.wrap_async(NestedTupleEnv()))
+
+    def testMultiAgentComplexSpaces(self):
+        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
+        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
+        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
+        act_space = spaces.Discrete(2)
+        pg = PGAgent(
+            env="nested_ma",
+            config={
+                "num_workers": 0,
+                "sample_batch_size": 5,
+                "multiagent": {
+                    "policy_graphs": {
+                        "tuple_policy": (
+                            PGPolicyGraph, TUPLE_SPACE, act_space,
+                            {"model": {"custom_model": "tuple_spy"}}),
+                        "dict_policy": (
+                            PGPolicyGraph, DICT_SPACE, act_space,
+                            {"model": {"custom_model": "dict_spy"}}),
+                    },
+                    "policy_mapping_fn": lambda a: {
+                        "tuple_agent": "tuple_policy",
+                        "dict_agent": "dict_policy"}[a],
+                },
+            })
+        pg.train()
+
+        for i in range(4):
+            seen = pickle.loads(
+                ray.experimental.internal_kv._internal_kv_get(
+                    "d_spy_in_{}".format(i)))
+            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
+            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
+            task_i = one_hot(
+                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
+            self.assertEqual(seen[0][0].tolist(), pos_i)
+            self.assertEqual(seen[1][0].tolist(), cam_i)
+            self.assertEqual(seen[2][0].tolist(), task_i)
+
+        for i in range(4):
+            seen = pickle.loads(
+                ray.experimental.internal_kv._internal_kv_get(
+                    "t_spy_in_{}".format(i)))
+            pos_i = TUPLE_SAMPLES[i][0].tolist()
+            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
+            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
+            self.assertEqual(seen[0][0].tolist(), pos_i)
+            self.assertEqual(seen[1][0].tolist(), cam_i)
+            self.assertEqual(seen[2][0].tolist(), task_i)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/rllib/utils/filter.py b/python/ray/rllib/utils/filter.py
index fbdb39ae1..9a1f37dbd 100644
--- a/python/ray/rllib/utils/filter.py
+++ b/python/ray/rllib/utils/filter.py
@@ -2,9 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging
 import numpy as np
 import threading
 
+logger = logging.getLogger(__name__)
+
 
 class Filter(object):
     """Processes input, possibly statefully."""
@@ -39,7 +42,10 @@ class NoFilter(Filter):
         pass
 
     def __call__(self, x, update=True):
-        return np.asarray(x)
+        try:
+            return np.asarray(x)
+        except Exception:
+            raise ValueError("Failed to convert to array", x)
 
     def apply_changes(self, other, *args, **kwargs):
         pass