diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
index c95def692..a4a659b8e 100644
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@@ -136,6 +136,48 @@ Here is a simple `example training script <https://github.com/ray-project/ray/bl
 
 To scale to hundreds of agents, MultiAgentEnv batches policy evaluations across multiple agents internally. It can also be auto-vectorized by setting ``num_envs_per_worker > 1``.
 
+Variable-Sharing Between Policies
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RLlib will create each policy's model in a separate ``tf.variable_scope``. However, variables can still be shared between policies by explicitly entering a globally shared variable scope with ``tf.VariableScope(reuse=tf.AUTO_REUSE)``:
+
+.. code-block:: python
+
+        with tf.variable_scope(
+                tf.VariableScope(tf.AUTO_REUSE, "name_of_global_shared_scope"),
+                reuse=tf.AUTO_REUSE,
+                auxiliary_name_scope=False):
+            <create the shared layers here>
+
+There is a full example of this in the `example training script <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/multiagent_cartpole.py>`__.
+
+Implementing a Centralized Critic
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Implementing a shared critic between multiple policies requires the definition of custom policy graphs. It can be done as follows:
+
+1. Querying the critic: this can be done in the ``postprocess_trajectory`` method of a custom policy graph, which has full access to the policies and observations of concurrent agents via the ``other_agent_batches`` and ``episode`` arguments. This assumes you use variable sharing to access the critic network from multiple policies. The critic predictions can then be added to the postprocessed trajectory. Here's an example:
+
+.. code-block:: python
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches, episode):
+        agents = ["agent_1", "agent_2", "agent_3"]  # simple example of 3 agents
+        global_obs_batch = np.stack(
+            [other_agent_batches[agent_id][1]["obs"] for agent_id in agents],
+            axis=1)
+        # add the global obs and global critic value
+        sample_batch["global_obs"] = global_obs_batch
+        sample_batch["global_vf"] = self.sess.run(
+            self.global_critic_network, feed_dict={"obs": global_obs_batch})
+        # metrics like "global reward" can be retrieved from the info return of the environment
+        sample_batch["global_rewards"] = [
+            info["global_reward"] for info in sample_batch["infos"]]
+        return sample_batch
+
+2. Updating the critic: the centralized critic loss can be added to the loss of some arbitrary policy graph. The policy graph that is chosen must add the inputs for the critic loss to its postprocessed trajectory batches.
+
+For an example of defining loss inputs, see the `PGPolicyGraph example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy_graph.py>`__.
+
 Agent-Driven
 ------------
 
diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index a2a9233ef..79df3ab5c 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -30,7 +30,7 @@ The following is a list of the built-in model hyperparameters:
 Custom Models
 -------------
 
-Custom models should subclass the common RLlib `model class <https://github.com/ray-project/ray/blob/master/python/ray/rllib/models/model.py>`__ and override the ``_build_layers_v2`` method. This method takes in a dict of tensor inputs (the observation ``obs``, ``prev_action``, and ``prev_reward``), and returns a feature layer and float vector of the specified output size. The model can then be registered and used in place of a built-in model:
+Custom models should subclass the common RLlib `model class <https://github.com/ray-project/ray/blob/master/python/ray/rllib/models/model.py>`__ and override the ``_build_layers_v2`` method. This method takes in a dict of tensor inputs (the observation ``obs``, ``prev_action``, and ``prev_reward``), and returns a feature layer and float vector of the specified output size. You can also override the ``value_function`` method to implement a custom value branch. The model can then be registered and used in place of a built-in model:
 
 .. code-block:: python
 
@@ -74,6 +74,18 @@ Custom models should subclass the common RLlib `model class <https://github.com/
             ...
             return layerN, layerN_minus_1
 
+        def value_function(self):
+            """Builds the value function output.
+
+            This method can be overridden to customize the implementation of the
+            value function (e.g., not sharing hidden layers).
+
+            Returns:
+                Tensor of size [BATCH_SIZE] for the value function.
+            """
+            return tf.reshape(
+                linear(self.last_layer, 1, "value", normc_initializer(1.0)), [-1])
+
     ModelCatalog.register_custom_model("my_model", MyModelClass)
 
     ray.init()
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
index c51cfb49e..e04d418c6 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
@@ -13,7 +13,6 @@ from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
     LearningRateSchedule
-from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.models.catalog import ModelCatalog
 
 
@@ -57,9 +56,7 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
             "prev_rewards": prev_rewards
         }, observation_space, logit_dim, self.config["model"])
         action_dist = dist_class(self.model.outputs)
-        self.vf = tf.reshape(
-            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
-            [-1])
+        self.vf = self.model.value_function()
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
 
@@ -144,7 +141,10 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
     def get_initial_state(self):
         return self.model.state_init
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         completed = sample_batch["dones"][-1]
         if completed:
             last_r = 0.0
diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index dcdada591..3eecc3bb1 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -62,7 +62,10 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
     def optimizer(self):
         return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         completed = sample_batch["dones"][-1]
         if completed:
             last_r = 0.0
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index ad2cee363..e647a4153 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -332,7 +332,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
             "td_error": self.loss.td_error,
         }
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         return _postprocess_dqn(self, sample_batch)
 
     def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index 601bd8fc1..f279d4ecc 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -414,7 +414,10 @@ class DQNPolicyGraph(TFPolicyGraph):
             "td_error": self.loss.td_error,
         }
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         return _postprocess_dqn(self, sample_batch)
 
     def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
index 9621bd1a8..b75d79e57 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
@@ -14,7 +14,6 @@ from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
     LearningRateSchedule
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.explained_variance import explained_variance
 
@@ -140,9 +139,7 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
             state_in=existing_state_in,
             seq_lens=existing_seq_lens)
         action_dist = dist_class(self.model.outputs)
-        values = tf.reshape(
-            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
-            [-1])
+        values = self.model.value_function()
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
 
@@ -251,7 +248,10 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
     def extra_compute_grad_fetches(self):
         return self.stats_fetches
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         del sample_batch.data["new_obs"]  # not used, so save some bandwidth
         return sample_batch
 
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 4bfd23316..0395f026e 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -11,21 +11,27 @@ from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 
 
 class PGLoss(object):
+    """Simple policy gradient loss."""
+
     def __init__(self, action_dist, actions, advantages):
         self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages)
 
 
 class PGPolicyGraph(TFPolicyGraph):
+    """Simple policy gradient example of defining a policy graph."""
+
     def __init__(self, obs_space, action_space, config):
         config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config)
         self.config = config
 
-        # Setup policy
+        # Setup placeholders
         obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(
             action_space, self.config["model"])
         prev_actions = ModelCatalog.get_action_placeholder(action_space)
         prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
+
+        # Create the model network and action outputs
         self.model = ModelCatalog.get_model({
             "obs": obs,
             "prev_actions": prev_actions,
@@ -38,17 +44,19 @@ class PGPolicyGraph(TFPolicyGraph):
         advantages = tf.placeholder(tf.float32, [None], name="adv")
         loss = PGLoss(action_dist, actions, advantages).loss
 
-        # Initialize TFPolicyGraph
-        sess = tf.get_default_session()
-        # Mapping from sample batch keys to placeholders
+        # Mapping from sample batch keys to placeholders. These keys will be
+        # read from postprocessed sample batches and fed into the specified
+        # placeholders during loss computation.
         loss_in = [
             ("obs", obs),
             ("actions", actions),
             ("prev_actions", prev_actions),
             ("prev_rewards", prev_rewards),
-            ("advantages", advantages),
+            ("advantages", advantages),  # added during postprocessing
         ]
 
+        # Initialize TFPolicyGraph
+        sess = tf.get_default_session()
         TFPolicyGraph.__init__(
             self,
             obs_space,
@@ -66,7 +74,11 @@ class PGPolicyGraph(TFPolicyGraph):
             max_seq_len=config["model"]["max_seq_len"])
         sess.run(tf.global_variables_initializer())
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
+        # This ads the "advantages" column to the sample batch
         return compute_advantages(
             sample_batch, 0.0, self.config["gamma"], use_gae=False)
 
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 262a014a4..1a78625b2 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -9,7 +9,6 @@ from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
     LearningRateSchedule
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.utils.explained_variance import explained_variance
 
 
@@ -180,9 +179,7 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
         self.sampler = curr_action_dist.sample()
         if self.config["use_gae"]:
             if self.config["vf_share_layers"]:
-                self.value_function = tf.reshape(
-                    linear(self.model.last_layer, 1, "value",
-                           normc_initializer(1.0)), [-1])
+                self.value_function = self.model.value_function()
             else:
                 vf_config = self.config["model"].copy()
                 # Do not split the last layer of the value function into
@@ -286,7 +283,10 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
         vf = self.sess.run(self.value_function, feed_dict)
         return vf[0]
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         completed = sample_batch["dones"][-1]
         if completed:
             last_r = 0.0
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index d032c8da3..4d72315e8 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -17,10 +17,11 @@ from ray.rllib.evaluation.interface import EvaluatorInterface
 from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
     DEFAULT_POLICY_ID
 from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
-from ray.rllib.utils.compression import pack
-from ray.rllib.utils.filter import get_filter
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.utils import merge_dicts
+from ray.rllib.utils.compression import pack
+from ray.rllib.utils.filter import get_filter
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
 
 
@@ -299,8 +300,7 @@ class PolicyEvaluator(EvaluatorInterface):
         policy_map = {}
         for name, (cls, obs_space, act_space,
                    conf) in sorted(policy_dict.items()):
-            merged_conf = policy_config.copy()
-            merged_conf.update(conf)
+            merged_conf = merge_dicts(policy_config, conf)
             with tf.variable_scope(name):
                 if isinstance(obs_space, gym.spaces.Dict):
                     raise ValueError(
diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy_graph.py
index b2d154f48..9de59d269 100644
--- a/python/ray/rllib/evaluation/policy_graph.py
+++ b/python/ray/rllib/evaluation/policy_graph.py
@@ -83,7 +83,7 @@ class PolicyGraph(object):
             is_training (bool): whether we are training the policy
             episode (MultiAgentEpisode): this provides access to all of the
                 internal episode state, which may be useful for model-based or
-                multiagent algorithms.
+                multi-agent algorithms.
 
         Returns:
             actions (obj): single action
@@ -96,7 +96,10 @@ class PolicyGraph(object):
         return action, [s[0] for s in state_out], \
             {k: v[0] for k, v in info.items()}
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
         """Implements algorithm-specific trajectory postprocessing.
 
         This will be called on each trajectory fragment computed during policy
@@ -108,6 +111,9 @@ class PolicyGraph(object):
             other_agent_batches (dict): In a multi-agent env, this contains a
                 mapping of agent ids to (policy_graph, agent_batch) tuples
                 containing the policy graph and experiences of the other agent.
+            episode (MultiAgentEpisode): this provides access to all of the
+                internal episode state, which may be useful for model-based or
+                multi-agent algorithms.
 
         Returns:
             SampleBatch: postprocessed sample batch.
diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py
index f8f88a4aa..caec1bf43 100644
--- a/python/ray/rllib/evaluation/sample_batch.py
+++ b/python/ray/rllib/evaluation/sample_batch.py
@@ -99,11 +99,14 @@ class MultiAgentSampleBatchBuilder(object):
         builder = self.agent_builders[agent_id]
         builder.add_values(**values)
 
-    def postprocess_batch_so_far(self):
+    def postprocess_batch_so_far(self, episode):
         """Apply policy postprocessors to any unprocessed rows.
 
         This pushes the postprocessed per-agent batches onto the per-policy
         builders, clearing per-agent state.
+
+        Arguments:
+            episode: current MultiAgentEpisode object or None
         """
 
         # Materialize the batches so far
@@ -128,7 +131,7 @@ class MultiAgentSampleBatchBuilder(object):
                     "Batches sent to postprocessing must only contain steps "
                     "from a single trajectory.", pre_batch)
             post_batches[agent_id] = policy.postprocess_trajectory(
-                pre_batch, other_batches)
+                pre_batch, other_batches, episode)
 
         # Append into policy batches and reset
         for agent_id, post_batch in sorted(post_batches.items()):
@@ -137,14 +140,17 @@ class MultiAgentSampleBatchBuilder(object):
         self.agent_builders.clear()
         self.agent_to_policy.clear()
 
-    def build_and_reset(self):
+    def build_and_reset(self, episode):
         """Returns the accumulated sample batches for each policy.
 
         Any unprocessed rows will be first postprocessed with a policy
         postprocessor. The internal state of this builder will be reset.
+
+        Arguments:
+            episode: current MultiAgentEpisode object or None
         """
 
-        self.postprocess_batch_so_far()
+        self.postprocess_batch_so_far(episode)
         policy_batches = {}
         for policy_id, builder in self.policy_builders.items():
             if builder.count > 0:
diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py
index 85d5386b1..503f52a12 100644
--- a/python/ray/rllib/evaluation/sampler.py
+++ b/python/ray/rllib/evaluation/sampler.py
@@ -317,10 +317,10 @@ def _env_runner(async_vector_env,
             if episode.batch_builder.has_pending_data():
                 if (all_done and not pack) or \
                         episode.batch_builder.count >= unroll_length:
-                    yield episode.batch_builder.build_and_reset()
+                    yield episode.batch_builder.build_and_reset(episode)
                 elif all_done:
                     # Make sure postprocessor stays within one episode
-                    episode.batch_builder.postprocess_batch_so_far()
+                    episode.batch_builder.postprocess_batch_so_far(episode)
 
             if all_done:
                 # Handle episode termination
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index a7b34c2ce..bea53384b 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -64,7 +64,9 @@ class TFPolicyGraph(PolicyGraph):
             loss_inputs (list): a (name, placeholder) tuple for each loss
                 input argument. Each placeholder name must correspond to a
                 SampleBatch column key returned by postprocess_trajectory(),
-                and has shape [BATCH_SIZE, data...].
+                and has shape [BATCH_SIZE, data...]. These keys will be read
+                from postprocessed sample batches and fed into the specified
+                placeholders during loss computation.
             state_inputs (list): list of RNN state input Tensors.
             state_outputs (list): list of RNN state output Tensors.
             prev_action_input (Tensor): placeholder for previous actions
diff --git a/python/ray/rllib/examples/multiagent_cartpole.py b/python/ray/rllib/examples/multiagent_cartpole.py
index 8faeb184b..fff706639 100644
--- a/python/ray/rllib/examples/multiagent_cartpole.py
+++ b/python/ray/rllib/examples/multiagent_cartpole.py
@@ -16,9 +16,13 @@ import argparse
 import gym
 import random
 
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+
 import ray
 from ray import tune
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
+from ray.rllib.models import Model, ModelCatalog
 from ray.rllib.test.test_multi_agent_env import MultiCartpole
 from ray.tune import run_experiments
 from ray.tune.registry import register_env
@@ -29,26 +33,65 @@ parser.add_argument("--num-agents", type=int, default=4)
 parser.add_argument("--num-policies", type=int, default=2)
 parser.add_argument("--num-iters", type=int, default=20)
 
+
+class CustomModel1(Model):
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        # Example of (optional) weight sharing between two different policies.
+        # Here, we share the variables defined in the 'shared' variable scope
+        # by entering it explicitly with tf.AUTO_REUSE. This creates the
+        # variables for the 'fc1' layer in a global scope called 'shared'
+        # outside of the policy's normal variable scope.
+        with tf.variable_scope(
+                tf.VariableScope(tf.AUTO_REUSE, "shared"),
+                reuse=tf.AUTO_REUSE,
+                auxiliary_name_scope=False):
+            last_layer = slim.fully_connected(
+                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
+        output = slim.fully_connected(
+            last_layer, num_outputs, activation_fn=None, scope="fc_out")
+        return output, last_layer
+
+
+class CustomModel2(Model):
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        # Weights shared with CustomModel1
+        with tf.variable_scope(
+                tf.VariableScope(tf.AUTO_REUSE, "shared"),
+                reuse=tf.AUTO_REUSE,
+                auxiliary_name_scope=False):
+            last_layer = slim.fully_connected(
+                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
+        output = slim.fully_connected(
+            last_layer, num_outputs, activation_fn=None, scope="fc_out")
+        return output, last_layer
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     ray.init()
 
     # Simple environment with `num_agents` independent cartpole entities
     register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
+    ModelCatalog.register_custom_model("model1", CustomModel1)
+    ModelCatalog.register_custom_model("model2", CustomModel2)
     single_env = gym.make("CartPole-v0")
     obs_space = single_env.observation_space
     act_space = single_env.action_space
 
-    def gen_policy():
+    # Each policy can have a different configuration (including custom model)
+    def gen_policy(i):
         config = {
+            "model": {
+                "custom_model": ["model1", "model2"][i % 2],
+            },
             "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]),
             "n_step": random.choice([1, 2, 3, 4, 5]),
         }
-        return (PGPolicyGraph, obs_space, act_space, config)
+        return (PPOPolicyGraph, obs_space, act_space, config)
 
-    # Setup PG with an ensemble of `num_policies` different policy graphs
+    # Setup PPO with an ensemble of `num_policies` different policy graphs
     policy_graphs = {
-        "policy_{}".format(i): gen_policy()
+        "policy_{}".format(i): gen_policy(i)
         for i in range(args.num_policies)
     }
     policy_ids = list(policy_graphs.keys())
diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py
index fb8ea6687..091b17c8f 100644
--- a/python/ray/rllib/models/model.py
+++ b/python/ray/rllib/models/model.py
@@ -7,6 +7,7 @@ from collections import OrderedDict
 import gym
 import tensorflow as tf
 
+from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.models.preprocessors import get_preprocessor
 
 
@@ -131,6 +132,18 @@ class Model(object):
         """
         raise NotImplementedError
 
+    def value_function(self):
+        """Builds the value function output.
+
+        This method can be overridden to customize the implementation of the
+        value function (e.g., not sharing hidden layers).
+
+        Returns:
+            Tensor of size [BATCH_SIZE] for the value function.
+        """
+        return tf.reshape(
+            linear(self.last_layer, 1, "value", normc_initializer(1.0)), [-1])
+
 
 def _restore_original_dimensions(input_dict, obs_space):
     if hasattr(obs_space, "original_space"):
diff --git a/python/ray/rllib/test/test_multi_agent_env.py b/python/ray/rllib/test/test_multi_agent_env.py
index 31f3103f5..8a4d26768 100644
--- a/python/ray/rllib/test/test_multi_agent_env.py
+++ b/python/ray/rllib/test/test_multi_agent_env.py
@@ -359,7 +359,7 @@ class TestMultiAgentEnv(unittest.TestCase):
                         dones=t == 4,
                         infos={},
                         new_obs=obs_batch[0])
-                batch = builder.build_and_reset()
+                batch = builder.build_and_reset(episode=None)
                 episodes[0].add_extra_batch(batch)
 
                 # Just return zeros for actions
diff --git a/python/ray/rllib/test/test_policy_evaluator.py b/python/ray/rllib/test/test_policy_evaluator.py
index b90613603..0e0d48c21 100644
--- a/python/ray/rllib/test/test_policy_evaluator.py
+++ b/python/ray/rllib/test/test_policy_evaluator.py
@@ -28,7 +28,11 @@ class MockPolicyGraph(PolicyGraph):
                         episodes=None):
         return [0] * len(obs_batch), [], {}
 
-    def postprocess_trajectory(self, batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               batch,
+                               other_agent_batches=None,
+                               episode=None):
+        assert episode is not None
         return compute_advantages(batch, 100.0, 0.9, use_gae=False)
 
 
@@ -42,7 +46,11 @@ class BadPolicyGraph(PolicyGraph):
                         episodes=None):
         raise Exception("intentional error")
 
-    def postprocess_trajectory(self, batch, other_agent_batches=None):
+    def postprocess_trajectory(self,
+                               batch,
+                               other_agent_batches=None,
+                               episode=None):
+        assert episode is not None
         return compute_advantages(batch, 100.0, 0.9, use_gae=False)