[rllib] Native support for Dict and Tuple spaces; fix Tuple action spaces; add prev a, r to LSTM (#3051)

2026-06-27 20:06:31 +08:00 · 2018-10-20 15:21:22 -07:00
parent 9a2b5333ef
commit 59901a88a0
39 changed files with 921 additions and 185 deletions
@@ -49,8 +49,13 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
            tf.float32, [None] + list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
-        self.model = ModelCatalog.get_model(self.observations, logit_dim,
-                                            self.config["model"])
+        prev_actions = ModelCatalog.get_action_placeholder(action_space)
+        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
+        self.model = ModelCatalog.get_model({
+            "obs": self.observations,
+            "prev_actions": prev_actions,
+            "prev_rewards": prev_rewards
+        }, observation_space, logit_dim, self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = tf.reshape(
            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
@@ -78,6 +83,8 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
        loss_in = [
            ("obs", self.observations),
            ("actions", actions),
+            ("prev_actions", prev_actions),
+            ("prev_rewards", prev_rewards),
            ("advantages", advantages),
            ("value_targets", self.v_target),
        ]
@@ -94,6 +101,8 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
+            prev_action_input=prev_actions,
+            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

@@ -51,7 +51,8 @@ COMMON_CONFIG = {
    "env_config": {},
    # Environment name can also be passed via config
    "env": None,
-    # Arguments to pass to model
+    # Arguments to pass to model. See models/catalog.py for a full list of the
+    # available model options.
    "model": MODEL_DEFAULTS,
    # Arguments to pass to the policy optimizer. These vary by optimizer.
    "optimizer": {},
@@ -196,6 +197,8 @@ class Agent(Trainable):

        # Agents allow env ids to be passed directly to the constructor.
        self._env_id = env or config.get("env")
+        if not self._env_id:
+            raise ValueError("Must specify env (str) when creating agent")

        # Create a default logger creator if no logger_creator is specified
        if logger_creator is None:
@@ -77,8 +77,8 @@ class Worker(object):

        self.sess = utils.make_session(single_threaded=True)
        self.policy = policies.GenericPolicy(
-            self.sess, self.env.action_space, self.preprocessor,
-            config["observation_filter"], config["model"])
+            self.sess, self.env.action_space, self.env.observation_space,
+            self.preprocessor, config["observation_filter"], config["model"])

    def rollout(self, timestep_limit, add_noise=False):
        rollout_rewards, rollout_length = policies.rollout(
@@ -153,7 +153,7 @@ class ARSAgent(Agent):

        self.sess = utils.make_session(single_threaded=False)
        self.policy = policies.GenericPolicy(
-            self.sess, env.action_space, preprocessor,
+            self.sess, env.action_space, env.observation_space, preprocessor,
            self.config["observation_filter"], self.config["model"])
        self.optimizer = optimizers.SGD(self.policy,
                                        self.config["sgd_stepsize"])
@@ -10,6 +10,7 @@ import numpy as np
 import tensorflow as tf

 import ray
+from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
 from ray.rllib.utils.filter import get_filter
 from ray.rllib.models import ModelCatalog

@@ -56,6 +57,7 @@ class GenericPolicy(object):
    def __init__(self,
                 sess,
                 action_space,
+                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
@@ -73,7 +75,9 @@ class GenericPolicy(object):
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

-        model = ModelCatalog.get_model(self.inputs, dist_dim, model_config)
+        model = ModelCatalog.get_model({
+            "obs": self.inputs
+        }, obs_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

@@ -90,6 +94,7 @@ class GenericPolicy(object):
        observation = self.observation_filter(observation[None], update=update)
        action = self.sess.run(
            self.sampler, feed_dict={self.inputs: observation})
+        action = _unbatch_tuple_actions(action)
        if add_noise and isinstance(self.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * self.action_noise_std
        return action
@@ -149,7 +149,8 @@ class DDPGPolicyGraph(TFPolicyGraph):

        # Actor: P (policy) network
        with tf.variable_scope(P_SCOPE) as scope:
-            p_values = self._build_p_network(self.cur_observations)
+            p_values = self._build_p_network(self.cur_observations,
+                                             observation_space)
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
@@ -178,11 +179,11 @@ class DDPGPolicyGraph(TFPolicyGraph):

        # p network evaluation
        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
-            self.p_t = self._build_p_network(self.obs_t)
+            self.p_t = self._build_p_network(self.obs_t, observation_space)

        # target p network evaluation
        with tf.variable_scope(P_TARGET_SCOPE) as scope:
-            p_tp1 = self._build_p_network(self.obs_tp1)
+            p_tp1 = self._build_p_network(self.obs_tp1, observation_space)
            target_p_func_vars = _scope_vars(scope.name)

        # Action outputs
@@ -197,14 +198,16 @@ class DDPGPolicyGraph(TFPolicyGraph):

        # q network evaluation
        with tf.variable_scope(Q_SCOPE) as scope:
-            q_t = self._build_q_network(self.obs_t, self.act_t)
+            q_t = self._build_q_network(self.obs_t, observation_space,
+                                        self.act_t)
            self.q_func_vars = _scope_vars(scope.name)
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_tp0 = self._build_q_network(self.obs_t, output_actions)
+            q_tp0 = self._build_q_network(self.obs_t, observation_space,
+                                          output_actions)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = self._build_q_network(self.obs_tp1,
+            q_tp1 = self._build_q_network(self.obs_tp1, observation_space,
                                          output_actions_estimated)
            target_q_func_vars = _scope_vars(scope.name)

@@ -267,16 +270,20 @@ class DDPGPolicyGraph(TFPolicyGraph):
        # Hard initial update
        self.update_target(tau=1.0)

-    def _build_q_network(self, obs, actions):
+    def _build_q_network(self, obs, obs_space, actions):
        return QNetwork(
-            ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
+            ModelCatalog.get_model({
+                "obs": obs
+            }, obs_space, 1, self.config["model"]), actions,
            self.config["critic_hiddens"],
            self.config["critic_hidden_activation"]).value

-    def _build_p_network(self, obs):
+    def _build_p_network(self, obs, obs_space):
        return PNetwork(
-            ModelCatalog.get_model(obs, 1, self.config["model"]),
-            self.dim_actions, self.config["actor_hiddens"],
+            ModelCatalog.get_model({
+                "obs": obs
+            }, obs_space, 1, self.config["model"]), self.dim_actions,
+            self.config["actor_hiddens"],
            self.config["actor_hidden_activation"]).action_scores

    def _build_action_network(self, p_values, stochastic, eps):
@@ -275,7 +275,7 @@ class DQNPolicyGraph(TFPolicyGraph):
        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
            q_values, q_logits, q_dist = self._build_q_network(
-                self.cur_observations)
+                self.cur_observations, observation_space)
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
@@ -294,12 +294,13 @@ class DQNPolicyGraph(TFPolicyGraph):

        # q network evaluation
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_t, q_logits_t, q_dist_t = self._build_q_network(self.obs_t)
+            q_t, q_logits_t, q_dist_t = self._build_q_network(
+                self.obs_t, observation_space)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1, q_logits_tp1, q_dist_tp1 = self._build_q_network(
-                self.obs_tp1)
+                self.obs_tp1, observation_space)
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
@@ -313,7 +314,7 @@ class DQNPolicyGraph(TFPolicyGraph):
            with tf.variable_scope(Q_SCOPE, reuse=True):
                q_tp1_using_online_net, q_logits_tp1_using_online_net, \
                    q_dist_tp1_using_online_net = self._build_q_network(
-                        self.obs_tp1)
+                        self.obs_tp1, observation_space)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best_one_hot_selection = tf.one_hot(
                q_tp1_best_using_online_net, self.num_actions)
@@ -362,10 +363,12 @@ class DQNPolicyGraph(TFPolicyGraph):
            loss_inputs=self.loss_inputs)
        self.sess.run(tf.global_variables_initializer())

-    def _build_q_network(self, obs):
+    def _build_q_network(self, obs, space):
        qnet = QNetwork(
-            ModelCatalog.get_model(obs, 1, self.config["model"]),
-            self.num_actions, self.config["dueling"], self.config["hiddens"],
+            ModelCatalog.get_model({
+                "obs": obs
+            }, space, 1, self.config["model"]), self.num_actions,
+            self.config["dueling"], self.config["hiddens"],
            self.config["noisy"], self.config["num_atoms"],
            self.config["v_min"], self.config["v_max"], self.config["sigma0"])
        return qnet.value, qnet.logits, qnet.dist
@@ -81,8 +81,9 @@ class Worker(object):

        self.sess = utils.make_session(single_threaded=True)
        self.policy = policies.GenericPolicy(
-            self.sess, self.env.action_space, self.preprocessor,
-            config["observation_filter"], config["model"], **policy_params)
+            self.sess, self.env.action_space, self.env.observation_space,
+            self.preprocessor, config["observation_filter"], config["model"],
+            **policy_params)

    def rollout(self, timestep_limit, add_noise=True):
        rollout_rewards, rollout_length = policies.rollout(
@@ -161,7 +162,7 @@ class ESAgent(Agent):

        self.sess = utils.make_session(single_threaded=False)
        self.policy = policies.GenericPolicy(
-            self.sess, env.action_space, preprocessor,
+            self.sess, env.action_space, env.observation_space, preprocessor,
            self.config["observation_filter"], self.config["model"],
            **policy_params)
        self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
@@ -10,6 +10,7 @@ import numpy as np
 import tensorflow as tf

 import ray
+from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.filter import get_filter

@@ -38,8 +39,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):


 class GenericPolicy(object):
-    def __init__(self, sess, action_space, preprocessor, observation_filter,
-                 model_options, action_noise_std):
+    def __init__(self, sess, action_space, obs_space, preprocessor,
+                 observation_filter, model_options, action_noise_std):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
@@ -52,7 +53,9 @@ class GenericPolicy(object):
        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, model_options, dist_type="deterministic")
-        model = ModelCatalog.get_model(self.inputs, dist_dim, model_options)
+        model = ModelCatalog.get_model({
+            "obs": self.inputs
+        }, obs_space, dist_dim, model_options)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

@@ -69,6 +72,7 @@ class GenericPolicy(object):
        observation = self.observation_filter(observation[None], update=update)
        action = self.sess.run(
            self.sampler, feed_dict={self.inputs: observation})
+        action = _unbatch_tuple_actions(action)
        if add_noise and isinstance(self.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * self.action_noise_std
        return action
@@ -102,9 +102,9 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):

        # Create input placeholders
        if existing_inputs:
-            actions, dones, behaviour_logits, rewards, observations = \
-                existing_inputs[:5]
-            existing_state_in = existing_inputs[5:-1]
+            actions, dones, behaviour_logits, rewards, observations, \
+                prev_actions, prev_rewards = existing_inputs[:7]
+            existing_state_in = existing_inputs[7:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            if isinstance(action_space, gym.spaces.Discrete):
@@ -126,8 +126,15 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
        # Setup the policy
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
+        prev_actions = ModelCatalog.get_action_placeholder(action_space)
+        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model(
-            observations,
+            {
+                "obs": observations,
+                "prev_actions": prev_actions,
+                "prev_rewards": prev_rewards,
+            },
+            observation_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
@@ -187,6 +194,8 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
            ("behaviour_logits", behaviour_logits),
            ("rewards", rewards),
            ("obs", observations),
+            ("prev_actions", prev_actions),
+            ("prev_rewards", prev_rewards),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
@@ -201,6 +210,8 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
+            prev_action_input=prev_actions,
+            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

@@ -24,8 +24,13 @@ class PGPolicyGraph(TFPolicyGraph):
        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
-        self.model = ModelCatalog.get_model(obs, self.logit_dim,
-                                            self.config["model"])
+        prev_actions = ModelCatalog.get_action_placeholder(action_space)
+        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
+        self.model = ModelCatalog.get_model({
+            "obs": obs,
+            "prev_actions": prev_actions,
+            "prev_rewards": prev_rewards
+        }, obs_space, self.logit_dim, self.config["model"])
        action_dist = dist_class(self.model.outputs)  # logit for each action

        # Setup policy loss
@@ -35,9 +40,12 @@ class PGPolicyGraph(TFPolicyGraph):

        # Initialize TFPolicyGraph
        sess = tf.get_default_session()
+        # Mapping from sample batch keys to placeholders
        loss_in = [
            ("obs", obs),
            ("actions", actions),
+            ("prev_actions", prev_actions),
+            ("prev_rewards", prev_rewards),
            ("advantages", advantages),
        ]

@@ -52,6 +60,8 @@ class PGPolicyGraph(TFPolicyGraph):
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
+            prev_action_input=prev_actions,
+            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])
        sess.run(tf.global_variables_initializer())
@@ -88,7 +88,7 @@ class PPOAgent(Agent):
            self.optimizer = SyncSamplesOptimizer(
                self.local_evaluator, self.remote_evaluators, {
                    "num_sgd_iter": self.config["num_sgd_iter"],
-                    "train_batch_size": self.config["train_batch_size"]
+                    "train_batch_size": self.config["train_batch_size"],
                })
        else:
            self.optimizer = LocalMultiGPUOptimizer(
@@ -120,8 +120,9 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
-                logits_ph, vf_preds_ph = existing_inputs[:6]
-            existing_state_in = existing_inputs[6:-1]
+                logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
+                existing_inputs[:8]
+            existing_state_in = existing_inputs[8:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(
@@ -137,6 +138,9 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
                tf.float32, name="vf_preds", shape=(None, ))
            value_targets_ph = tf.placeholder(
                tf.float32, name="value_targets", shape=(None, ))
+            prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
+            prev_rewards_ph = tf.placeholder(
+                tf.float32, [None], name="prev_reward")
            existing_state_in = None
            existing_seq_lens = None
        self.observations = obs_ph
@@ -148,9 +152,16 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
            ("actions", act_ph),
            ("logits", logits_ph),
            ("vf_preds", vf_preds_ph),
+            ("prev_actions", prev_actions_ph),
+            ("prev_rewards", prev_rewards_ph),
        ]
        self.model = ModelCatalog.get_model(
-            obs_ph,
+            {
+                "obs": obs_ph,
+                "prev_actions": prev_actions_ph,
+                "prev_rewards": prev_rewards_ph
+            },
+            observation_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
@@ -180,8 +191,11 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
                vf_config["free_log_std"] = False
                vf_config["use_lstm"] = False
                with tf.variable_scope("value_function"):
-                    self.value_function = ModelCatalog.get_model(
-                        obs_ph, 1, vf_config).outputs
+                    self.value_function = ModelCatalog.get_model({
+                        "obs": obs_ph,
+                        "prev_actions": prev_actions_ph,
+                        "prev_rewards": prev_rewards_ph
+                    }, observation_space, 1, vf_config).outputs
                    self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
@@ -223,6 +237,8 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
            loss_inputs=self.loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
+            prev_action_input=prev_actions_ph,
+            prev_reward_input=prev_rewards_ph,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])

@@ -22,6 +22,12 @@ class AsyncVectorEnv(object):
        rllib.MultiAgentEnv => rllib.AsyncVectorEnv
        rllib.ServingEnv => rllib.AsyncVectorEnv

+    Attributes:
+        action_space (gym.Space): Action space. This must be defined for
+            single-agent envs. Multi-agent envs can set this to None.
+        observation_space (gym.Space): Observation space. This must be defined
+            for single-agent envs. Multi-agent envs can set this to None.
+
    Examples:
        >>> env = MyAsyncVectorEnv()
        >>> obs, rewards, dones, infos, off_policy_actions = env.poll()
@@ -142,8 +148,14 @@ def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
 class _ServingEnvToAsync(AsyncVectorEnv):
    """Internal adapter of ServingEnv to AsyncVectorEnv."""

-    def __init__(self, serving_env):
+    def __init__(self, serving_env, preprocessor=None):
        self.serving_env = serving_env
+        self.prep = preprocessor
+        self.action_space = serving_env.action_space
+        if preprocessor:
+            self.observation_space = preprocessor.observation_space
+        else:
+            self.observation_space = serving_env.observation_space
        serving_env.start()

    def poll(self):
@@ -168,7 +180,10 @@ class _ServingEnvToAsync(AsyncVectorEnv):
            if episode.cur_done:
                del self.serving_env._episodes[eid]
            if data:
-                all_obs[eid] = data["obs"]
+                if self.prep:
+                    all_obs[eid] = self.prep.transform(data["obs"])
+                else:
+                    all_obs[eid] = data["obs"]
                all_rewards[eid] = data["reward"]
                all_dones[eid] = data["done"]
                all_infos[eid] = data["info"]
@@ -196,6 +211,8 @@ class _VectorEnvToAsync(AsyncVectorEnv):

    def __init__(self, vector_env):
        self.vector_env = vector_env
+        self.action_space = vector_env.action_space
+        self.observation_space = vector_env.observation_space
        self.num_envs = vector_env.num_envs
        self.new_obs = self.vector_env.vector_reset()
        self.cur_rewards = [None for _ in range(self.num_envs)]
@@ -25,6 +25,10 @@ class ServingEnv(threading.Thread):

    This env is thread-safe, but individual episodes must be executed serially.

+    Attributes:
+        action_space (gym.Space): Action space.
+        observation_space (gym.Space): Observation space.
+
    Examples:
        >>> register_env("my_env", lambda config: YourServingEnv(config))
        >>> agent = DQNAgent(env="my_env")
@@ -57,10 +61,12 @@ class ServingEnv(threading.Thread):
        """Override this to implement the run loop.

        Your loop should continuously:
-            1. Call self.start_episode()
-            2. Call self.get_action() or self.log_action()
-            3. Call self.log_returns()
-            4. Call self.end_episode()
+            1. Call self.start_episode(episode_id)
+            2. Call self.get_action(episode_id, obs)
+                    -or-
+                    self.log_action(episode_id, obs, action)
+            3. Call self.log_returns(episode_id, reward)
+            4. Call self.end_episode(episode_id, obs)
            5. Wait if nothing to do.

        Multiple episodes may be started at the same time.
@@ -69,6 +69,8 @@ class _VectorizedGymEnv(VectorEnv):
        self.num_envs = num_envs
        while len(self.envs) < self.num_envs:
            self.envs.append(self.make_env(len(self.envs)))
+        self.action_space = self.envs[0].action_space
+        self.observation_space = self.envs[0].observation_space

    def vector_reset(self):
        return [e.reset() for e in self.envs]
@@ -54,6 +54,8 @@ class MultiAgentEpisode(object):
        self._agent_to_last_obs = {}
        self._agent_to_last_action = {}
        self._agent_to_last_pi_info = {}
+        self._agent_to_prev_action = {}
+        self._agent_reward_history = defaultdict(list)

    def policy_for(self, agent_id):
        """Returns the policy graph for the specified agent.
@@ -72,19 +74,33 @@ class MultiAgentEpisode(object):
        return self._agent_to_last_obs.get(agent_id)

    def last_action_for(self, agent_id):
-        """Returns the last action for the specified agent."""
+        """Returns the last action for the specified agent, or zeros."""

-        action = self._agent_to_last_action[agent_id]
-        # Concatenate tuple actions
-        if isinstance(action, list):
-            expanded = []
-            for a in action:
-                if len(a.shape) == 1:
-                    expanded.append(np.expand_dims(a, 1))
-                else:
-                    expanded.append(a)
-            action = np.concatenate(expanded, axis=1).flatten()
-        return action
+        if agent_id in self._agent_to_last_action:
+            return _flatten_action(self._agent_to_last_action[agent_id])
+        else:
+            policy = self._policies[self.policy_for(agent_id)]
+            flat = _flatten_action(policy.action_space.sample())
+            return np.zeros_like(flat)
+
+    def prev_action_for(self, agent_id):
+        """Returns the previous action for the specified agent."""
+
+        if agent_id in self._agent_to_prev_action:
+            return _flatten_action(self._agent_to_prev_action[agent_id])
+        else:
+            # We're at t=0, so return all zeros.
+            return np.zeros_like(self.last_action_for(agent_id))
+
+    def prev_reward_for(self, agent_id):
+        """Returns the previous reward for the specified agent."""
+
+        history = self._agent_reward_history[agent_id]
+        if len(history) >= 2:
+            return history[-2]
+        else:
+            # We're at t=0, so there is no previous reward, just return zero.
+            return 0.0

    def rnn_state_for(self, agent_id):
        """Returns the last RNN state for the specified agent."""
@@ -105,6 +121,7 @@ class MultiAgentEpisode(object):
                self.agent_rewards[agent_id,
                                   self.policy_for(agent_id)] += reward
                self.total_reward += reward
+                self._agent_reward_history[agent_id].append(reward)

    def _set_rnn_state(self, agent_id, rnn_state):
        self._agent_to_rnn_state[agent_id] = rnn_state
@@ -117,3 +134,16 @@ class MultiAgentEpisode(object):

    def _set_last_pi_info(self, agent_id, pi_info):
        self._agent_to_last_pi_info[agent_id] = pi_info
+
+
+def _flatten_action(action):
+    # Concatenate tuple actions
+    if isinstance(action, list) or isinstance(action, tuple):
+        expanded = []
+        for a in action:
+            if not hasattr(a, "shape") or len(a.shape) == 0:
+                expanded.append(np.expand_dims(a, 1))
+            else:
+                expanded.append(a)
+        action = np.concatenate(expanded, axis=0).flatten()
+    return action
@@ -11,8 +11,6 @@ from ray.rllib.models import ModelCatalog
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
 from ray.rllib.env.env_context import EnvContext
-from ray.rllib.env.serving_env import ServingEnv
-from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.evaluation.interface import EvaluatorInterface
 from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
@@ -179,11 +177,15 @@ class PolicyEvaluator(EvaluatorInterface):
        self.compress_observations = compress_observations

        self.env = env_creator(env_context)
-        if isinstance(self.env, VectorEnv) or \
-                isinstance(self.env, ServingEnv) or \
-                isinstance(self.env, MultiAgentEnv) or \
+        if isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

+            if model_config.get("custom_preprocessor"):
+                raise ValueError(
+                    "Custom preprocessors are not supported for env types "
+                    "MultiAgentEnv and AsyncVectorEnv. Please preprocess "
+                    "observations in your env directly.")
+
            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
@@ -294,6 +296,18 @@ class PolicyEvaluator(EvaluatorInterface):
            merged_conf = policy_config.copy()
            merged_conf.update(conf)
            with tf.variable_scope(name):
+                if isinstance(obs_space, gym.spaces.Dict):
+                    raise ValueError(
+                        "Found raw Dict space as input to policy graph. "
+                        "Please preprocess your environment observations "
+                        "with DictFlatteningPreprocessor and set the "
+                        "obs space to `preprocessor.observation_space`.")
+                elif isinstance(obs_space, gym.spaces.Tuple):
+                    raise ValueError(
+                        "Found raw Tuple space as input to policy graph. "
+                        "Please preprocess your environment observations "
+                        "with TupleFlatteningPreprocessor and set the "
+                        "obs space to `preprocessor.observation_space`.")
                policy_map[name] = cls(obs_space, act_space, merged_conf)
        return policy_map

@@ -40,6 +40,8 @@ class PolicyGraph(object):
    def compute_actions(self,
                        obs_batch,
                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
                        is_training=False,
                        episodes=None):
        """Compute actions for the current policy.
@@ -47,6 +49,8 @@ class PolicyGraph(object):
        Arguments:
            obs_batch (np.ndarray): batch of observations
            state_batches (list): list of RNN state input batches, if any
+            prev_action_batch (np.ndarray): batch of previous action values
+            prev_reward_batch (np.ndarray): batch of previous rewards
            is_training (bool): whether we are training the policy
            episodes (list): MultiAgentEpisode for each obs in obs_batch.
                This provides access to all of the internal episode state,
@@ -65,6 +69,8 @@ class PolicyGraph(object):
    def compute_single_action(self,
                              obs,
                              state,
+                              prev_action_batch=None,
+                              prev_reward_batch=None,
                              is_training=False,
                              episode=None):
        """Unbatched version of compute_actions.
@@ -72,6 +78,8 @@ class PolicyGraph(object):
        Arguments:
            obs (obj): single observation
            state_batches (list): list of RNN state inputs, if any
+            prev_action_batch (np.ndarray): batch of previous action values
+            prev_reward_batch (np.ndarray): batch of previous rewards
            is_training (bool): whether we are training the policy
            episode (MultiAgentEpisode): this provides access to all of the
                internal episode state, which may be useful for model-based or
@@ -3,22 +3,25 @@ from __future__ import division
 from __future__ import print_function

 from collections import defaultdict, namedtuple
+import numpy as np
 import six.moves.queue as queue
 import threading

-from ray.rllib.evaluation.episode import MultiAgentEpisode
+from ray.rllib.evaluation.episode import MultiAgentEpisode, _flatten_action
 from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
    MultiAgentBatch
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.env.async_vector_env import AsyncVectorEnv
 from ray.rllib.env.atari_wrappers import get_wrapper_by_cls, MonitorEnv
+from ray.rllib.models.action_dist import TupleActions
 from ray.rllib.utils.tf_run_builder import TFRunBuilder

 RolloutMetrics = namedtuple(
    "RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])

-PolicyEvalData = namedtuple("PolicyEvalData",
-                            ["env_id", "agent_id", "obs", "rnn_state"])
+PolicyEvalData = namedtuple(
+    "PolicyEvalData",
+    ["env_id", "agent_id", "obs", "rnn_state", "prev_action", "prev_reward"])


 class SyncSampler(object):
@@ -281,7 +284,9 @@ def _env_runner(async_vector_env,
                if not agent_done:
                    to_eval[policy_id].append(
                        PolicyEvalData(env_id, agent_id, filtered_obs,
-                                       episode.rnn_state_for(agent_id)))
+                                       episode.rnn_state_for(agent_id),
+                                       episode.last_action_for(agent_id),
+                                       rewards[env_id][agent_id] or 0.0))

                last_observation = episode.last_observation_for(agent_id)
                episode._set_last_observation(agent_id, filtered_obs)
@@ -297,6 +302,8 @@ def _env_runner(async_vector_env,
                        obs=last_observation,
                        actions=episode.last_action_for(agent_id),
                        rewards=rewards[env_id][agent_id],
+                        prev_actions=episode.prev_action_for(agent_id),
+                        prev_rewards=episode.prev_reward_for(agent_id),
                        dones=agent_done,
                        infos=infos[env_id][agent_id],
                        new_obs=filtered_obs,
@@ -326,12 +333,17 @@ def _env_runner(async_vector_env,
                    episode = active_episodes[env_id]
                    for agent_id, raw_obs in resetted_obs.items():
                        policy_id = episode.policy_for(agent_id)
+                        policy = _get_or_raise(policies, policy_id)
                        filtered_obs = _get_or_raise(obs_filters,
                                                     policy_id)(raw_obs)
                        episode._set_last_observation(agent_id, filtered_obs)
                        to_eval[policy_id].append(
-                            PolicyEvalData(env_id, agent_id, filtered_obs,
-                                           episode.rnn_state_for(agent_id)))
+                            PolicyEvalData(
+                                env_id, agent_id, filtered_obs,
+                                episode.rnn_state_for(agent_id),
+                                np.zeros_like(
+                                    _flatten_action(
+                                        policy.action_space.sample())), 0.0))

        # Batch eval policy actions if possible
        if tf_sess:
@@ -350,11 +362,15 @@ def _env_runner(async_vector_env,
                pending_fetches[policy_id] = policy.build_compute_actions(
                    builder, [t.obs for t in eval_data],
                    rnn_in,
+                    prev_action_batch=[t.prev_action for t in eval_data],
+                    prev_reward_batch=[t.prev_reward for t in eval_data],
                    is_training=True)
            else:
                eval_results[policy_id] = policy.compute_actions(
                    [t.obs for t in eval_data],
                    rnn_in,
+                    prev_action_batch=[t.prev_action for t in eval_data],
+                    prev_reward_batch=[t.prev_reward for t in eval_data],
                    is_training=True,
                    episodes=[active_episodes[t.env_id] for t in eval_data])
        if builder:
@@ -374,6 +390,7 @@ def _env_runner(async_vector_env,
            for f_i, column in enumerate(rnn_out_cols):
                pi_info_cols["state_out_{}".format(f_i)] = column
            # Save output rows
+            actions = _unbatch_tuple_actions(actions)
            for i, action in enumerate(actions):
                env_id = eval_data[i].env_id
                agent_id = eval_data[i].agent_id
@@ -413,6 +430,19 @@ def _fetch_atari_metrics(async_vector_env):
    return atari_out


+def _unbatch_tuple_actions(action_batch):
+    # convert list of batches -> batch of lists
+    if isinstance(action_batch, TupleActions):
+        out = []
+        for j in range(len(action_batch.batches[0])):
+            out.append([
+                action_batch.batches[i][j]
+                for i in range(len(action_batch.batches))
+            ])
+        return out
+    return action_batch
+
+
 def _to_column_format(rnn_state_rows):
    num_cols = len(rnn_state_rows[0])
    return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]
@@ -46,6 +46,8 @@ class TFPolicyGraph(PolicyGraph):
                 loss_inputs,
                 state_inputs=None,
                 state_outputs=None,
+                 prev_action_input=None,
+                 prev_reward_input=None,
                 seq_lens=None,
                 max_seq_len=20):
        """Initialize the policy graph.
@@ -65,6 +67,8 @@ class TFPolicyGraph(PolicyGraph):
                and has shape [BATCH_SIZE, data...].
            state_inputs (list): list of RNN state input Tensors.
            state_outputs (list): list of RNN state output Tensors.
+            prev_action_input (Tensor): placeholder for previous actions
+            prev_reward_input (Tensor): placeholder for previous rewards
            seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
                [NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
                models/lstm.py for more information.
@@ -75,6 +79,8 @@ class TFPolicyGraph(PolicyGraph):
        self.action_space = action_space
        self._sess = sess
        self._obs_input = obs_input
+        self._prev_action_input = prev_action_input
+        self._prev_reward_input = prev_reward_input
        self._sampler = action_sampler
        self._loss = loss
        self._loss_inputs = loss_inputs
@@ -112,6 +118,8 @@ class TFPolicyGraph(PolicyGraph):
                              builder,
                              obs_batch,
                              state_batches=None,
+                              prev_action_batch=None,
+                              prev_reward_batch=None,
                              is_training=False,
                              episodes=None):
        state_batches = state_batches or []
@@ -121,6 +129,10 @@ class TFPolicyGraph(PolicyGraph):
        builder.add_feed_dict({self._obs_input: obs_batch})
        if state_batches:
            builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
+        if self._prev_action_input is not None and prev_action_batch:
+            builder.add_feed_dict({self._prev_action_input: prev_action_batch})
+        if self._prev_reward_input is not None and prev_reward_batch:
+            builder.add_feed_dict({self._prev_reward_input: prev_reward_batch})
        builder.add_feed_dict({self._is_training: is_training})
        builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
        fetches = builder.add_fetches([self._sampler] + self._state_outputs +
@@ -130,11 +142,14 @@ class TFPolicyGraph(PolicyGraph):
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
                        is_training=False,
                        episodes=None):
        builder = TFRunBuilder(self._sess, "compute_actions")
        fetches = self.build_compute_actions(builder, obs_batch, state_batches,
-                                             is_training)
+                                             prev_action_batch,
+                                             prev_reward_batch, is_training)
        return builder.get(fetches)

    def _get_loss_inputs_dict(self, batch):
@@ -70,6 +70,8 @@ class TorchPolicyGraph(PolicyGraph):
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
                        is_training=False,
                        episodes=None):
        if state_batches:
@@ -20,6 +20,7 @@ class CarlaModel(Model):
    further fully connected layers.
    """

+    # TODO(ekl): use build_layers_v2 for native dict space support
    def _build_layers(self, inputs, num_outputs, options):
        # Parse options
        image_shape = options["custom_options"]["image_shape"]
@@ -14,6 +14,7 @@ import numpy as np

 parser = argparse.ArgumentParser()
 parser.add_argument("--stop", type=int, default=200)
+parser.add_argument("--use-prev-action-reward", action="store_true")
 parser.add_argument("--run", type=str, default="PPO")


@@ -183,10 +184,13 @@ if __name__ == "__main__":
            "stop": {
                "episode_reward_mean": args.stop
            },
-            "config": dict(configs[args.run], **{
-                "model": {
-                    "use_lstm": True,
-                },
-            }),
+            "config": dict(
+                configs[args.run], **{
+                    "model": {
+                        "use_lstm": True,
+                        "lstm_use_prev_action_reward": args.
+                        use_prev_action_reward,
+                    },
+                }),
        }
    })
@@ -2,9 +2,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from collections import namedtuple
+import distutils.version
+
 import tensorflow as tf
 import numpy as np
-import distutils.version

 use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
                 distutils.version.LooseVersion("1.5.0"))
@@ -225,4 +227,8 @@ class MultiActionDistribution(ActionDistribution):

    def sample(self):
        """Draw a sample from the action distribution."""
-        return [[s.sample() for s in self.child_distributions]]
+
+        return TupleActions([s.sample() for s in self.child_distributions])
+
+
+TupleActions = namedtuple("TupleActions", ["batches"])
@@ -10,6 +10,9 @@ from functools import partial
 from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
    _global_registry

+from ray.rllib.env.async_vector_env import _ServingEnvToAsync
+from ray.rllib.env.serving_env import ServingEnv
+from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.models.action_dist import (
    Categorical, Deterministic, DiagGaussian, MultiActionDistribution,
    squash_to_range)
@@ -41,6 +44,8 @@ MODEL_DEFAULTS = {
    "max_seq_len": 20,
    # Size of the LSTM cell
    "lstm_cell_size": 256,
+    # Whether to feed a_{t-1}, r_{t-1} to LSTM
+    "lstm_use_prev_action_reward": False,

    # == Atari ==
    # Whether to enable framestack for Atari envs
@@ -133,10 +138,6 @@ class ModelCatalog(object):
            action_placeholder (Tensor): A placeholder for the actions
        """

-        # TODO(ekl) are list spaces valid?
-        if isinstance(action_space, list):
-            action_space = gym.spaces.Tuple(action_space)
-
        if isinstance(action_space, gym.spaces.Box):
            return tf.placeholder(
                tf.float32, shape=(None, action_space.shape[0]), name="action")
@@ -160,11 +161,18 @@ class ModelCatalog(object):
                                      " not supported".format(action_space))

    @staticmethod
-    def get_model(inputs, num_outputs, options, state_in=None, seq_lens=None):
+    def get_model(input_dict,
+                  obs_space,
+                  num_outputs,
+                  options,
+                  state_in=None,
+                  seq_lens=None):
        """Returns a suitable model conforming to given input and output specs.

        Args:
-            inputs (Tensor): The input tensor to the model.
+            input_dict (dict): Dict of input tensors to the model, including
+                the observation under the "obs" key.
+            obs_space (Space): Observation space of the target gym env.
            num_outputs (int): The size of the output vector of the model.
            options (dict): Optional args to pass to the model constructor.
            state_in (list): Optional RNN state in tensors.
@@ -174,34 +182,40 @@ class ModelCatalog(object):
            model (Model): Neural network model.
        """

+        assert isinstance(input_dict, dict)
        options = options or MODEL_DEFAULTS
-        model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
-                                        seq_lens)
+        model = ModelCatalog._get_model(input_dict, obs_space, num_outputs,
+                                        options, state_in, seq_lens)

        if options.get("use_lstm"):
-            model = LSTM(model.last_layer, num_outputs, options, state_in,
+            copy = dict(input_dict)
+            copy["obs"] = model.last_layer
+            model = LSTM(copy, obs_space, num_outputs, options, state_in,
                         seq_lens)

        return model

    @staticmethod
-    def _get_model(inputs, num_outputs, options, state_in, seq_lens):
+    def _get_model(input_dict, obs_space, num_outputs, options, state_in,
+                   seq_lens):
        if options.get("custom_model"):
            model = options["custom_model"]
            print("Using custom model {}".format(model))
            return _global_registry.get(RLLIB_MODEL, model)(
-                inputs,
+                input_dict,
+                obs_space,
                num_outputs,
                options,
                state_in=state_in,
                seq_lens=seq_lens)

-        obs_rank = len(inputs.shape) - 1
+        obs_rank = len(input_dict["obs"].shape) - 1

        if obs_rank > 1:
-            return VisionNetwork(inputs, num_outputs, options)
+            return VisionNetwork(input_dict, obs_space, num_outputs, options)

-        return FullyConnectedNetwork(inputs, num_outputs, options)
+        return FullyConnectedNetwork(input_dict, obs_space, num_outputs,
+                                     options)

    @staticmethod
    def get_torch_model(input_shape, num_outputs, options=None):
@@ -243,7 +257,7 @@ class ModelCatalog(object):
        """Returns a suitable processor for the given environment.

        Args:
-            env (gym.Env): The gym environment to preprocess.
+            env (gym.Env|VectorEnv|ServingEnv): The environment to wrap.
            options (dict): Options to pass to the preprocessor.

        Returns:
@@ -269,16 +283,23 @@ class ModelCatalog(object):
        """Returns a preprocessor as a gym observation wrapper.

        Args:
-            env (gym.Env): The gym environment to wrap.
+            env (gym.Env|VectorEnv|ServingEnv): The environment to wrap.
            options (dict): Options to pass to the preprocessor.

        Returns:
-            wrapper (gym.ObservationWrapper): Preprocessor in wrapper form.
+            env (RLlib env): Wrapped environment
        """

        options = options or MODEL_DEFAULTS
        preprocessor = ModelCatalog.get_preprocessor(env, options)
-        return _RLlibPreprocessorWrapper(env, preprocessor)
+        if isinstance(env, gym.Env):
+            return _RLlibPreprocessorWrapper(env, preprocessor)
+        elif isinstance(env, VectorEnv):
+            return _RLlibVectorPreprocessorWrapper(env, preprocessor)
+        elif isinstance(env, ServingEnv):
+            return _ServingEnvToAsync(env, preprocessor)
+        else:
+            raise ValueError("Don't know how to wrap {}".format(env))

    @staticmethod
    def register_custom_preprocessor(preprocessor_name, preprocessor_class):
@@ -314,10 +335,32 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper):
    def __init__(self, env, preprocessor):
        super(_RLlibPreprocessorWrapper, self).__init__(env)
        self.preprocessor = preprocessor
-
-        from gym.spaces.box import Box
-        self.observation_space = Box(
-            -1.0, 1.0, preprocessor.shape, dtype=np.float32)
+        self.observation_space = preprocessor.observation_space

    def observation(self, observation):
        return self.preprocessor.transform(observation)
+
+
+class _RLlibVectorPreprocessorWrapper(VectorEnv):
+    """Preprocessing wrapper for vector envs."""
+
+    def __init__(self, env, preprocessor):
+        self.env = env
+        self.prep = preprocessor
+        self.action_space = env.action_space
+        self.observation_space = preprocessor.observation_space
+        self.num_envs = env.num_envs
+
+    def vector_reset(self):
+        return [self.prep.transform(obs) for obs in self.env.vector_reset()]
+
+    def reset_at(self, index):
+        return self.prep.transform(self.env.reset_at(index))
+
+    def vector_step(self, actions):
+        obs, rewards, dones, infos = self.env.vector_step(actions)
+        obs = [self.prep.transform(o) for o in obs]
+        return obs, rewards, dones, infos
+
+    def get_unwrapped(self):
+        return self.env.get_unwrapped()
@@ -13,6 +13,12 @@ class FullyConnectedNetwork(Model):
    """Generic fully connected network."""

    def _build_layers(self, inputs, num_outputs, options):
+        """Process the flattened inputs.
+
+        Note that dict inputs will be flattened into a vector. To define a
+        model that processes the components separately, use _build_layers_v2().
+        """
+
        hiddens = options.get("fcnet_hiddens")
        activation = get_activation_fn(options.get("fcnet_activation"))

@@ -9,6 +9,10 @@ the LSTM cell, we reshape the input to add the expected time dimension. During
 postprocessing, we dynamically pad the experience batches so that this
 reshaping is possible.

+Note that this padding strategy only works out if we assume zero inputs don't
+meaningfully affect the loss function. This happens to be true for all the
+current algorithms: https://github.com/ray-project/ray/issues/2992
+
 See the add_time_dimension() and chop_into_sequences() functions below for
 more info.
 """
@@ -134,9 +138,24 @@ class LSTM(Model):
        self.seq_lens. See add_time_dimension() for more information.
    """

-    def _build_layers(self, inputs, num_outputs, options):
+    def _build_layers_v2(self, input_dict, num_outputs, options):
        cell_size = options.get("lstm_cell_size")
-        last_layer = add_time_dimension(inputs, self.seq_lens)
+        if options.get("lstm_use_prev_action_reward"):
+            action_dim = int(
+                np.product(
+                    input_dict["prev_actions"].get_shape().as_list()[1:]))
+            features = tf.concat(
+                [
+                    input_dict["obs"],
+                    tf.reshape(
+                        tf.cast(input_dict["prev_actions"], tf.float32),
+                        [-1, action_dim]),
+                    tf.reshape(input_dict["prev_rewards"], [-1, 1]),
+                ],
+                axis=1)
+        else:
+            features = input_dict["obs"]
+        last_layer = add_time_dimension(features, self.seq_lens)

        # Setup the LSTM cell
        lstm = rnn.BasicLSTMCell(cell_size, state_is_tuple=True)
@@ -2,8 +2,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from collections import OrderedDict
+
+import gym
 import tensorflow as tf

+from ray.rllib.models.preprocessors import get_preprocessor
+

 class Model(object):
    """Defines an abstract network model for use with RLlib.
@@ -16,12 +21,12 @@ class Model(object):
    needs to further post-processing (e.g. Actor and Critic networks in A3C).

    Attributes:
-        inputs (Tensor): The input placeholder for this model, of shape
-            [BATCH_SIZE, ...].
+        input_dict (dict): Dictionary of input tensors, including "obs",
+            "prev_action", "prev_reward".
        outputs (Tensor): The output vector of this model, of shape
            [BATCH_SIZE, num_outputs].
-        last_layer (Tensor): The network layer right before the model output,
-            of shape [BATCH_SIZE, N].
+        last_layer (Tensor): The feature layer right before the model output,
+            of shape [BATCH_SIZE, f].
        state_init (list): List of initial recurrent state tensors (if any).
        state_in (list): List of input recurrent state tensors (if any).
        state_out (list): List of output recurrent state tensors (if any).
@@ -38,12 +43,13 @@ class Model(object):
    """

    def __init__(self,
-                 inputs,
+                 input_dict,
+                 obs_space,
                 num_outputs,
                 options,
                 state_in=None,
                 seq_lens=None):
-        self.inputs = inputs
+        assert isinstance(input_dict, dict), input_dict

        # Default attribute values for the non-RNN case
        self.state_init = []
@@ -58,8 +64,26 @@ class Model(object):
        if options.get("free_log_std"):
            assert num_outputs % 2 == 0
            num_outputs = num_outputs // 2
-        self.outputs, self.last_layer = self._build_layers(
-            inputs, num_outputs, options)
+        try:
+            self.outputs, self.last_layer = self._build_layers_v2(
+                _restore_original_dimensions(input_dict, obs_space),
+                num_outputs, options)
+        except NotImplementedError:
+            self.outputs, self.last_layer = self._build_layers(
+                input_dict["obs"], num_outputs, options)
+
+        # Validate the output shape
+        try:
+            out = tf.convert_to_tensor(self.outputs)
+            shape = out.shape.as_list()
+        except Exception:
+            raise ValueError("Output is not a tensor: {}".format(self.outputs))
+        else:
+            if len(shape) != 2 or shape[1] != num_outputs:
+                raise ValueError(
+                    "Expected output shape of [None, {}], got {}".format(
+                        num_outputs, shape))
+
        if options.get("free_log_std", False):
            log_std = tf.get_variable(
                name="log_std",
@@ -68,6 +92,80 @@ class Model(object):
            self.outputs = tf.concat(
                [self.outputs, 0.0 * self.outputs + log_std], 1)

-    def _build_layers(self):
-        """Builds and returns the output and last layer of the network."""
+    def _build_layers(self, inputs, num_outputs, options):
+        """Builds and returns the output and last layer of the network.
+
+        Deprecated: use _build_layers_v2 instead, which has better support
+        for dict and tuple spaces.
+        """
        raise NotImplementedError
+
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        """Define the layers of a custom model.
+
+        Arguments:
+            input_dict (dict): Dictionary of input tensors, including "obs",
+                "prev_action", "prev_reward".
+            num_outputs (int): Output tensor must be of size
+                [BATCH_SIZE, num_outputs].
+            options (dict): Model options.
+
+        Returns:
+            (outputs, feature_layer): Tensors of size [BATCH_SIZE, num_outputs]
+                and [BATCH_SIZE, desired_feature_size].
+
+        When using dict or tuple observation spaces, you can access
+        the nested sub-observation batches here as well:
+
+        Examples:
+            >>> print(input_dict)
+            {'prev_actions': <tf.Tensor shape=(?,) dtype=int64>,
+             'prev_rewards': <tf.Tensor shape=(?,) dtype=float32>,
+             'obs': OrderedDict([
+                ('sensors', OrderedDict([
+                    ('front_cam', [
+                        <tf.Tensor shape=(?, 10, 10, 3) dtype=float32>,
+                        <tf.Tensor shape=(?, 10, 10, 3) dtype=float32>]),
+                    ('position', <tf.Tensor shape=(?, 3) dtype=float32>),
+                    ('velocity', <tf.Tensor shape=(?, 3) dtype=float32>)]))])}
+        """
+        raise NotImplementedError
+
+
+def _restore_original_dimensions(input_dict, obs_space):
+    if hasattr(obs_space, "original_space"):
+        return dict(
+            input_dict,
+            obs=_unpack_obs(input_dict["obs"], obs_space.original_space))
+    return input_dict
+
+
+def _unpack_obs(obs, space):
+    if (isinstance(space, gym.spaces.Dict)
+            or isinstance(space, gym.spaces.Tuple)):
+        prep = get_preprocessor(space)(space)
+        if len(obs.shape) != 2 or obs.shape[1] != prep.shape[0]:
+            raise ValueError(
+                "Expected flattened obs shape of [None, {}], got {}".format(
+                    prep.shape[0], obs.shape))
+        assert len(prep.preprocessors) == len(space.spaces), \
+            (len(prep.preprocessors) == len(space.spaces))
+        offset = 0
+        if isinstance(space, gym.spaces.Tuple):
+            u = []
+            for p, v in zip(prep.preprocessors, space.spaces):
+                obs_slice = obs[:, offset:offset + p.size]
+                offset += p.size
+                u.append(
+                    _unpack_obs(
+                        tf.reshape(obs_slice, [-1] + list(p.shape)), v))
+        else:
+            u = OrderedDict()
+            for p, (k, v) in zip(prep.preprocessors, space.spaces.items()):
+                obs_slice = obs[:, offset:offset + p.size]
+                offset += p.size
+                u[k] = _unpack_obs(
+                    tf.reshape(obs_slice, [-1] + list(p.shape)), v)
+        return u
+    else:
+        return obs
@@ -16,19 +16,34 @@ class Preprocessor(object):
        shape (obj): Shape of the preprocessed output.
    """

-    def __init__(self, obs_space, options):
+    def __init__(self, obs_space, options=None):
        legacy_patch_shapes(obs_space)
        self._obs_space = obs_space
-        self._options = options
-        self._init()
+        self._options = options or {}
+        self.shape = self._init_shape(obs_space, options)

-    def _init(self):
-        pass
+    def _init_shape(self, obs_space, options):
+        """Returns the shape after preprocessing."""
+        raise NotImplementedError

    def transform(self, observation):
        """Returns the preprocessed observation."""
        raise NotImplementedError

+    @property
+    def size(self):
+        return int(np.product(self.shape))
+
+    @property
+    def observation_space(self):
+        obs_space = gym.spaces.Box(-1.0, 1.0, self.shape, dtype=np.float32)
+        # Stash the unwrapped space so that we can unwrap dict and tuple spaces
+        # automatically in model.py
+        if (isinstance(self, TupleFlatteningPreprocessor)
+                or isinstance(self, DictFlatteningPreprocessor)):
+            obs_space.original_space = self._obs_space
+        return obs_space
+

 class GenericPixelPreprocessor(Preprocessor):
    """Generic image preprocessor.
@@ -37,19 +52,20 @@ class GenericPixelPreprocessor(Preprocessor):
    instead for deepmind-style Atari preprocessing.
    """

-    def _init(self):
-        self._grayscale = self._options.get("grayscale")
-        self._zero_mean = self._options.get("zero_mean")
-        self._dim = self._options.get("dim")
-        self._channel_major = self._options.get("channel_major")
+    def _init_shape(self, obs_space, options):
+        self._grayscale = options.get("grayscale")
+        self._zero_mean = options.get("zero_mean")
+        self._dim = options.get("dim")
+        self._channel_major = options.get("channel_major")
        if self._grayscale:
-            self.shape = (self._dim, self._dim, 1)
+            shape = (self._dim, self._dim, 1)
        else:
-            self.shape = (self._dim, self._dim, 3)
+            shape = (self._dim, self._dim, 3)

        # channel_major requires (# in-channels, row dim, col dim)
        if self._channel_major:
-            self.shape = self.shape[-1:] + self.shape[:-1]
+            shape = shape[-1:] + shape[:-1]
+        return shape

    def transform(self, observation):
        """Downsamples images from (210, 160, 3) by the configured factor."""
@@ -75,16 +91,16 @@ class GenericPixelPreprocessor(Preprocessor):


 class AtariRamPreprocessor(Preprocessor):
-    def _init(self):
-        self.shape = (128, )
+    def _init_shape(self, obs_space, options):
+        return (128, )

    def transform(self, observation):
        return (observation - 128) / 128


 class OneHotPreprocessor(Preprocessor):
-    def _init(self):
-        self.shape = (self._obs_space.n, )
+    def _init_shape(self, obs_space, options):
+        return (self._obs_space.n, )

    def transform(self, observation):
        arr = np.zeros(self._obs_space.n)
@@ -93,8 +109,8 @@ class OneHotPreprocessor(Preprocessor):


 class NoPreprocessor(Preprocessor):
-    def _init(self):
-        self.shape = self._obs_space.shape
+    def _init_shape(self, obs_space, options):
+        return self._obs_space.shape

    def transform(self, observation):
        return observation
@@ -103,11 +119,10 @@ class NoPreprocessor(Preprocessor):
 class TupleFlatteningPreprocessor(Preprocessor):
    """Preprocesses each tuple element, then flattens it all into a vector.

-    If desired, the vector output can be unpacked via tf.reshape() within a
-    custom model to handle each component separately.
+    RLlib models will unpack the flattened output before _build_layers_v2().
    """

-    def _init(self):
+    def _init_shape(self, obs_space, options):
        assert isinstance(self._obs_space, gym.spaces.Tuple)
        size = 0
        self.preprocessors = []
@@ -116,17 +131,43 @@ class TupleFlatteningPreprocessor(Preprocessor):
            print("Creating sub-preprocessor for", space)
            preprocessor = get_preprocessor(space)(space, self._options)
            self.preprocessors.append(preprocessor)
-            size += np.product(preprocessor.shape)
-        self.shape = (size, )
+            size += preprocessor.size
+        return (size, )

    def transform(self, observation):
        assert len(observation) == len(self.preprocessors), observation
        return np.concatenate([
-            np.reshape(p.transform(o), [np.product(p.shape)])
+            np.reshape(p.transform(o), [p.size])
            for (o, p) in zip(observation, self.preprocessors)
        ])


+class DictFlatteningPreprocessor(Preprocessor):
+    """Preprocesses each dict value, then flattens it all into a vector.
+
+    RLlib models will unpack the flattened output before _build_layers_v2().
+    """
+
+    def _init_shape(self, obs_space, options):
+        assert isinstance(self._obs_space, gym.spaces.Dict)
+        size = 0
+        self.preprocessors = []
+        for space in self._obs_space.spaces.values():
+            print("Creating sub-preprocessor for", space)
+            preprocessor = get_preprocessor(space)(space, self._options)
+            self.preprocessors.append(preprocessor)
+            size += preprocessor.size
+        return (size, )
+
+    def transform(self, observation):
+        assert len(observation) == len(self.preprocessors), \
+            (len(observation), len(self.preprocessors))
+        return np.concatenate([
+            np.reshape(p.transform(o), [p.size])
+            for (o, p) in zip(observation.values(), self.preprocessors)
+        ])
+
+
 def get_preprocessor(space):
    """Returns an appropriate preprocessor class for the given space."""

@@ -141,6 +182,8 @@ def get_preprocessor(space):
        preprocessor = AtariRamPreprocessor
    elif isinstance(space, gym.spaces.Tuple):
        preprocessor = TupleFlatteningPreprocessor
+    elif isinstance(space, gym.spaces.Dict):
+        preprocessor = DictFlatteningPreprocessor
    else:
        preprocessor = NoPreprocessor

@@ -12,7 +12,8 @@ from ray.rllib.models.misc import get_activation_fn, flatten
 class VisionNetwork(Model):
    """Generic vision network."""

-    def _build_layers(self, inputs, num_outputs, options):
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        inputs = input_dict["obs"]
        filters = options.get("conv_filters")
        if not filters:
            filters = get_filter_config(options)
@@ -15,16 +15,18 @@ from ray.rllib.models.visionnet import VisionNetwork


 class CustomPreprocessor(Preprocessor):
-    pass
+    def _init_shape(self, obs_space, options):
+        return None


 class CustomPreprocessor2(Preprocessor):
-    pass
+    def _init_shape(self, obs_space, options):
+        return None


 class CustomModel(Model):
    def _build_layers(self, *args):
-        return None, None
+        return tf.constant([[0] * 5]), None


 class ModelCatalogTest(unittest.TestCase):
@@ -69,20 +71,24 @@ class ModelCatalogTest(unittest.TestCase):
        ray.init()

        with tf.variable_scope("test1"):
-            p1 = ModelCatalog.get_model(
-                np.zeros((10, 3), dtype=np.float32), 5, {})
+            p1 = ModelCatalog.get_model({
+                "obs": np.zeros((10, 3), dtype=np.float32)
+            }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {})
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
-            p2 = ModelCatalog.get_model(
-                np.zeros((10, 84, 84, 3), dtype=np.float32), 5, {})
+            p2 = ModelCatalog.get_model({
+                "obs": np.zeros((10, 84, 84, 3), dtype=np.float32)
+            }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {})
            self.assertEqual(type(p2), VisionNetwork)

    def testCustomModel(self):
        ray.init()
        ModelCatalog.register_custom_model("foo", CustomModel)
-        p1 = ModelCatalog.get_model(
-            tf.constant([1, 2, 3]), 5, {"custom_model": "foo"})
+        p1 = ModelCatalog.get_model({
+            "obs": tf.constant([1, 2, 3])
+        }, Box(0, 1, shape=(3, ), dtype=np.float32), 5,
+                                    {"custom_model": "foo"})
        self.assertEqual(str(type(p1)), str(CustomModel))


@@ -314,6 +314,8 @@ class TestMultiAgentEnv(unittest.TestCase):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
+                                prev_action_batch=None,
+                                prev_reward_batch=None,
                                is_training=False,
                                episodes=None):
                return [0] * len(obs_batch), [[h] * len(obs_batch)], {}
@@ -337,6 +339,8 @@ class TestMultiAgentEnv(unittest.TestCase):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
+                                prev_action_batch=None,
+                                prev_reward_batch=None,
                                is_training=False,
                                episodes=None):
                # Pretend we did a model-based rollout and want to return
@@ -0,0 +1,249 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+
+from gym import spaces
+from gym.envs.registration import EnvSpec
+import gym
+import tensorflow.contrib.slim as slim
+import tensorflow as tf
+import unittest
+
+import ray
+from ray.rllib.agents.pg import PGAgent
+from ray.rllib.env.async_vector_env import AsyncVectorEnv
+from ray.rllib.env.vector_env import VectorEnv
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.model import Model
+from ray.rllib.test.test_serving_env import SimpleServing
+from ray.tune.registry import register_env
+
+DICT_SPACE = spaces.Dict({
+    "sensors": spaces.Dict({
+        "position": spaces.Box(low=-100, high=100, shape=(3, )),
+        "velocity": spaces.Box(low=-1, high=1, shape=(3, )),
+        "front_cam": spaces.Tuple(
+            (spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+             spaces.Box(low=0, high=1, shape=(10, 10, 3)))),
+        "rear_cam": spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+    }),
+    "inner_state": spaces.Dict({
+        "charge": spaces.Discrete(100),
+        "job_status": spaces.Dict({
+            "task": spaces.Discrete(5),
+            "progress": spaces.Box(low=0, high=100, shape=()),
+        })
+    })
+})
+
+DICT_SAMPLES = [DICT_SPACE.sample() for _ in range(10)]
+
+TUPLE_SPACE = spaces.Tuple([
+    spaces.Box(low=-100, high=100, shape=(3, )),
+    spaces.Tuple((spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                  spaces.Box(low=0, high=1, shape=(10, 10, 3)))),
+    spaces.Discrete(5),
+])
+
+TUPLE_SAMPLES = [TUPLE_SPACE.sample() for _ in range(10)]
+
+
+def one_hot(i, n):
+    out = [0.0] * n
+    out[i] = 1.0
+    return out
+
+
+class NestedDictEnv(gym.Env):
+    def __init__(self):
+        self.action_space = spaces.Discrete(2)
+        self.observation_space = DICT_SPACE
+        self._spec = EnvSpec("NestedDictEnv-v0")
+        self.steps = 0
+
+    def reset(self):
+        self.steps = 0
+        return DICT_SAMPLES[0]
+
+    def step(self, action):
+        self.steps += 1
+        return DICT_SAMPLES[self.steps], 1, self.steps >= 5, {}
+
+
+class NestedTupleEnv(gym.Env):
+    def __init__(self):
+        self.action_space = spaces.Discrete(2)
+        self.observation_space = TUPLE_SPACE
+        self._spec = EnvSpec("NestedTupleEnv-v0")
+        self.steps = 0
+
+    def reset(self):
+        self.steps = 0
+        return TUPLE_SAMPLES[0]
+
+    def step(self, action):
+        self.steps += 1
+        return TUPLE_SAMPLES[self.steps], 1, self.steps >= 5, {}
+
+
+class InvalidModel(Model):
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        return "not", "valid"
+
+
+class DictSpyModel(Model):
+    capture_index = 0
+
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        def spy(pos, front_cam, task):
+            # TF runs this function in an isolated context, so we have to use
+            # redis to communicate back to our suite
+            ray.experimental.internal_kv._internal_kv_put(
+                "d_spy_in_{}".format(DictSpyModel.capture_index),
+                pickle.dumps((pos, front_cam, task)))
+            DictSpyModel.capture_index += 1
+            return 0
+
+        spy_fn = tf.py_func(
+            spy, [
+                input_dict["obs"]["sensors"]["position"],
+                input_dict["obs"]["sensors"]["front_cam"][0],
+                input_dict["obs"]["inner_state"]["job_status"]["task"]
+            ],
+            tf.int64,
+            stateful=True)
+
+        with tf.control_dependencies([spy_fn]):
+            output = slim.fully_connected(
+                input_dict["obs"]["sensors"]["position"], num_outputs)
+        return output, output
+
+
+class TupleSpyModel(Model):
+    capture_index = 0
+
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        def spy(pos, cam, task):
+            # TF runs this function in an isolated context, so we have to use
+            # redis to communicate back to our suite
+            ray.experimental.internal_kv._internal_kv_put(
+                "t_spy_in_{}".format(TupleSpyModel.capture_index),
+                pickle.dumps((pos, cam, task)))
+            TupleSpyModel.capture_index += 1
+            return 0
+
+        spy_fn = tf.py_func(
+            spy, [
+                input_dict["obs"][0],
+                input_dict["obs"][1][0],
+                input_dict["obs"][2],
+            ],
+            tf.int64,
+            stateful=True)
+
+        with tf.control_dependencies([spy_fn]):
+            output = slim.fully_connected(input_dict["obs"][0], num_outputs)
+        return output, output
+
+
+class NestedSpacesTest(unittest.TestCase):
+    def testInvalidModel(self):
+        ModelCatalog.register_custom_model("invalid", InvalidModel)
+        self.assertRaises(ValueError, lambda: PGAgent(
+            env="CartPole-v0", config={
+                "model": {
+                    "custom_model": "invalid",
+                },
+            }))
+
+    def doTestNestedDict(self, make_env):
+        ModelCatalog.register_custom_model("composite", DictSpyModel)
+        register_env("nested", make_env)
+        pg = PGAgent(
+            env="nested",
+            config={
+                "num_workers": 0,
+                "sample_batch_size": 5,
+                "model": {
+                    "custom_model": "composite",
+                },
+            })
+        pg.train()
+
+        # Check that the model sees the correct reconstructed observations
+        for i in range(4):
+            seen = pickle.loads(
+                ray.experimental.internal_kv._internal_kv_get(
+                    "d_spy_in_{}".format(i)))
+            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
+            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
+            task_i = one_hot(
+                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
+            self.assertEqual(seen[0][0].tolist(), pos_i)
+            self.assertEqual(seen[1][0].tolist(), cam_i)
+            self.assertEqual(seen[2][0].tolist(), task_i)
+
+    def doTestNestedTuple(self, make_env):
+        ModelCatalog.register_custom_model("composite2", TupleSpyModel)
+        register_env("nested2", make_env)
+        pg = PGAgent(
+            env="nested2",
+            config={
+                "num_workers": 0,
+                "sample_batch_size": 5,
+                "model": {
+                    "custom_model": "composite2",
+                },
+            })
+        pg.train()
+
+        # Check that the model sees the correct reconstructed observations
+        for i in range(4):
+            seen = pickle.loads(
+                ray.experimental.internal_kv._internal_kv_get(
+                    "t_spy_in_{}".format(i)))
+            pos_i = TUPLE_SAMPLES[i][0].tolist()
+            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
+            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
+            self.assertEqual(seen[0][0].tolist(), pos_i)
+            self.assertEqual(seen[1][0].tolist(), cam_i)
+            self.assertEqual(seen[2][0].tolist(), task_i)
+
+    def testNestedDictGym(self):
+        self.doTestNestedDict(lambda _: NestedDictEnv())
+
+    def testNestedDictVector(self):
+        self.doTestNestedDict(
+            lambda _: VectorEnv.wrap(lambda i: NestedDictEnv()))
+
+    def testNestedDictServing(self):
+        self.doTestNestedDict(lambda _: SimpleServing(NestedDictEnv()))
+
+    def testNestedDictAsync(self):
+        self.assertRaisesRegexp(
+            ValueError, "Found raw Dict space.*",
+            lambda: self.doTestNestedDict(
+                lambda _: AsyncVectorEnv.wrap_async(NestedDictEnv())))
+
+    def testNestedTupleGym(self):
+        self.doTestNestedTuple(lambda _: NestedTupleEnv())
+
+    def testNestedTupleVector(self):
+        self.doTestNestedTuple(
+            lambda _: VectorEnv.wrap(lambda i: NestedTupleEnv()))
+
+    def testNestedTupleServing(self):
+        self.doTestNestedTuple(lambda _: SimpleServing(NestedTupleEnv()))
+
+    def testNestedTupleAsync(self):
+        self.assertRaisesRegexp(
+            ValueError, "Found raw Tuple space.*",
+            lambda: self.doTestNestedTuple(
+                lambda _: AsyncVectorEnv.wrap_async(NestedTupleEnv())))
+
+
+if __name__ == "__main__":
+    ray.init(num_cpus=5)
+    unittest.main(verbosity=2)
@@ -3,6 +3,7 @@ from __future__ import division
 from __future__ import print_function

 import gym
+import numpy as np
 import time
 import unittest

@@ -21,6 +22,8 @@ class MockPolicyGraph(PolicyGraph):
    def compute_actions(self,
                        obs_batch,
                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
                        is_training=False,
                        episodes=None):
        return [0] * len(obs_batch), [], {}
@@ -33,6 +36,8 @@ class BadPolicyGraph(PolicyGraph):
    def compute_actions(self,
                        obs_batch,
                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
                        is_training=False,
                        episodes=None):
        raise Exception("intentional error")
@@ -107,8 +112,23 @@ class TestPolicyEvaluator(unittest.TestCase):
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=MockPolicyGraph)
        batch = ev.sample()
-        for key in ["obs", "actions", "rewards", "dones", "advantages"]:
+        for key in [
+                "obs", "actions", "rewards", "dones", "advantages",
+                "prev_rewards", "prev_actions"
+        ]:
            self.assertIn(key, batch)
+
+        def to_prev(vec):
+            out = np.zeros_like(vec)
+            for i, v in enumerate(vec):
+                if i + 1 < len(out) and not batch["dones"][i]:
+                    out[i + 1] = v
+            return out.tolist()
+
+        self.assertEqual(batch["prev_rewards"].tolist(),
+                         to_prev(batch["rewards"]))
+        self.assertEqual(batch["prev_actions"].tolist(),
+                         to_prev(batch["actions"]))
        self.assertGreater(batch["advantages"][0], 1)

    def testGlobalVarsUpdate(self):
@@ -2,7 +2,7 @@ import unittest
 import traceback

 import gym
-from gym.spaces import Box, Discrete, Tuple
+from gym.spaces import Box, Discrete, Tuple, Dict
 from gym.envs.registration import EnvSpec
 import numpy as np
 import sys
@@ -14,33 +14,28 @@ from ray.tune.registry import register_env

 ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
-    "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5, ), dtype=np.float32),
-        Box(0.0, 1.0, (5, ), dtype=np.float32)
-    ]),
-    "mixed_tuple": Tuple(
+    "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32),
+    "tuple": Tuple(
        [Discrete(2),
         Discrete(3),
-         Box(0.0, 1.0, (5, ), dtype=np.float32)]),
+         Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
 }

 OBSERVATION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
-    "image": Box(0.0, 1.0, (84, 84, 1), dtype=np.float32),
-    "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
-    "atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
-    "simple_tuple": Tuple([
-        Box(0.0, 1.0, (5, ), dtype=np.float32),
-        Box(0.0, 1.0, (5, ), dtype=np.float32)
-    ]),
-    "mixed_tuple": Tuple(
-        [Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
+    "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32),
+    "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
+    "atari": Box(-1.0, 1.0, (210, 160, 3), dtype=np.float32),
+    "tuple": Tuple([Discrete(10),
+                    Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
+    "dict": Dict({
+        "task": Discrete(10),
+        "position": Box(-1.0, 1.0, (5, ), dtype=np.float32),
+    }),
 }


-def make_stub_env(action_space, obs_space):
+def make_stub_env(action_space, obs_space, check_action_bounds):
    class StubEnv(gym.Env):
        def __init__(self):
            self.action_space = action_space
@@ -52,16 +47,23 @@ def make_stub_env(action_space, obs_space):
            return sample

        def step(self, action):
+            if check_action_bounds and not self.action_space.contains(action):
+                raise ValueError("Illegal action for {}: {}".format(
+                    self.action_space, action))
+            if (isinstance(self.action_space, Tuple)
+                    and len(action) != len(self.action_space.spaces)):
+                raise ValueError("Illegal action for {}: {}".format(
+                    self.action_space, action))
            return self.observation_space.sample(), 1, True, {}

    return StubEnv


-def check_support(alg, config, stats):
+def check_support(alg, config, stats, check_bounds=False):
    for a_name, action_space in ACTION_SPACES_TO_TEST.items():
        for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items():
            print("=== Testing", alg, action_space, obs_space, "===")
-            stub_env = make_stub_env(action_space, obs_space)
+            stub_env = make_stub_env(action_space, obs_space, check_bounds)
            register_env("stub_env", lambda c: stub_env())
            stat = "ok"
            a = None
@@ -105,8 +107,13 @@ class ModelSupportedSpaces(unittest.TestCase):
                "num_sgd_iter": 1,
                "train_batch_size": 10,
                "sample_batch_size": 10,
-                "sgd_minibatch_size": 1
-            }, stats)
+                "sgd_minibatch_size": 1,
+                "model": {
+                    "squash_to_range": True
+                },
+            },
+            stats,
+            check_bounds=True)
        check_support(
            "ES", {
                "num_workers": 1,
@@ -26,7 +26,8 @@ class TFRunBuilder(object):
    def add_feed_dict(self, feed_dict):
        assert not self._executed
        for k in feed_dict:
-            assert k not in self.feed_dict
+            if k in self.feed_dict:
+                raise ValueError("Key added twice: {}".format(k))
        self.feed_dict.update(feed_dict)

    def add_fetches(self, fetches):