From a9e454f6fdaa4dca2cb27e748805953f8721c6f4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Tue, 16 Oct 2018 15:55:11 -0700
Subject: [PATCH] [rllib] Include config dicts in the sphinx docs (#3064)

---
 doc/source/rllib-algorithms.rst               | 63 +++++++++++++
 doc/source/rllib-models.rst                   |  9 ++
 doc/source/rllib-training.rst                 | 55 +++++++-----
 doc/source/rllib.rst                          |  1 +
 python/ray/rllib/agents/a3c/a3c.py            | 26 +-----
 python/ray/rllib/agents/agent.py              | 16 ++--
 python/ray/rllib/agents/ars/ars.py            |  3 +-
 python/ray/rllib/agents/ddpg/apex.py          |  2 +-
 python/ray/rllib/agents/ddpg/ddpg.py          |  3 +
 python/ray/rllib/agents/dqn/apex.py           |  5 +-
 .../ray/rllib/agents/dqn/common/wrappers.py   |  2 +-
 python/ray/rllib/agents/dqn/dqn.py            |  3 +
 python/ray/rllib/agents/es/es.py              |  8 +-
 python/ray/rllib/agents/impala/impala.py      | 10 +--
 python/ray/rllib/agents/pg/pg.py              | 10 +--
 python/ray/rllib/agents/ppo/ppo.py            | 10 +--
 .../ray/rllib/evaluation/policy_evaluator.py  |  7 +-
 python/ray/rllib/models/__init__.py           | 14 ++-
 python/ray/rllib/models/catalog.py            | 88 ++++++++++++-------
 python/ray/rllib/models/fcnet.py              |  4 +-
 python/ray/rllib/models/lstm.py               |  2 +-
 python/ray/rllib/models/model.py              |  2 +-
 python/ray/rllib/models/preprocessors.py      | 18 ++--
 python/ray/rllib/models/pytorch/visionnet.py  |  4 +-
 python/ray/rllib/models/visionnet.py          |  4 +-
 .../ray/rllib/test/test_supported_spaces.py   |  4 -
 26 files changed, 236 insertions(+), 137 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index be39b61bb..9a7c53539 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -38,6 +38,13 @@ SpaceInvaders  646                               ~300
 
     Ape-X using 32 workers in RLlib vs vanilla DQN (orange) and A3C (blue) on PongNoFrameskip-v4.
 
+**Ape-X specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/dqn/apex.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Importance Weighted Actor-Learner Architecture (IMPALA)
 -------------------------------------------------------
 
@@ -73,6 +80,13 @@ SpaceInvaders  843                              ~300
 
    IMPALA solves Atari several times faster than A2C / A3C, with similar sample efficiency. Here IMPALA scales from 16 to 128 workers to solve PongNoFrameskip-v4 in ~8 minutes.
 
+**IMPALA-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/impala/impala.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Gradient-based
 ~~~~~~~~~~~~~~
 
@@ -97,6 +111,13 @@ Qbert          3620                      ~1000
 SpaceInvaders  692                       ~600
 =============  ========================  ==============================
 
+**A3C-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/a3c/a3c.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Deep Deterministic Policy Gradients (DDPG)
 ------------------------------------------
 `[paper] <https://arxiv.org/abs/1509.02971>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ddpg/ddpg.py>`__
@@ -104,6 +125,13 @@ DDPG is implemented similarly to DQN (below). The algorithm can be scaled by inc
 
 Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml>`__
 
+**DDPG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/ddpg/ddpg.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Deep Q Networks (DQN, Rainbow)
 ------------------------------
 `[paper] <https://arxiv.org/abs/1312.5602>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/dqn/dqn.py>`__
@@ -125,12 +153,26 @@ Qbert          3921                      7968                           15780
 SpaceInvaders  650                       1001                           1025                            ~500                           
 =============  ========================  =============================  ==============================  ===============================
 
+**DQN-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/dqn/dqn.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Policy Gradients
 ----------------
 `[paper] <https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg.py>`__ We include a vanilla policy gradients implementation as an example algorithm. This is usually outperformed by PPO.
 
 Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml>`__
 
+**PG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/pg/pg.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Proximal Policy Optimization (PPO)
 ----------------------------------
 `[paper] <https://arxiv.org/abs/1707.06347>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo.py>`__
@@ -158,6 +200,13 @@ SpaceInvaders  671             944             ~800
 
    RLlib's multi-GPU PPO scales to multiple GPUs and hundreds of CPUs on solving the Humanoid-v1 task. Here we compare against a reference MPI-based implementation.
 
+**PPO-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/ppo/ppo.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Derivative-free
 ~~~~~~~~~~~~~~~
 
@@ -168,6 +217,13 @@ ARS is a random search method for training linear policies for continuous contro
 
 Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml>`__, `Swimmer-v2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/swimmer-ars.yaml>`__
 
+**ARS-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/ars/ars.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Evolution Strategies
 --------------------
 `[paper] <https://arxiv.org/abs/1703.03864>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/es/es.py>`__
@@ -181,3 +237,10 @@ Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/pyt
    :width: 500px
 
    RLlib's ES implementation scales further and is faster than a reference Redis implementation on solving the Humanoid-v1 task.
+
+**ES-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+
+.. literalinclude:: ../../python/ray/rllib/agents/es/es.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index 5b3f88cf0..6efc4abb8 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -17,6 +17,15 @@ In addition, if you set ``"model": {"use_lstm": true}``, then the model output w
 
 For preprocessors, RLlib tries to pick one of its built-in preprocessor based on the environment's observation space. Discrete observations are one-hot encoded, Atari observations downscaled, and Tuple observations flattened (there isn't native tuple support yet, but you can reshape the flattened observation in a custom model). Note that for Atari, RLlib defaults to using the `DeepMind preprocessors <https://github.com/ray-project/ray/blob/master/python/ray/rllib/env/atari_wrappers.py>`__, which are also used by the OpenAI baselines library.
 
+Built-in Model Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following is a list of the built-in model hyperparameters:
+
+.. literalinclude:: ../../python/ray/rllib/models/catalog.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
 
 Custom Models
 -------------
diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst
index 6d3a142db..583a0ccae 100644
--- a/doc/source/rllib-training.rst
+++ b/doc/source/rllib-training.rst
@@ -37,26 +37,6 @@ with ``--env`` (any OpenAI gym environment including ones registered by the user
 can be used) and for choosing the algorithm with ``--run``
 (available options are ``PPO``, ``PG``, ``A2C``, ``A3C``, ``IMPALA``, ``ES``, ``DDPG``, ``DQN``, ``APEX``, and ``APEX_DDPG``).
 
-Specifying Parameters
-~~~~~~~~~~~~~~~~~~~~~
-
-Each algorithm has specific hyperparameters that can be set with ``--config``, in addition to a number of `common hyperparameters <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/agent.py>`__. See the
-`algorithms documentation <rllib-algorithms.html>`__ for more information.
-
-In an example below, we train A2C by specifying 8 workers through the config flag. We also set ``"monitor": true`` to save episode videos to the result dir:
-
-.. code-block:: bash
-
-    python ray/python/ray/rllib/train.py --env=PongDeterministic-v4 \
-        --run=A2C --config '{"num_workers": 8, "monitor": true}'
-
-.. image:: rllib-config.svg
-
-Specifying Resources
-~~~~~~~~~~~~~~~~~~~~
-
-You can control the degree of parallelism used by setting the ``num_workers`` hyperparameter for most agents. Many agents also provide a ``num_gpus`` or ``gpu`` option. In addition, you can allocate a fraction of a GPU by setting ``gpu_fraction: f``. For example, with DQN you can pack five agents onto one GPU by setting ``gpu_fraction: 0.2``. Note that fractional GPU support requires enabling the experimental Xray backend by setting the environment variable ``RAY_USE_XRAY=1``.
-
 Evaluating Trained Agents
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -77,6 +57,39 @@ The ``rollout.py`` helper script reconstructs a DQN agent from the checkpoint
 located at ``~/ray_results/default/DQN_CartPole-v0_0upjmdgr0/checkpoint-1``
 and renders its behavior in the environment specified by ``--env``.
 
+Configuration
+-------------
+
+Specifying Parameters
+~~~~~~~~~~~~~~~~~~~~~
+
+Each algorithm has specific hyperparameters that can be set with ``--config``, in addition to a number of `common hyperparameters <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/agent.py>`__. See the
+`algorithms documentation <rllib-algorithms.html>`__ for more information.
+
+In an example below, we train A2C by specifying 8 workers through the config flag. We also set ``"monitor": true`` to save episode videos to the result dir:
+
+.. code-block:: bash
+
+    python ray/python/ray/rllib/train.py --env=PongDeterministic-v4 \
+        --run=A2C --config '{"num_workers": 8, "monitor": true}'
+
+.. image:: rllib-config.svg
+
+Specifying Resources
+~~~~~~~~~~~~~~~~~~~~
+
+You can control the degree of parallelism used by setting the ``num_workers`` hyperparameter for most agents. Many agents also provide a ``num_gpus`` or ``gpu`` option. In addition, you can allocate a fraction of a GPU by setting ``gpu_fraction: f``. For example, with DQN you can pack five agents onto one GPU by setting ``gpu_fraction: 0.2``. Note that fractional GPU support requires enabling the experimental X-ray backend by setting the environment variable ``RAY_USE_XRAY=1``.
+
+Common Parameters
+~~~~~~~~~~~~~~~~~
+
+The following is a list of the common agent hyperparameters:
+
+.. literalinclude:: ../../python/ray/rllib/agents/agent.py
+   :language: python
+   :start-after: __sphinx_doc_begin__
+   :end-before: __sphinx_doc_end__
+
 Tuned Examples
 ~~~~~~~~~~~~~~
 
@@ -154,7 +167,7 @@ Tune will schedule the trials to run in parallel on your Ray cluster:
     == Status ==
     Using FIFO scheduling algorithm.
     Resources requested: 4/4 CPUs, 0/0 GPUs
-    Result logdir: /home/eric/ray_results/my_experiment
+    Result logdir: ~/ray_results/my_experiment
     PENDING trials:
      - PPO_CartPole-v0_2_sgd_stepsize=0.0001:	PENDING
     RUNNING trials:
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index ba011d08c..8979e702a 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -27,6 +27,7 @@ You might also want to clone the Ray repo for convenient access to RLlib helper
 Training APIs
 -------------
 * `Command-line <rllib-training.html>`__
+* `Configuration <rllib-training.html#configuration>`__
 * `Python API <rllib-training.html#python-api>`__
 * `REST API <rllib-training.html#rest-api>`__
 
diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py
index afda95062..55f179ade 100644
--- a/python/ray/rllib/agents/a3c/a3c.py
+++ b/python/ray/rllib/agents/a3c/a3c.py
@@ -10,6 +10,7 @@ from ray.rllib.optimizers import AsyncGradientsOptimizer
 from ray.rllib.utils import merge_dicts
 from ray.tune.trial import Resources
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # Size of rollout batch
     "sample_batch_size": 10,
@@ -34,31 +35,10 @@ DEFAULT_CONFIG = with_common_config({
     # Workers sample async. Note that this increases the effective
     # sample_batch_size by up to 5x due to async buffering of batches.
     "sample_async": True,
-    # Model and preprocessor options
-    "model": {
-        # Use LSTM model. Requires TF.
-        "use_lstm": False,
-        # Max seq length for LSTM training.
-        "max_seq_len": 20,
-        # (Image statespace) - Converts image to Channels = 1
-        "grayscale": True,
-        # (Image statespace) - Each pixel
-        "zero_mean": False,
-        # (Image statespace) - Converts image to (dim, dim, C)
-        "dim": 84,
-        # (Image statespace) - Converts image shape to (C, dim, dim)
-        "channel_major": False,
-    },
-    # Configure TF for single-process operation
-    "tf_session_args": {
-        "intra_op_parallelism_threads": 1,
-        "inter_op_parallelism_threads": 1,
-        "gpu_options": {
-            "allow_growth": True,
-        },
-    },
 })
 
+# __sphinx_doc_end__
+
 
 class A3CAgent(Agent):
     """A3C implementations in TensorFlow and PyTorch."""
diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py
index 28b423417..01defdbc3 100644
--- a/python/ray/rllib/agents/agent.py
+++ b/python/ray/rllib/agents/agent.py
@@ -10,6 +10,7 @@ from datetime import datetime
 import tensorflow as tf
 
 import ray
+from ray.rllib.models import MODEL_DEFAULTS
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.utils import FilterManager, deep_update, merge_dicts
@@ -18,10 +19,11 @@ from ray.tune.trainable import Trainable
 from ray.tune.logger import UnifiedLogger
 from ray.tune.result import DEFAULT_RESULTS_DIR
 
+# __sphinx_doc_begin__
 COMMON_CONFIG = {
     # Discount factor of the MDP
     "gamma": 0.99,
-    # Number of steps after which the rollout gets cut
+    # Number of steps after which the episode is forced to terminate
     "horizon": None,
     # Number of environments to evaluate vectorwise per worker.
     "num_envs_per_worker": 1,
@@ -36,7 +38,7 @@ COMMON_CONFIG = {
     "batch_mode": "truncate_episodes",
     # Whether to use a background thread for sampling (slightly off-policy)
     "sample_async": False,
-    # Which observation filter to apply to the observation
+    # Element-wise observation filter, either "NoFilter" or "MeanStdFilter"
     "observation_filter": "NoFilter",
     # Whether to synchronize the statistics of remote filters.
     "synchronize_filters": True,
@@ -50,14 +52,12 @@ COMMON_CONFIG = {
     # Environment name can also be passed via config
     "env": None,
     # Arguments to pass to model
-    "model": {
-        "use_lstm": False,
-        "max_seq_len": 20,
-    },
-    # Arguments to pass to the rllib optimizer
+    "model": MODEL_DEFAULTS,
+    # Arguments to pass to the policy optimizer. These vary by optimizer.
     "optimizer": {},
     # Configure TF for single-process operation by default
     "tf_session_args": {
+        # note: parallelism_threads is set to auto for the local evaluator
         "intra_op_parallelism_threads": 1,
         "inter_op_parallelism_threads": 1,
         "gpu_options": {
@@ -88,6 +88,8 @@ COMMON_CONFIG = {
     },
 }
 
+# __sphinx_doc_end__
+
 
 def with_common_config(extra_config):
     """Returns the given config dict merged with common agent confs."""
diff --git a/python/ray/rllib/agents/ars/ars.py b/python/ray/rllib/agents/ars/ars.py
index 5984e2e01..27c6fe879 100644
--- a/python/ray/rllib/agents/ars/ars.py
+++ b/python/ray/rllib/agents/ars/ars.py
@@ -24,6 +24,7 @@ Result = namedtuple("Result", [
     "eval_returns", "eval_lengths"
 ])
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     "noise_stdev": 0.02,  # std deviation of parameter noise
     "num_rollouts": 32,  # number of perturbs to try
@@ -34,9 +35,9 @@ DEFAULT_CONFIG = with_common_config({
     "noise_size": 250000000,
     "eval_prob": 0.03,  # probability of evaluating the parameter rewards
     "report_length": 10,  # how many of the last rewards we average over
-    "env_config": {},
     "offset": 0,
 })
+# __sphinx_doc_end__
 
 
 @ray.remote
diff --git a/python/ray/rllib/agents/ddpg/apex.py b/python/ray/rllib/agents/ddpg/apex.py
index c2276d0a9..c9053ca8a 100644
--- a/python/ray/rllib/agents/ddpg/apex.py
+++ b/python/ray/rllib/agents/ddpg/apex.py
@@ -7,7 +7,7 @@ from ray.rllib.utils import merge_dicts
 from ray.tune.trial import Resources
 
 APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
-    DDPG_CONFIG,
+    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
     {
         "optimizer_class": "AsyncReplayOptimizer",
         "optimizer": merge_dicts(
diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py
index b475e297a..c35fdaa71 100644
--- a/python/ray/rllib/agents/ddpg/ddpg.py
+++ b/python/ray/rllib/agents/ddpg/ddpg.py
@@ -13,6 +13,7 @@ OPTIMIZER_SHARED_CONFIGS = [
     "train_batch_size", "learning_starts"
 ]
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # === Model ===
     # Hidden layer sizes of the policy network
@@ -108,6 +109,8 @@ DEFAULT_CONFIG = with_common_config({
     "min_iter_time_s": 1,
 })
 
+# __sphinx_doc_end__
+
 
 class DDPGAgent(DQNAgent):
     """DDPG implementation in TensorFlow."""
diff --git a/python/ray/rllib/agents/dqn/apex.py b/python/ray/rllib/agents/dqn/apex.py
index e6058b41f..052d0fd3e 100644
--- a/python/ray/rllib/agents/dqn/apex.py
+++ b/python/ray/rllib/agents/dqn/apex.py
@@ -6,8 +6,9 @@ from ray.rllib.agents.dqn.dqn import DQNAgent, DEFAULT_CONFIG as DQN_CONFIG
 from ray.rllib.utils import merge_dicts
 from ray.tune.trial import Resources
 
+# __sphinx_doc_begin__
 APEX_DEFAULT_CONFIG = merge_dicts(
-    DQN_CONFIG,
+    DQN_CONFIG,  # see also the options in dqn.py, which are also supported
     {
         "optimizer_class": "AsyncReplayOptimizer",
         "optimizer": merge_dicts(
@@ -31,6 +32,8 @@ APEX_DEFAULT_CONFIG = merge_dicts(
     },
 )
 
+# __sphinx_doc_end__
+
 
 class ApexAgent(DQNAgent):
     """DQN variant that uses the Ape-X distributed policy optimizer.
diff --git a/python/ray/rllib/agents/dqn/common/wrappers.py b/python/ray/rllib/agents/dqn/common/wrappers.py
index eb6a6c0d5..97f839abf 100644
--- a/python/ray/rllib/agents/dqn/common/wrappers.py
+++ b/python/ray/rllib/agents/dqn/common/wrappers.py
@@ -13,7 +13,7 @@ def wrap_dqn(env, options):
 
     # Override atari default to use the deepmind wrappers.
     # TODO(ekl) this logic should be pushed to the catalog.
-    if is_atari and "custom_preprocessor" not in options:
+    if is_atari and not options.get("custom_preprocessor"):
         return wrap_deepmind(env, dim=options.get("dim", 84))
 
     return ModelCatalog.get_preprocessor_as_wrapper(env, options)
diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py
index 25320fd6a..f86b286ce 100644
--- a/python/ray/rllib/agents/dqn/dqn.py
+++ b/python/ray/rllib/agents/dqn/dqn.py
@@ -20,6 +20,7 @@ OPTIMIZER_SHARED_CONFIGS = [
     "learning_starts"
 ]
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # === Model ===
     # Number of atoms for representing the distribution of return. When
@@ -116,6 +117,8 @@ DEFAULT_CONFIG = with_common_config({
     "min_iter_time_s": 1,
 })
 
+# __sphinx_doc_end__
+
 
 class DQNAgent(Agent):
     """DQN implementation in TensorFlow."""
diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py
index 392f98f1d..d5526c0da 100644
--- a/python/ray/rllib/agents/es/es.py
+++ b/python/ray/rllib/agents/es/es.py
@@ -24,6 +24,7 @@ Result = namedtuple("Result", [
     "eval_returns", "eval_lengths"
 ])
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     "l2_coeff": 0.005,
     "noise_stdev": 0.02,
@@ -36,10 +37,8 @@ DEFAULT_CONFIG = with_common_config({
     "observation_filter": "MeanStdFilter",
     "noise_size": 250000000,
     "report_length": 10,
-    "env": None,
-    "env_config": {},
-    "model": {},
 })
+# __sphinx_doc_end__
 
 
 @ray.remote
@@ -77,7 +76,8 @@ class Worker(object):
 
         self.env = env_creator(config["env_config"])
         from ray.rllib import models
-        self.preprocessor = models.ModelCatalog.get_preprocessor(self.env)
+        self.preprocessor = models.ModelCatalog.get_preprocessor(
+            self.env, config["model"])
 
         self.sess = utils.make_session(single_threaded=True)
         self.policy = policies.GenericPolicy(
diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
index 1ad2b673f..a303643f5 100644
--- a/python/ray/rllib/agents/impala/impala.py
+++ b/python/ray/rllib/agents/impala/impala.py
@@ -23,6 +23,7 @@ OPTIMIZER_SHARED_CONFIGS = [
     "max_sample_requests_in_flight_per_worker",
 ]
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # V-trace params (see vtrace.py).
     "vtrace": True,
@@ -63,15 +64,10 @@ DEFAULT_CONFIG = with_common_config({
     # balancing the three losses
     "vf_loss_coeff": 0.5,
     "entropy_coeff": -0.01,
-
-    # Model and preprocessor options.
-    "model": {
-        "use_lstm": False,
-        "max_seq_len": 20,
-        "dim": 84,
-    },
 })
 
+# __sphinx_doc_end__
+
 
 class ImpalaAgent(Agent):
     """IMPALA implementation using DeepMind's V-trace."""
diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index e1766e774..edc24ca1b 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -8,20 +8,16 @@ from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.utils import merge_dicts
 from ray.tune.trial import Resources
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # No remote workers by default
     "num_workers": 0,
     # Learning rate
     "lr": 0.0004,
-    # Override model config
-    "model": {
-        # Use LSTM model.
-        "use_lstm": False,
-        # Max seq length for LSTM training.
-        "max_seq_len": 20,
-    },
 })
 
+# __sphinx_doc_end__
+
 
 class PGAgent(Agent):
     """Simple policy gradient agent.
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index d2a991929..ea09dfe59 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -8,6 +8,7 @@ from ray.rllib.utils import merge_dicts
 from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
 from ray.tune.trial import Resources
 
+# __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
     # If true, use the Generalized Advantage Estimator (GAE)
     # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
@@ -53,15 +54,10 @@ DEFAULT_CONFIG = with_common_config({
     "observation_filter": "MeanStdFilter",
     # Use the sync samples optimizer instead of the multi-gpu one
     "simple_optimizer": False,
-    # Override model config
-    "model": {
-        # Whether to use LSTM model
-        "use_lstm": False,
-        # Max seq length for LSTM training.
-        "max_seq_len": 20,
-    },
 })
 
+# __sphinx_doc_end__
+
 
 class PPOAgent(Agent):
     """Multi-GPU optimized implementation of PPO in TensorFlow."""
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index ac276ee3a..548b65806 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -187,7 +187,7 @@ class PolicyEvaluator(EvaluatorInterface):
             def wrap(env):
                 return env  # we can't auto-wrap these env types
         elif is_atari(self.env) and \
-                "custom_preprocessor" not in model_config and \
+                not model_config.get("custom_preprocessor") and \
                 preprocessor_pref == "deepmind":
 
             if clip_rewards is None:
@@ -196,9 +196,8 @@ class PolicyEvaluator(EvaluatorInterface):
             def wrap(env):
                 env = wrap_deepmind(
                     env,
-                    dim=model_config.get("dim", 84),
-                    framestack=not model_config.get("use_lstm")
-                    and not model_config.get("no_framestack"))
+                    dim=model_config.get("dim"),
+                    framestack=model_config.get("framestack"))
                 if monitor_path:
                     env = _monitor(env, monitor_path)
                 return env
diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py
index ddfdd16b8..52e47e807 100644
--- a/python/ray/rllib/models/__init__.py
+++ b/python/ray/rllib/models/__init__.py
@@ -1,4 +1,4 @@
-from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS
 from ray.rllib.models.action_dist import (ActionDistribution, Categorical,
                                           DiagGaussian, Deterministic)
 from ray.rllib.models.model import Model
@@ -7,6 +7,14 @@ from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.lstm import LSTM
 
 __all__ = [
-    "ActionDistribution", "Categorical", "DiagGaussian", "Deterministic",
-    "ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM"
+    "ActionDistribution",
+    "Categorical",
+    "DiagGaussian",
+    "Deterministic",
+    "ModelCatalog",
+    "Model",
+    "Preprocessor",
+    "FullyConnectedNetwork",
+    "LSTM",
+    "MODEL_DEFAULTS",
 ]
diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index 370429c43..d2038f55f 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -18,29 +18,52 @@ from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.visionnet import VisionNetwork
 from ray.rllib.models.lstm import LSTM
 
-MODEL_CONFIGS = [
+# __sphinx_doc_begin__
+MODEL_DEFAULTS = {
     # === Built-in options ===
     # Filter config. List of [out_channels, kernel, stride] for each filter
-    "conv_filters",
-    "conv_activation",  # Nonlinearity for built-in convnet
-    "fcnet_activation",  # Nonlinearity for fully connected net (tanh, relu)
-    "fcnet_hiddens",  # Number of hidden layers for fully connected net
-    "dim",  # Dimension for ATARI
-    "grayscale",  # Converts ATARI frame to 1 Channel Grayscale image
-    "zero_mean",  # Changes frame to range from [-1, 1] if true
-    "extra_frameskip",  # (int) for number of frames to skip
-    "free_log_std",  # Documented in ray.rllib.models.Model
-    "channel_major",  # Pytorch conv requires images to be channel-major
-    "squash_to_range",  # Whether to squash the action output to space range
-    "use_lstm",  # Whether to wrap the model with a LSTM
-    "max_seq_len",  # Max seq len for training the LSTM, defaults to 20
-    "lstm_cell_size",  # Size of the LSTM cell
+    "conv_filters": None,
+    # Nonlinearity for built-in convnet
+    "conv_activation": "relu",
+    # Nonlinearity for fully connected net (tanh, relu)
+    "fcnet_activation": "tanh",
+    # Number of hidden layers for fully connected net
+    "fcnet_hiddens": [256, 256],
+    # For control envs, documented in ray.rllib.models.Model
+    "free_log_std": False,
+    # Whether to squash the action output to space range
+    "squash_to_range": False,
+
+    # == LSTM ==
+    # Whether to wrap the model with a LSTM
+    "use_lstm": False,
+    # Max seq len for training the LSTM, defaults to 20
+    "max_seq_len": 20,
+    # Size of the LSTM cell
+    "lstm_cell_size": 256,
+
+    # == Atari ==
+    # Whether to enable framestack for Atari envs
+    "framestack": True,
+    # Final resized frame dimension
+    "dim": 84,
+    # Pytorch conv requires images to be channel-major
+    "channel_major": False,
+    # (deprecated) Converts ATARI frame to 1 Channel Grayscale image
+    "grayscale": False,
+    # (deprecated) Changes frame to range from [-1, 1] if true
+    "zero_mean": True,
 
     # === Options for custom models ===
-    "custom_preprocessor",  # Name of a custom preprocessor to use
-    "custom_model",  # Name of a custom model to use
-    "custom_options",  # Extra options to pass to the custom classes
-]
+    # Name of a custom preprocessor to use
+    "custom_preprocessor": None,
+    # Name of a custom model to use
+    "custom_model": None,
+    # Extra options to pass to the custom classes
+    "custom_options": {},
+}
+
+# __sphinx_doc_end__
 
 
 class ModelCatalog(object):
@@ -71,10 +94,7 @@ class ModelCatalog(object):
             dist_dim (int): The size of the input vector to the distribution.
         """
 
-        # TODO(ekl) are list spaces valid?
-        if isinstance(action_space, list):
-            action_space = gym.spaces.Tuple(action_space)
-        config = config or {}
+        config = config or MODEL_DEFAULTS
         if isinstance(action_space, gym.spaces.Box):
             if dist_type is None:
                 dist = DiagGaussian
@@ -82,7 +102,7 @@ class ModelCatalog(object):
                     dist = squash_to_range(dist, action_space.low,
                                            action_space.high)
                 return dist, action_space.shape[0] * 2
-            elif dist_type == 'deterministic':
+            elif dist_type == "deterministic":
                 return Deterministic, action_space.shape[0]
         elif isinstance(action_space, gym.spaces.Discrete):
             return Categorical, action_space.n
@@ -154,6 +174,7 @@ class ModelCatalog(object):
             model (Model): Neural network model.
         """
 
+        options = options or MODEL_DEFAULTS
         model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
                                         seq_lens)
 
@@ -165,7 +186,7 @@ class ModelCatalog(object):
 
     @staticmethod
     def _get_model(inputs, num_outputs, options, state_in, seq_lens):
-        if "custom_model" in options:
+        if options.get("custom_model"):
             model = options["custom_model"]
             print("Using custom model {}".format(model))
             return _global_registry.get(RLLIB_MODEL, model)(
@@ -183,7 +204,7 @@ class ModelCatalog(object):
         return FullyConnectedNetwork(inputs, num_outputs, options)
 
     @staticmethod
-    def get_torch_model(input_shape, num_outputs, options={}):
+    def get_torch_model(input_shape, num_outputs, options=None):
         """Returns a PyTorch suitable model. This is currently only supported
         in A3C.
 
@@ -200,7 +221,8 @@ class ModelCatalog(object):
         from ray.rllib.models.pytorch.visionnet import (VisionNetwork as
                                                         PyTorchVisionNet)
 
-        if "custom_model" in options:
+        options = options or MODEL_DEFAULTS
+        if options.get("custom_model"):
             model = options["custom_model"]
             print("Using custom torch model {}".format(model))
             return _global_registry.get(RLLIB_MODEL, model)(
@@ -217,7 +239,7 @@ class ModelCatalog(object):
         return PyTorchFCNet(input_shape[0], num_outputs, options)
 
     @staticmethod
-    def get_preprocessor(env, options={}):
+    def get_preprocessor(env, options=None):
         """Returns a suitable processor for the given environment.
 
         Args:
@@ -227,12 +249,13 @@ class ModelCatalog(object):
         Returns:
             preprocessor (Preprocessor): Preprocessor for the env observations.
         """
+        options = options or MODEL_DEFAULTS
         for k in options.keys():
-            if k not in MODEL_CONFIGS:
+            if k not in MODEL_DEFAULTS:
                 raise Exception("Unknown config key `{}`, all keys: {}".format(
-                    k, MODEL_CONFIGS))
+                    k, list(MODEL_DEFAULTS)))
 
-        if "custom_preprocessor" in options:
+        if options.get("custom_preprocessor"):
             preprocessor = options["custom_preprocessor"]
             print("Using custom preprocessor {}".format(preprocessor))
             return _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)(
@@ -242,7 +265,7 @@ class ModelCatalog(object):
         return preprocessor(env.observation_space, options)
 
     @staticmethod
-    def get_preprocessor_as_wrapper(env, options={}):
+    def get_preprocessor_as_wrapper(env, options=None):
         """Returns a preprocessor as a gym observation wrapper.
 
         Args:
@@ -253,6 +276,7 @@ class ModelCatalog(object):
             wrapper (gym.ObservationWrapper): Preprocessor in wrapper form.
         """
 
+        options = options or MODEL_DEFAULTS
         preprocessor = ModelCatalog.get_preprocessor(env, options)
         return _RLlibPreprocessorWrapper(env, preprocessor)
 
diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py
index 11aee2c0d..e703fb0a0 100644
--- a/python/ray/rllib/models/fcnet.py
+++ b/python/ray/rllib/models/fcnet.py
@@ -13,8 +13,8 @@ class FullyConnectedNetwork(Model):
     """Generic fully connected network."""
 
     def _build_layers(self, inputs, num_outputs, options):
-        hiddens = options.get("fcnet_hiddens", [256, 256])
-        activation = get_activation_fn(options.get("fcnet_activation", "tanh"))
+        hiddens = options.get("fcnet_hiddens")
+        activation = get_activation_fn(options.get("fcnet_activation"))
 
         with tf.name_scope("fc_net"):
             i = 1
diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py
index 581569f0e..b8dea3ede 100644
--- a/python/ray/rllib/models/lstm.py
+++ b/python/ray/rllib/models/lstm.py
@@ -135,7 +135,7 @@ class LSTM(Model):
     """
 
     def _build_layers(self, inputs, num_outputs, options):
-        cell_size = options.get("lstm_cell_size", 256)
+        cell_size = options.get("lstm_cell_size")
         last_layer = add_time_dimension(inputs, self.seq_lens)
 
         # Setup the LSTM cell
diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py
index 00d6575e6..168f29c74 100644
--- a/python/ray/rllib/models/model.py
+++ b/python/ray/rllib/models/model.py
@@ -55,7 +55,7 @@ class Model(object):
             self.seq_lens = tf.placeholder(
                 dtype=tf.int32, shape=[None], name="seq_lens")
 
-        if options.get("free_log_std", False):
+        if options.get("free_log_std"):
             assert num_outputs % 2 == 0
             num_outputs = num_outputs // 2
         self.outputs, self.last_layer = self._build_layers(
diff --git a/python/ray/rllib/models/preprocessors.py b/python/ray/rllib/models/preprocessors.py
index c400dd980..cd72d1922 100644
--- a/python/ray/rllib/models/preprocessors.py
+++ b/python/ray/rllib/models/preprocessors.py
@@ -30,12 +30,18 @@ class Preprocessor(object):
         raise NotImplementedError
 
 
-class AtariPixelPreprocessor(Preprocessor):
+class GenericPixelPreprocessor(Preprocessor):
+    """Generic image preprocessor.
+
+    Note: for Atari games, use config {"preprocessor_pref": "deepmind"}
+    instead for deepmind-style Atari preprocessing.
+    """
+
     def _init(self):
-        self._grayscale = self._options.get("grayscale", False)
-        self._zero_mean = self._options.get("zero_mean", True)
-        self._dim = self._options.get("dim", 84)
-        self._channel_major = self._options.get("channel_major", False)
+        self._grayscale = self._options.get("grayscale")
+        self._zero_mean = self._options.get("zero_mean")
+        self._dim = self._options.get("dim")
+        self._channel_major = self._options.get("channel_major")
         if self._grayscale:
             self.shape = (self._dim, self._dim, 1)
         else:
@@ -130,7 +136,7 @@ def get_preprocessor(space):
     if isinstance(space, gym.spaces.Discrete):
         preprocessor = OneHotPreprocessor
     elif obs_shape == ATARI_OBS_SHAPE:
-        preprocessor = AtariPixelPreprocessor
+        preprocessor = GenericPixelPreprocessor
     elif obs_shape == ATARI_RAM_OBS_SHAPE:
         preprocessor = AtariRamPreprocessor
     elif isinstance(space, gym.spaces.Tuple):
diff --git a/python/ray/rllib/models/pytorch/visionnet.py b/python/ray/rllib/models/pytorch/visionnet.py
index 94ac8291d..e54c51897 100644
--- a/python/ray/rllib/models/pytorch/visionnet.py
+++ b/python/ray/rllib/models/pytorch/visionnet.py
@@ -18,11 +18,11 @@ class VisionNetwork(Model):
             inputs (tuple): (channels, rows/height, cols/width)
             num_outputs (int): logits size
         """
-        filters = options.get("conv_filters", [
+        filters = options.get("conv_filters") or [
             [16, [8, 8], 4],
             [32, [4, 4], 2],
             [512, [11, 11], 1],
-        ])
+        ]
         layers = []
         in_channels, in_size = inputs[0], inputs[1:]
 
diff --git a/python/ray/rllib/models/visionnet.py b/python/ray/rllib/models/visionnet.py
index 805d2e9e5..902addb6a 100644
--- a/python/ray/rllib/models/visionnet.py
+++ b/python/ray/rllib/models/visionnet.py
@@ -17,7 +17,7 @@ class VisionNetwork(Model):
         if not filters:
             filters = get_filter_config(options)
 
-        activation = get_activation_fn(options.get("conv_activation", "relu"))
+        activation = get_activation_fn(options.get("conv_activation"))
 
         with tf.name_scope("vision_net"):
             for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
@@ -57,7 +57,7 @@ def get_filter_config(options):
         [32, [4, 4], 2],
         [256, [11, 11], 1],
     ]
-    dim = options.get("dim", 84)
+    dim = options.get("dim")
     if dim == 84:
         return filters_84x84
     elif dim == 42:
diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py
index 16bdd485f..20ef872ae 100644
--- a/python/ray/rllib/test/test_supported_spaces.py
+++ b/python/ray/rllib/test/test_supported_spaces.py
@@ -19,10 +19,6 @@ ACTION_SPACES_TO_TEST = {
         Box(0.0, 1.0, (5, ), dtype=np.float32),
         Box(0.0, 1.0, (5, ), dtype=np.float32)
     ]),
-    "implicit_tuple": [
-        Box(0.0, 1.0, (5, ), dtype=np.float32),
-        Box(0.0, 1.0, (5, ), dtype=np.float32)
-    ],
     "mixed_tuple": Tuple(
         [Discrete(2),
          Discrete(3),