From 5cebee68d681bebfd59255b811338d39e2cc2e7d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 27 Mar 2020 22:05:43 -0700
Subject: [PATCH] [rllib] Add scaling guide to documentation, improve bandit
 docs (#7780)

* update

* reword

* update

* ms

* multi node sgd

* reorder

* improve bandit docs

* contrib

* update

* ref

* improve refs

* fix build

* add pillow dep

* add pil

* update pil

* pillow

* remove false
---
 doc/requirements-doc.txt                |   1 +
 doc/source/conf.py                      |   1 +
 doc/source/rllib-algorithms.rst         |  51 +++++++++--
 doc/source/rllib-env.rst                |   6 +-
 doc/source/rllib-toc.rst                | 113 +++++++++++++-----------
 doc/source/rllib-training.rst           |  16 ++++
 doc/source/rllib.rst                    |  14 ++-
 rllib/contrib/bandits/agents/lin_ts.py  |   1 -
 rllib/contrib/bandits/agents/lin_ucb.py |   1 -
 9 files changed, 134 insertions(+), 70 deletions(-)
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
index ba9167335..3066e2d8b 100644
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@@ -10,6 +10,7 @@ numpy
 opencv-python-headless
 pandas
 pickle5
+pillow
 pygments
 pyyaml
 recommonmark
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 1a1e1d9ef..6743157ff 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -71,6 +71,7 @@ sphinx_gallery_conf = {
     "ignore_pattern": "../examples/doc_code/",
     "plot_gallery": "False",
     # "filename_pattern": "tutorial.py",
+    # "backreferences_dir": "False",
     # "show_memory': False,
     # 'min_reported_time': False
 }
diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index a7df0b0fd..991a81d2f 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -33,6 +33,8 @@ MARWIL          **Yes** `+parametric`_  **Yes**             **Yes**      `+RNN`_
 High-throughput architectures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. _apex:
+
 Distributed Prioritized Experience Replay (Ape-X)
 -------------------------------------------------
 |tensorflow|
@@ -79,6 +81,8 @@ SpaceInvaders  646                               ~300
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _impala:
+
 Importance Weighted Actor-Learner Architecture (IMPALA)
 -------------------------------------------------------
 |tensorflow|
@@ -126,6 +130,8 @@ SpaceInvaders  843                              ~300
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _appo:
+
 Asynchronous Proximal Policy Optimization (APPO)
 ------------------------------------------------
 |tensorflow|
@@ -135,7 +141,7 @@ We include an asynchronous variant of Proximal Policy Optimization (PPO) based o
 
 .. tip::
 
-    APPO is not always more efficient; it is often better to use `standard PPO <rllib-algorithms.html#proximal-policy-optimization-ppo>`__ or `IMPALA <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__.
+    APPO is not always more efficient; it is often better to use :ref:`standard PPO <ppo>` or :ref:`IMPALA <impala>`.
 
 .. figure:: impala-arch.svg
 
@@ -150,6 +156,8 @@ Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/mas
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _ddppo:
+
 Decentralized Distributed Proximal Policy Optimization (DD-PPO)
 ---------------------------------------------------------------
 |pytorch|
@@ -177,6 +185,8 @@ Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rll
 Gradient-based
 ~~~~~~~~~~~~~~
 
+.. _a3c:
+
 Advantage Actor-Critic (A2C, A3C)
 ---------------------------------
 |pytorch| |tensorflow|
@@ -212,6 +222,8 @@ SpaceInvaders  692                       ~600
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _ddpg:
+
 Deep Deterministic Policy Gradients (DDPG, TD3)
 -----------------------------------------------
 |tensorflow|
@@ -231,6 +243,8 @@ Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rll
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _dqn:
+
 Deep Q Networks (DQN, Rainbow, Parametric DQN)
 ----------------------------------------------
 |tensorflow|
@@ -274,6 +288,8 @@ SpaceInvaders  650                       1001                           1025
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _pg:
+
 Policy Gradients
 ----------------
 |pytorch| |tensorflow|
@@ -292,6 +308,8 @@ Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rll
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _ppo:
+
 Proximal Policy Optimization (PPO)
 ----------------------------------
 |pytorch| |tensorflow|
@@ -341,6 +359,8 @@ HalfCheetah    9664                       ~7700
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _sac:
+
 Soft Actor Critic (SAC)
 ------------------------
 |tensorflow|
@@ -372,6 +392,8 @@ HalfCheetah    13000       ~15000
 Derivative-free
 ~~~~~~~~~~~~~~~
 
+.. _ars:
+
 Augmented Random Search (ARS)
 -----------------------------
 |tensorflow|
@@ -387,6 +409,8 @@ Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rll
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _es:
+
 Evolution Strategies
 --------------------
 |tensorflow|
@@ -409,6 +433,8 @@ Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rll
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _qmix:
+
 QMIX Monotonic Value Factorisation (QMIX, VDN, IQN)
 ---------------------------------------------------
 |pytorch|
@@ -423,6 +449,8 @@ Tuned examples: `Two-step game <https://github.com/ray-project/ray/blob/master/r
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _maddpg:
+
 Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG)
 ---------------------------------------------------------------
 |tensorflow|
@@ -437,6 +465,8 @@ Tuned examples: `Multi-Agent Particle Environment <https://github.com/wsjeon/mad
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _marwil:
+
 Advantage Re-Weighted Imitation Learning (MARWIL)
 -------------------------------------------------
 |tensorflow|
@@ -451,6 +481,8 @@ Tuned examples: `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rll
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _alphazero:
+
 Single-Player Alpha Zero (contrib/AlphaZero)
 --------------------------------------------
 |pytorch|
@@ -485,8 +517,10 @@ model) and an exploration strategy (e-greedy, UCB, Thompson Sampling etc.)
 RLlib supports the following online contextual bandit algorithms,
 named after the exploration strategies that they employ:
 
-LinUCB (Upper Confidence Bound)
--------------------------------
+.. _linucb:
+
+Linear Upper Confidence Bound (contrib/LinUCB)
+----------------------------------------------
 |pytorch|
 `[paper] <http://rob.schapire.net/papers/www10.pdf>`__ `[implementation]
 <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/agents/lin_ucb.py>`__
@@ -496,6 +530,8 @@ It constructs a confidence region around the weights of the linear
 regression model and uses this confidence ellipsoid to estimate the
 uncertainty of action values.
 
+Tuned examples: `SimpleContextualBandit <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/examples/simple_context_bandit.py>`__, `ParametricItemRecoEnv <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/examples/tune_LinUCB_train_recommendation.py>`__.
+
 **LinUCB-specific configs** (see also `common configs <rllib-training
 .html#common-parameters>`__):
 
@@ -504,9 +540,10 @@ uncertainty of action values.
    :start-after: __sphinx_doc_begin__
    :end-before: __sphinx_doc_end__
 
+.. _lints:
 
-LinTS (Linear Thompson Sampling)
---------------------------------
+Linear Thompson Sampling (contrib/LinTS)
+----------------------------------------
 |pytorch|
 `[paper] <http://proceedings.mlr.press/v28/agrawal13.pdf>`__ `[implementation]
 <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/agents/lin_ts.py>`__
@@ -517,6 +554,8 @@ prior on the weights and a Gaussian likelihood function. For deciding which
 action to take, the agent samples weights for each arm, using
 the posterior distributions, and plays the arm that produces the highest reward.
 
+Tuned examples: `SimpleContextualBandit <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/examples/simple_context_bandit.py>`__, `WheelBandit <https://github.com/ray-project/ray/blob/master/rllib/contrib/bandits/examples/tune_LinTS_train_wheel_env.py>`__.
+
 **LinTS-specific configs** (see also `common configs <rllib-training
 .html#common-parameters>`__):
 
@@ -530,4 +569,4 @@ the posterior distributions, and plays the arm that produces the highest reward.
     :width: 24
 
 .. |pytorch| image:: pytorch.png
-    :width: 24
\ No newline at end of file
+    :width: 24
diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
index 69e38cc5e..057daf270 100644
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@@ -80,6 +80,10 @@ RLlib uses Gym as its environment interface for single-agent training. For more
 Performance
 ~~~~~~~~~~~
 
+.. tip::
+
+    Also check out the `scaling guide <rllib-training.html#scaling-guide>`__ for RLlib training.
+
 There are two ways to scale experience collection with Gym environments:
 
     1. **Vectorization within a single process:** Though many envs can achieve high frame rates per core, their throughput is limited in practice by policy evaluation between steps. For example, even small TensorFlow models incur a couple milliseconds of latency to evaluate. This can be worked around by creating multiple envs per process and batching policy evaluations across these envs.
@@ -256,7 +260,7 @@ Alternatively, the env itself can be modified to share observations between agen
 Grouping Agents
 ~~~~~~~~~~~~~~~
 
-It is common to have groups of agents in multi-agent RL. RLlib treats agent groups like a single agent with a Tuple action and observation space. The group agent can then be assigned to a single policy for centralized execution, or to specialized multi-agent policies such as `Q-Mix <rllib-algorithms.html#qmix-monotonic-value-factorisation-qmix-vdn-iqn>`__ that implement centralized training but decentralized execution. You can use the ``MultiAgentEnv.with_agent_groups()`` method to define these groups:
+It is common to have groups of agents in multi-agent RL. RLlib treats agent groups like a single agent with a Tuple action and observation space. The group agent can then be assigned to a single policy for centralized execution, or to specialized multi-agent policies such as :ref:`Q-Mix <qmix>` that implement centralized training but decentralized execution. You can use the ``MultiAgentEnv.with_agent_groups()`` method to define these groups:
 
 .. literalinclude:: ../../rllib/env/multi_agent_env.py
    :language: python
diff --git a/doc/source/rllib-toc.rst b/doc/source/rllib-toc.rst
index 3f9949543..99f9bc9fc 100644
--- a/doc/source/rllib-toc.rst
+++ b/doc/source/rllib-toc.rst
@@ -15,6 +15,8 @@ Training APIs
 
    -  `Common Parameters <rllib-training.html#common-parameters>`__
 
+   -  `Scaling Guide <rllib-training.html#scaling-guide>`__
+
    -  `Tuned Examples <rllib-training.html#tuned-examples>`__
 
 *  `Basic Python API <rllib-training.html#basic-python-api>`__
@@ -57,79 +59,84 @@ Training APIs
 
 Environments
 ------------
-* `RLlib Environments Overview <rllib-env.html>`__
-* `Feature Compatibility Matrix <rllib-env.html#feature-compatibility-matrix>`__
-* `OpenAI Gym <rllib-env.html#openai-gym>`__
-* `Vectorized <rllib-env.html#vectorized>`__
-* `Multi-Agent and Hierarchical <rllib-env.html#multi-agent-and-hierarchical>`__
-* `External Agents and Applications <rllib-env.html#external-agents-and-applications>`__
+*  `RLlib Environments Overview <rllib-env.html>`__
+*  `Feature Compatibility Matrix <rllib-env.html#feature-compatibility-matrix>`__
+*  `OpenAI Gym <rllib-env.html#openai-gym>`__
+*  `Vectorized <rllib-env.html#vectorized>`__
+*  `Multi-Agent and Hierarchical <rllib-env.html#multi-agent-and-hierarchical>`__
+*  `External Agents and Applications <rllib-env.html#external-agents-and-applications>`__
 
    -  `External Application Clients <rllib-env.html#external-application-clients>`__
 
-* `Advanced Integrations <rllib-env.html#advanced-integrations>`__
+*  `Advanced Integrations <rllib-env.html#advanced-integrations>`__
 
 Models, Preprocessors, and Action Distributions
 -----------------------------------------------
-* `RLlib Models, Preprocessors, and Action Distributions Overview <rllib-models.html>`__
-* `TensorFlow Models <rllib-models.html#tensorflow-models>`__
-* `PyTorch Models <rllib-models.html#pytorch-models>`__
-* `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
-* `Custom Action Distributions <rllib-models.html#custom-action-distributions>`__
-* `Supervised Model Losses <rllib-models.html#supervised-model-losses>`__
-* `Self-Supervised Model Losses <rllib-models.html#self-supervised-model-losses>`__
-* `Variable-length / Parametric Action Spaces <rllib-models.html#variable-length-parametric-action-spaces>`__
-* `Autoregressive Action Distributions <rllib-models.html#autoregressive-action-distributions>`__
+*  `RLlib Models, Preprocessors, and Action Distributions Overview <rllib-models.html>`__
+*  `TensorFlow Models <rllib-models.html#tensorflow-models>`__
+*  `PyTorch Models <rllib-models.html#pytorch-models>`__
+*  `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
+*  `Custom Action Distributions <rllib-models.html#custom-action-distributions>`__
+*  `Supervised Model Losses <rllib-models.html#supervised-model-losses>`__
+*  `Self-Supervised Model Losses <rllib-models.html#self-supervised-model-losses>`__
+*  `Variable-length / Parametric Action Spaces <rllib-models.html#variable-length-parametric-action-spaces>`__
+*  `Autoregressive Action Distributions <rllib-models.html#autoregressive-action-distributions>`__
 
 Algorithms
 ----------
 
 *  High-throughput architectures
 
-   -  |tensorflow| `Distributed Prioritized Experience Replay (Ape-X) <rllib-algorithms.html#distributed-prioritized-experience-replay-ape-x>`__
+   -  |tensorflow| :ref:`Distributed Prioritized Experience Replay (Ape-X) <apex>`
 
-   -  |tensorflow| `Importance Weighted Actor-Learner Architecture (IMPALA) <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__
+   -  |tensorflow| :ref:`Importance Weighted Actor-Learner Architecture (IMPALA) <impala>`
 
-   -  |tensorflow| `Asynchronous Proximal Policy Optimization (APPO) <rllib-algorithms.html#asynchronous-proximal-policy-optimization-appo>`__
+   -  |tensorflow| :ref:`Asynchronous Proximal Policy Optimization (APPO) <appo>`
 
-   -  |pytorch| `Decentralized Distributed Proximal Policy Optimization (DD-PPO) <rllib-algorithms.html#decentralized-distributed-proximal-policy-optimization-dd-ppo>`__
+   -  |pytorch| :ref:`Decentralized Distributed Proximal Policy Optimization (DD-PPO) <ddppo>`
 
-   -  |pytorch| `Single-Player AlphaZero (contrib/AlphaZero) <rllib-algorithms.html#single-player-alpha-zero-contrib-alphazero>`__
+   -  |pytorch| :ref:`Single-Player AlphaZero (contrib/AlphaZero) <alphazero>`
 
 *  Gradient-based
 
-   -  |pytorch| |tensorflow| `Advantage Actor-Critic (A2C, A3C) <rllib-algorithms.html#advantage-actor-critic-a2c-a3c>`__
+   -  |pytorch| |tensorflow| :ref:`Advantage Actor-Critic (A2C, A3C) <a3c>`
 
-   -  |tensorflow| `Deep Deterministic Policy Gradients (DDPG, TD3) <rllib-algorithms.html#deep-deterministic-policy-gradients-ddpg-td3>`__
+   -  |tensorflow| :ref:`Deep Deterministic Policy Gradients (DDPG, TD3) <ddpg>`
 
-   -  |tensorflow| `Deep Q Networks (DQN, Rainbow, Parametric DQN) <rllib-algorithms.html#deep-q-networks-dqn-rainbow-parametric-dqn>`__
+   -  |tensorflow| :ref:`Deep Q Networks (DQN, Rainbow, Parametric DQN) <dqn>`
 
-   -  |pytorch| |tensorflow| `Policy Gradients <rllib-algorithms.html#policy-gradients>`__
+   -  |pytorch| |tensorflow| :ref:`Policy Gradients <pg>`
 
-   -  |pytorch| |tensorflow| `Proximal Policy Optimization (PPO) <rllib-algorithms.html#proximal-policy-optimization-ppo>`__
+   -  |pytorch| |tensorflow| :ref:`Proximal Policy Optimization (PPO) <ppo>`
 
-   -  |tensorflow| `Soft Actor Critic (SAC) <rllib-algorithms.html#soft-actor-critic-sac>`__
+   -  |tensorflow| :ref:`Soft Actor Critic (SAC) <sac>`
 
 *  Derivative-free
 
-   -  |tensorflow| `Augmented Random Search (ARS) <rllib-algorithms.html#augmented-random-search-ars>`__
+   -  |tensorflow| :ref:`Augmented Random Search (ARS) <ars>`
 
-   -  |tensorflow| `Evolution Strategies <rllib-algorithms.html#evolution-strategies>`__
+   -  |tensorflow| :ref:`Evolution Strategies <es>`
 
 *  Multi-agent specific
 
-   -  |pytorch| `QMIX Monotonic Value Factorisation (QMIX, VDN, IQN) <rllib-algorithms.html#qmix-monotonic-value-factorisation-qmix-vdn-iqn>`__
-   -  |tensorflow| `Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) <rllib-algorithms.html#multi-agent-deep-deterministic-policy-gradient-contrib-maddpg>`__
+   -  |pytorch| :ref:`QMIX Monotonic Value Factorisation (QMIX, VDN, IQN) <qmix>`
+   -  |tensorflow| :ref:`Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) <maddpg>`
 
 *  Offline
 
-   -  |tensorflow| `Advantage Re-Weighted Imitation Learning (MARWIL) <rllib-algorithms.html#advantage-re-weighted-imitation-learning-marwil>`__
+   -  |tensorflow| :ref:`Advantage Re-Weighted Imitation Learning (MARWIL) <marwil>`
+
+*  Contextual bandits
+
+   -  |pytorch| :ref:`Linear Upper Confidence Bound (contrib/LinUCB) <linucb>`
+   -  |pytorch| :ref:`Linear Thompson Sampling (contrib/LinTS) <lints>`
 
 Offline Datasets
 ----------------
-* `Working with Offline Datasets <rllib-offline.html>`__
-* `Input Pipeline for Supervised Losses <rllib-offline.html#input-pipeline-for-supervised-losses>`__
-* `Input API <rllib-offline.html#input-api>`__
-* `Output API <rllib-offline.html#output-api>`__
+*  `Working with Offline Datasets <rllib-offline.html>`__
+*  `Input Pipeline for Supervised Losses <rllib-offline.html#input-pipeline-for-supervised-losses>`__
+*  `Input API <rllib-offline.html#input-api>`__
+*  `Output API <rllib-offline.html#output-api>`__
 
 Concepts and Custom Algorithms
 ------------------------------
@@ -152,30 +159,30 @@ Concepts and Custom Algorithms
 Examples
 --------
 
-* `Tuned Examples <rllib-examples.html#tuned-examples>`__
-* `Training Workflows <rllib-examples.html#training-workflows>`__
-* `Custom Envs and Models <rllib-examples.html#custom-envs-and-models>`__
-* `Serving and Offline <rllib-examples.html#serving-and-offline>`__
-* `Multi-Agent and Hierarchical <rllib-examples.html#multi-agent-and-hierarchical>`__
-* `Community Examples <rllib-examples.html#community-examples>`__
+*  `Tuned Examples <rllib-examples.html#tuned-examples>`__
+*  `Training Workflows <rllib-examples.html#training-workflows>`__
+*  `Custom Envs and Models <rllib-examples.html#custom-envs-and-models>`__
+*  `Serving and Offline <rllib-examples.html#serving-and-offline>`__
+*  `Multi-Agent and Hierarchical <rllib-examples.html#multi-agent-and-hierarchical>`__
+*  `Community Examples <rllib-examples.html#community-examples>`__
 
 Development
 -----------
 
-* `Development Install <rllib-dev.html#development-install>`__
-* `API Stability <rllib-dev.html#api-stability>`__
-* `Features <rllib-dev.html#feature-development>`__
-* `Benchmarks <rllib-dev.html#benchmarks>`__
-* `Contributing Algorithms <rllib-dev.html#contributing-algorithms>`__
+*  `Development Install <rllib-dev.html#development-install>`__
+*  `API Stability <rllib-dev.html#api-stability>`__
+*  `Features <rllib-dev.html#feature-development>`__
+*  `Benchmarks <rllib-dev.html#benchmarks>`__
+*  `Contributing Algorithms <rllib-dev.html#contributing-algorithms>`__
 
 Package Reference
 -----------------
-* `ray.rllib.agents <rllib-package-ref.html#module-ray.rllib.agents>`__
-* `ray.rllib.env <rllib-package-ref.html#module-ray.rllib.env>`__
-* `ray.rllib.evaluation <rllib-package-ref.html#module-ray.rllib.evaluation>`__
-* `ray.rllib.models <rllib-package-ref.html#module-ray.rllib.models>`__
-* `ray.rllib.optimizers <rllib-package-ref.html#module-ray.rllib.optimizers>`__
-* `ray.rllib.utils <rllib-package-ref.html#module-ray.rllib.utils>`__
+*  `ray.rllib.agents <rllib-package-ref.html#module-ray.rllib.agents>`__
+*  `ray.rllib.env <rllib-package-ref.html#module-ray.rllib.env>`__
+*  `ray.rllib.evaluation <rllib-package-ref.html#module-ray.rllib.evaluation>`__
+*  `ray.rllib.models <rllib-package-ref.html#module-ray.rllib.models>`__
+*  `ray.rllib.optimizers <rllib-package-ref.html#module-ray.rllib.optimizers>`__
+*  `ray.rllib.utils <rllib-package-ref.html#module-ray.rllib.utils>`__
 
 Troubleshooting
 ---------------
diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst
index 0b95ef61f..aa7459ecf 100644
--- a/doc/source/rllib-training.rst
+++ b/doc/source/rllib-training.rst
@@ -86,6 +86,22 @@ You can control the degree of parallelism used by setting the ``num_workers`` hy
 .. Original image: https://docs.google.com/drawings/d/14QINFvx3grVyJyjAnjggOCEVN-Iq6pYVJ3jA2S6j8z0/edit?usp=sharing
 .. image:: rllib-config.svg
 
+Scaling Guide
+~~~~~~~~~~~~~
+
+Here are some rules of thumb for scaling training with RLlib.
+
+1. If the environment is slow and cannot be replicated (e.g., since it requires interaction with physical systems), then you should use a sample-efficient off-policy algorithm such as :ref:`DQN <dqn>` or :ref:`SAC <sac>`. These algorithms default to ``num_workers: 0`` for single-process operation. Consider also batch RL training with the `offline data <rllib-offline.html>`__ API.
+
+
+2. If the environment is fast and the model is small (most models for RL are), use time-efficient algorithms such as :ref:`PPO <ppo>`, :ref:`IMPALA <impala>`, or :ref:`APEX <apex>`. These can be scaled by increasing ``num_workers`` to add rollout workers. It may also make sense to enable `vectorization <rllib-env.html#vectorized>`__ for inference. If the learner becomes a bottleneck, multiple GPUs can be used for learning by setting ``num_gpus > 1``.
+
+
+3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_worker: 1``. If you only have a single GPU, consider ``num_workers: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker <rllib-env.html#vectorized>`__.
+
+   
+4. Finally, if both model and environment are compute intensive, then enable `remote worker envs <rllib-env.html#vectorized>`__ with `async batching <rllib-env.html#vectorized>`__ by setting ``remote_worker_envs: True`` and optionally ``remote_env_batch_wait_ms``. This batches inference on GPUs in the rollout workers while letting envs run asynchronously in separate actors, similar to the `SEED <https://ai.googleblog.com/2020/03/massively-scaling-reinforcement.html>`__ architecture. The number of workers and number of envs per worker should be tuned to maximize GPU utilization. If your env requires GPUs to function, or if multi-node SGD is needed, then also consider :ref:`DD-PPO <ddppo>`.
+
 Common Parameters
 ~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 7232626c4..c0a45f598 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -96,7 +96,12 @@ RLlib `Trainer classes <rllib-concepts.html#trainers>`__ coordinate the distribu
 
     Synchronous Sampling (e.g., A2C, PG, PPO)
 
-RLlib uses `Ray actors <actors.html>`__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism <rllib-training.html#specifying-resources>`__ used for training by changing the ``num_workers`` parameter.
+RLlib uses `Ray actors <actors.html>`__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism <rllib-training.html#specifying-resources>`__ used for training by changing the ``num_workers`` parameter. Check out our `scaling guide <rllib-training.html#scaling-guide>`__ for more details here.
+
+Application Support
+~~~~~~~~~~~~~~~~~~~
+
+Beyond environments defined in Python, RLlib supports batch training on `offline datasets <rllib-offline.html>`__, and also provides a variety of integration strategies for `external applications <rllib-env.html#external-agents-and-applications>`__.
 
 Customization
 ~~~~~~~~~~~~~
@@ -105,13 +110,6 @@ RLlib provides ways to customize almost all aspects of training, including the `
 
 .. image:: rllib-components.svg
 
-Application Support
-~~~~~~~~~~~~~~~~~~~
-
-Beyond environments defined in Python, RLlib supports batch training on `offline datasets <rllib-offline.html>`__, and also provides a variety of integration strategies for `external applications <rllib-env.html#external-agents-and-applications>`__:
-
-.. image:: rllib-external.svg
-
 To learn more, proceed to the `table of contents <rllib-toc.html>`__.
 
 .. |tensorflow| image:: tensorflow.png
diff --git a/rllib/contrib/bandits/agents/lin_ts.py b/rllib/contrib/bandits/agents/lin_ts.py
index e237f9209..e49db4ac8 100644
--- a/rllib/contrib/bandits/agents/lin_ts.py
+++ b/rllib/contrib/bandits/agents/lin_ts.py
@@ -26,7 +26,6 @@ TS_CONFIG = with_common_config({
         "type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling"
     }
 })
-
 # __sphinx_doc_end__
 # yapf: enable
 
diff --git a/rllib/contrib/bandits/agents/lin_ucb.py b/rllib/contrib/bandits/agents/lin_ucb.py
index 36029c3fa..2aab7a767 100644
--- a/rllib/contrib/bandits/agents/lin_ucb.py
+++ b/rllib/contrib/bandits/agents/lin_ucb.py
@@ -26,7 +26,6 @@ UCB_CONFIG = with_common_config({
         "type": "ray.rllib.contrib.bandits.exploration.UCB"
     }
 })
-
 # __sphinx_doc_end__
 # yapf: enable