From 5cebee68d681bebfd59255b811338d39e2cc2e7d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 27 Mar 2020 22:05:43 -0700 Subject: [PATCH] [rllib] Add scaling guide to documentation, improve bandit docs (#7780) * update * reword * update * ms * multi node sgd * reorder * improve bandit docs * contrib * update * ref * improve refs * fix build * add pillow dep * add pil * update pil * pillow * remove false --- doc/requirements-doc.txt | 1 + doc/source/conf.py | 1 + doc/source/rllib-algorithms.rst | 51 +++++++++-- doc/source/rllib-env.rst | 6 +- doc/source/rllib-toc.rst | 113 +++++++++++++----------- doc/source/rllib-training.rst | 16 ++++ doc/source/rllib.rst | 14 ++- rllib/contrib/bandits/agents/lin_ts.py | 1 - rllib/contrib/bandits/agents/lin_ucb.py | 1 - 9 files changed, 134 insertions(+), 70 deletions(-) diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index ba9167335..3066e2d8b 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -10,6 +10,7 @@ numpy opencv-python-headless pandas pickle5 +pillow pygments pyyaml recommonmark diff --git a/doc/source/conf.py b/doc/source/conf.py index 1a1e1d9ef..6743157ff 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -71,6 +71,7 @@ sphinx_gallery_conf = { "ignore_pattern": "../examples/doc_code/", "plot_gallery": "False", # "filename_pattern": "tutorial.py", + # "backreferences_dir": "False", # "show_memory': False, # 'min_reported_time': False } diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index a7df0b0fd..991a81d2f 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -33,6 +33,8 @@ MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ High-throughput architectures ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _apex: + Distributed Prioritized Experience Replay (Ape-X) ------------------------------------------------- |tensorflow| @@ -79,6 +81,8 @@ SpaceInvaders 646 ~300 :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ +.. _impala: + Importance Weighted Actor-Learner Architecture (IMPALA) ------------------------------------------------------- |tensorflow| @@ -126,6 +130,8 @@ SpaceInvaders 843 ~300 :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ +.. _appo: + Asynchronous Proximal Policy Optimization (APPO) ------------------------------------------------ |tensorflow| @@ -135,7 +141,7 @@ We include an asynchronous variant of Proximal Policy Optimization (PPO) based o .. tip:: - APPO is not always more efficient; it is often better to use `standard PPO `__ or `IMPALA `__. + APPO is not always more efficient; it is often better to use :ref:`standard PPO ` or :ref:`IMPALA `. .. figure:: impala-arch.svg @@ -150,6 +156,8 @@ Tuned examples: `PongNoFrameskip-v4 `__ `[implementation] `__ @@ -496,6 +530,8 @@ It constructs a confidence region around the weights of the linear regression model and uses this confidence ellipsoid to estimate the uncertainty of action values. +Tuned examples: `SimpleContextualBandit `__, `ParametricItemRecoEnv `__. + **LinUCB-specific configs** (see also `common configs `__): @@ -504,9 +540,10 @@ uncertainty of action values. :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ +.. _lints: -LinTS (Linear Thompson Sampling) --------------------------------- +Linear Thompson Sampling (contrib/LinTS) +---------------------------------------- |pytorch| `[paper] `__ `[implementation] `__ @@ -517,6 +554,8 @@ prior on the weights and a Gaussian likelihood function. For deciding which action to take, the agent samples weights for each arm, using the posterior distributions, and plays the arm that produces the highest reward. +Tuned examples: `SimpleContextualBandit `__, `WheelBandit `__. + **LinTS-specific configs** (see also `common configs `__): @@ -530,4 +569,4 @@ the posterior distributions, and plays the arm that produces the highest reward. :width: 24 .. |pytorch| image:: pytorch.png - :width: 24 \ No newline at end of file + :width: 24 diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index 69e38cc5e..057daf270 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -80,6 +80,10 @@ RLlib uses Gym as its environment interface for single-agent training. For more Performance ~~~~~~~~~~~ +.. tip:: + + Also check out the `scaling guide `__ for RLlib training. + There are two ways to scale experience collection with Gym environments: 1. **Vectorization within a single process:** Though many envs can achieve high frame rates per core, their throughput is limited in practice by policy evaluation between steps. For example, even small TensorFlow models incur a couple milliseconds of latency to evaluate. This can be worked around by creating multiple envs per process and batching policy evaluations across these envs. @@ -256,7 +260,7 @@ Alternatively, the env itself can be modified to share observations between agen Grouping Agents ~~~~~~~~~~~~~~~ -It is common to have groups of agents in multi-agent RL. RLlib treats agent groups like a single agent with a Tuple action and observation space. The group agent can then be assigned to a single policy for centralized execution, or to specialized multi-agent policies such as `Q-Mix `__ that implement centralized training but decentralized execution. You can use the ``MultiAgentEnv.with_agent_groups()`` method to define these groups: +It is common to have groups of agents in multi-agent RL. RLlib treats agent groups like a single agent with a Tuple action and observation space. The group agent can then be assigned to a single policy for centralized execution, or to specialized multi-agent policies such as :ref:`Q-Mix ` that implement centralized training but decentralized execution. You can use the ``MultiAgentEnv.with_agent_groups()`` method to define these groups: .. literalinclude:: ../../rllib/env/multi_agent_env.py :language: python diff --git a/doc/source/rllib-toc.rst b/doc/source/rllib-toc.rst index 3f9949543..99f9bc9fc 100644 --- a/doc/source/rllib-toc.rst +++ b/doc/source/rllib-toc.rst @@ -15,6 +15,8 @@ Training APIs - `Common Parameters `__ + - `Scaling Guide `__ + - `Tuned Examples `__ * `Basic Python API `__ @@ -57,79 +59,84 @@ Training APIs Environments ------------ -* `RLlib Environments Overview `__ -* `Feature Compatibility Matrix `__ -* `OpenAI Gym `__ -* `Vectorized `__ -* `Multi-Agent and Hierarchical `__ -* `External Agents and Applications `__ +* `RLlib Environments Overview `__ +* `Feature Compatibility Matrix `__ +* `OpenAI Gym `__ +* `Vectorized `__ +* `Multi-Agent and Hierarchical `__ +* `External Agents and Applications `__ - `External Application Clients `__ -* `Advanced Integrations `__ +* `Advanced Integrations `__ Models, Preprocessors, and Action Distributions ----------------------------------------------- -* `RLlib Models, Preprocessors, and Action Distributions Overview `__ -* `TensorFlow Models `__ -* `PyTorch Models `__ -* `Custom Preprocessors `__ -* `Custom Action Distributions `__ -* `Supervised Model Losses `__ -* `Self-Supervised Model Losses `__ -* `Variable-length / Parametric Action Spaces `__ -* `Autoregressive Action Distributions `__ +* `RLlib Models, Preprocessors, and Action Distributions Overview `__ +* `TensorFlow Models `__ +* `PyTorch Models `__ +* `Custom Preprocessors `__ +* `Custom Action Distributions `__ +* `Supervised Model Losses `__ +* `Self-Supervised Model Losses `__ +* `Variable-length / Parametric Action Spaces `__ +* `Autoregressive Action Distributions `__ Algorithms ---------- * High-throughput architectures - - |tensorflow| `Distributed Prioritized Experience Replay (Ape-X) `__ + - |tensorflow| :ref:`Distributed Prioritized Experience Replay (Ape-X) ` - - |tensorflow| `Importance Weighted Actor-Learner Architecture (IMPALA) `__ + - |tensorflow| :ref:`Importance Weighted Actor-Learner Architecture (IMPALA) ` - - |tensorflow| `Asynchronous Proximal Policy Optimization (APPO) `__ + - |tensorflow| :ref:`Asynchronous Proximal Policy Optimization (APPO) ` - - |pytorch| `Decentralized Distributed Proximal Policy Optimization (DD-PPO) `__ + - |pytorch| :ref:`Decentralized Distributed Proximal Policy Optimization (DD-PPO) ` - - |pytorch| `Single-Player AlphaZero (contrib/AlphaZero) `__ + - |pytorch| :ref:`Single-Player AlphaZero (contrib/AlphaZero) ` * Gradient-based - - |pytorch| |tensorflow| `Advantage Actor-Critic (A2C, A3C) `__ + - |pytorch| |tensorflow| :ref:`Advantage Actor-Critic (A2C, A3C) ` - - |tensorflow| `Deep Deterministic Policy Gradients (DDPG, TD3) `__ + - |tensorflow| :ref:`Deep Deterministic Policy Gradients (DDPG, TD3) ` - - |tensorflow| `Deep Q Networks (DQN, Rainbow, Parametric DQN) `__ + - |tensorflow| :ref:`Deep Q Networks (DQN, Rainbow, Parametric DQN) ` - - |pytorch| |tensorflow| `Policy Gradients `__ + - |pytorch| |tensorflow| :ref:`Policy Gradients ` - - |pytorch| |tensorflow| `Proximal Policy Optimization (PPO) `__ + - |pytorch| |tensorflow| :ref:`Proximal Policy Optimization (PPO) ` - - |tensorflow| `Soft Actor Critic (SAC) `__ + - |tensorflow| :ref:`Soft Actor Critic (SAC) ` * Derivative-free - - |tensorflow| `Augmented Random Search (ARS) `__ + - |tensorflow| :ref:`Augmented Random Search (ARS) ` - - |tensorflow| `Evolution Strategies `__ + - |tensorflow| :ref:`Evolution Strategies ` * Multi-agent specific - - |pytorch| `QMIX Monotonic Value Factorisation (QMIX, VDN, IQN) `__ - - |tensorflow| `Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) `__ + - |pytorch| :ref:`QMIX Monotonic Value Factorisation (QMIX, VDN, IQN) ` + - |tensorflow| :ref:`Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) ` * Offline - - |tensorflow| `Advantage Re-Weighted Imitation Learning (MARWIL) `__ + - |tensorflow| :ref:`Advantage Re-Weighted Imitation Learning (MARWIL) ` + +* Contextual bandits + + - |pytorch| :ref:`Linear Upper Confidence Bound (contrib/LinUCB) ` + - |pytorch| :ref:`Linear Thompson Sampling (contrib/LinTS) ` Offline Datasets ---------------- -* `Working with Offline Datasets `__ -* `Input Pipeline for Supervised Losses `__ -* `Input API `__ -* `Output API `__ +* `Working with Offline Datasets `__ +* `Input Pipeline for Supervised Losses `__ +* `Input API `__ +* `Output API `__ Concepts and Custom Algorithms ------------------------------ @@ -152,30 +159,30 @@ Concepts and Custom Algorithms Examples -------- -* `Tuned Examples `__ -* `Training Workflows `__ -* `Custom Envs and Models `__ -* `Serving and Offline `__ -* `Multi-Agent and Hierarchical `__ -* `Community Examples `__ +* `Tuned Examples `__ +* `Training Workflows `__ +* `Custom Envs and Models `__ +* `Serving and Offline `__ +* `Multi-Agent and Hierarchical `__ +* `Community Examples `__ Development ----------- -* `Development Install `__ -* `API Stability `__ -* `Features `__ -* `Benchmarks `__ -* `Contributing Algorithms `__ +* `Development Install `__ +* `API Stability `__ +* `Features `__ +* `Benchmarks `__ +* `Contributing Algorithms `__ Package Reference ----------------- -* `ray.rllib.agents `__ -* `ray.rllib.env `__ -* `ray.rllib.evaluation `__ -* `ray.rllib.models `__ -* `ray.rllib.optimizers `__ -* `ray.rllib.utils `__ +* `ray.rllib.agents `__ +* `ray.rllib.env `__ +* `ray.rllib.evaluation `__ +* `ray.rllib.models `__ +* `ray.rllib.optimizers `__ +* `ray.rllib.utils `__ Troubleshooting --------------- diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 0b95ef61f..aa7459ecf 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -86,6 +86,22 @@ You can control the degree of parallelism used by setting the ``num_workers`` hy .. Original image: https://docs.google.com/drawings/d/14QINFvx3grVyJyjAnjggOCEVN-Iq6pYVJ3jA2S6j8z0/edit?usp=sharing .. image:: rllib-config.svg +Scaling Guide +~~~~~~~~~~~~~ + +Here are some rules of thumb for scaling training with RLlib. + +1. If the environment is slow and cannot be replicated (e.g., since it requires interaction with physical systems), then you should use a sample-efficient off-policy algorithm such as :ref:`DQN ` or :ref:`SAC `. These algorithms default to ``num_workers: 0`` for single-process operation. Consider also batch RL training with the `offline data `__ API. + + +2. If the environment is fast and the model is small (most models for RL are), use time-efficient algorithms such as :ref:`PPO `, :ref:`IMPALA `, or :ref:`APEX `. These can be scaled by increasing ``num_workers`` to add rollout workers. It may also make sense to enable `vectorization `__ for inference. If the learner becomes a bottleneck, multiple GPUs can be used for learning by setting ``num_gpus > 1``. + + +3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_worker: 1``. If you only have a single GPU, consider ``num_workers: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker `__. + + +4. Finally, if both model and environment are compute intensive, then enable `remote worker envs `__ with `async batching `__ by setting ``remote_worker_envs: True`` and optionally ``remote_env_batch_wait_ms``. This batches inference on GPUs in the rollout workers while letting envs run asynchronously in separate actors, similar to the `SEED `__ architecture. The number of workers and number of envs per worker should be tuned to maximize GPU utilization. If your env requires GPUs to function, or if multi-node SGD is needed, then also consider :ref:`DD-PPO `. + Common Parameters ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 7232626c4..c0a45f598 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -96,7 +96,12 @@ RLlib `Trainer classes `__ coordinate the distribu Synchronous Sampling (e.g., A2C, PG, PPO) -RLlib uses `Ray actors `__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism `__ used for training by changing the ``num_workers`` parameter. +RLlib uses `Ray actors `__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism `__ used for training by changing the ``num_workers`` parameter. Check out our `scaling guide `__ for more details here. + +Application Support +~~~~~~~~~~~~~~~~~~~ + +Beyond environments defined in Python, RLlib supports batch training on `offline datasets `__, and also provides a variety of integration strategies for `external applications `__. Customization ~~~~~~~~~~~~~ @@ -105,13 +110,6 @@ RLlib provides ways to customize almost all aspects of training, including the ` .. image:: rllib-components.svg -Application Support -~~~~~~~~~~~~~~~~~~~ - -Beyond environments defined in Python, RLlib supports batch training on `offline datasets `__, and also provides a variety of integration strategies for `external applications `__: - -.. image:: rllib-external.svg - To learn more, proceed to the `table of contents `__. .. |tensorflow| image:: tensorflow.png diff --git a/rllib/contrib/bandits/agents/lin_ts.py b/rllib/contrib/bandits/agents/lin_ts.py index e237f9209..e49db4ac8 100644 --- a/rllib/contrib/bandits/agents/lin_ts.py +++ b/rllib/contrib/bandits/agents/lin_ts.py @@ -26,7 +26,6 @@ TS_CONFIG = with_common_config({ "type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling" } }) - # __sphinx_doc_end__ # yapf: enable diff --git a/rllib/contrib/bandits/agents/lin_ucb.py b/rllib/contrib/bandits/agents/lin_ucb.py index 36029c3fa..2aab7a767 100644 --- a/rllib/contrib/bandits/agents/lin_ucb.py +++ b/rllib/contrib/bandits/agents/lin_ucb.py @@ -26,7 +26,6 @@ UCB_CONFIG = with_common_config({ "type": "ray.rllib.contrib.bandits.exploration.UCB" } }) - # __sphinx_doc_end__ # yapf: enable