From 35f739866692c8f2a14102f5be0c21b69f8ac214 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 6 Dec 2017 18:17:51 -0800
Subject: [PATCH] [rllib] Update RLlib docs and README (#1288)

Updates the rllib docs and README.
---
 README.rst                      |   8 ++
 doc/source/index.rst            |  10 ++-
 doc/source/rllib.rst            | 153 +++++++++++++++++---------------
 doc/source/tune.rst             |   3 +-
 python/ray/rllib/README.rst     |  66 +++++++++-----
 python/ray/rllib/agent.py       |   3 +-
 python/ray/tune/trial_runner.py |   2 +-
 7 files changed, 151 insertions(+), 94 deletions(-)

diff --git a/README.rst b/README.rst
index 53cf726a3..83e8221f5 100644
--- a/README.rst
+++ b/README.rst
@@ -11,6 +11,14 @@ Ray
 
 Ray is a flexible, high-performance distributed execution framework.
 
+Ray comes with libraries that accelerate deep learning and reinforcement learning development:
+
+- `Ray.tune`_: Efficient Distributed Hyperparameter Search
+- `Ray RLlib`_: A Composable and Scalable Reinforcement Learning Library
+
+.. _`Ray.tune`: http://ray.readthedocs.io/en/latest/tune.html
+.. _`Ray RLlib`: http://ray.readthedocs.io/en/latest/rllib.html
+
 
 Installation
 ------------
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 45819a957..1bc329cc9 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -3,6 +3,14 @@ Ray
 
 *Ray is a flexible, high-performance distributed execution framework.*
 
+Ray comes with libraries that accelerate deep learning and reinforcement learning development:
+
+- `Ray.tune`_: Efficient Distributed Hyperparameter Search
+- `Ray RLlib`_: A Composable and Scalable Reinforcement Learning Library
+
+.. _`Ray.tune`: tune.html
+.. _`Ray RLlib`: rllib.html
+
 Example Program
 ---------------
 
@@ -42,8 +50,8 @@ Example Program
    api.rst
    actors.rst
    using-ray-with-gpus.rst
-   rllib.rst
    tune.rst
+   rllib.rst
    webui.rst
 
 .. toctree::
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 0db6e0c6c..a9692bba4 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -1,15 +1,28 @@
-RLLib: A Scalable Reinforcement Learning Library
-================================================
+Ray RLlib: A Composable and Scalable Reinforcement Learning Library
+===================================================================
 
-This document describes Ray's reinforcement learning library.
-It currently supports the following algorithms:
+Ray RLlib is a reinforcement learning library that aims to provide both performance and composability:
+
+- Performance
+    - High performance algorithm implementions
+    - Pluggable distributed RL execution strategies
+
+- Composability
+    - Integration with the `Ray.tune <http://ray.readthedocs.io/en/latest/tune.html>`__ hyperparam tuning tool
+    - Support for multiple frameworks (TensorFlow, PyTorch)
+    - Scalable primitives for developing new algorithms
+    - Shared models between algorithms
+
+You can find the code for RLlib `here on GitHub <https://github.com/ray-project/ray/tree/master/python/ray/rllib>`__.
+
+RLlib currently provides the following algorithms:
 
 -  `Proximal Policy Optimization <https://arxiv.org/abs/1707.06347>`__ which
    is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
 
 -  Evolution Strategies which is decribed in `this
    paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
-   borrows code from
+   is adapted from
    `here <https://github.com/openai/evolution-strategies-starter>`__.
 
 -  `The Asynchronous Advantage Actor-Critic <https://arxiv.org/abs/1602.01783>`__
@@ -28,7 +41,7 @@ including custom ones written and registered by the user.
 Getting Started
 ---------------
 
-You can train an example DQN agent with the following command
+You can train a simple DQN agent with the following command
 
 ::
 
@@ -79,12 +92,11 @@ Some good hyperparameters and settings are available in
 (some of them are tuned to run on GPUs). If you find better settings or tune
 an algorithm on a different domain, consider submitting a Pull Request!
 
-The User API
-------------
+Python User API
+---------------
 
 You will be using this part of the API if you run the existing algorithms
-on a new problem. Note that the API is not considered to be stable yet.
-Here is an example how to use it:
+on a new problem. Here is an example how to use it:
 
 ::
 
@@ -108,7 +120,7 @@ Custom Environments
 ~~~~~~~~~~~~~~~~~~~
 
 To train against a custom environment, i.e. one not in the gym catalog, you
-can pass a function that returns an env instead of an env id. For example:
+can register a function that creates the env to refer to it by name. For example:
 
 ::
 
@@ -117,17 +129,11 @@ can pass a function that returns an env instead of an env id. For example:
     from ray.rllib import ppo
 
     env_creator = lambda: create_my_env()
-    env_creator_key = "custom_env"
-    register_env(env_creator_key, env_creator)
+    env_creator_name = "custom_env"
+    register_env(env_creator_name, env_creator)
 
     ray.init()
-    alg = ppo.PPOAgent(env=env_creator_key, registry=get_registry())
-
-The Developer API
------------------
-
-This part of the API will be useful if you need to change existing RL algorithms
-or implement new ones. Note that the API is not considered to be stable yet.
+    alg = ppo.PPOAgent(env=env_creator_name, registry=get_registry())
 
 Agents
 ~~~~~~
@@ -140,6 +146,63 @@ a common base class:
 .. autoclass:: ray.rllib.agent.Agent
     :members:
 
+Using RLlib on a cluster
+------------------------
+
+First create a cluster as described in `managing a cluster with parallel ssh`_.
+You can then run RLlib on this cluster by passing the address of the main redis
+shard into ``train.py`` with ``--redis-address``.
+
+Using RLlib with Ray.tune
+-------------------------
+
+All Agents implemented in RLlib support the
+`tune Trainable <http://ray.readthedocs.io/en/latest/tune.html#ray.tune.trainable.Trainable>`__ interface.
+
+Here is an example of using Ray.tune with RLlib:
+
+::
+
+    python ray/python/ray/rllib/train.py -f tuned_examples/cartpole-grid-search-example.yaml
+
+Here is an example using the Python API.
+
+::
+
+    from ray.tune.tune import run_experiments
+    from ray.tune.variant_generator import grid_search
+
+
+    experiment = {
+        'cartpole-ppo': {
+            'run': 'PPO',
+            'env': 'CartPole-v0',
+            'resources': {
+                'cpu': 2,
+                'driver_cpu_limit': 1},
+            'stop': {
+                'episode_reward_mean': 200,
+                'time_total_s': 180
+            },
+            'config': {
+                'num_sgd_iter': grid_search([1, 4]),
+                'num_workers': 2,
+                'sgd_batchsize': grid_search([128, 256, 512])
+            }
+        }
+    }
+
+    run_experiments(experiment)
+
+.. _`managing a cluster with parallel ssh`: http://ray.readthedocs.io/en/latest/using-ray-on-a-large-cluster.html
+
+
+The Developer API
+-----------------
+
+This part of the API will be useful if you need to change existing RL algorithms
+or implement new ones. Note that the API is not considered to be stable yet.
+
 Models
 ~~~~~~
 
@@ -186,53 +249,3 @@ various gym environments. Here is an example usage:
 
 .. autoclass:: ray.rllib.models.ModelCatalog
     :members:
-
-Using RLLib on a cluster
-------------------------
-
-First create a cluster as described in `managing a cluster with parallel ssh`_.
-You can then run RLLib on this cluster by passing the address of the main redis
-shard into ``train.py`` with ``--redis-address``.
-
-Using RLLib with Ray.tune
--------------------------
-
-All Agents implemented in RLLib support the
-`Trainable <http://ray.readthedocs.io/en/latest/tune.html#ray.tune.trainable.Trainable>`__ interface.
-
-Here is an example of using Ray.tune with RLLib:
-
-::
-
-    python ray/python/ray/rllib/train.py -f tuned_examples/cartpole-grid-search-example.yaml
-
-Here is an example using the Python API.
-
-::
-
-    from ray.tune.tune import run_experiments
-    from ray.tune.variant_generator import grid_search
-
-
-    experiment = {
-        'cartpole-ppo': {
-            'run': 'PPO',
-            'env': 'CartPole-v0',
-            'resources': {
-                'cpu': 2,
-                'driver_cpu_limit': 1},
-            'stop': {
-                'episode_reward_mean': 200,
-                'time_total_s': 180
-            },
-            'config': {
-                'num_sgd_iter': grid_search([1, 4]),
-                'num_workers': 2,
-                'sgd_batchsize': grid_search([128, 256, 512])
-            }
-        }
-    }
-
-    run_experiments(experiment)
-
-.. _`managing a cluster with parallel ssh`: http://ray.readthedocs.io/en/latest/using-ray-on-a-large-cluster.html
diff --git a/doc/source/tune.rst b/doc/source/tune.rst
index 593416d0c..1d4253316 100644
--- a/doc/source/tune.rst
+++ b/doc/source/tune.rst
@@ -11,6 +11,7 @@ This document describes Ray.tune, a hyperparameter tuning tool for long-running
 
 -  Resource-aware scheduling, including support for concurrent runs of algorithms that may themselves be parallel and distributed.
 
+You can find the code for Ray.tune `here on GitHub <https://github.com/ray-project/ray/tree/master/python/ray/tune>`__.
 
 Getting Started
 --------------- 
@@ -49,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun
     == Status ==
     Using FIFO scheduling algorithm.
     Resources used: 4/8 CPUs, 0/0 GPUs
-    Tensorboard logdir: /tmp/ray/my_experiment
+    Result logdir: /tmp/ray/my_experiment
      - my_func_0_alpha=0.2,beta=1:	RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
      - my_func_1_alpha=0.4,beta=1:	RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
      - my_func_2_alpha=0.6,beta=1:	TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.rst
index 7eb108287..240438d04 100644
--- a/python/ray/rllib/README.rst
+++ b/python/ray/rllib/README.rst
@@ -1,31 +1,57 @@
-RLLib: A Scalable Reinforcement Learning Library
+RLlib: A Scalable Reinforcement Learning Library
 ================================================
 
-Getting Started
----------------
+This README provides a brief technical overview of RLlib. See also the `user documentation <http://ray.readthedocs.io/en/latest/rllib.html>`__.
 
-You can run training with
+RLlib currently provides the following algorithms:
 
-::
+-  `Proximal Policy Optimization <https://arxiv.org/abs/1707.06347>`__ which
+   is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
 
-    python train.py --env CartPole-v0 --run PPO
-
-The available algorithms are:
-
--  ``PPO`` is a proximal variant of
-   `TRPO <https://arxiv.org/abs/1502.05477>`__.
-
--  ``ES`` is decribed in `this
+-  Evolution Strategies which is decribed in `this
    paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
    borrows code from
    `here <https://github.com/openai/evolution-strategies-starter>`__.
 
--  ``DQN`` is an implementation of `Deep Q
-   Networks <https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf>`__ based on
-   `OpenAI baselines <https://github.com/openai/baselines>`__.
+-  `The Asynchronous Advantage Actor-Critic <https://arxiv.org/abs/1602.01783>`__
+   based on `the OpenAI starter agent <https://github.com/openai/universe-starter-agent>`__.
 
--  ``A3C`` is an implementation of
-   `A3C <https://arxiv.org/abs/1602.01783>`__ based on `the OpenAI
-   starter agent <https://github.com/openai/universe-starter-agent>`__.
+- `Deep Q Network (DQN) <https://arxiv.org/abs/1312.5602>`__.
 
-User documentation can be `found here <http://ray.readthedocs.io/en/latest/rllib.html>`__.
+Proximal Policy Optimization scales to hundreds of cores and several GPUs, Evolution Strategies to clusters with thousands of cores and the Asynchronous Advantage Actor-Critic scales to dozens of cores on a single node.
+
+These algorithms can be run on any OpenAI Gym MDP, including custom ones written and registered by the user.
+
+For more detailed usage information, see the `user documentation <http://ray.readthedocs.io/en/latest/rllib.html>`__.
+
+Training API
+------------
+
+All RLlib algorithms implement a common training API (agent.py), which enables multiple algorithms to be easily evaluated:
+
+::
+
+    # Train a model on a single environment
+    python train.py --env CartPole-v0 --run PPO
+
+    # Integration with ray.tune for hyperparam evaluation
+    python train.py -f tuned_examples/cartpole-grid-search-example.yaml
+
+Evaluator and Optimizer abstractions
+------------------------------------
+
+RLlib's gradient-based algorithms are composed using two abstractions: Evaluators (evaluator.py) and Optimizers (optimizers/optimizer.py). Optimizers encapsulate a particular distributed optimization strategy for RL. Evaluators encapsulate the model graph, and once implemented, any Optimizer may be "plugged in" to any algorithm that implements the Evaluator interface.
+
+This pluggability enables optimization strategies to be re-used and improved across different algorithms and deep learning frameworks (RLlib's optimizers work with both TensorFlow and PyTorch, though currently only A3C has a PyTorch graph implementation).
+
+These are the currently available optimizers:
+
+-  ``AsyncOptimizer`` is an asynchronous RL optimizer, i.e. like A3C. It asynchronously pulls and applies gradients from evaluators, sending updated weights back as needed.
+-  ``LocalSyncOptimizer`` is a simple synchronous RL optimizer. It pulls samples from remote evaluators, concatenates them, and then updates a local model. The updated model weights are then broadcast to all remote evalutaors.
+-  ``LocalMultiGPUOptimizer`` (currently available for PPO) This optimizer performs SGD over a number of local GPUs, and pins experience data in GPU memory to amortize the copy overhead for multiple SGD passes.
+-  ``AllReduceOptimizer`` (planned) This optimizer would use the Allreduce primitive to scalably synchronize weights among a number of remote GPU workers.
+
+Common utilities
+----------------
+
+RLlib defines common action distributions, preprocessors, and neural network models, found in ``models/catalog.py``, which are shared by all algorithms. More information on these classes can be found in the `developer API docs <http://ray.readthedocs.io/en/latest/rllib.html#the-developer-api>`__.
diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py
index 8ca499b2d..1770a1a29 100644
--- a/python/ray/rllib/agent.py
+++ b/python/ray/rllib/agent.py
@@ -35,7 +35,8 @@ class Agent(Trainable):
         env_creator (func): Function that creates a new training env.
         config (obj): Algorithm-specific configuration data.
         logdir (str): Directory in which training outputs should be placed.
-        registry (obj): Object registry.
+        registry (obj): Tune object registry, for registering user-defined
+            classes and objects by name.
     """
 
     _allow_unknown_configs = False
diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py
index e0d6eddba..63aeba821 100644
--- a/python/ray/tune/trial_runner.py
+++ b/python/ray/tune/trial_runner.py
@@ -118,7 +118,7 @@ class TrialRunner(object):
                     self._committed_resources.gpu,
                     self._avail_resources.gpu))
         for local_dir in sorted(set([t.local_dir for t in self._trials])):
-            messages.append("Tensorboard logdir: {}".format(local_dir))
+            messages.append("Result logdir: {}".format(local_dir))
             for t in self._trials:
                 if t.local_dir == local_dir:
                     messages.append(