From 1eb8c833141b919aa4e60918dfecbace03a4fe68 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 12 Sep 2017 23:38:21 -0700
Subject: [PATCH] [rllib] Initial RLLib documentation (#969)

* initial documentation for RLLib

* more RL documentation

* fix linting

* fix comments

* update

* fix
---
 doc/source/conf.py                     |   8 +-
 doc/source/index.rst                   |   1 +
 doc/source/rllib.rst                   | 159 +++++++++++++++++++++++++
 python/ray/rllib/models/__init__.py    |  11 +-
 python/ray/rllib/models/action_dist.py |   4 +
 python/ray/rllib/models/lstm.py        |   6 +-
 6 files changed, 184 insertions(+), 5 deletions(-)
 create mode 100644 doc/source/rllib.rst

diff --git a/doc/source/conf.py b/doc/source/conf.py
index b84b8229c..c27242a57 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -18,8 +18,14 @@ import shlex
 
 # These lines added to enable Sphinx to work without installing Ray.
 import mock
-MOCK_MODULES = ["pyarrow",
+MOCK_MODULES = ["gym",
+                "tensorflow",
+                "tensorflow.contrib",
+                "tensorflow.contrib.slim",
+                "tensorflow.contrib.rnn",
+                "pyarrow",
                 "pyarrow.plasma",
+                "smart_open",
                 "ray.local_scheduler",
                 "ray.plasma",
                 "ray.core.generated.TaskInfo",
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 810dc4559..50579df40 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -20,6 +20,7 @@ Ray
    api.rst
    actors.rst
    using-ray-with-gpus.rst
+   rllib.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
new file mode 100644
index 000000000..0f6d521a5
--- /dev/null
+++ b/doc/source/rllib.rst
@@ -0,0 +1,159 @@
+RLLib: Ray's scalable reinforcement learning library
+====================================================
+
+This document describes Ray's reinforcement learning library.
+It currently supports the following algorithms:
+
+-  `Proximal Policy Optimization <https://arxiv.org/abs/1707.06347>`__ which
+   is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
+
+-  Evolution Strategies which is decribed in `this
+   paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
+   borrows code from
+   `here <https://github.com/openai/evolution-strategies-starter>`__.
+
+-  `The Asynchronous Advantage Actor-Critic <https://arxiv.org/abs/1602.01783>`__
+   based on `the OpenAI starter agent <https://github.com/openai/universe-starter-agent>`__.
+
+Proximal Policy Optimization scales to hundreds of cores and several GPUs,
+Evolution Strategies to clusters with thousands of cores and
+the Asynchronous Advantage Actor-Critic scales to dozens of cores
+on a single node.
+
+These algorithms can be run on any OpenAI gym MDP, including custom ones written
+and registered by the user.
+
+Getting Started
+---------------
+
+You can run training with
+
+::
+
+    python ray/python/ray/rllib/train.py --env CartPole-v0 --alg PPO --config '{"timesteps_per_batch": 10000}'
+
+By default, the results will be logged to a subdirectory of ``/tmp/ray``.
+This subdirectory will contain a file ``config.json`` which contains the
+hyperparameters, a file ``result.json`` which contains a training summary
+for each episode and a TensorBoard file that can be used to visualize
+training process with TensorBoard by running
+
+::
+
+     tensorboard --logdir=/tmp/ray
+
+
+The ``train.py`` script has a number of options you can show by running
+
+::
+
+    python ray/python/ray/rllib/train.py --help
+
+The most important options are for choosing the environment
+with ``--env`` (any OpenAI gym environment including ones registered by the user
+can be used) and for choosing the algorithm with ``--alg``
+(available options are ``PPO``, ``A3C``, ``ES`` and ``DQN``). Each algorithm
+has specific hyperparameters that can be set with ``--config``, see the
+``DEFAULT_CONFIG`` variable in
+`PPO <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ppo/ppo.py>`__,
+`A3C <https://github.com/ray-project/ray/blob/master/python/ray/rllib/a3c/a3c.py>`__,
+`ES <https://github.com/ray-project/ray/blob/master/python/ray/rllib/es/es.py>`__ and
+`DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/dqn.py>`__.
+
+
+Examples
+--------
+
+Some good hyperparameters and settings are available in
+`the repository <https://github.com/ray-project/ray/blob/master/python/ray/rllib/test/tuned_examples.sh>`__
+(some of them are tuned to run on GPUs). If you find better settings or tune
+an algorithm on a different domain, consider submitting a Pull Request!
+
+The User API
+------------
+
+You will be using this part of the API if you run the existing algorithms
+on a new problem. Note that the API is not considered to be stable yet.
+Here is an example how to use it:
+
+::
+
+    import ray
+    import ray.rllib.ppo as ppo
+
+    ray.init()
+
+    config = ppo.DEFAULT_CONFIG.copy()
+    alg = ppo.PPOAgent("CartPole-v1", config)
+
+    # Can optionally call alg.restore(path) to load a checkpoint.
+
+    for i in range(10):
+       # Perform one iteration of the algorithm.
+       result = alg.train()
+       print("result: {}".format(result))
+       print("checkpoint saved at path: {}".format(alg.save()))
+
+The Developer API
+-----------------
+
+This part of the API will be useful if you need to change existing RL algorithms
+or implement new ones. Note that the API is not considered to be stable yet.
+
+Agents
+~~~~~~
+
+Agents implement a particular algorithm and can be used to run
+some number of iterations of the algorithm, save and load the state
+of training and evaluate the current policy. All agents inherit from
+a common base class:
+
+.. autoclass:: ray.rllib.common.Agent
+    :members:
+
+Models
+~~~~~~
+
+Models are subclasses of the Model class:
+
+.. autoclass:: ray.rllib.models.Model
+
+Currently we support fully connected policies, convolutional policies and
+LSTMs:
+
+.. autofunction:: ray.rllib.models.FullyConnectedNetwork
+.. autofunction:: ray.rllib.models.ConvolutionalNetwork
+.. autofunction:: ray.rllib.models.LSTM
+
+Action Distributions
+~~~~~~~~~~~~~~~~~~~~
+
+Actions can be sampled from different distributions, they have a common base
+class:
+
+.. autoclass:: ray.rllib.models.ActionDistribution
+    :members:
+
+Currently we support the following action distributions:
+
+.. autofunction:: ray.rllib.models.Categorical
+.. autofunction:: ray.rllib.models.DiagGaussian
+.. autofunction:: ray.rllib.models.Deterministic
+
+The Model Catalog
+~~~~~~~~~~~~~~~~~
+
+To make picking the right action distribution and models easier, there is
+a mechanism to pick good default values for various gym environments.
+
+.. autoclass:: ray.rllib.models.ModelCatalog
+    :members:
+
+Using RLLib on a cluster
+------------------------
+
+First create a cluster as described in `managing a cluster with parallel ssh`_.
+You can then run RLLib on this cluster by passing the address of the main redis
+shard into ``train.py`` with ``--redis-address``.
+
+.. _`managing a cluster with parallel ssh`: http://ray.readthedocs.io/en/latest/using-ray-on-a-large-cluster.html
diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py
index 7ed71e5e6..61e554823 100644
--- a/python/ray/rllib/models/__init__.py
+++ b/python/ray/rllib/models/__init__.py
@@ -1,3 +1,12 @@
 from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.action_dist import (ActionDistribution, Categorical,
+                                          DiagGaussian, Deterministic)
+from ray.rllib.models.model import Model
+from ray.rllib.models.fcnet import FullyConnectedNetwork
+from ray.rllib.models.convnet import ConvolutionalNetwork
+from ray.rllib.models.lstm import LSTM
 
-__all__ = ["ModelCatalog"]
+
+__all__ = ["ActionDistribution", "ActionDistribution", "Categorical",
+           "DiagGaussian", "Deterministic", "ModelCatalog", "Model",
+           "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"]
diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py
index 26455add8..844761728 100644
--- a/python/ray/rllib/models/action_dist.py
+++ b/python/ray/rllib/models/action_dist.py
@@ -17,15 +17,19 @@ class ActionDistribution(object):
         self.inputs = inputs
 
     def logp(self, x):
+        """The log-likelihood of the action distribution."""
         raise NotImplementedError
 
     def kl(self, other):
+        """The KL-divergene between two action distributions."""
         raise NotImplementedError
 
     def entropy(self):
+        """The entroy of the action distribution."""
         raise NotImplementedError
 
     def sample(self):
+        """Draw a sample from the action distribution."""
         raise NotImplementedError
 
 
diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py
index 75a5a9045..bd0fdf018 100644
--- a/python/ray/rllib/models/lstm.py
+++ b/python/ray/rllib/models/lstm.py
@@ -11,13 +11,13 @@ from ray.rllib.models.misc import (conv2d, linear, flatten,
                                    normc_initializer)
 from ray.rllib.models.model import Model
 
-use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >=
-                 distutils.version.LooseVersion("1.0.0"))
-
 
 class LSTM(Model):
     # TODO(rliaw): Add LSTM code for other algorithms
     def _init(self, inputs, num_outputs, options):
+        use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >=
+                         distutils.version.LooseVersion("1.0.0"))
+
         self.x = x = inputs
         for i in range(4):
             x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))