From f34d705178b41efb8bfa72f2dd2ede413adb4f04 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Fri, 24 Nov 2017 10:36:57 -0800
Subject: [PATCH] [rllib] Update Docs for RLLib (#1248)

* init_changes

* last_changes

* addressing comments

* fix comments

* update

* nit
---
 doc/source/rllib.rst               | 106 +++++++++++++++++++++++------
 python/ray/rllib/README.rst        |  51 +-------------
 python/ray/rllib/models/catalog.py |  16 ++---
 python/ray/rllib/models/lstm.py    |   3 +
 python/ray/rllib/models/model.py   |   2 +-
 5 files changed, 97 insertions(+), 81 deletions(-)

diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 205de4406..0db6e0c6c 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -1,5 +1,5 @@
-RLLib: Ray's scalable reinforcement learning library
-====================================================
+RLLib: A Scalable Reinforcement Learning Library
+================================================
 
 This document describes Ray's reinforcement learning library.
 It currently supports the following algorithms:
@@ -15,25 +15,27 @@ It currently supports the following algorithms:
 -  `The Asynchronous Advantage Actor-Critic <https://arxiv.org/abs/1602.01783>`__
    based on `the OpenAI starter agent <https://github.com/openai/universe-starter-agent>`__.
 
+- `Deep Q Network (DQN) <https://arxiv.org/abs/1312.5602>`__.
+
 Proximal Policy Optimization scales to hundreds of cores and several GPUs,
 Evolution Strategies to clusters with thousands of cores and
 the Asynchronous Advantage Actor-Critic scales to dozens of cores
 on a single node.
 
-These algorithms can be run on any OpenAI gym MDP, including custom ones written
-and registered by the user.
+These algorithms can be run on any `OpenAI Gym MDP <https://github.com/openai/gym>`__,
+including custom ones written and registered by the user.
 
 Getting Started
 ---------------
 
-You can run training with
+You can train an example DQN agent with the following command
 
 ::
 
-    python ray/python/ray/rllib/train.py --env CartPole-v0 --run PPO --config '{"timesteps_per_batch": 10000}'
+    python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0
 
 By default, the results will be logged to a subdirectory of ``/tmp/ray``.
-This subdirectory will contain a file ``config.json`` which contains the
+This subdirectory will contain a file ``params.json`` which contains the
 hyperparameters, a file ``result.json`` which contains a training summary
 for each episode and a TensorBoard file that can be used to visualize
 training process with TensorBoard by running
@@ -51,18 +53,26 @@ The ``train.py`` script has a number of options you can show by running
 
 The most important options are for choosing the environment
 with ``--env`` (any OpenAI gym environment including ones registered by the user
-can be used) and for choosing the algorithm with ``-run``
-(available options are ``PPO``, ``A3C``, ``ES`` and ``DQN``). Each algorithm
-has specific hyperparameters that can be set with ``--config``, see the
+can be used) and for choosing the algorithm with ``--run``
+(available options are ``PPO``, ``A3C``, ``ES`` and ``DQN``).
+
+Specifying Parameters
+~~~~~~~~~~~~~~~~~~~~~
+
+Each algorithm has specific hyperparameters that can be set with ``--config`` - see the
 ``DEFAULT_CONFIG`` variable in
 `PPO <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ppo/ppo.py>`__,
 `A3C <https://github.com/ray-project/ray/blob/master/python/ray/rllib/a3c/a3c.py>`__,
 `ES <https://github.com/ray-project/ray/blob/master/python/ray/rllib/es/es.py>`__ and
 `DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/dqn.py>`__.
 
+In an example below, we train A3C by specifying 8 workers through the config flag.
+::
 
-Examples
---------
+    python ray/python/ray/rllib/train.py --env=PongDeterministic-v4 --run=A3C --config '{"num_workers": 8}'
+
+Tuned Examples
+--------------
 
 Some good hyperparameters and settings are available in
 `the repository <https://github.com/ray-project/ray/blob/master/python/ray/rllib/test/tuned_examples.sh>`__
@@ -84,7 +94,7 @@ Here is an example how to use it:
     ray.init()
 
     config = ppo.DEFAULT_CONFIG.copy()
-    alg = ppo.PPOAgent("CartPole-v1", config)
+    alg = ppo.PPOAgent(config=config, env="CartPole-v1")
 
     # Can optionally call alg.restore(path) to load a checkpoint.
 
@@ -102,8 +112,16 @@ can pass a function that returns an env instead of an env id. For example:
 
 ::
 
+    import ray
+    from ray.tune.registry import get_registry, register_env
+    from ray.rllib import ppo
+
     env_creator = lambda: create_my_env()
-    alg = ppo.PPOAgent(env_creator, config)
+    env_creator_key = "custom_env"
+    register_env(env_creator_key, env_creator)
+
+    ray.init()
+    alg = ppo.PPOAgent(env=env_creator_key, registry=get_registry())
 
 The Developer API
 -----------------
@@ -129,17 +147,19 @@ Models are subclasses of the Model class:
 
 .. autoclass:: ray.rllib.models.Model
 
-Currently we support fully connected policies, convolutional policies and
-LSTMs:
+Currently we support fully connected and convolutional TensorFlow policies on all algorithms:
 
 .. autofunction:: ray.rllib.models.FullyConnectedNetwork
 .. autofunction:: ray.rllib.models.ConvolutionalNetwork
+
+A3C also supports a TensorFlow LSTM policy.
+
 .. autofunction:: ray.rllib.models.LSTM
 
 Action Distributions
 ~~~~~~~~~~~~~~~~~~~~
 
-Actions can be sampled from different distributions, they have a common base
+Actions can be sampled from different distributions which have a common base
 class:
 
 .. autoclass:: ray.rllib.models.ActionDistribution
@@ -154,8 +174,15 @@ Currently we support the following action distributions:
 The Model Catalog
 ~~~~~~~~~~~~~~~~~
 
-To make picking the right action distribution and models easier, there is
-a mechanism to pick good default values for various gym environments.
+The Model Catalog is a mechanism for picking good default values for
+various gym environments. Here is an example usage:
+::
+
+    dist_class, dist_dim = ModelCatalog.get_action_dist(env.action_space)
+    model = ModelCatalog.get_model(inputs, dist_dim)
+    dist = dist_class(model.outputs)
+    action_op = dist.sample()
+
 
 .. autoclass:: ray.rllib.models.ModelCatalog
     :members:
@@ -167,4 +194,45 @@ First create a cluster as described in `managing a cluster with parallel ssh`_.
 You can then run RLLib on this cluster by passing the address of the main redis
 shard into ``train.py`` with ``--redis-address``.
 
+Using RLLib with Ray.tune
+-------------------------
+
+All Agents implemented in RLLib support the
+`Trainable <http://ray.readthedocs.io/en/latest/tune.html#ray.tune.trainable.Trainable>`__ interface.
+
+Here is an example of using Ray.tune with RLLib:
+
+::
+
+    python ray/python/ray/rllib/train.py -f tuned_examples/cartpole-grid-search-example.yaml
+
+Here is an example using the Python API.
+
+::
+
+    from ray.tune.tune import run_experiments
+    from ray.tune.variant_generator import grid_search
+
+
+    experiment = {
+        'cartpole-ppo': {
+            'run': 'PPO',
+            'env': 'CartPole-v0',
+            'resources': {
+                'cpu': 2,
+                'driver_cpu_limit': 1},
+            'stop': {
+                'episode_reward_mean': 200,
+                'time_total_s': 180
+            },
+            'config': {
+                'num_sgd_iter': grid_search([1, 4]),
+                'num_workers': 2,
+                'sgd_batchsize': grid_search([128, 256, 512])
+            }
+        }
+    }
+
+    run_experiments(experiment)
+
 .. _`managing a cluster with parallel ssh`: http://ray.readthedocs.io/en/latest/using-ray-on-a-large-cluster.html
diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.rst
index 074185fbb..56e27c718 100644
--- a/python/ray/rllib/README.rst
+++ b/python/ray/rllib/README.rst
@@ -28,53 +28,4 @@ The available algorithms are:
    `A3C <https://arxiv.org/abs/1602.01783>`__ based on `the OpenAI
    starter agent <https://github.com/openai/universe-starter-agent>`__.
 
-Storing logs
-------------
-
-You can store the algorithm configuration (including hyperparameters) and
-training results on a filesystem with the ``--upload-dir`` flag. Two protocols
-are supported at the moment:
-
-- ``--upload-dir file:///tmp/ray/`` will store the logs on the local filesystem
-  in a subdirectory of /tmp/ray which is named after the algorithm name, the
-  environment and the current date. This is the default.
-
-- ``--upload-dir s3://bucketname/`` will store the logs in S3. Not that if you
-  store the logs in S3, TensorFlow files will not currently be stored because
-  TensorFlow doesn't support directly uploading files to S3 at the moment.
-
-Querying logs with Athena
--------------------------
-
-If you stored the logs in S3 or uploaded them there from the local file system,
-they can be queried with Athena. First create tables containing the
-experimental results with
-
-.. code:: sql
-
-    CREATE EXTERNAL TABLE IF NOT EXISTS experiments (
-      experiment_id STRING,
-      env_name STRING,
-      alg STRING,
-      -- result.json
-      training_iteration INT,
-      episode_reward_mean FLOAT,
-      episode_len_mean FLOAT
-    ) ROW FORMAT serde 'org.apache.hive.hcatalog.data.JsonSerDe'
-    LOCATION 's3://bucketname/'
-
-and then you can for example visualize the results with
-
-.. code:: sql
-
-    SELECT c.experiment_id, c.env_name, c.alg, a.episode_reward_mean, a.episode_len_mean
-    FROM experiments a
-    LEFT OUTER JOIN experiments b
-        ON a.experiment_id = b.experiment_id AND a.training_iteration < b.training_iteration
-    INNER JOIN experiments c
-        ON a.experiment_id = c.experiment_id
-    WHERE b.experiment_id IS NULL AND a.training_iteration IS NOT NULL AND c.alg is NOT NULL;
-
-This query selects last iteration from each experiment (see `this
-stackoverflow
-post <https://stackoverflow.com/questions/7745609/sql-select-only-rows-with-max-value-on-a-column>`__).
+Documentation can be `found here <http://ray.readthedocs.io/en/latest/rllib.html>`__.
diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index b71f3623e..5d847a230 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -27,14 +27,7 @@ MODEL_CONFIGS = [
 
 
 class ModelCatalog(object):
-    """Registry of default models and action distributions for envs.
-
-    Example:
-        dist_class, dist_dim = ModelCatalog.get_action_dist(env.action_space)
-        model = ModelCatalog.get_model(inputs, dist_dim)
-        dist = dist_class(model.outputs)
-        action_op = dist.sample()
-    """
+    """Registry of default models and action distributions for envs."""
 
     ATARI_OBS_SHAPE = (210, 160, 3)
     ATARI_RAM_OBS_SHAPE = (128,)
@@ -47,7 +40,7 @@ class ModelCatalog(object):
 
         Args:
             action_space (Space): Action space of the target gym env.
-            dist_type (Optional[str]): Identifier of the action distribution.
+            dist_type (str): Optional identifier of the action distribution.
 
         Returns:
             dist_class (ActionDistribution): Python class of the distribution.
@@ -87,10 +80,11 @@ class ModelCatalog(object):
 
     @staticmethod
     def get_torch_model(input_shape, num_outputs, options=dict()):
-        """Returns a PyTorch suitable model.
+        """Returns a PyTorch suitable model. This is currently only supported
+        in A3C.
 
         Args:
-            input_shape (tup): The input shape to the model.
+            input_shape (tuple): The input shape to the model.
             num_outputs (int): The size of the output vector of the model.
             options (dict): Optional args to pass to the model constructor.
 
diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py
index bd0fdf018..1d950506b 100644
--- a/python/ray/rllib/models/lstm.py
+++ b/python/ray/rllib/models/lstm.py
@@ -13,6 +13,9 @@ from ray.rllib.models.model import Model
 
 
 class LSTM(Model):
+    """Vision LSTM network based here:
+    https://github.com/openai/universe-starter-agent"""
+
     # TODO(rliaw): Add LSTM code for other algorithms
     def _init(self, inputs, num_outputs, options):
         use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >=
diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py
index a8cfedc33..b1c5145d8 100644
--- a/python/ray/rllib/models/model.py
+++ b/python/ray/rllib/models/model.py
@@ -15,7 +15,7 @@ class Model(object):
     The last layer of the network can also be retrieved if the algorithm
     needs to further post-processing (e.g. Actor and Critic networks in A3C).
 
-    If options["free_log_std"] is True, the last half of the
+    If `options["free_log_std"]` is True, the last half of the
     output layer will be free variables that are not dependent on
     inputs. This is often used if the output of the network is used
     to parametrize a probability distribution. In this case, the