From 1eb8c833141b919aa4e60918dfecbace03a4fe68 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 12 Sep 2017 23:38:21 -0700 Subject: [PATCH] [rllib] Initial RLLib documentation (#969) * initial documentation for RLLib * more RL documentation * fix linting * fix comments * update * fix --- doc/source/conf.py | 8 +- doc/source/index.rst | 1 + doc/source/rllib.rst | 159 +++++++++++++++++++++++++ python/ray/rllib/models/__init__.py | 11 +- python/ray/rllib/models/action_dist.py | 4 + python/ray/rllib/models/lstm.py | 6 +- 6 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 doc/source/rllib.rst diff --git a/doc/source/conf.py b/doc/source/conf.py index b84b8229c..c27242a57 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -18,8 +18,14 @@ import shlex # These lines added to enable Sphinx to work without installing Ray. import mock -MOCK_MODULES = ["pyarrow", +MOCK_MODULES = ["gym", + "tensorflow", + "tensorflow.contrib", + "tensorflow.contrib.slim", + "tensorflow.contrib.rnn", + "pyarrow", "pyarrow.plasma", + "smart_open", "ray.local_scheduler", "ray.plasma", "ray.core.generated.TaskInfo", diff --git a/doc/source/index.rst b/doc/source/index.rst index 810dc4559..50579df40 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -20,6 +20,7 @@ Ray api.rst actors.rst using-ray-with-gpus.rst + rllib.rst .. toctree:: :maxdepth: 1 diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst new file mode 100644 index 000000000..0f6d521a5 --- /dev/null +++ b/doc/source/rllib.rst @@ -0,0 +1,159 @@ +RLLib: Ray's scalable reinforcement learning library +==================================================== + +This document describes Ray's reinforcement learning library. +It currently supports the following algorithms: + +- `Proximal Policy Optimization `__ which + is a proximal variant of `TRPO `__. + +- Evolution Strategies which is decribed in `this + paper `__. Our implementation + borrows code from + `here `__. + +- `The Asynchronous Advantage Actor-Critic `__ + based on `the OpenAI starter agent `__. + +Proximal Policy Optimization scales to hundreds of cores and several GPUs, +Evolution Strategies to clusters with thousands of cores and +the Asynchronous Advantage Actor-Critic scales to dozens of cores +on a single node. + +These algorithms can be run on any OpenAI gym MDP, including custom ones written +and registered by the user. + +Getting Started +--------------- + +You can run training with + +:: + + python ray/python/ray/rllib/train.py --env CartPole-v0 --alg PPO --config '{"timesteps_per_batch": 10000}' + +By default, the results will be logged to a subdirectory of ``/tmp/ray``. +This subdirectory will contain a file ``config.json`` which contains the +hyperparameters, a file ``result.json`` which contains a training summary +for each episode and a TensorBoard file that can be used to visualize +training process with TensorBoard by running + +:: + + tensorboard --logdir=/tmp/ray + + +The ``train.py`` script has a number of options you can show by running + +:: + + python ray/python/ray/rllib/train.py --help + +The most important options are for choosing the environment +with ``--env`` (any OpenAI gym environment including ones registered by the user +can be used) and for choosing the algorithm with ``--alg`` +(available options are ``PPO``, ``A3C``, ``ES`` and ``DQN``). Each algorithm +has specific hyperparameters that can be set with ``--config``, see the +``DEFAULT_CONFIG`` variable in +`PPO `__, +`A3C `__, +`ES `__ and +`DQN `__. + + +Examples +-------- + +Some good hyperparameters and settings are available in +`the repository `__ +(some of them are tuned to run on GPUs). If you find better settings or tune +an algorithm on a different domain, consider submitting a Pull Request! + +The User API +------------ + +You will be using this part of the API if you run the existing algorithms +on a new problem. Note that the API is not considered to be stable yet. +Here is an example how to use it: + +:: + + import ray + import ray.rllib.ppo as ppo + + ray.init() + + config = ppo.DEFAULT_CONFIG.copy() + alg = ppo.PPOAgent("CartPole-v1", config) + + # Can optionally call alg.restore(path) to load a checkpoint. + + for i in range(10): + # Perform one iteration of the algorithm. + result = alg.train() + print("result: {}".format(result)) + print("checkpoint saved at path: {}".format(alg.save())) + +The Developer API +----------------- + +This part of the API will be useful if you need to change existing RL algorithms +or implement new ones. Note that the API is not considered to be stable yet. + +Agents +~~~~~~ + +Agents implement a particular algorithm and can be used to run +some number of iterations of the algorithm, save and load the state +of training and evaluate the current policy. All agents inherit from +a common base class: + +.. autoclass:: ray.rllib.common.Agent + :members: + +Models +~~~~~~ + +Models are subclasses of the Model class: + +.. autoclass:: ray.rllib.models.Model + +Currently we support fully connected policies, convolutional policies and +LSTMs: + +.. autofunction:: ray.rllib.models.FullyConnectedNetwork +.. autofunction:: ray.rllib.models.ConvolutionalNetwork +.. autofunction:: ray.rllib.models.LSTM + +Action Distributions +~~~~~~~~~~~~~~~~~~~~ + +Actions can be sampled from different distributions, they have a common base +class: + +.. autoclass:: ray.rllib.models.ActionDistribution + :members: + +Currently we support the following action distributions: + +.. autofunction:: ray.rllib.models.Categorical +.. autofunction:: ray.rllib.models.DiagGaussian +.. autofunction:: ray.rllib.models.Deterministic + +The Model Catalog +~~~~~~~~~~~~~~~~~ + +To make picking the right action distribution and models easier, there is +a mechanism to pick good default values for various gym environments. + +.. autoclass:: ray.rllib.models.ModelCatalog + :members: + +Using RLLib on a cluster +------------------------ + +First create a cluster as described in `managing a cluster with parallel ssh`_. +You can then run RLLib on this cluster by passing the address of the main redis +shard into ``train.py`` with ``--redis-address``. + +.. _`managing a cluster with parallel ssh`: http://ray.readthedocs.io/en/latest/using-ray-on-a-large-cluster.html diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py index 7ed71e5e6..61e554823 100644 --- a/python/ray/rllib/models/__init__.py +++ b/python/ray/rllib/models/__init__.py @@ -1,3 +1,12 @@ from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.action_dist import (ActionDistribution, Categorical, + DiagGaussian, Deterministic) +from ray.rllib.models.model import Model +from ray.rllib.models.fcnet import FullyConnectedNetwork +from ray.rllib.models.convnet import ConvolutionalNetwork +from ray.rllib.models.lstm import LSTM -__all__ = ["ModelCatalog"] + +__all__ = ["ActionDistribution", "ActionDistribution", "Categorical", + "DiagGaussian", "Deterministic", "ModelCatalog", "Model", + "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"] diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 26455add8..844761728 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -17,15 +17,19 @@ class ActionDistribution(object): self.inputs = inputs def logp(self, x): + """The log-likelihood of the action distribution.""" raise NotImplementedError def kl(self, other): + """The KL-divergene between two action distributions.""" raise NotImplementedError def entropy(self): + """The entroy of the action distribution.""" raise NotImplementedError def sample(self): + """Draw a sample from the action distribution.""" raise NotImplementedError diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py index 75a5a9045..bd0fdf018 100644 --- a/python/ray/rllib/models/lstm.py +++ b/python/ray/rllib/models/lstm.py @@ -11,13 +11,13 @@ from ray.rllib.models.misc import (conv2d, linear, flatten, normc_initializer) from ray.rllib.models.model import Model -use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >= - distutils.version.LooseVersion("1.0.0")) - class LSTM(Model): # TODO(rliaw): Add LSTM code for other algorithms def _init(self, inputs, num_outputs, options): + use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >= + distutils.version.LooseVersion("1.0.0")) + self.x = x = inputs for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))