From b06c604a51ed94b48eead9db964bbc1891061b9f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 29 Sep 2018 23:13:36 -0700
Subject: [PATCH] [rllib] Add some more tuned atari results to documentation
 (#2991)

* dqn results ++

* add scale

* hour

* fix

* small dqn table

* update

* steps

* upd

* apex

* up

* add apex results

* tip
---
 doc/source/rllib-algorithms.rst               | 109 +++++++++++++++---
 .../ray/rllib/tuned_examples/atari-apex.yaml  |  34 ++++++
 .../rllib/tuned_examples/atari-dist-dqn.yaml  |  31 +++++
 .../ray/rllib/tuned_examples/atari-dqn.yaml   |  33 ++++++
 .../rllib/tuned_examples/atari-duel-ddqn.yaml |  31 +++++
 .../ray/rllib/tuned_examples/atari-ppo.yaml   |   7 +-
 6 files changed, 227 insertions(+), 18 deletions(-)
 create mode 100644 python/ray/rllib/tuned_examples/atari-apex.yaml
 create mode 100644 python/ray/rllib/tuned_examples/atari-dist-dqn.yaml
 create mode 100644 python/ray/rllib/tuned_examples/atari-dqn.yaml
 create mode 100644 python/ray/rllib/tuned_examples/atari-duel-ddqn.yaml

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 2cbcef178..d764fc7ad 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -10,7 +10,29 @@ Distributed Prioritized Experience Replay (Ape-X)
 `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/dqn/apex.py>`__
 Ape-X variations of DQN and DDPG (`APEX_DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/dqn/apex.py>`__, `APEX_DDPG <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ddpg/apex.py>`__ in RLlib) use a single GPU learner and many CPU workers for experience collection. Experience collection can scale to hundreds of CPU workers due to the distributed prioritization of experience prior to storage in replay buffers.
 
-Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-apex.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml>`__
+Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-apex.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-apex.yaml>`__.
+
+**Atari results @10M steps**: `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ================================  ========================================
+ Atari env     RLlib Ape-X 8-workers             Mnih et al Async DQN 16-workers
+=============  ================================  ========================================
+BeamRider      6134                              ~6000
+Breakout       123                               ~50
+Qbert          15302                             ~1200
+SpaceInvaders  686                               ~600
+=============  ================================  ========================================
+
+**Scalability**:
+
+=============  ================================  ========================================
+ Atari env     RLlib Ape-X 8-workers @1 hour     Mnih et al Async DQN 16-workers @1 hour
+=============  ================================  ========================================
+BeamRider      4873                              ~1000
+Breakout       77                                ~10
+Qbert          4083                              ~500
+SpaceInvaders  646                               ~300
+=============  ================================  ========================================
 
 .. figure:: apex.png
 
@@ -23,10 +45,31 @@ Importance Weighted Actor-Learner Architecture (IMPALA)
 `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/impala/impala.py>`__
 In IMPALA, a central learner runs SGD in a tight loop while asynchronously pulling sample batches from many actor processes. RLlib's IMPALA implementation uses DeepMind's reference `V-trace code <https://github.com/deepmind/scalable_agent/blob/master/vtrace.py>`__. Note that we do not provide a deep residual network out of the box, but one can be plugged in as a `custom model <rllib-models.html#custom-models>`__.
 
-Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-impala.yaml>`__, `vectorized configuration <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-impala-vectorized.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-impala.yaml>`__, `Atari results <https://github.com/ray-project/rl-experiments>`__.
+Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-impala.yaml>`__, `vectorized configuration <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-impala-vectorized.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-impala.yaml>`__
+
+**Atari results @10M steps**: `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ==================================  ====================================
+ Atari env     RLlib IMPALA 32-workers             Mnih et al A3C 16-workers
+=============  ==================================  ====================================
+BeamRider      2071                                ~3000
+Breakout       385                                 ~150
+Qbert          4068                                ~1000
+SpaceInvaders  719                                 ~600
+=============  ==================================  ====================================
+
+**Scalability:**
+
+=============  ===============================  =================================
+ Atari env     RLlib IMPALA 32-workers @1 hour  Mnih et al A3C 16-workers @1 hour
+=============  ===============================  =================================
+BeamRider      3181                             ~1000
+Breakout       538                              ~10
+Qbert          10850                            ~500
+SpaceInvaders  843                              ~300
+=============  ===============================  =================================
 
 .. figure:: impala.png
-   :align: center
 
    IMPALA solves Atari several times faster than A2C / A3C, with similar sample efficiency. Here IMPALA scales from 16 to 128 workers to solve PongNoFrameskip-v4 in ~8 minutes.
 
@@ -38,10 +81,21 @@ Advantage Actor-Critic (A2C, A3C)
 `[paper] <https://arxiv.org/abs/1602.01783>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c.py>`__
 RLlib implements A2C and A3C using SyncSamplesOptimizer and AsyncGradientsOptimizer respectively for policy optimization. These algorithms scale to up to 16-32 worker processes depending on the environment. Both a TensorFlow (LSTM), and PyTorch version are available.
 
-.. note::
-    In most cases, `IMPALA <#importance-weighted-actor-learner-architecture-impala>`__ will outperform A2C / A3C. In `benchmarks <https://github.com/ray-project/rl-experiments>`__, IMPALA is almost 10x faster than A2C in wallclock time, with similar sample efficiency.
+Tuned examples: `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-a3c.yaml>`__, `PyTorch version <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-a2c.yaml>`__
 
-Tuned examples: `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-a3c.yaml>`__, `PyTorch version <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-a2c.yaml>`__, `Atari results <https://github.com/ray-project/rl-experiments>`__.
+.. tip::
+    Consider using `IMPALA <#importance-weighted-actor-learner-architecture-impala>`__ for faster training with similar timestep efficiency.
+
+**Atari results @10M steps**: `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ========================  ==============================
+ Atari env     RLlib A2C 5-workers       Mnih et al A3C 16-workers
+=============  ========================  ==============================
+BeamRider      1401                      ~3000
+Breakout       374                       ~150
+Qbert          3620                      ~1000
+SpaceInvaders  692                       ~600
+=============  ========================  ==============================
 
 Deep Deterministic Policy Gradients (DDPG)
 ------------------------------------------
@@ -53,9 +107,23 @@ Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/pyt
 Deep Q Networks (DQN, Rainbow)
 ------------------------------
 `[paper] <https://arxiv.org/abs/1312.5602>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/dqn/dqn.py>`__
-RLlib DQN is implemented using the SyncReplayOptimizer. The algorithm can be scaled by increasing the number of workers, using the AsyncGradientsOptimizer for async DQN, or using Ape-X. Memory usage is reduced by compressing samples in the replay buffer with LZ4. All of the DQN improvements evaluated in `Rainbow <https://arxiv.org/abs/1710.02298>`__ are available, though not all are enabled by default. For more details, see these `DQN ablation experiments <https://github.com/ray-project/ray/pull/2701#issuecomment-415651381>`__.
+RLlib DQN is implemented using the SyncReplayOptimizer. The algorithm can be scaled by increasing the number of workers, using the AsyncGradientsOptimizer for async DQN, or using Ape-X. Memory usage is reduced by compressing samples in the replay buffer with LZ4. All of the DQN improvements evaluated in `Rainbow <https://arxiv.org/abs/1710.02298>`__ are available, though not all are enabled by default.
 
-Tuned examples: `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-dqn.yaml>`__, `Rainbow configuration <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-rainbow.yaml>`__
+Tuned examples: `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-dqn.yaml>`__, `Rainbow configuration <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-rainbow.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-basic-dqn.yaml>`__, `with Dueling and Double-Q <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-duel-ddqn.yaml>`__, `with Distributional DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-dist-dqn.yaml>`__.
+
+.. tip::
+    Consider using `Ape-X <#distributed-prioritized-experience-replay-ape-x>`__ for faster training with similar timestep efficiency.
+
+**Atari results @10M steps**: `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ========================  =============================  ==============================  ===============================
+ Atari env     RLlib DQN                 RLlib Dueling DDQN             RLlib Dist. DQN                 Hessel et al. DQN              
+=============  ========================  =============================  ==============================  ===============================
+BeamRider      2869                      1910                           4447                            ~2000                          
+Breakout       287                       312                            410                             ~150                           
+Qbert          3921                      7968                           15780                           ~4000                          
+SpaceInvaders  650                       1001                           1025                            ~500                           
+=============  ========================  =============================  ==============================  ===============================
 
 Policy Gradients
 ----------------
@@ -68,13 +136,27 @@ Proximal Policy Optimization (PPO)
 `[paper] <https://arxiv.org/abs/1707.06347>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo.py>`__
 PPO's clipped objective supports multiple SGD passes over the same batch of experiences. RLlib's multi-GPU optimizer pins that data in GPU memory to avoid unnecessary transfers from host memory, substantially improving performance over a naive implementation. RLlib's PPO scales out using multiple workers for experience collection, and also with multiple GPUs for SGD.
 
-Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/walker2d-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-ppo.yaml>`__, `Atari results <https://github.com/ray-project/rl-experiments>`__.
+Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/walker2d-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/atari-ppo.yaml>`__
+
+
+**Atari results**: `more details <https://github.com/ray-project/rl-experiments>`__
+
+=============  ==============  ==============  ==================
+ Atari env     RLlib PPO @10M  RLlib PPO @25M  Baselines PPO @10M
+=============  ==============  ==============  ==================
+BeamRider      2807            4480            ~1800
+Breakout       104             201             ~250
+Qbert          11085           14247           ~14000
+SpaceInvaders  671             944             ~800
+=============  ==============  ==============  ==================
+
+
+**Scalability:**
 
 .. figure:: ppo.png
    :width: 500px
-   :align: center
 
-   RLlib's multi-GPU PPO scales to multiple GPUs and hundreds of CPUs. Here we compare against a reference MPI-based implementation.
+   RLlib's multi-GPU PPO scales to multiple GPUs and hundreds of CPUs on solving the Humanoid-v1 task. Here we compare against a reference MPI-based implementation.
 
 Derivative-free
 ~~~~~~~~~~~~~~~
@@ -93,8 +175,9 @@ Code here is adapted from https://github.com/openai/evolution-strategies-starter
 
 Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/humanoid-es.yaml>`__
 
+**Scalability:**
+
 .. figure:: es.png
    :width: 500px
-   :align: center
 
-   RLlib's ES implementation scales further and is faster than a reference Redis implementation.
+   RLlib's ES implementation scales further and is faster than a reference Redis implementation on solving the Humanoid-v1 task.
diff --git a/python/ray/rllib/tuned_examples/atari-apex.yaml b/python/ray/rllib/tuned_examples/atari-apex.yaml
new file mode 100644
index 000000000..6e538d038
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/atari-apex.yaml
@@ -0,0 +1,34 @@
+# Runs on a single g3.16xl AWS machine
+apex:
+    env:
+        grid_search:
+            - BreakoutNoFrameskip-v4
+            - BeamRiderNoFrameskip-v4
+            - QbertNoFrameskip-v4
+            - SpaceInvadersNoFrameskip-v4
+    run: APEX
+    config:
+        double_q: false
+        dueling: false
+        num_atoms: 1
+        noisy: false
+        n_step: 3
+        lr: .0001
+        adam_epsilon: .00015
+        hiddens: [512]
+        buffer_size: 1000000
+        schedule_max_timesteps: 2000000
+        exploration_final_eps: 0.01
+        exploration_fraction: .1
+        prioritized_replay_alpha: 0.5
+        beta_annealing_fraction: 1.0
+        final_prioritized_replay_beta: 1.0
+        gpu: false
+
+        # APEX
+        num_workers: 8
+        num_envs_per_worker: 8
+        sample_batch_size: 158
+        train_batch_size: 512
+        target_network_update_freq: 50000
+        timesteps_per_iteration: 25000
diff --git a/python/ray/rllib/tuned_examples/atari-dist-dqn.yaml b/python/ray/rllib/tuned_examples/atari-dist-dqn.yaml
new file mode 100644
index 000000000..d71932986
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/atari-dist-dqn.yaml
@@ -0,0 +1,31 @@
+basic-dqn:
+    env:
+        grid_search:
+            - BreakoutNoFrameskip-v4
+            - BeamRiderNoFrameskip-v4
+            - QbertNoFrameskip-v4
+            - SpaceInvadersNoFrameskip-v4
+    run: DQN
+    config:
+        double_q: false
+        dueling: false
+        num_atoms: 51
+        noisy: false
+        prioritized_replay: false
+        n_step: 1
+        target_network_update_freq: 8000
+        lr: .0000625
+        adam_epsilon: .00015
+        hiddens: [512]
+        learning_starts: 20000
+        buffer_size: 1000000
+        sample_batch_size: 4
+        train_batch_size: 32
+        schedule_max_timesteps: 2000000
+        exploration_final_eps: 0.01
+        exploration_fraction: .1
+        prioritized_replay_alpha: 0.5
+        beta_annealing_fraction: 1.0
+        final_prioritized_replay_beta: 1.0
+        gpu: true
+        timesteps_per_iteration: 10000
diff --git a/python/ray/rllib/tuned_examples/atari-dqn.yaml b/python/ray/rllib/tuned_examples/atari-dqn.yaml
new file mode 100644
index 000000000..492901787
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/atari-dqn.yaml
@@ -0,0 +1,33 @@
+# Runs on a single g3.16xl node
+# See https://github.com/ray-project/rl-experiments for results
+atari-basic-dqn:
+    env:
+        grid_search:
+            - BreakoutNoFrameskip-v4
+            - BeamRiderNoFrameskip-v4
+            - QbertNoFrameskip-v4
+            - SpaceInvadersNoFrameskip-v4
+    run: DQN
+    config:
+        double_q: false
+        dueling: false
+        num_atoms: 1
+        noisy: false
+        prioritized_replay: false
+        n_step: 1
+        target_network_update_freq: 8000
+        lr: .0000625
+        adam_epsilon: .00015
+        hiddens: [512]
+        learning_starts: 20000
+        buffer_size: 1000000
+        sample_batch_size: 4
+        train_batch_size: 32
+        schedule_max_timesteps: 2000000
+        exploration_final_eps: 0.01
+        exploration_fraction: .1
+        prioritized_replay_alpha: 0.5
+        beta_annealing_fraction: 1.0
+        final_prioritized_replay_beta: 1.0
+        gpu: true
+        timesteps_per_iteration: 10000
diff --git a/python/ray/rllib/tuned_examples/atari-duel-ddqn.yaml b/python/ray/rllib/tuned_examples/atari-duel-ddqn.yaml
new file mode 100644
index 000000000..61ed3120d
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/atari-duel-ddqn.yaml
@@ -0,0 +1,31 @@
+dueling-ddqn:
+    env:
+        grid_search:
+            - BreakoutNoFrameskip-v4
+            - BeamRiderNoFrameskip-v4
+            - QbertNoFrameskip-v4
+            - SpaceInvadersNoFrameskip-v4
+    run: DQN
+    config:
+        double_q: true
+        dueling: true
+        num_atoms: 1
+        noisy: false
+        prioritized_replay: false
+        n_step: 1
+        target_network_update_freq: 8000
+        lr: .0000625
+        adam_epsilon: .00015
+        hiddens: [512]
+        learning_starts: 20000
+        buffer_size: 1000000
+        sample_batch_size: 4
+        train_batch_size: 32
+        schedule_max_timesteps: 2000000
+        exploration_final_eps: 0.01
+        exploration_fraction: .1
+        prioritized_replay_alpha: 0.5
+        beta_annealing_fraction: 1.0
+        final_prioritized_replay_beta: 1.0
+        gpu: true
+        timesteps_per_iteration: 10000
diff --git a/python/ray/rllib/tuned_examples/atari-ppo.yaml b/python/ray/rllib/tuned_examples/atari-ppo.yaml
index 0ee6929aa..24593d6bb 100644
--- a/python/ray/rllib/tuned_examples/atari-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/atari-ppo.yaml
@@ -11,8 +11,9 @@ atari-ppo:
     config:
         lambda: 0.95
         kl_coeff: 0.5
-        clip_param: 0.1
         clip_rewards: True
+        clip_param: 0.1
+        vf_clip_param: 10.0
         entropy_coeff: 0.01
         train_batch_size: 5000
         sample_batch_size: 500
@@ -24,7 +25,3 @@ atari-ppo:
         observation_filter: NoFilter
         vf_share_layers: true
         num_gpus: 1
-        lr_schedule: [
-            [0, 0.0007],
-            [20000000, 0.000000000001],
-        ]