From f1c55497ce2727c68f15b04ea997ce31eb7789ec Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 28 Sep 2018 15:22:33 -0700
Subject: [PATCH] [rllib] Fix edge case in n-step calculation and non-apex
 replay prioritization (#2929)

* fix

* lint
---
 .../ray/rllib/agents/dqn/dqn_policy_graph.py  | 23 ++++++++-----------
 .../rllib/optimizers/sync_replay_optimizer.py | 10 ++++----
 python/ray/rllib/test/test_evaluators.py      | 15 ++++++------
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index b9fd6468f..a2e8c8022 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -453,21 +453,18 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
 
     The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
 
-    If the episode finishes, the reward will be truncated. After this rewrite,
-    all the arrays will be shortened by (n_step - 1).
+    At the end of the trajectory, n is truncated to fit in the traj length.
     """
-    for i in range(len(rewards) - n_step + 1):
-        if dones[i]:
-            continue  # episode end
+
+    assert not any(dones[:-1]), "Unexpected done in middle of trajectory"
+
+    traj_length = len(rewards)
+    for i in range(traj_length):
         for j in range(1, n_step):
-            new_obs[i] = new_obs[i + j]
-            rewards[i] += gamma**j * rewards[i + j]
-            if dones[i + j]:
-                break  # episode end
-    # truncate ends of the trajectory
-    new_len = len(obs) - n_step + 1
-    for arr in [obs, actions, rewards, new_obs, dones]:
-        del arr[new_len:]
+            if i + j < traj_length:
+                new_obs[i] = new_obs[i + j]
+                dones[i] = dones[i + j]
+                rewards[i] += gamma**j * rewards[i + j]
 
 
 def _postprocess_dqn(policy_graph, sample_batch):
diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py
index 2993f883f..73df00601 100644
--- a/python/ray/rllib/optimizers/sync_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py
@@ -92,13 +92,13 @@ class SyncReplayOptimizer(PolicyOptimizer):
 
             for policy_id, s in batch.policy_batches.items():
                 for row in s.rows():
-                    if "weights" not in row:
-                        row["weights"] = np.ones_like(row["rewards"])
                     self.replay_buffers[policy_id].add(
                         pack_if_needed(row["obs"]),
-                        row["actions"], row["rewards"],
-                        pack_if_needed(row["new_obs"]), row["dones"],
-                        row["weights"])
+                        row["actions"],
+                        row["rewards"],
+                        pack_if_needed(row["new_obs"]),
+                        row["dones"],
+                        weight=None)
 
         if self.num_steps_sampled >= self.replay_starts:
             self._optimize()
diff --git a/python/ray/rllib/test/test_evaluators.py b/python/ray/rllib/test/test_evaluators.py
index 6d493099e..9ae0994f3 100644
--- a/python/ray/rllib/test/test_evaluators.py
+++ b/python/ray/rllib/test/test_evaluators.py
@@ -11,15 +11,16 @@ class DQNTest(unittest.TestCase):
     def testNStep(self):
         obs = [1, 2, 3, 4, 5, 6, 7]
         actions = ["a", "b", "a", "a", "a", "b", "a"]
-        rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100000.0]
+        rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0]
         new_obs = [2, 3, 4, 5, 6, 7, 8]
-        dones = [1, 0, 0, 0, 0, 1, 0]
+        dones = [0, 0, 0, 0, 0, 0, 1]
         adjust_nstep(3, 0.9, obs, actions, rewards, new_obs, dones)
-        self.assertEqual(obs, [1, 2, 3, 4, 5])
-        self.assertEqual(actions, ["a", "b", "a", "a", "a"])
-        self.assertEqual(rewards, [10.0, 171.0, 271.0, 271.0, 190.0])
-        self.assertEqual(new_obs, [2, 5, 6, 7, 7])
-        self.assertEqual(dones, [1, 0, 0, 0, 0])
+        self.assertEqual(obs, [1, 2, 3, 4, 5, 6, 7])
+        self.assertEqual(actions, ["a", "b", "a", "a", "a", "b", "a"])
+        self.assertEqual(new_obs, [4, 5, 6, 7, 8, 8, 8])
+        self.assertEqual(dones, [0, 0, 0, 0, 1, 1, 1])
+        self.assertEqual(rewards,
+                         [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
 
 
 if __name__ == '__main__':