From 007208d2bb98aa11e63f0273fcee0471982f06b7 Mon Sep 17 00:00:00 2001
From: Jones Wong <joneswong@users.noreply.github.com>
Date: Thu, 9 Aug 2018 19:51:32 -0700
Subject: [PATCH] Support older version TF and Support RMSProp in Impala
 (#2590)

to support TF version < 1.5
to support rmsprop optimizer in Impala

Before TF1.5, tf.reduce_sum() and tf.reduce_max() has an argument keep_dims which has been renamed as keepdims in later versions.

In the original paper of Impala, they use rmsprop algorithm to optimize the model. We'd better also support it so that users can reproduce their experiments. Without any tuning, say that using the same hyper-parameters as AdamOptimizer, it reaches "episode_reward_mean": 19.083333333333332 in Pong after consume 3,610,350 samples.
---
 doc/source/conf.py                            |  3 ++
 python/ray/rllib/agents/impala/impala.py      |  9 ++++-
 .../agents/impala/vtrace_policy_graph.py      |  7 +++-
 python/ray/rllib/ddpg2/common/__init__.py     |  0
 python/ray/rllib/models/action_dist.py        | 40 ++++++++++++++-----
 5 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 python/ray/rllib/ddpg2/common/__init__.py

diff --git a/doc/source/conf.py b/doc/source/conf.py
index 1b113b71a..848b26c14 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -56,6 +56,9 @@ MOCK_MODULES = ["gym",
                 "ray.core.generated.TablePubsub",]
 for mod_name in MOCK_MODULES:
   sys.modules[mod_name] = mock.Mock()
+# ray.rllib.models.action_dist.py and 
+# ray.rllib.models.lstm.py will use tf.VERSION
+sys.modules["tensorflow"].VERSION = "9.9.9"
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
index f5ebe6ec7..99322d1d3 100644
--- a/python/ray/rllib/agents/impala/impala.py
+++ b/python/ray/rllib/agents/impala/impala.py
@@ -37,7 +37,14 @@ DEFAULT_CONFIG = with_common_config({
 
     # Learning params.
     "grad_clip": 40.0,
-    "lr": 0.0001,
+    # either "adam" or "rmsprop"
+    "opt_type": "adam",
+    "lr": 0.0005,
+    # rmsprop considered
+    "decay": 0.99,
+    "momentum": 0.0,
+    "epsilon": 0.1,
+    # balancing the three losses
     "vf_loss_coeff": 0.5,
     "entropy_coeff": -0.01,
 
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
index 0b9c46c9a..bd55d5329 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
@@ -184,7 +184,12 @@ class VTracePolicyGraph(TFPolicyGraph):
         self.sess.run(tf.global_variables_initializer())
 
     def optimizer(self):
-        return tf.train.AdamOptimizer(self.config["lr"])
+        if self.config["opt_type"] == "adam":
+            return tf.train.AdamOptimizer(self.config["lr"])
+        else:
+            return tf.train.RMSPropOptimizer(
+                self.config["lr"], self.config["decay"],
+                self.config["momentum"], self.config["epsilon"])
 
     def gradients(self, optimizer):
         grads = tf.gradients(self.loss.total_loss, self.var_list)
diff --git a/python/ray/rllib/ddpg2/common/__init__.py b/python/ray/rllib/ddpg2/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py
index a88f5fa3a..b104230bf 100644
--- a/python/ray/rllib/models/action_dist.py
+++ b/python/ray/rllib/models/action_dist.py
@@ -4,8 +4,13 @@ from __future__ import print_function
 
 import tensorflow as tf
 import numpy as np
+import distutils.version
+
 from ray.rllib.utils.reshaper import Reshaper
 
+use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
+                 distutils.version.LooseVersion("1.5.0"))
+
 
 class ActionDistribution(object):
     """The policy action distribution of an agent.
@@ -42,22 +47,39 @@ class Categorical(ActionDistribution):
             logits=self.inputs, labels=x)
 
     def entropy(self):
-        a0 = self.inputs - tf.reduce_max(
-            self.inputs, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keepdims=True)
+        else:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keep_dims=True)
         ea0 = tf.exp(a0)
-        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+        else:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
         p0 = ea0 / z0
         return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
 
     def kl(self, other):
-        a0 = self.inputs - tf.reduce_max(
-            self.inputs, reduction_indices=[1], keepdims=True)
-        a1 = other.inputs - tf.reduce_max(
-            other.inputs, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keepdims=True)
+            a1 = other.inputs - tf.reduce_max(
+                other.inputs, reduction_indices=[1], keepdims=True)
+        else:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keep_dims=True)
+            a1 = other.inputs - tf.reduce_max(
+                other.inputs, reduction_indices=[1], keep_dims=True)
         ea0 = tf.exp(a0)
         ea1 = tf.exp(a1)
-        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
-        z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+            z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
+        else:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
+            z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True)
         p0 = ea0 / z0
         return tf.reduce_sum(
             p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])