Support older version TF and Support RMSProp in Impala (#2590)

to support TF version < 1.5 to support rmsprop optimizer in Impala Before TF1.5, tf.reduce_sum() and tf.reduce_max() has an argument keep_dims which has been renamed as keepdims in later versions. In the original paper of Impala, they use rmsprop algorithm to optimize the model. We'd better also support it so that users can reproduce their experiments. Without any tuning, say that using the same hyper-parameters as AdamOptimizer, it reaches "episode_reward_mean": 19.083333333333332 in Pong after consume 3,610,350 samples.
2026-06-28 00:29:38 +08:00 · 2018-08-09 19:51:32 -07:00
parent 170e08cf02
commit 007208d2bb
5 changed files with 48 additions and 11 deletions
@@ -37,7 +37,14 @@ DEFAULT_CONFIG = with_common_config({

    # Learning params.
    "grad_clip": 40.0,
-    "lr": 0.0001,
+    # either "adam" or "rmsprop"
+    "opt_type": "adam",
+    "lr": 0.0005,
+    # rmsprop considered
+    "decay": 0.99,
+    "momentum": 0.0,
+    "epsilon": 0.1,
+    # balancing the three losses
    "vf_loss_coeff": 0.5,
    "entropy_coeff": -0.01,

@@ -184,7 +184,12 @@ class VTracePolicyGraph(TFPolicyGraph):
        self.sess.run(tf.global_variables_initializer())

    def optimizer(self):
-        return tf.train.AdamOptimizer(self.config["lr"])
+        if self.config["opt_type"] == "adam":
+            return tf.train.AdamOptimizer(self.config["lr"])
+        else:
+            return tf.train.RMSPropOptimizer(
+                self.config["lr"], self.config["decay"],
+                self.config["momentum"], self.config["epsilon"])

    def gradients(self, optimizer):
        grads = tf.gradients(self.loss.total_loss, self.var_list)
@@ -4,8 +4,13 @@ from __future__ import print_function

 import tensorflow as tf
 import numpy as np
+import distutils.version
+
 from ray.rllib.utils.reshaper import Reshaper

+use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
+                 distutils.version.LooseVersion("1.5.0"))
+

 class ActionDistribution(object):
    """The policy action distribution of an agent.
@@ -42,22 +47,39 @@ class Categorical(ActionDistribution):
            logits=self.inputs, labels=x)

    def entropy(self):
-        a0 = self.inputs - tf.reduce_max(
-            self.inputs, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keepdims=True)
+        else:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keep_dims=True)
        ea0 = tf.exp(a0)
-        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+        else:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
        p0 = ea0 / z0
        return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])

    def kl(self, other):
-        a0 = self.inputs - tf.reduce_max(
-            self.inputs, reduction_indices=[1], keepdims=True)
-        a1 = other.inputs - tf.reduce_max(
-            other.inputs, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keepdims=True)
+            a1 = other.inputs - tf.reduce_max(
+                other.inputs, reduction_indices=[1], keepdims=True)
+        else:
+            a0 = self.inputs - tf.reduce_max(
+                self.inputs, reduction_indices=[1], keep_dims=True)
+            a1 = other.inputs - tf.reduce_max(
+                other.inputs, reduction_indices=[1], keep_dims=True)
        ea0 = tf.exp(a0)
        ea1 = tf.exp(a1)
-        z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
-        z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
+        if use_tf150_api:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
+            z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
+        else:
+            z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
+            z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True)
        p0 = ea0 / z0
        return tf.reduce_sum(
            p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])