Support older version TF and Support RMSProp in Impala (#2590)

to support TF version < 1.5
to support rmsprop optimizer in Impala

Before TF1.5, tf.reduce_sum() and tf.reduce_max() has an argument keep_dims which has been renamed as keepdims in later versions.

In the original paper of Impala, they use rmsprop algorithm to optimize the model. We'd better also support it so that users can reproduce their experiments. Without any tuning, say that using the same hyper-parameters as AdamOptimizer, it reaches "episode_reward_mean": 19.083333333333332 in Pong after consume 3,610,350 samples.
This commit is contained in:
Jones Wong
2018-08-09 19:51:32 -07:00
committed by Eric Liang
parent 170e08cf02
commit 007208d2bb
5 changed files with 48 additions and 11 deletions
+8 -1
View File
@@ -37,7 +37,14 @@ DEFAULT_CONFIG = with_common_config({
# Learning params.
"grad_clip": 40.0,
"lr": 0.0001,
# either "adam" or "rmsprop"
"opt_type": "adam",
"lr": 0.0005,
# rmsprop considered
"decay": 0.99,
"momentum": 0.0,
"epsilon": 0.1,
# balancing the three losses
"vf_loss_coeff": 0.5,
"entropy_coeff": -0.01,
@@ -184,7 +184,12 @@ class VTracePolicyGraph(TFPolicyGraph):
self.sess.run(tf.global_variables_initializer())
def optimizer(self):
return tf.train.AdamOptimizer(self.config["lr"])
if self.config["opt_type"] == "adam":
return tf.train.AdamOptimizer(self.config["lr"])
else:
return tf.train.RMSPropOptimizer(
self.config["lr"], self.config["decay"],
self.config["momentum"], self.config["epsilon"])
def gradients(self, optimizer):
grads = tf.gradients(self.loss.total_loss, self.var_list)
+31 -9
View File
@@ -4,8 +4,13 @@ from __future__ import print_function
import tensorflow as tf
import numpy as np
import distutils.version
from ray.rllib.utils.reshaper import Reshaper
use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
distutils.version.LooseVersion("1.5.0"))
class ActionDistribution(object):
"""The policy action distribution of an agent.
@@ -42,22 +47,39 @@ class Categorical(ActionDistribution):
logits=self.inputs, labels=x)
def entropy(self):
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
if use_tf150_api:
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
else:
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keep_dims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
if use_tf150_api:
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
else:
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
def kl(self, other):
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
a1 = other.inputs - tf.reduce_max(
other.inputs, reduction_indices=[1], keepdims=True)
if use_tf150_api:
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
a1 = other.inputs - tf.reduce_max(
other.inputs, reduction_indices=[1], keepdims=True)
else:
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keep_dims=True)
a1 = other.inputs - tf.reduce_max(
other.inputs, reduction_indices=[1], keep_dims=True)
ea0 = tf.exp(a0)
ea1 = tf.exp(a1)
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
if use_tf150_api:
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
else:
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(
p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])