mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 00:29:38 +08:00
Support older version TF and Support RMSProp in Impala (#2590)
to support TF version < 1.5 to support rmsprop optimizer in Impala Before TF1.5, tf.reduce_sum() and tf.reduce_max() has an argument keep_dims which has been renamed as keepdims in later versions. In the original paper of Impala, they use rmsprop algorithm to optimize the model. We'd better also support it so that users can reproduce their experiments. Without any tuning, say that using the same hyper-parameters as AdamOptimizer, it reaches "episode_reward_mean": 19.083333333333332 in Pong after consume 3,610,350 samples.
This commit is contained in:
@@ -37,7 +37,14 @@ DEFAULT_CONFIG = with_common_config({
|
||||
|
||||
# Learning params.
|
||||
"grad_clip": 40.0,
|
||||
"lr": 0.0001,
|
||||
# either "adam" or "rmsprop"
|
||||
"opt_type": "adam",
|
||||
"lr": 0.0005,
|
||||
# rmsprop considered
|
||||
"decay": 0.99,
|
||||
"momentum": 0.0,
|
||||
"epsilon": 0.1,
|
||||
# balancing the three losses
|
||||
"vf_loss_coeff": 0.5,
|
||||
"entropy_coeff": -0.01,
|
||||
|
||||
|
||||
@@ -184,7 +184,12 @@ class VTracePolicyGraph(TFPolicyGraph):
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(self.config["lr"])
|
||||
if self.config["opt_type"] == "adam":
|
||||
return tf.train.AdamOptimizer(self.config["lr"])
|
||||
else:
|
||||
return tf.train.RMSPropOptimizer(
|
||||
self.config["lr"], self.config["decay"],
|
||||
self.config["momentum"], self.config["epsilon"])
|
||||
|
||||
def gradients(self, optimizer):
|
||||
grads = tf.gradients(self.loss.total_loss, self.var_list)
|
||||
|
||||
@@ -4,8 +4,13 @@ from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import distutils.version
|
||||
|
||||
from ray.rllib.utils.reshaper import Reshaper
|
||||
|
||||
use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
|
||||
distutils.version.LooseVersion("1.5.0"))
|
||||
|
||||
|
||||
class ActionDistribution(object):
|
||||
"""The policy action distribution of an agent.
|
||||
@@ -42,22 +47,39 @@ class Categorical(ActionDistribution):
|
||||
logits=self.inputs, labels=x)
|
||||
|
||||
def entropy(self):
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
if use_tf150_api:
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
else:
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
if use_tf150_api:
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
else:
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
|
||||
|
||||
def kl(self, other):
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
a1 = other.inputs - tf.reduce_max(
|
||||
other.inputs, reduction_indices=[1], keepdims=True)
|
||||
if use_tf150_api:
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
a1 = other.inputs - tf.reduce_max(
|
||||
other.inputs, reduction_indices=[1], keepdims=True)
|
||||
else:
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keep_dims=True)
|
||||
a1 = other.inputs - tf.reduce_max(
|
||||
other.inputs, reduction_indices=[1], keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
|
||||
if use_tf150_api:
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
|
||||
else:
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
|
||||
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(
|
||||
p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])
|
||||
|
||||
Reference in New Issue
Block a user