diff --git a/doc/source/conf.py b/doc/source/conf.py index 1b113b71a..848b26c14 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -56,6 +56,9 @@ MOCK_MODULES = ["gym", "ray.core.generated.TablePubsub",] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() +# ray.rllib.models.action_dist.py and +# ray.rllib.models.lstm.py will use tf.VERSION +sys.modules["tensorflow"].VERSION = "9.9.9" # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py index f5ebe6ec7..99322d1d3 100644 --- a/python/ray/rllib/agents/impala/impala.py +++ b/python/ray/rllib/agents/impala/impala.py @@ -37,7 +37,14 @@ DEFAULT_CONFIG = with_common_config({ # Learning params. "grad_clip": 40.0, - "lr": 0.0001, + # either "adam" or "rmsprop" + "opt_type": "adam", + "lr": 0.0005, + # rmsprop considered + "decay": 0.99, + "momentum": 0.0, + "epsilon": 0.1, + # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index 0b9c46c9a..bd55d5329 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -184,7 +184,12 @@ class VTracePolicyGraph(TFPolicyGraph): self.sess.run(tf.global_variables_initializer()) def optimizer(self): - return tf.train.AdamOptimizer(self.config["lr"]) + if self.config["opt_type"] == "adam": + return tf.train.AdamOptimizer(self.config["lr"]) + else: + return tf.train.RMSPropOptimizer( + self.config["lr"], self.config["decay"], + self.config["momentum"], self.config["epsilon"]) def gradients(self, optimizer): grads = tf.gradients(self.loss.total_loss, self.var_list) diff --git a/python/ray/rllib/ddpg2/common/__init__.py b/python/ray/rllib/ddpg2/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index a88f5fa3a..b104230bf 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -4,8 +4,13 @@ from __future__ import print_function import tensorflow as tf import numpy as np +import distutils.version + from ray.rllib.utils.reshaper import Reshaper +use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >= + distutils.version.LooseVersion("1.5.0")) + class ActionDistribution(object): """The policy action distribution of an agent. @@ -42,22 +47,39 @@ class Categorical(ActionDistribution): logits=self.inputs, labels=x) def entropy(self): - a0 = self.inputs - tf.reduce_max( - self.inputs, reduction_indices=[1], keepdims=True) + if use_tf150_api: + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keepdims=True) + else: + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keep_dims=True) ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) + if use_tf150_api: + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) + else: + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1]) def kl(self, other): - a0 = self.inputs - tf.reduce_max( - self.inputs, reduction_indices=[1], keepdims=True) - a1 = other.inputs - tf.reduce_max( - other.inputs, reduction_indices=[1], keepdims=True) + if use_tf150_api: + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keepdims=True) + a1 = other.inputs - tf.reduce_max( + other.inputs, reduction_indices=[1], keepdims=True) + else: + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keep_dims=True) + a1 = other.inputs - tf.reduce_max( + other.inputs, reduction_indices=[1], keep_dims=True) ea0 = tf.exp(a0) ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) - z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True) + if use_tf150_api: + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) + z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True) + else: + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True) + z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True) p0 = ea0 / z0 return tf.reduce_sum( p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])