diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index d1dc8687f..26455add8 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -70,27 +70,27 @@ class DiagGaussian(ActionDistribution): def __init__(self, inputs): ActionDistribution.__init__(self, inputs) - mean, logstd = tf.split(inputs, 2, axis=1) + mean, log_std = tf.split(inputs, 2, axis=1) self.mean = mean - self.logstd = logstd - self.std = tf.exp(logstd) + self.log_std = log_std + self.std = tf.exp(log_std) def logp(self, x): return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), reduction_indices=[1]) - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) - - tf.reduce_sum(self.logstd, reduction_indices=[1])) + tf.reduce_sum(self.log_std, reduction_indices=[1])) def kl(self, other): assert isinstance(other, DiagGaussian) - return tf.reduce_sum(other.logstd - self.logstd + + return tf.reduce_sum(other.log_std - self.log_std + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, reduction_indices=[1]) def entropy(self): - return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), + return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e), reduction_indices=[1]) def sample(self): diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index effaa3326..5b942a55f 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -48,7 +48,7 @@ class ModelCatalog(object): "Unsupported args: {} {}".format(action_space, dist_type)) @staticmethod - def get_model(inputs, num_outputs, options=None): + def get_model(inputs, num_outputs, options=dict()): """Returns a suitable model conforming to given input and output specs. Args: @@ -60,9 +60,6 @@ class ModelCatalog(object): model (Model): Neural network model. """ - if options is None: - options = {} - obs_rank = len(inputs.get_shape()) - 1 if obs_rank > 1: @@ -71,7 +68,7 @@ class ModelCatalog(object): return FullyConnectedNetwork(inputs, num_outputs, options) @staticmethod - def ConvolutionalNetwork(inputs, num_outputs, options=None): + def ConvolutionalNetwork(inputs, num_outputs, options=dict()): return ConvolutionalNetwork(inputs, num_outputs, options) @staticmethod diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 3faac9d38..1990158cb 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -19,17 +19,7 @@ def normc_initializer(std=1.0): class FullyConnectedNetwork(Model): - """Generic fully connected network. - - Options to construct the network are passed to the _init function. - If options["free_logstd"] is True, the last half of the - output layer will be free variables that are not dependent on - inputs. This is often used if the output of the network is used - to parametrize a probability distribution. In this case, the - first half of the parameters can be interpreted as a location - parameter (like a mean) and the second half can be interpreted as - a scale parameter (like a standard deviation). - """ + """Generic fully connected network.""" def _init(self, inputs, num_outputs, options): hiddens = options.get("fcnet_hiddens", [256, 256]) @@ -40,9 +30,6 @@ class FullyConnectedNetwork(Model): activation = tf.nn.relu print("Constructing fcnet {} {}".format(hiddens, activation)) - if options.get("free_logstd", False): - num_outputs = num_outputs // 2 - with tf.name_scope("fc_net"): i = 1 last_layer = inputs @@ -57,8 +44,4 @@ class FullyConnectedNetwork(Model): last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope="fc_out") - if options.get("free_logstd", False): - logstd = tf.get_variable(name="logstd", shape=[num_outputs], - initializer=tf.zeros_initializer) - output = tf.concat([output, 0.0 * output + logstd], 1) return output, last_layer diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py index 56cf5e866..a8cfedc33 100644 --- a/python/ray/rllib/models/model.py +++ b/python/ray/rllib/models/model.py @@ -2,6 +2,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import tensorflow as tf + class Model(object): """Defines an abstract network model for use with RLlib. @@ -13,6 +15,14 @@ class Model(object): The last layer of the network can also be retrieved if the algorithm needs to further post-processing (e.g. Actor and Critic networks in A3C). + If options["free_log_std"] is True, the last half of the + output layer will be free variables that are not dependent on + inputs. This is often used if the output of the network is used + to parametrize a probability distribution. In this case, the + first half of the parameters can be interpreted as a location + parameter (like a mean) and the second half can be interpreted as + a scale parameter (like a standard deviation). + Attributes: inputs (Tensor): The input placeholder for this model. outputs (Tensor): The output vector of this model. @@ -21,8 +31,16 @@ class Model(object): def __init__(self, inputs, num_outputs, options): self.inputs = inputs + if options.get("free_log_std", False): + assert num_outputs % 2 == 0 + num_outputs = num_outputs // 2 self.outputs, self.last_layer = self._init( inputs, num_outputs, options) + if options.get("free_log_std", False): + log_std = tf.get_variable(name="log_std", shape=[num_outputs], + initializer=tf.zeros_initializer) + self.outputs = tf.concat( + [self.outputs, 0.0 * self.outputs + log_std], 1) def _init(self): """Builds and returns the output and last layer of the network.""" diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py index dbee42167..46d721744 100644 --- a/python/ray/rllib/policy_gradient/policy_gradient.py +++ b/python/ray/rllib/policy_gradient/policy_gradient.py @@ -52,7 +52,7 @@ DEFAULT_CONFIG = { "clip_param": 0.3, # Target value for KL divergence "kl_target": 0.01, - "model": {"free_logstd": False}, + "model": {"free_log_std": False}, # Number of timesteps collected in each outer loop "timesteps_per_batch": 40000, # Each tasks performs rollouts until at least this diff --git a/python/ray/rllib/test.sh b/python/ray/rllib/test.sh index 162c53137..07c5dcbfc 100755 --- a/python/ray/rllib/test.sh +++ b/python/ray/rllib/test.sh @@ -6,9 +6,9 @@ python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20 python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ -python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}, "use_gae": false}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PolicyGradient --upload-dir s3://bucketname/ -python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/ +python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/ python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/ python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/