diff --git a/python/ray/rllib/a3c/LSTM.py b/python/ray/rllib/a3c/LSTM.py deleted file mode 100644 index 96a5398d8..000000000 --- a/python/ray/rllib/a3c/LSTM.py +++ /dev/null @@ -1,122 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import tensorflow as tf -import tensorflow.contrib.rnn as rnn -import distutils.version - -from ray.rllib.a3c.policy import ( - categorical_sample, conv2d, linear, flatten, - normalized_columns_initializer, Policy) - -use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >= - distutils.version.LooseVersion("1.0.0")) - - -class LSTMPolicy(Policy): - def setup_graph(self, ob_space, ac_space): - """Setup model used for Policy. - - In this A3C implementation, both the Critic and the Actor share the - model. - """ - num_actions = ac_space.n - self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space)) - - for i in range(4): - x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) - # Introduce a "fake" batch dimension of 1 after flatten so that we can - # do LSTM over the time dim. - x = tf.expand_dims(flatten(x), [0]) - - size = 256 - if use_tf100_api: - lstm = rnn.BasicLSTMCell(size, state_is_tuple=True) - else: - lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True) - self.state_size = lstm.state_size - step_size = tf.shape(self.x)[:1] - - c_init = np.zeros((1, lstm.state_size.c), np.float32) - h_init = np.zeros((1, lstm.state_size.h), np.float32) - self.state_init = [c_init, h_init] - c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c]) - h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h]) - self.state_in = [c_in, h_in] - - if use_tf100_api: - state_in = rnn.LSTMStateTuple(c_in, h_in) - else: - state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in) - lstm_outputs, lstm_state = tf.nn.dynamic_rnn( - lstm, x, initial_state=state_in, sequence_length=step_size, - time_major=False) - lstm_c, lstm_h = lstm_state - x = tf.reshape(lstm_outputs, [-1, size]) - self.logits = linear(x, num_actions, "action", - normalized_columns_initializer(0.01)) - self.vf = tf.reshape(linear(x, 1, "value", - normalized_columns_initializer(1.0)), [-1]) - self.state_out = [lstm_c[:1, :], lstm_h[:1, :]] - self.sample = categorical_sample(self.logits, num_actions)[0, :] - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) - self.global_step = tf.get_variable( - "global_step", [], tf.int32, - initializer=tf.constant_initializer(0, dtype=tf.int32), - trainable=False) - - def get_gradients(self, batch): - """Computing the gradient is actually model-dependent. - - The LSTM needs its hidden states in order to compute the gradient - accurately. - """ - feed_dict = { - self.x: batch.si, - self.ac: batch.a, - self.adv: batch.adv, - self.r: batch.r, - self.state_in[0]: batch.features[0], - self.state_in[1]: batch.features[1] - } - info = {} - self.local_steps += 1 - if self.summarize: - grad, summ = self.sess.run([self.grads, self.summary_op], - feed_dict=feed_dict) - info['summary'] = summ - else: - grad = self.sess.run(self.grads, feed_dict=feed_dict) - return grad, info - - def compute_actions(self, ob, c, h): - return self.sess.run([self.sample, self.vf] + self.state_out, - {self.x: [ob], - self.state_in[0]: c, - self.state_in[1]: h}) - - def value(self, ob, c, h): - # process_rollout is very non-intuitive due to value being a float - return self.sess.run(self.vf, {self.x: [ob], - self.state_in[0]: c, - self.state_in[1]: h})[0] - - def get_initial_features(self): - return self.state_init - - -class RawLSTMPolicy(LSTMPolicy): - def get_weights(self): - if not hasattr(self, "_weights"): - self._weights = self.variables.get_weights() - return self._weights - - def set_weights(self, weights): - self._weights = weights - - def model_update(self, grads): - for var, grad in zip(self.var_list, grads): - self._weights[var.name[:-2]] -= 1e-4 * grad diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index c83245ca2..ed791ba86 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -12,7 +12,7 @@ import ray from ray.rllib.a3c.runner import RunnerThread, process_rollout from ray.rllib.a3c.envs import create_env from ray.rllib.common import Algorithm, TrainingResult -from ray.rllib.a3c.shared_model import SharedModel +from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM DEFAULT_CONFIG = { @@ -89,7 +89,7 @@ class Runner(object): class A3C(Algorithm): def __init__(self, env_name, config, - policy_cls=SharedModel, upload_dir=None): + policy_cls=SharedModelLSTM, upload_dir=None): config.update({"alg": "A3C"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) self.env = create_env(env_name) diff --git a/python/ray/rllib/a3c/envs.py b/python/ray/rllib/a3c/envs.py index cd01901f4..d8f82d8ef 100644 --- a/python/ray/rllib/a3c/envs.py +++ b/python/ray/rllib/a3c/envs.py @@ -17,7 +17,7 @@ def create_env(env_id): env = gym.make(env_id) if hasattr(env.env, "ale"): env = AtariProcessing(env) - env = Diagnostic(env) + env = Diagnostic(env) return env diff --git a/python/ray/rllib/a3c/policy.py b/python/ray/rllib/a3c/policy.py index 055948973..fc9ae3392 100644 --- a/python/ray/rllib/a3c/policy.py +++ b/python/ray/rllib/a3c/policy.py @@ -2,69 +2,74 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np import tensorflow as tf import ray +import gym class Policy(object): """The policy base class.""" - def __init__(self, ob_space, ac_space, name="local", summarize=True): + def __init__(self, ob_space, action_space, name="local", summarize=True): self.local_steps = 0 self.summarize = summarize worker_device = "/job:localhost/replica:0/task:0/cpu:0" self.g = tf.Graph() with self.g.as_default(), tf.device(worker_device): with tf.variable_scope(name): - self.setup_graph(ob_space, ac_space) + self.setup_graph(ob_space, action_space) assert all([hasattr(self, attr) for attr in ["vf", "logits", "x", "var_list"]]) print("Setting up loss") - self.setup_loss(ac_space) + self.setup_loss(action_space) + self.setup_gradients() self.initialize() def setup_graph(self): raise NotImplementedError - def setup_loss(self, ac_space): - num_actions = ac_space.n - self.ac = tf.placeholder(tf.float32, [None, num_actions], name="ac") + def setup_loss(self, action_space): + if isinstance(action_space, gym.spaces.Box): + ac_size = action_space.shape[0] + self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac") + elif isinstance(action_space, gym.spaces.Discrete): + self.ac = tf.placeholder(tf.int64, [None], name="ac") + else: + raise NotImplemented( + "action space" + str(type(action_space)) + + "currently not supported") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") - log_prob_tf = tf.nn.log_softmax(self.logits) - prob_tf = tf.nn.softmax(self.logits) + log_prob = self.curr_dist.logp(self.ac) # The "policy gradients" loss: its derivative is precisely the policy # gradient. Notice that self.ac is a placeholder that is provided # externally. adv will contain the advantages, as calculated in # process_rollout. - pi_loss = - tf.reduce_sum(tf.reduce_sum(log_prob_tf * self.ac, - [1]) * self.adv) + self.pi_loss = - tf.reduce_sum(log_prob * self.adv) - # loss of value function - vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r)) - entropy = - tf.reduce_sum(prob_tf * log_prob_tf) - - bs = tf.to_float(tf.shape(self.x)[0]) - self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 + delta = self.vf - self.r + self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) + self.entropy = tf.reduce_sum(self.curr_dist.entropy()) + self.loss = self.pi_loss + 0.5 * self.vf_loss - self.entropy * 0.01 + def setup_gradients(self): grads = tf.gradients(self.loss, self.var_list) self.grads, _ = tf.clip_by_global_norm(grads, 40.0) - grads_and_vars = list(zip(self.grads, self.var_list)) opt = tf.train.AdamOptimizer(1e-4) self._apply_gradients = opt.apply_gradients(grads_and_vars) + def initialize(self): if self.summarize: - tf.summary.scalar("model/policy_loss", pi_loss / bs) - tf.summary.scalar("model/value_loss", vf_loss / bs) - tf.summary.scalar("model/entropy", entropy / bs) + bs = tf.to_float(tf.shape(self.x)[0]) + tf.summary.scalar("model/policy_loss", self.pi_loss / bs) + tf.summary.scalar("model/value_loss", self.vf_loss / bs) + tf.summary.scalar("model/entropy", self.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() - def initialize(self): self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) self.variables = ray.experimental.TensorFlowVariables(self.loss, @@ -94,55 +99,3 @@ class Policy(object): def value(self, ob): raise NotImplementedError - - -def normalized_columns_initializer(std=1.0): - def _initializer(shape, dtype=None, partition_info=None): - out = np.random.randn(*shape).astype(np.float32) - out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) - return tf.constant(out) - return _initializer - - -def flatten(x): - return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])]) - - -def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", - dtype=tf.float32, collections=None): - with tf.variable_scope(name): - stride_shape = [1, stride[0], stride[1], 1] - filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), - num_filters] - - # There are "num input feature maps * filter height * filter width" - # inputs to each hidden unit. - fan_in = np.prod(filter_shape[:3]) - # Each unit in the lower layer receives a gradient from: "num output - # feature maps * filter height * filter width" / pooling size. - fan_out = np.prod(filter_shape[:2]) * num_filters - # Initialize weights with random weights. - w_bound = np.sqrt(6 / (fan_in + fan_out)) - - w = tf.get_variable("W", filter_shape, dtype, - tf.random_uniform_initializer(-w_bound, w_bound), - collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], - initializer=tf.constant_initializer(0.0), - collections=collections) - return tf.nn.conv2d(x, w, stride_shape, pad) + b - - -def linear(x, size, name, initializer=None, bias_init=0): - w = tf.get_variable(name + "/w", [x.get_shape()[1], size], - initializer=initializer) - b = tf.get_variable(name + "/b", [size], - initializer=tf.constant_initializer(bias_init)) - return tf.matmul(x, w) + b - - -def categorical_sample(logits, d): - value = tf.squeeze(tf.multinomial(logits - tf.reduce_max(logits, [1], - keep_dims=True), - 1), [1]) - return tf.one_hot(value, d) diff --git a/python/ray/rllib/a3c/runner.py b/python/ray/rllib/a3c/runner.py index 7a9338f7d..f6a2834fa 100644 --- a/python/ray/rllib/a3c/runner.py +++ b/python/ray/rllib/a3c/runner.py @@ -139,7 +139,7 @@ def env_runner(env, policy, num_local_steps, summary_writer, render): fetched = policy.compute_actions(last_state, *last_features) action, value_, features = fetched[0], fetched[1], fetched[2:] # Argmax to convert from one-hot. - state, reward, terminal, info = env.step(action.argmax()) + state, reward, terminal, info = env.step(action) if render: env.render() diff --git a/python/ray/rllib/a3c/shared_model.py b/python/ray/rllib/a3c/shared_model.py index cd0793d52..2c0e934fc 100644 --- a/python/ray/rllib/a3c/shared_model.py +++ b/python/ray/rllib/a3c/shared_model.py @@ -3,11 +3,10 @@ from __future__ import division from __future__ import print_function import tensorflow as tf -from ray.rllib.a3c.policy import ( - categorical_sample, linear, - normalized_columns_initializer, Policy) - +from ray.rllib.models.misc import linear, normc_initializer +from ray.rllib.a3c.policy import Policy from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.convnet import ConvolutionalNetwork class SharedModel(Policy): @@ -15,15 +14,17 @@ class SharedModel(Policy): super(SharedModel, self).__init__(ob_space, ac_space, **kwargs) def setup_graph(self, ob_space, ac_space): - num_actions = ac_space.n self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) - dist_class, dist_dim = ModelCatalog.get_action_dist(ac_space) - self._model = ModelCatalog.ConvolutionalNetwork(self.x, dist_dim) + dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) + self._model = ConvolutionalNetwork(self.x, self.logit_dim, {}) self.logits = self._model.outputs + self.curr_dist = dist_class(self.logits) + # with tf.variable_scope("vf"): + # vf_model = ModelCatalog.get_model(self.x, 1) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", - normalized_columns_initializer(1.0)), [-1]) + normc_initializer(1.0)), [-1]) - self.sample = categorical_sample(self.logits, num_actions)[0, :] + self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( @@ -39,7 +40,7 @@ class SharedModel(Policy): self.adv: batch.adv, self.r: batch.r, } - + self.grads = [g for g in self.grads if g is not None] self.local_steps += 1 if self.summarize: grad, summ = self.sess.run([self.grads, self.summary_op], @@ -50,8 +51,9 @@ class SharedModel(Policy): return grad, info def compute_actions(self, ob, *args): - return self.sess.run([self.sample, self.vf], - {self.x: [ob]}) + action, vf = self.sess.run([self.sample, self.vf], + {self.x: [ob]}) + return action[0], vf def value(self, ob, *args): return self.sess.run(self.vf, {self.x: [ob]})[0] diff --git a/python/ray/rllib/a3c/shared_model_lstm.py b/python/ray/rllib/a3c/shared_model_lstm.py new file mode 100644 index 000000000..32369ba2f --- /dev/null +++ b/python/ray/rllib/a3c/shared_model_lstm.py @@ -0,0 +1,81 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from ray.rllib.models.misc import linear, normc_initializer +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.a3c.policy import Policy +from ray.rllib.models.lstm import LSTM + + +class SharedModelLSTM(Policy): + + def __init__(self, ob_space, ac_space, **kwargs): + super(SharedModelLSTM, self).__init__(ob_space, ac_space, **kwargs) + + def setup_graph(self, ob_space, ac_space): + self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) + dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) + self._model = LSTM(self.x, self.logit_dim, {}) + + self.state_init = self._model.state_init + self.state_in = self._model.state_in + self.state_out = self._model.state_out + + self.logits = self._model.outputs + self.curr_dist = dist_class(self.logits) + # with tf.variable_scope("vf"): + # vf_model = ModelCatalog.get_model(self.x, 1) + self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", + normc_initializer(1.0)), [-1]) + + self.sample = self.curr_dist.sample() + self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + tf.get_variable_scope().name) + self.global_step = tf.get_variable( + "global_step", [], tf.int32, + initializer=tf.constant_initializer(0, dtype=tf.int32), + trainable=False) + + def get_gradients(self, batch): + """Computing the gradient is actually model-dependent. + + The LSTM needs its hidden states in order to compute the gradient + accurately. + """ + feed_dict = { + self.x: batch.si, + self.ac: batch.a, + self.adv: batch.adv, + self.r: batch.r, + self.state_in[0]: batch.features[0], + self.state_in[1]: batch.features[1] + } + info = {} + self.local_steps += 1 + if self.summarize and self.local_steps % 10 == 0: + grad, summ = self.sess.run([self.grads, self.summary_op], + feed_dict=feed_dict) + info['summary'] = summ + else: + grad = self.sess.run(self.grads, feed_dict=feed_dict) + return grad, info + + def compute_actions(self, ob, c, h): + output = self.sess.run([self.sample, self.vf] + self.state_out, + {self.x: [ob], + self.state_in[0]: c, + self.state_in[1]: h}) + output = list(output) + output[0] = output[0][0] + return output + + def value(self, ob, c, h): + # process_rollout is very non-intuitive due to value being a float + return self.sess.run(self.vf, {self.x: [ob], + self.state_in[0]: c, + self.state_in[1]: h})[0] + + def get_initial_features(self): + return self.state_init diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 5b942a55f..8a3987818 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -10,7 +10,6 @@ from ray.rllib.models.preprocessors import ( NoPreprocessor, AtariRamPreprocessor, AtariPixelPreprocessor) from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.visionnet import VisionNetwork -from ray.rllib.models.convnet import ConvolutionalNetwork class ModelCatalog(object): @@ -67,10 +66,6 @@ class ModelCatalog(object): return FullyConnectedNetwork(inputs, num_outputs, options) - @staticmethod - def ConvolutionalNetwork(inputs, num_outputs, options=dict()): - return ConvolutionalNetwork(inputs, num_outputs, options) - @staticmethod def get_preprocessor(env_name, obs_shape): """Returns a suitable processor for the given environment. diff --git a/python/ray/rllib/models/convnet.py b/python/ray/rllib/models/convnet.py index ece6f17b3..4074e0ad3 100644 --- a/python/ray/rllib/models/convnet.py +++ b/python/ray/rllib/models/convnet.py @@ -3,48 +3,14 @@ from __future__ import division from __future__ import print_function import tensorflow as tf -import numpy as np from ray.rllib.models.model import Model -from ray.rllib.models.misc import normc_initializer - - -def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", - dtype=tf.float32, collections=None): - with tf.variable_scope(name): - stride_shape = [1, stride[0], stride[1], 1] - filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), - num_filters] - - # There are "num input feature maps * filter height * filter width" - # inputs to each hidden unit. - fan_in = np.prod(filter_shape[:3]) - # Each unit in the lower layer receives a gradient from: "num output - # feature maps * filter height * filter width" / pooling size. - fan_out = np.prod(filter_shape[:2]) * num_filters - # Initialize weights with random weights. - w_bound = np.sqrt(6 / (fan_in + fan_out)) - - w = tf.get_variable("W", filter_shape, dtype, - tf.random_uniform_initializer(-w_bound, w_bound), - collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], - initializer=tf.constant_initializer(0.0), - collections=collections) - return tf.nn.conv2d(x, w, stride_shape, pad) + b - - -def linear(x, size, name, initializer=None, bias_init=0): - w = tf.get_variable(name + "/w", [x.get_shape()[1], size], - initializer=initializer) - b = tf.get_variable(name + "/b", [size], - initializer=tf.constant_initializer(bias_init)) - return tf.matmul(x, w) + b +from ray.rllib.models.misc import normc_initializer, conv2d, linear class ConvolutionalNetwork(Model): """Generic convolutional network.""" - + # TODO(rliaw): converge on one generic ConvNet model def _init(self, inputs, num_outputs, options): x = inputs with tf.name_scope("convnet"): diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 1990158cb..43a1ab031 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -5,17 +5,8 @@ from __future__ import print_function import tensorflow as tf import tensorflow.contrib.slim as slim -import numpy as np - from ray.rllib.models.model import Model - - -def normc_initializer(std=1.0): - def _initializer(shape, dtype=None, partition_info=None): - out = np.random.randn(*shape).astype(np.float32) - out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) - return tf.constant(out) - return _initializer +from ray.rllib.models.misc import normc_initializer class FullyConnectedNetwork(Model): diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py new file mode 100644 index 000000000..75a5a9045 --- /dev/null +++ b/python/ray/rllib/models/lstm.py @@ -0,0 +1,54 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf +import tensorflow.contrib.rnn as rnn +import distutils.version + +from ray.rllib.models.misc import (conv2d, linear, flatten, + normc_initializer) +from ray.rllib.models.model import Model + +use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >= + distutils.version.LooseVersion("1.0.0")) + + +class LSTM(Model): + # TODO(rliaw): Add LSTM code for other algorithms + def _init(self, inputs, num_outputs, options): + self.x = x = inputs + for i in range(4): + x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) + # Introduce a "fake" batch dimension of 1 after flatten so that we can + # do LSTM over the time dim. + x = tf.expand_dims(flatten(x), [0]) + + size = 256 + if use_tf100_api: + lstm = rnn.BasicLSTMCell(size, state_is_tuple=True) + else: + lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True) + step_size = tf.shape(self.x)[:1] + + c_init = np.zeros((1, lstm.state_size.c), np.float32) + h_init = np.zeros((1, lstm.state_size.h), np.float32) + self.state_init = [c_init, h_init] + c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c]) + h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h]) + self.state_in = [c_in, h_in] + + if use_tf100_api: + state_in = rnn.LSTMStateTuple(c_in, h_in) + else: + state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in) + lstm_out, lstm_state = tf.nn.dynamic_rnn(lstm, x, + initial_state=state_in, + sequence_length=step_size, + time_major=False) + lstm_c, lstm_h = lstm_state + x = tf.reshape(lstm_out, [-1, size]) + logits = linear(x, num_outputs, "action", normc_initializer(0.01)) + self.state_out = [lstm_c[:1, :], lstm_h[:1, :]] + return logits, x diff --git a/python/ray/rllib/models/misc.py b/python/ray/rllib/models/misc.py index 0044a021e..a531bc07b 100644 --- a/python/ray/rllib/models/misc.py +++ b/python/ray/rllib/models/misc.py @@ -3,7 +3,6 @@ from __future__ import division from __future__ import print_function import tensorflow as tf - import numpy as np @@ -13,3 +12,40 @@ def normc_initializer(std=1.0): out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) return tf.constant(out) return _initializer + + +def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", + dtype=tf.float32, collections=None): + with tf.variable_scope(name): + stride_shape = [1, stride[0], stride[1], 1] + filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), + num_filters] + + # There are "num input feature maps * filter height * filter width" + # inputs to each hidden unit. + fan_in = np.prod(filter_shape[:3]) + # Each unit in the lower layer receives a gradient from: "num output + # feature maps * filter height * filter width" / pooling size. + fan_out = np.prod(filter_shape[:2]) * num_filters + # Initialize weights with random weights. + w_bound = np.sqrt(6 / (fan_in + fan_out)) + + w = tf.get_variable("W", filter_shape, dtype, + tf.random_uniform_initializer(-w_bound, w_bound), + collections=collections) + b = tf.get_variable("b", [1, 1, 1, num_filters], + initializer=tf.constant_initializer(0.0), + collections=collections) + return tf.nn.conv2d(x, w, stride_shape, pad) + b + + +def linear(x, size, name, initializer=None, bias_init=0): + w = tf.get_variable(name + "/w", [x.get_shape()[1], size], + initializer=initializer) + b = tf.get_variable(name + "/b", [size], + initializer=tf.constant_initializer(bias_init)) + return tf.matmul(x, w) + b + + +def flatten(x): + return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])]) diff --git a/python/ray/rllib/test/test.sh b/python/ray/rllib/test/test.sh index 07c5dcbfc..7d677c8ce 100755 --- a/python/ray/rllib/test/test.sh +++ b/python/ray/rllib/test/test.sh @@ -11,5 +11,7 @@ python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20 python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/ python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/ -python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/ + +python train.py --env PongDeterministic-v4 --alg A3C --config '{"num_workers": 16, "num_batches_per_iteration": 1000, "batch_size": 20}' --upload-dir s3://bucketname/ + python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/