mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 04:23:03 +08:00
[rllib] Additional support for Shared Models in A3C (#866)
* Code for Supporting Shared Models Running (with vnet modification) - needs to be tested for performance Summaries Small refactoring + generalized to more domains Small fix for jenkins Linting linting Addressing changes Addressing changes Update envs.py Addressing changes convnet Merge - new model final touches final linting Changing iterations back removed extra change changes for fast experimentation changes to enable a2c TEMP FOR DEBUGGING ContinuousActions - Still doesn't work InvertedPendulum trains with 8 workers - k=200 huber loss Maxes for InvertedPendulum-v1 - 16w,200steps temp: working with a2c Back to shared model more fixes small nit LSTM to shared models need to fix last_features tuning pong Best record for hitting 0 - with k=16,n=20 nit a2cremoval remove A2c reference and nits nit removed a2c vestiges removing a2c removing example.py Linting nit * Linting + Removing vestigal code * Final Touches * nits * rerun travis
This commit is contained in:
committed by
Philipp Moritz
parent
b251f0b6b9
commit
bc082e9a9e
@@ -1,122 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.rnn as rnn
|
||||
import distutils.version
|
||||
|
||||
from ray.rllib.a3c.policy import (
|
||||
categorical_sample, conv2d, linear, flatten,
|
||||
normalized_columns_initializer, Policy)
|
||||
|
||||
use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >=
|
||||
distutils.version.LooseVersion("1.0.0"))
|
||||
|
||||
|
||||
class LSTMPolicy(Policy):
|
||||
def setup_graph(self, ob_space, ac_space):
|
||||
"""Setup model used for Policy.
|
||||
|
||||
In this A3C implementation, both the Critic and the Actor share the
|
||||
model.
|
||||
"""
|
||||
num_actions = ac_space.n
|
||||
self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
|
||||
for i in range(4):
|
||||
x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))
|
||||
# Introduce a "fake" batch dimension of 1 after flatten so that we can
|
||||
# do LSTM over the time dim.
|
||||
x = tf.expand_dims(flatten(x), [0])
|
||||
|
||||
size = 256
|
||||
if use_tf100_api:
|
||||
lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
|
||||
else:
|
||||
lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
|
||||
self.state_size = lstm.state_size
|
||||
step_size = tf.shape(self.x)[:1]
|
||||
|
||||
c_init = np.zeros((1, lstm.state_size.c), np.float32)
|
||||
h_init = np.zeros((1, lstm.state_size.h), np.float32)
|
||||
self.state_init = [c_init, h_init]
|
||||
c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
|
||||
h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
|
||||
self.state_in = [c_in, h_in]
|
||||
|
||||
if use_tf100_api:
|
||||
state_in = rnn.LSTMStateTuple(c_in, h_in)
|
||||
else:
|
||||
state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
|
||||
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
|
||||
lstm, x, initial_state=state_in, sequence_length=step_size,
|
||||
time_major=False)
|
||||
lstm_c, lstm_h = lstm_state
|
||||
x = tf.reshape(lstm_outputs, [-1, size])
|
||||
self.logits = linear(x, num_actions, "action",
|
||||
normalized_columns_initializer(0.01))
|
||||
self.vf = tf.reshape(linear(x, 1, "value",
|
||||
normalized_columns_initializer(1.0)), [-1])
|
||||
self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]
|
||||
self.sample = categorical_sample(self.logits, num_actions)[0, :]
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
self.global_step = tf.get_variable(
|
||||
"global_step", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def get_gradients(self, batch):
|
||||
"""Computing the gradient is actually model-dependent.
|
||||
|
||||
The LSTM needs its hidden states in order to compute the gradient
|
||||
accurately.
|
||||
"""
|
||||
feed_dict = {
|
||||
self.x: batch.si,
|
||||
self.ac: batch.a,
|
||||
self.adv: batch.adv,
|
||||
self.r: batch.r,
|
||||
self.state_in[0]: batch.features[0],
|
||||
self.state_in[1]: batch.features[1]
|
||||
}
|
||||
info = {}
|
||||
self.local_steps += 1
|
||||
if self.summarize:
|
||||
grad, summ = self.sess.run([self.grads, self.summary_op],
|
||||
feed_dict=feed_dict)
|
||||
info['summary'] = summ
|
||||
else:
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
|
||||
def compute_actions(self, ob, c, h):
|
||||
return self.sess.run([self.sample, self.vf] + self.state_out,
|
||||
{self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})
|
||||
|
||||
def value(self, ob, c, h):
|
||||
# process_rollout is very non-intuitive due to value being a float
|
||||
return self.sess.run(self.vf, {self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})[0]
|
||||
|
||||
def get_initial_features(self):
|
||||
return self.state_init
|
||||
|
||||
|
||||
class RawLSTMPolicy(LSTMPolicy):
|
||||
def get_weights(self):
|
||||
if not hasattr(self, "_weights"):
|
||||
self._weights = self.variables.get_weights()
|
||||
return self._weights
|
||||
|
||||
def set_weights(self, weights):
|
||||
self._weights = weights
|
||||
|
||||
def model_update(self, grads):
|
||||
for var, grad in zip(self.var_list, grads):
|
||||
self._weights[var.name[:-2]] -= 1e-4 * grad
|
||||
@@ -12,7 +12,7 @@ import ray
|
||||
from ray.rllib.a3c.runner import RunnerThread, process_rollout
|
||||
from ray.rllib.a3c.envs import create_env
|
||||
from ray.rllib.common import Algorithm, TrainingResult
|
||||
from ray.rllib.a3c.shared_model import SharedModel
|
||||
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
@@ -89,7 +89,7 @@ class Runner(object):
|
||||
|
||||
class A3C(Algorithm):
|
||||
def __init__(self, env_name, config,
|
||||
policy_cls=SharedModel, upload_dir=None):
|
||||
policy_cls=SharedModelLSTM, upload_dir=None):
|
||||
config.update({"alg": "A3C"})
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
self.env = create_env(env_name)
|
||||
|
||||
@@ -17,7 +17,7 @@ def create_env(env_id):
|
||||
env = gym.make(env_id)
|
||||
if hasattr(env.env, "ale"):
|
||||
env = AtariProcessing(env)
|
||||
env = Diagnostic(env)
|
||||
env = Diagnostic(env)
|
||||
return env
|
||||
|
||||
|
||||
|
||||
@@ -2,69 +2,74 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import ray
|
||||
import gym
|
||||
|
||||
|
||||
class Policy(object):
|
||||
"""The policy base class."""
|
||||
def __init__(self, ob_space, ac_space, name="local", summarize=True):
|
||||
def __init__(self, ob_space, action_space, name="local", summarize=True):
|
||||
self.local_steps = 0
|
||||
self.summarize = summarize
|
||||
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
self.g = tf.Graph()
|
||||
with self.g.as_default(), tf.device(worker_device):
|
||||
with tf.variable_scope(name):
|
||||
self.setup_graph(ob_space, ac_space)
|
||||
self.setup_graph(ob_space, action_space)
|
||||
assert all([hasattr(self, attr)
|
||||
for attr in ["vf", "logits", "x", "var_list"]])
|
||||
print("Setting up loss")
|
||||
self.setup_loss(ac_space)
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
|
||||
def setup_graph(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def setup_loss(self, ac_space):
|
||||
num_actions = ac_space.n
|
||||
self.ac = tf.placeholder(tf.float32, [None, num_actions], name="ac")
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
ac_size = action_space.shape[0]
|
||||
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise NotImplemented(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.r = tf.placeholder(tf.float32, [None], name="r")
|
||||
|
||||
log_prob_tf = tf.nn.log_softmax(self.logits)
|
||||
prob_tf = tf.nn.softmax(self.logits)
|
||||
log_prob = self.curr_dist.logp(self.ac)
|
||||
|
||||
# The "policy gradients" loss: its derivative is precisely the policy
|
||||
# gradient. Notice that self.ac is a placeholder that is provided
|
||||
# externally. adv will contain the advantages, as calculated in
|
||||
# process_rollout.
|
||||
pi_loss = - tf.reduce_sum(tf.reduce_sum(log_prob_tf * self.ac,
|
||||
[1]) * self.adv)
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
|
||||
|
||||
# loss of value function
|
||||
vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r))
|
||||
entropy = - tf.reduce_sum(prob_tf * log_prob_tf)
|
||||
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01
|
||||
delta = self.vf - self.r
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(self.curr_dist.entropy())
|
||||
self.loss = self.pi_loss + 0.5 * self.vf_loss - self.entropy * 0.01
|
||||
|
||||
def setup_gradients(self):
|
||||
grads = tf.gradients(self.loss, self.var_list)
|
||||
self.grads, _ = tf.clip_by_global_norm(grads, 40.0)
|
||||
|
||||
grads_and_vars = list(zip(self.grads, self.var_list))
|
||||
opt = tf.train.AdamOptimizer(1e-4)
|
||||
self._apply_gradients = opt.apply_gradients(grads_and_vars)
|
||||
|
||||
def initialize(self):
|
||||
if self.summarize:
|
||||
tf.summary.scalar("model/policy_loss", pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", entropy / bs)
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", self.entropy / bs)
|
||||
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
|
||||
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
def initialize(self):
|
||||
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.loss,
|
||||
@@ -94,55 +99,3 @@ class Policy(object):
|
||||
|
||||
def value(self, ob):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def normalized_columns_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
def flatten(x):
|
||||
return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])])
|
||||
|
||||
|
||||
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
dtype=tf.float32, collections=None):
|
||||
with tf.variable_scope(name):
|
||||
stride_shape = [1, stride[0], stride[1], 1]
|
||||
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
|
||||
num_filters]
|
||||
|
||||
# There are "num input feature maps * filter height * filter width"
|
||||
# inputs to each hidden unit.
|
||||
fan_in = np.prod(filter_shape[:3])
|
||||
# Each unit in the lower layer receives a gradient from: "num output
|
||||
# feature maps * filter height * filter width" / pooling size.
|
||||
fan_out = np.prod(filter_shape[:2]) * num_filters
|
||||
# Initialize weights with random weights.
|
||||
w_bound = np.sqrt(6 / (fan_in + fan_out))
|
||||
|
||||
w = tf.get_variable("W", filter_shape, dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable("b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
|
||||
def linear(x, size, name, initializer=None, bias_init=0):
|
||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
|
||||
initializer=initializer)
|
||||
b = tf.get_variable(name + "/b", [size],
|
||||
initializer=tf.constant_initializer(bias_init))
|
||||
return tf.matmul(x, w) + b
|
||||
|
||||
|
||||
def categorical_sample(logits, d):
|
||||
value = tf.squeeze(tf.multinomial(logits - tf.reduce_max(logits, [1],
|
||||
keep_dims=True),
|
||||
1), [1])
|
||||
return tf.one_hot(value, d)
|
||||
|
||||
@@ -139,7 +139,7 @@ def env_runner(env, policy, num_local_steps, summary_writer, render):
|
||||
fetched = policy.compute_actions(last_state, *last_features)
|
||||
action, value_, features = fetched[0], fetched[1], fetched[2:]
|
||||
# Argmax to convert from one-hot.
|
||||
state, reward, terminal, info = env.step(action.argmax())
|
||||
state, reward, terminal, info = env.step(action)
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
|
||||
@@ -3,11 +3,10 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
from ray.rllib.a3c.policy import (
|
||||
categorical_sample, linear,
|
||||
normalized_columns_initializer, Policy)
|
||||
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.models.convnet import ConvolutionalNetwork
|
||||
|
||||
|
||||
class SharedModel(Policy):
|
||||
@@ -15,15 +14,17 @@ class SharedModel(Policy):
|
||||
super(SharedModel, self).__init__(ob_space, ac_space, **kwargs)
|
||||
|
||||
def setup_graph(self, ob_space, ac_space):
|
||||
num_actions = ac_space.n
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = ModelCatalog.ConvolutionalNetwork(self.x, dist_dim)
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = ConvolutionalNetwork(self.x, self.logit_dim, {})
|
||||
self.logits = self._model.outputs
|
||||
self.curr_dist = dist_class(self.logits)
|
||||
# with tf.variable_scope("vf"):
|
||||
# vf_model = ModelCatalog.get_model(self.x, 1)
|
||||
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
|
||||
normalized_columns_initializer(1.0)), [-1])
|
||||
normc_initializer(1.0)), [-1])
|
||||
|
||||
self.sample = categorical_sample(self.logits, num_actions)[0, :]
|
||||
self.sample = self.curr_dist.sample()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
self.global_step = tf.get_variable(
|
||||
@@ -39,7 +40,7 @@ class SharedModel(Policy):
|
||||
self.adv: batch.adv,
|
||||
self.r: batch.r,
|
||||
}
|
||||
|
||||
self.grads = [g for g in self.grads if g is not None]
|
||||
self.local_steps += 1
|
||||
if self.summarize:
|
||||
grad, summ = self.sess.run([self.grads, self.summary_op],
|
||||
@@ -50,8 +51,9 @@ class SharedModel(Policy):
|
||||
return grad, info
|
||||
|
||||
def compute_actions(self, ob, *args):
|
||||
return self.sess.run([self.sample, self.vf],
|
||||
{self.x: [ob]})
|
||||
action, vf = self.sess.run([self.sample, self.vf],
|
||||
{self.x: [ob]})
|
||||
return action[0], vf
|
||||
|
||||
def value(self, ob, *args):
|
||||
return self.sess.run(self.vf, {self.x: [ob]})[0]
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
|
||||
|
||||
class SharedModelLSTM(Policy):
|
||||
|
||||
def __init__(self, ob_space, ac_space, **kwargs):
|
||||
super(SharedModelLSTM, self).__init__(ob_space, ac_space, **kwargs)
|
||||
|
||||
def setup_graph(self, ob_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = LSTM(self.x, self.logit_dim, {})
|
||||
|
||||
self.state_init = self._model.state_init
|
||||
self.state_in = self._model.state_in
|
||||
self.state_out = self._model.state_out
|
||||
|
||||
self.logits = self._model.outputs
|
||||
self.curr_dist = dist_class(self.logits)
|
||||
# with tf.variable_scope("vf"):
|
||||
# vf_model = ModelCatalog.get_model(self.x, 1)
|
||||
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
|
||||
normc_initializer(1.0)), [-1])
|
||||
|
||||
self.sample = self.curr_dist.sample()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
self.global_step = tf.get_variable(
|
||||
"global_step", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def get_gradients(self, batch):
|
||||
"""Computing the gradient is actually model-dependent.
|
||||
|
||||
The LSTM needs its hidden states in order to compute the gradient
|
||||
accurately.
|
||||
"""
|
||||
feed_dict = {
|
||||
self.x: batch.si,
|
||||
self.ac: batch.a,
|
||||
self.adv: batch.adv,
|
||||
self.r: batch.r,
|
||||
self.state_in[0]: batch.features[0],
|
||||
self.state_in[1]: batch.features[1]
|
||||
}
|
||||
info = {}
|
||||
self.local_steps += 1
|
||||
if self.summarize and self.local_steps % 10 == 0:
|
||||
grad, summ = self.sess.run([self.grads, self.summary_op],
|
||||
feed_dict=feed_dict)
|
||||
info['summary'] = summ
|
||||
else:
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
|
||||
def compute_actions(self, ob, c, h):
|
||||
output = self.sess.run([self.sample, self.vf] + self.state_out,
|
||||
{self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})
|
||||
output = list(output)
|
||||
output[0] = output[0][0]
|
||||
return output
|
||||
|
||||
def value(self, ob, c, h):
|
||||
# process_rollout is very non-intuitive due to value being a float
|
||||
return self.sess.run(self.vf, {self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})[0]
|
||||
|
||||
def get_initial_features(self):
|
||||
return self.state_init
|
||||
@@ -10,7 +10,6 @@ from ray.rllib.models.preprocessors import (
|
||||
NoPreprocessor, AtariRamPreprocessor, AtariPixelPreprocessor)
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.visionnet import VisionNetwork
|
||||
from ray.rllib.models.convnet import ConvolutionalNetwork
|
||||
|
||||
|
||||
class ModelCatalog(object):
|
||||
@@ -67,10 +66,6 @@ class ModelCatalog(object):
|
||||
|
||||
return FullyConnectedNetwork(inputs, num_outputs, options)
|
||||
|
||||
@staticmethod
|
||||
def ConvolutionalNetwork(inputs, num_outputs, options=dict()):
|
||||
return ConvolutionalNetwork(inputs, num_outputs, options)
|
||||
|
||||
@staticmethod
|
||||
def get_preprocessor(env_name, obs_shape):
|
||||
"""Returns a suitable processor for the given environment.
|
||||
|
||||
@@ -3,48 +3,14 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.model import Model
|
||||
from ray.rllib.models.misc import normc_initializer
|
||||
|
||||
|
||||
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
dtype=tf.float32, collections=None):
|
||||
with tf.variable_scope(name):
|
||||
stride_shape = [1, stride[0], stride[1], 1]
|
||||
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
|
||||
num_filters]
|
||||
|
||||
# There are "num input feature maps * filter height * filter width"
|
||||
# inputs to each hidden unit.
|
||||
fan_in = np.prod(filter_shape[:3])
|
||||
# Each unit in the lower layer receives a gradient from: "num output
|
||||
# feature maps * filter height * filter width" / pooling size.
|
||||
fan_out = np.prod(filter_shape[:2]) * num_filters
|
||||
# Initialize weights with random weights.
|
||||
w_bound = np.sqrt(6 / (fan_in + fan_out))
|
||||
|
||||
w = tf.get_variable("W", filter_shape, dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable("b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
|
||||
def linear(x, size, name, initializer=None, bias_init=0):
|
||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
|
||||
initializer=initializer)
|
||||
b = tf.get_variable(name + "/b", [size],
|
||||
initializer=tf.constant_initializer(bias_init))
|
||||
return tf.matmul(x, w) + b
|
||||
from ray.rllib.models.misc import normc_initializer, conv2d, linear
|
||||
|
||||
|
||||
class ConvolutionalNetwork(Model):
|
||||
"""Generic convolutional network."""
|
||||
|
||||
# TODO(rliaw): converge on one generic ConvNet model
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
x = inputs
|
||||
with tf.name_scope("convnet"):
|
||||
|
||||
@@ -5,17 +5,8 @@ from __future__ import print_function
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.model import Model
|
||||
|
||||
|
||||
def normc_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
from ray.rllib.models.misc import normc_initializer
|
||||
|
||||
|
||||
class FullyConnectedNetwork(Model):
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.rnn as rnn
|
||||
import distutils.version
|
||||
|
||||
from ray.rllib.models.misc import (conv2d, linear, flatten,
|
||||
normc_initializer)
|
||||
from ray.rllib.models.model import Model
|
||||
|
||||
use_tf100_api = (distutils.version.LooseVersion(tf.VERSION) >=
|
||||
distutils.version.LooseVersion("1.0.0"))
|
||||
|
||||
|
||||
class LSTM(Model):
|
||||
# TODO(rliaw): Add LSTM code for other algorithms
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
self.x = x = inputs
|
||||
for i in range(4):
|
||||
x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))
|
||||
# Introduce a "fake" batch dimension of 1 after flatten so that we can
|
||||
# do LSTM over the time dim.
|
||||
x = tf.expand_dims(flatten(x), [0])
|
||||
|
||||
size = 256
|
||||
if use_tf100_api:
|
||||
lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
|
||||
else:
|
||||
lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
|
||||
step_size = tf.shape(self.x)[:1]
|
||||
|
||||
c_init = np.zeros((1, lstm.state_size.c), np.float32)
|
||||
h_init = np.zeros((1, lstm.state_size.h), np.float32)
|
||||
self.state_init = [c_init, h_init]
|
||||
c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
|
||||
h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
|
||||
self.state_in = [c_in, h_in]
|
||||
|
||||
if use_tf100_api:
|
||||
state_in = rnn.LSTMStateTuple(c_in, h_in)
|
||||
else:
|
||||
state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
|
||||
lstm_out, lstm_state = tf.nn.dynamic_rnn(lstm, x,
|
||||
initial_state=state_in,
|
||||
sequence_length=step_size,
|
||||
time_major=False)
|
||||
lstm_c, lstm_h = lstm_state
|
||||
x = tf.reshape(lstm_out, [-1, size])
|
||||
logits = linear(x, num_outputs, "action", normc_initializer(0.01))
|
||||
self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]
|
||||
return logits, x
|
||||
@@ -3,7 +3,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@@ -13,3 +12,40 @@ def normc_initializer(std=1.0):
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
dtype=tf.float32, collections=None):
|
||||
with tf.variable_scope(name):
|
||||
stride_shape = [1, stride[0], stride[1], 1]
|
||||
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
|
||||
num_filters]
|
||||
|
||||
# There are "num input feature maps * filter height * filter width"
|
||||
# inputs to each hidden unit.
|
||||
fan_in = np.prod(filter_shape[:3])
|
||||
# Each unit in the lower layer receives a gradient from: "num output
|
||||
# feature maps * filter height * filter width" / pooling size.
|
||||
fan_out = np.prod(filter_shape[:2]) * num_filters
|
||||
# Initialize weights with random weights.
|
||||
w_bound = np.sqrt(6 / (fan_in + fan_out))
|
||||
|
||||
w = tf.get_variable("W", filter_shape, dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable("b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
|
||||
def linear(x, size, name, initializer=None, bias_init=0):
|
||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
|
||||
initializer=initializer)
|
||||
b = tf.get_variable(name + "/b", [size],
|
||||
initializer=tf.constant_initializer(bias_init))
|
||||
return tf.matmul(x, w) + b
|
||||
|
||||
|
||||
def flatten(x):
|
||||
return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])])
|
||||
|
||||
@@ -11,5 +11,7 @@ python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20
|
||||
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
|
||||
python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env PongDeterministic-v4 --alg A3C --config '{"num_workers": 16, "num_batches_per_iteration": 1000, "batch_size": 20}' --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/
|
||||
|
||||
Reference in New Issue
Block a user