mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:38:18 +08:00
[rllib] Pull out shared models for evolution strategies and policy gradient. (#719)
* wip * works with cartpole * lint * fix pg * comment * action dist rename * preprocessor * fix test * typo * fix the action[0] nonsense * revert * satisfy the lint * wip * works with cartpole * lint * fix pg * comment * action dist rename * preprocessor * fix test * typo * fix the action[0] nonsense * revert * satisfy the lint * Minor indentation changes. * fix merge * add humanoid * fix linting * more 4 space * fix * fix linT * oops * es parity
This commit is contained in:
committed by
Philipp Moritz
parent
8fc7dc3ed4
commit
420013774c
+12
-11
@@ -8,6 +8,7 @@ import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
import smart_open
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cStringIO as StringIO
|
||||
elif sys.version_info[0] == 3:
|
||||
@@ -60,9 +61,9 @@ class Algorithm(object):
|
||||
you should create a new algorithm instance for each training session.
|
||||
|
||||
Attributes:
|
||||
env_name (str): Name of the OpenAI gym environment to train against.
|
||||
config (obj): Algorithm-specific configuration data.
|
||||
logdir (str): Directory in which training outputs should be placed.
|
||||
env_name (str): Name of the OpenAI gym environment to train against.
|
||||
config (obj): Algorithm-specific configuration data.
|
||||
logdir (str): Directory in which training outputs should be placed.
|
||||
|
||||
TODO(ekl): support checkpoint / restore of training state.
|
||||
"""
|
||||
@@ -71,11 +72,11 @@ class Algorithm(object):
|
||||
"""Initialize an RLLib algorithm.
|
||||
|
||||
Args:
|
||||
env_name (str): The name of the OpenAI gym environment to use.
|
||||
config (obj): Algorithm-specific configuration data.
|
||||
upload_dir (str): Root directory into which the output directory
|
||||
should be placed. Can be local like file:///tmp/ray/ or on S3
|
||||
like s3://bucketname/.
|
||||
env_name (str): The name of the OpenAI gym environment to use.
|
||||
config (obj): Algorithm-specific configuration data.
|
||||
upload_dir (str): Root directory into which the output directory
|
||||
should be placed. Can be local like file:///tmp/ray/ or on S3
|
||||
like s3://bucketname/.
|
||||
"""
|
||||
upload_dir = "file:///tmp/ray" if upload_dir is None else upload_dir
|
||||
self.experiment_id = uuid.uuid4()
|
||||
@@ -88,8 +89,8 @@ class Algorithm(object):
|
||||
self.__class__.__name__,
|
||||
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
if upload_dir.startswith("file"):
|
||||
self.logdir = "file://" + tempfile.mkdtemp(prefix=prefix,
|
||||
dir="/tmp/ray")
|
||||
self.logdir = "file://" + tempfile.mkdtemp(
|
||||
prefix=prefix, dir="/tmp/ray")
|
||||
else:
|
||||
self.logdir = os.path.join(upload_dir, prefix)
|
||||
log_path = os.path.join(self.logdir, "config.json")
|
||||
@@ -103,7 +104,7 @@ class Algorithm(object):
|
||||
"""Runs one logical iteration of training.
|
||||
|
||||
Returns:
|
||||
A TrainingResult that describes training progress.
|
||||
A TrainingResult that describes training progress.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -73,15 +73,15 @@ class Worker(object):
|
||||
|
||||
self.env = gym.make(env_name)
|
||||
self.sess = utils.make_session(single_threaded=True)
|
||||
self.policy = policies.MujocoPolicy(self.env.observation_space,
|
||||
self.env.action_space,
|
||||
**policy_params)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.env.observation_space, self.env.action_space, **policy_params)
|
||||
tf_util.initialize()
|
||||
|
||||
self.rs = np.random.RandomState()
|
||||
|
||||
assert self.policy.needs_ob_stat == (self.config["calc_obstat_prob"] !=
|
||||
0)
|
||||
assert (
|
||||
self.policy.needs_ob_stat ==
|
||||
(self.config["calc_obstat_prob"] != 0))
|
||||
|
||||
def rollout_and_update_ob_stat(self, timestep_limit, task_ob_stat):
|
||||
if (self.policy.needs_ob_stat and
|
||||
@@ -109,15 +109,15 @@ class Worker(object):
|
||||
|
||||
noise_inds, returns, sign_returns, lengths = [], [], [], []
|
||||
# We set eps=0 because we're incrementing only.
|
||||
task_ob_stat = utils.RunningStat(self.env.observation_space.shape,
|
||||
eps=0)
|
||||
task_ob_stat = utils.RunningStat(
|
||||
self.env.observation_space.shape, eps=0)
|
||||
|
||||
# Perform some rollouts with noise.
|
||||
task_tstart = time.time()
|
||||
while (len(noise_inds) == 0 or
|
||||
time.time() - task_tstart < self.min_task_runtime):
|
||||
noise_idx = self.noise.sample_index(self.rs,
|
||||
self.policy.num_params)
|
||||
noise_idx = self.noise.sample_index(
|
||||
self.rs, self.policy.num_params)
|
||||
perturbation = self.config["noise_stdev"] * self.noise.get(
|
||||
noise_idx, self.policy.num_params)
|
||||
|
||||
@@ -133,8 +133,8 @@ class Worker(object):
|
||||
|
||||
noise_inds.append(noise_idx)
|
||||
returns.append([rews_pos.sum(), rews_neg.sum()])
|
||||
sign_returns.append([np.sign(rews_pos).sum(),
|
||||
np.sign(rews_neg).sum()])
|
||||
sign_returns.append(
|
||||
[np.sign(rews_pos).sum(), np.sign(rews_neg).sum()])
|
||||
lengths.append([len_pos, len_neg])
|
||||
|
||||
return Result(
|
||||
@@ -157,13 +157,17 @@ class EvolutionStrategies(Algorithm):
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
|
||||
policy_params = {
|
||||
"ac_bins": "continuous:",
|
||||
"ac_noise_std": 0.01,
|
||||
"nonlin_type": "tanh",
|
||||
"hidden_dims": [256, 256],
|
||||
"connection_type": "ff"
|
||||
"ac_noise_std": 0.01
|
||||
}
|
||||
|
||||
env = gym.make(env_name)
|
||||
utils.make_session(single_threaded=False)
|
||||
self.policy = policies.GenericPolicy(
|
||||
env.observation_space, env.action_space, **policy_params)
|
||||
tf_util.initialize()
|
||||
self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
|
||||
self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)
|
||||
|
||||
# Create the shared noise table.
|
||||
print("Creating shared noise table.")
|
||||
noise_id = create_shared_noise.remote()
|
||||
@@ -171,17 +175,9 @@ class EvolutionStrategies(Algorithm):
|
||||
|
||||
# Create the actors.
|
||||
print("Creating actors.")
|
||||
self.workers = [Worker.remote(config, policy_params, env_name,
|
||||
noise_id)
|
||||
for _ in range(config["num_workers"])]
|
||||
|
||||
env = gym.make(env_name)
|
||||
utils.make_session(single_threaded=False)
|
||||
self.policy = policies.MujocoPolicy(
|
||||
env.observation_space, env.action_space, **policy_params)
|
||||
tf_util.initialize()
|
||||
self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
|
||||
self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)
|
||||
self.workers = [
|
||||
Worker.remote(config, policy_params, env_name, noise_id)
|
||||
for _ in range(config["num_workers"])]
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.timesteps_so_far = 0
|
||||
@@ -241,13 +237,13 @@ class EvolutionStrategies(Algorithm):
|
||||
curr_task_results.append(result)
|
||||
# Update ob stats.
|
||||
if self.policy.needs_ob_stat and result.ob_count > 0:
|
||||
self.ob_stat.increment(result.ob_sum, result.ob_sumsq,
|
||||
result.ob_count)
|
||||
self.ob_stat.increment(
|
||||
result.ob_sum, result.ob_sumsq, result.ob_count)
|
||||
ob_count_this_batch += result.ob_count
|
||||
|
||||
# Assemble the results.
|
||||
noise_inds_n = np.concatenate([r.noise_inds_n for
|
||||
r in curr_task_results])
|
||||
noise_inds_n = np.concatenate(
|
||||
[r.noise_inds_n for r in curr_task_results])
|
||||
returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
|
||||
lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
|
||||
assert (noise_inds_n.shape[0] == returns_n2.shape[0] ==
|
||||
@@ -265,9 +261,10 @@ class EvolutionStrategies(Algorithm):
|
||||
for idx in noise_inds_n),
|
||||
batch_size=500)
|
||||
g /= returns_n2.size
|
||||
assert (g.shape == (self.policy.num_params,) and
|
||||
g.dtype == np.float32 and
|
||||
count == len(noise_inds_n))
|
||||
assert (
|
||||
g.shape == (self.policy.num_params,) and
|
||||
g.dtype == np.float32 and
|
||||
count == len(noise_inds_n))
|
||||
update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta)
|
||||
|
||||
# Update ob stat (we're never running the policy in the master, but we
|
||||
|
||||
@@ -8,11 +8,13 @@ from __future__ import print_function
|
||||
import logging
|
||||
import pickle
|
||||
|
||||
import gym.spaces
|
||||
import h5py
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.evolution_strategies import tf_util as U
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -137,24 +139,10 @@ def bins(x, dim, num_bins, name):
|
||||
return tf.argmax(scores_nab, 2)
|
||||
|
||||
|
||||
class MujocoPolicy(Policy):
|
||||
def _initialize(self, ob_space, ac_space, ac_bins, ac_noise_std,
|
||||
nonlin_type, hidden_dims, connection_type):
|
||||
class GenericPolicy(Policy):
|
||||
def _initialize(self, ob_space, ac_space, ac_noise_std):
|
||||
self.ac_space = ac_space
|
||||
self.ac_bins = ac_bins
|
||||
self.ac_noise_std = ac_noise_std
|
||||
self.hidden_dims = hidden_dims
|
||||
self.connection_type = connection_type
|
||||
|
||||
assert len(ob_space.shape) == len(self.ac_space.shape) == 1
|
||||
assert (np.all(np.isfinite(self.ac_space.low)) and
|
||||
np.all(np.isfinite(self.ac_space.high))), ("Action bounds "
|
||||
"required")
|
||||
|
||||
self.nonlin = {'tanh': tf.tanh,
|
||||
'relu': tf.nn.relu,
|
||||
'lrelu': U.lrelu,
|
||||
'elu': tf.nn.elu}[nonlin_type]
|
||||
|
||||
with tf.variable_scope(type(self).__name__) as scope:
|
||||
# Observation normalization.
|
||||
@@ -171,72 +159,24 @@ class MujocoPolicy(Policy):
|
||||
tf.assign(ob_std, in_std),
|
||||
])
|
||||
|
||||
inputs = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
|
||||
|
||||
# TODO(ekl): we should do clipping in a standard RLlib preprocessor
|
||||
clipped_inputs = tf.clip_by_value(
|
||||
(inputs - ob_mean) / ob_std, -5.0, 5.0)
|
||||
|
||||
# Policy network.
|
||||
o = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
|
||||
a = self._make_net(tf.clip_by_value((o - ob_mean) / ob_std,
|
||||
-5.0, 5.0))
|
||||
self._act = U.function([o], a)
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
self.ac_space, dist_type='deterministic')
|
||||
model = ModelCatalog.get_model(clipped_inputs, dist_dim)
|
||||
dist = dist_class(model.outputs)
|
||||
self._act = U.function([inputs], dist.sample())
|
||||
return scope
|
||||
|
||||
def _make_net(self, o):
|
||||
# Process observation.
|
||||
if self.connection_type == 'ff':
|
||||
x = o
|
||||
for ilayer, hd in enumerate(self.hidden_dims):
|
||||
x = self.nonlin(U.dense(x, hd, 'l{}'.format(ilayer),
|
||||
U.normc_initializer(1.0)))
|
||||
else:
|
||||
raise NotImplementedError(self.connection_type)
|
||||
|
||||
# Map to action.
|
||||
adim = self.ac_space.shape[0]
|
||||
ahigh = self.ac_space.high
|
||||
alow = self.ac_space.low
|
||||
assert isinstance(self.ac_bins, str)
|
||||
ac_bin_mode, ac_bin_arg = self.ac_bins.split(':')
|
||||
|
||||
if ac_bin_mode == 'uniform':
|
||||
# Uniformly spaced bins, from ac_space.low to ac_space.high.
|
||||
num_ac_bins = int(ac_bin_arg)
|
||||
aidx_na = bins(x, adim, num_ac_bins, 'out')
|
||||
ac_range_1a = (ahigh - alow)[None, :]
|
||||
a = (1. / (num_ac_bins - 1.) * tf.to_float(aidx_na) * ac_range_1a +
|
||||
alow[None, :])
|
||||
|
||||
elif ac_bin_mode == 'custom':
|
||||
# Custom bins specified as a list of values from -1 to 1.
|
||||
# The bins are rescaled to ac_space.low to ac_space.high.
|
||||
acvals_k = np.array(list(map(float, ac_bin_arg.split(','))),
|
||||
dtype=np.float32)
|
||||
logger.info('Custom action values: ' + ' '.join('{:.3f}'.format(x)
|
||||
for x in acvals_k))
|
||||
assert (acvals_k.ndim == 1 and acvals_k[0] == -1 and
|
||||
acvals_k[-1] == 1)
|
||||
acvals_ak = ((ahigh - alow)[:, None] /
|
||||
(acvals_k[-1] - acvals_k[0]) *
|
||||
(acvals_k - acvals_k[0])[None, :] + alow[:, None])
|
||||
|
||||
aidx_na = bins(x, adim, len(acvals_k),
|
||||
'out') # Values in [0, k-1].
|
||||
a = tf.gather_nd(
|
||||
acvals_ak,
|
||||
tf.concat([
|
||||
tf.tile(np.arange(adim)[None, :, None],
|
||||
[tf.shape(aidx_na)[0], 1, 1]),
|
||||
2,
|
||||
tf.expand_dims(aidx_na, -1)
|
||||
]) # (n, a, 2)
|
||||
) # (n, a)
|
||||
elif ac_bin_mode == 'continuous':
|
||||
a = U.dense(x, adim, 'out', U.normc_initializer(0.01))
|
||||
else:
|
||||
raise NotImplementedError(ac_bin_mode)
|
||||
|
||||
return a
|
||||
|
||||
def act(self, ob, random_stream=None):
|
||||
a = self._act(ob)
|
||||
if random_stream is not None and self.ac_noise_std != 0:
|
||||
if not isinstance(self.ac_space, gym.spaces.Discrete) and \
|
||||
random_stream is not None and self.ac_noise_std != 0:
|
||||
a += random_stream.randn(*a.shape) * self.ac_noise_std
|
||||
return a
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
Shared neural network models for RLlib.
|
||||
@@ -0,0 +1,3 @@
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
__all__ = ["ModelCatalog"]
|
||||
+50
-12
@@ -6,16 +6,38 @@ import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Categorical(object):
|
||||
def __init__(self, logits):
|
||||
self.logits = logits
|
||||
class ActionDistribution(object):
|
||||
"""The policy action distribution of an agent.
|
||||
|
||||
Args:
|
||||
inputs (Tensor): The input vector to compute samples from.
|
||||
"""
|
||||
|
||||
def __init__(self, inputs):
|
||||
self.inputs = inputs
|
||||
|
||||
def logp(self, x):
|
||||
raise NotImplementedError
|
||||
|
||||
def kl(self, other):
|
||||
raise NotImplementedError
|
||||
|
||||
def entropy(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def sample(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Categorical(ActionDistribution):
|
||||
"""Categorical distribution for discrete action spaces."""
|
||||
|
||||
def logp(self, x):
|
||||
return -tf.nn.sparse_softmax_cross_entropy_with_logits(
|
||||
logits=self.logits, labels=x)
|
||||
logits=self.inputs, labels=x)
|
||||
|
||||
def entropy(self):
|
||||
a0 = self.logits - tf.reduce_max(self.logits, reduction_indices=[1],
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
|
||||
keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True)
|
||||
@@ -23,9 +45,9 @@ class Categorical(object):
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
|
||||
|
||||
def kl(self, other):
|
||||
a0 = self.logits - tf.reduce_max(self.logits, reduction_indices=[1],
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
|
||||
keep_dims=True)
|
||||
a1 = other.logits - tf.reduce_max(other.logits, reduction_indices=[1],
|
||||
a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1],
|
||||
keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
@@ -36,13 +58,19 @@ class Categorical(object):
|
||||
reduction_indices=[1])
|
||||
|
||||
def sample(self):
|
||||
return tf.multinomial(self.logits, 1)
|
||||
return tf.multinomial(self.inputs, 1)[0]
|
||||
|
||||
|
||||
class DiagGaussian(object):
|
||||
def __init__(self, flat):
|
||||
self.flat = flat
|
||||
mean, logstd = tf.split(flat, 2, axis=1)
|
||||
class DiagGaussian(ActionDistribution):
|
||||
"""Action distribution where each vector element is a gaussian.
|
||||
|
||||
The first half of the input vector defines the gaussian means, and the
|
||||
second half the gaussian standard deviations.
|
||||
"""
|
||||
|
||||
def __init__(self, inputs):
|
||||
ActionDistribution.__init__(self, inputs)
|
||||
mean, logstd = tf.split(inputs, 2, axis=1)
|
||||
self.mean = mean
|
||||
self.logstd = logstd
|
||||
self.std = tf.exp(logstd)
|
||||
@@ -67,3 +95,13 @@ class DiagGaussian(object):
|
||||
|
||||
def sample(self):
|
||||
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||
|
||||
|
||||
class Deterministic(ActionDistribution):
|
||||
"""Action distribution that returns the input values directly.
|
||||
|
||||
This is similar to DiagGaussian with standard deviation zero.
|
||||
"""
|
||||
|
||||
def sample(self):
|
||||
return self.inputs
|
||||
@@ -0,0 +1,76 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
|
||||
from ray.rllib.models.action_dist import (
|
||||
Categorical, Deterministic, DiagGaussian)
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.visionnet import VisionNetwork
|
||||
|
||||
|
||||
class ModelCatalog(object):
|
||||
"""Registry of default models and action distributions for envs.
|
||||
|
||||
Example:
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(env.action_space)
|
||||
model = ModelCatalog.get_model(inputs, dist_dim)
|
||||
dist = dist_class(model.outputs)
|
||||
action_op = dist.sample()
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_action_dist(action_space, dist_type=None):
|
||||
"""Returns action distribution class and size for the given action space.
|
||||
|
||||
Args:
|
||||
action_space (Space): Action space of the target gym env.
|
||||
|
||||
Returns:
|
||||
dist_class (ActionDistribution): Python class of the distribution.
|
||||
dist_dim (int): The size of the input vector to the distribution.
|
||||
"""
|
||||
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
if dist_type is None:
|
||||
return DiagGaussian, action_space.shape[0] * 2
|
||||
elif dist_type == 'deterministic':
|
||||
return Deterministic, action_space.shape[0]
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
return Categorical, action_space.n
|
||||
|
||||
raise NotImplementedError(
|
||||
"Unsupported args: {} {}".format(action_space, dist_type))
|
||||
|
||||
@staticmethod
|
||||
def get_model(inputs, num_outputs):
|
||||
"""Returns a suitable model conforming to given input and output specs.
|
||||
|
||||
Args:
|
||||
inputs (Tensor): The input tensor to the model.
|
||||
num_outputs (int): The size of the output vector of the model.
|
||||
|
||||
Returns:
|
||||
model (Model): Neural network model.
|
||||
"""
|
||||
|
||||
obs_rank = len(inputs.get_shape()) - 1
|
||||
|
||||
if obs_rank > 1:
|
||||
return VisionNetwork(inputs, num_outputs)
|
||||
|
||||
return FullyConnectedNetwork(inputs, num_outputs)
|
||||
|
||||
@staticmethod
|
||||
def get_preprocessor(env_name):
|
||||
"""Returns a suitable processor for the given environment.
|
||||
|
||||
Args:
|
||||
env_name (str): The name of the environment.
|
||||
|
||||
Returns:
|
||||
preprocessor (Preprocessor): Preprocessor for the env observations.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,37 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.model import Model
|
||||
|
||||
|
||||
def normc_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
class FullyConnectedNetwork(Model):
|
||||
"""Generic fully connected network."""
|
||||
|
||||
def _init(self, inputs, num_outputs):
|
||||
with tf.name_scope("fc_net"):
|
||||
fc1 = slim.fully_connected(
|
||||
inputs, 256, weights_initializer=normc_initializer(1.0),
|
||||
activation_fn=tf.nn.tanh,
|
||||
scope="fc1")
|
||||
fc2 = slim.fully_connected(
|
||||
fc1, 256, weights_initializer=normc_initializer(1.0),
|
||||
activation_fn=tf.nn.tanh,
|
||||
scope="fc2")
|
||||
fc3 = slim.fully_connected(
|
||||
fc2, num_outputs, weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope="fc3")
|
||||
return fc3
|
||||
@@ -0,0 +1,28 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
class Model(object):
|
||||
"""Defines an abstract network model for use with RLlib.
|
||||
|
||||
Models convert input tensors to a number of output features. These features
|
||||
can then be interpreted by ActionDistribution classes to determine
|
||||
e.g. agent action values.
|
||||
"""
|
||||
|
||||
def __init__(self, inputs, num_outputs):
|
||||
self.inputs = inputs
|
||||
self.outputs = self._init(inputs, num_outputs)
|
||||
|
||||
def _init(self):
|
||||
"""Initializes the model given self.inputs and self.num_outputs."""
|
||||
raise NotImplementedError
|
||||
|
||||
def inputs(self):
|
||||
"""Returns the input placeholder for this model."""
|
||||
return self.inputs
|
||||
|
||||
def outputs(self):
|
||||
"""Returns the output tensor of this model."""
|
||||
return self.outputs
|
||||
@@ -0,0 +1,14 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
# TODO(ekl) implement common preprocessors
|
||||
class Preprocessor(object):
|
||||
def output_shape(self):
|
||||
"""Returns the new output shape, or None if unchanged."""
|
||||
raise NotImplementedError
|
||||
|
||||
def preprocess(self, observation):
|
||||
"""Returns the preprocessed observation."""
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,22 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
|
||||
from ray.rllib.models.model import Model
|
||||
|
||||
|
||||
class VisionNetwork(Model):
|
||||
"""Generic vision network."""
|
||||
|
||||
def _init(self, inputs, num_outputs):
|
||||
with tf.name_scope("vision_net"):
|
||||
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1")
|
||||
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2")
|
||||
fc1 = slim.conv2d(
|
||||
conv2, 512, [10, 10], padding="VALID", scope="fc1")
|
||||
fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None,
|
||||
normalizer_fn=None, scope="fc2")
|
||||
return tf.squeeze(fc2, [1, 2])
|
||||
@@ -11,7 +11,7 @@ from tensorflow.python import debug as tf_debug
|
||||
import ray
|
||||
|
||||
from ray.rllib.parallel import LocalSyncParallelOptimizer
|
||||
from ray.rllib.policy_gradient.distributions import Categorical, DiagGaussian
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.policy_gradient.env import BatchedEnv
|
||||
from ray.rllib.policy_gradient.loss import ProximalPolicyLoss
|
||||
from ray.rllib.policy_gradient.filter import MeanStdFilter
|
||||
@@ -33,8 +33,8 @@ class Agent(object):
|
||||
network weights. When run as a remote agent, only this graph is used.
|
||||
"""
|
||||
|
||||
def __init__(self, name, batchsize, preprocessor, config, logdir,
|
||||
is_remote):
|
||||
def __init__(
|
||||
self, name, batchsize, preprocessor, config, logdir, is_remote):
|
||||
if is_remote:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
devices = ["/cpu:0"]
|
||||
@@ -54,37 +54,30 @@ class Agent(object):
|
||||
self.sess = tf.Session(config=config_proto)
|
||||
if config["use_tf_debugger"] and not is_remote:
|
||||
self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
|
||||
self.sess.add_tensor_filter("has_inf_or_nan",
|
||||
tf_debug.has_inf_or_nan)
|
||||
self.sess.add_tensor_filter(
|
||||
"has_inf_or_nan", tf_debug.has_inf_or_nan)
|
||||
|
||||
# Defines the training inputs.
|
||||
self.kl_coeff = tf.placeholder(name="newkl", shape=(),
|
||||
dtype=tf.float32)
|
||||
self.observations = tf.placeholder(tf.float32,
|
||||
shape=(None,) + preprocessor.shape)
|
||||
self.kl_coeff = tf.placeholder(
|
||||
name="newkl", shape=(), dtype=tf.float32)
|
||||
self.observations = tf.placeholder(
|
||||
tf.float32, shape=(None,) + preprocessor.shape)
|
||||
self.advantages = tf.placeholder(tf.float32, shape=(None,))
|
||||
|
||||
action_space = self.env.action_space
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
# The first half of the dimensions are the means, the second half
|
||||
# are the standard deviations.
|
||||
self.action_dim = action_space.shape[0]
|
||||
self.action_shape = (self.action_dim,)
|
||||
self.logit_dim = 2 * self.action_dim
|
||||
self.actions = tf.placeholder(tf.float32,
|
||||
shape=(None, self.action_dim))
|
||||
self.distribution_class = DiagGaussian
|
||||
self.actions = tf.placeholder(
|
||||
tf.float32, shape=(None, action_space.shape[0]))
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.action_dim = action_space.n
|
||||
self.action_shape = ()
|
||||
self.logit_dim = self.action_dim
|
||||
self.actions = tf.placeholder(tf.int64, shape=(None,))
|
||||
self.distribution_class = Categorical
|
||||
else:
|
||||
raise NotImplemented("action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.prev_logits = tf.placeholder(tf.float32,
|
||||
shape=(None, self.logit_dim))
|
||||
raise NotImplemented(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space)
|
||||
self.prev_logits = tf.placeholder(
|
||||
tf.float32, shape=(None, self.logit_dim))
|
||||
|
||||
assert config["sgd_batchsize"] % len(devices) == 0, \
|
||||
"Batch size must be evenly divisible by devices"
|
||||
@@ -99,7 +92,8 @@ class Agent(object):
|
||||
return ProximalPolicyLoss(
|
||||
self.env.observation_space, self.env.action_space,
|
||||
obs, advs, acts, plog, self.logit_dim,
|
||||
self.kl_coeff, self.distribution_class, self.config, self.sess)
|
||||
self.kl_coeff, self.distribution_class, self.config,
|
||||
self.sess)
|
||||
|
||||
self.par_opt = LocalSyncParallelOptimizer(
|
||||
tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
|
||||
@@ -118,14 +112,13 @@ class Agent(object):
|
||||
self.mean_kl = tf.reduce_mean(
|
||||
tf.stack(values=[policy.mean_kl for policy in policies]), 0)
|
||||
self.mean_entropy = tf.reduce_mean(
|
||||
tf.stack(values=[policy.mean_entropy for policy in policies]),
|
||||
0)
|
||||
tf.stack(
|
||||
values=[policy.mean_entropy for policy in policies]), 0)
|
||||
|
||||
# References to the model weights
|
||||
self.common_policy = self.par_opt.get_common_loss()
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
self.common_policy.loss,
|
||||
self.sess)
|
||||
self.common_policy.loss, self.sess)
|
||||
self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
|
||||
self.reward_filter = MeanStdFilter((), clip=5.0)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
@@ -139,8 +132,8 @@ class Agent(object):
|
||||
trajectories["logprobs"]],
|
||||
full_trace=full_trace)
|
||||
|
||||
def run_sgd_minibatch(self, batch_index, kl_coeff, full_trace,
|
||||
file_writer):
|
||||
def run_sgd_minibatch(
|
||||
self, batch_index, kl_coeff, full_trace, file_writer):
|
||||
return self.par_opt.optimize(
|
||||
self.sess,
|
||||
batch_index,
|
||||
|
||||
@@ -55,8 +55,7 @@ class BatchedEnv(object):
|
||||
observations.append(np.zeros(self.shape))
|
||||
rewards.append(0.0)
|
||||
continue
|
||||
observation, reward, done, info = self.envs[i].step(
|
||||
action if len(action) > 1 else action[0])
|
||||
observation, reward, done, info = self.envs[i].step(action)
|
||||
if render:
|
||||
self.envs[0].render()
|
||||
observations.append(self.preprocessor(observation))
|
||||
|
||||
@@ -4,8 +4,8 @@ from __future__ import print_function
|
||||
|
||||
import gym.spaces
|
||||
import tensorflow as tf
|
||||
from ray.rllib.policy_gradient.models.visionnet import vision_net
|
||||
from ray.rllib.policy_gradient.models.fcnet import fc_net
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
|
||||
class ProximalPolicyLoss(object):
|
||||
@@ -21,11 +21,8 @@ class ProximalPolicyLoss(object):
|
||||
# Saved so that we can compute actions given different observations
|
||||
self.observations = observations
|
||||
|
||||
if len(observation_space.shape) > 1:
|
||||
self.curr_logits = vision_net(observations, num_classes=logit_dim)
|
||||
else:
|
||||
assert len(observation_space.shape) == 1
|
||||
self.curr_logits = fc_net(observations, num_classes=logit_dim)
|
||||
self.curr_logits = ModelCatalog.get_model(
|
||||
observations, logit_dim).outputs
|
||||
self.curr_dist = distribution_class(self.curr_logits)
|
||||
self.sampler = self.curr_dist.sample()
|
||||
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def normc_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
def fc_net(inputs, num_classes=10, logstd=False):
|
||||
with tf.name_scope("fc_net"):
|
||||
fc1 = slim.fully_connected(inputs, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc1")
|
||||
fc2 = slim.fully_connected(fc1, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc2")
|
||||
fc3 = slim.fully_connected(fc2, 128,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
scope="fc3")
|
||||
fc4 = slim.fully_connected(fc3, num_classes,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope="fc4")
|
||||
if logstd:
|
||||
logstd = tf.get_variable(name="logstd", shape=[num_classes],
|
||||
initializer=tf.zeros_initializer)
|
||||
return tf.concat(1, [fc4, logstd])
|
||||
else:
|
||||
return fc4
|
||||
@@ -1,16 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
|
||||
|
||||
def vision_net(inputs, num_classes=10):
|
||||
with tf.name_scope("vision_net"):
|
||||
conv1 = slim.conv2d(inputs, 16, [8, 8], 4, scope="conv1")
|
||||
conv2 = slim.conv2d(conv1, 32, [4, 4], 2, scope="conv2")
|
||||
fc1 = slim.conv2d(conv2, 512, [10, 10], padding="VALID", scope="fc1")
|
||||
fc2 = slim.conv2d(fc1, num_classes, [1, 1], activation_fn=None,
|
||||
normalizer_fn=None, scope="fc2")
|
||||
return tf.squeeze(fc2, [1, 2])
|
||||
@@ -48,8 +48,7 @@ class PolicyGradient(Algorithm):
|
||||
|
||||
Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
|
||||
|
||||
# TODO(ekl): The preprocessor should be associated with the env
|
||||
# elsewhere.
|
||||
# TODO(ekl): preprocessor should be associated with the env elsewhere
|
||||
if self.env_name == "Pong-v0":
|
||||
preprocessor = AtariPixelPreprocessor()
|
||||
elif self.env_name == "Pong-ram-v3":
|
||||
@@ -58,6 +57,8 @@ class PolicyGradient(Algorithm):
|
||||
preprocessor = NoPreprocessor()
|
||||
elif self.env_name == "Walker2d-v1":
|
||||
preprocessor = NoPreprocessor()
|
||||
elif self.env_name == "Humanoid-v1":
|
||||
preprocessor = NoPreprocessor()
|
||||
else:
|
||||
preprocessor = AtariPixelPreprocessor()
|
||||
|
||||
@@ -93,8 +94,8 @@ class PolicyGradient(Algorithm):
|
||||
if config["model_checkpoint_file"]:
|
||||
checkpoint_path = saver.save(
|
||||
model.sess,
|
||||
os.path.join(self.logdir,
|
||||
config["model_checkpoint_file"] % j))
|
||||
os.path.join(
|
||||
self.logdir, config["model_checkpoint_file"] % j))
|
||||
print("Checkpoint saved in file: %s" % checkpoint_path)
|
||||
checkpointing_end = time.time()
|
||||
weights = ray.put(model.get_weights())
|
||||
@@ -135,8 +136,8 @@ class PolicyGradient(Algorithm):
|
||||
for i in range(config["num_sgd_iter"]):
|
||||
sgd_start = time.time()
|
||||
batch_index = 0
|
||||
num_batches = (int(tuples_per_device) //
|
||||
int(model.per_device_batch_size))
|
||||
num_batches = (
|
||||
int(tuples_per_device) // int(model.per_device_batch_size))
|
||||
loss, kl, entropy = [], [], []
|
||||
permutation = np.random.permutation(num_batches)
|
||||
while batch_index < num_batches:
|
||||
@@ -155,8 +156,8 @@ class PolicyGradient(Algorithm):
|
||||
kl = np.mean(kl)
|
||||
entropy = np.mean(entropy)
|
||||
sgd_end = time.time()
|
||||
print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl,
|
||||
entropy))
|
||||
print(
|
||||
"{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy))
|
||||
|
||||
values = []
|
||||
if i == config["num_sgd_iter"] - 1:
|
||||
|
||||
@@ -7,11 +7,12 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from ray.rllib.policy_gradient.distributions import Categorical
|
||||
from ray.rllib.models.action_dist import Categorical
|
||||
from ray.rllib.policy_gradient.utils import flatten, concatenate
|
||||
|
||||
|
||||
class DistibutionsTest(unittest.TestCase):
|
||||
# TODO(ekl): move to rllib/models dir
|
||||
class DistributionsTest(unittest.TestCase):
|
||||
|
||||
def testCategorical(self):
|
||||
num_samples = 100000
|
||||
|
||||
Reference in New Issue
Block a user