pytorch version

This commit is contained in:
Kolesnikov Sergey
2017-11-15 22:18:46 +03:00
parent 34993abdf7
commit 7401266fe7
49 changed files with 5435 additions and 1 deletions
+4
View File
@@ -99,3 +99,7 @@ ENV/
# mypy # mypy
.mypy_cache/ .mypy_cache/
.DS_Store
.idea
log*
+75 -1
View File
@@ -1,2 +1,76 @@
# Run-Skeleton-Run # Run-Skeleton-Run
Reason8.ai PyTorch solution for NIPS RL 2017 challenge [Reason8.ai](https://reason8.ai) PyTorch solution for 3rd place [NIPS RL 2017 challenge](https://www.crowdai.org/challenges/nips-2017-learning-to-run/leaderboards?challenge_round_id=12).
Additional thanks to [Michail Pavlov](https://github.com/fgvbrt) for collaboration.
## Agent policies
### no-flip-state-action
![Alt Text](http://www.sheawong.com/wp-content/uploads/2013/08/keephatin.gif)
![alt text](https://github.com/Scitator/Run-Skeleton-Run/blob/master/gifs/noflip.gif)
### flip-state-action
![alt text](https://github.com/Scitator/Run-Skeleton-Run/blob/master/gifs/flip.gif)
## How to setup environment?
1. `sh setup_conda.sh`
2. `source activate opensim-rl`
Would like to test baselines? (Need MPI support)
3. `sudo apt-get install openmpi-bin openmpi-doc libopenmpi-dev`
3+. `sh setup_env_mpi.sh`
OR like DDPG agents?
3. `sh setup_env.sh`
4. Congrats! Now you are ready to check our agents.
## Run DDPG agent
```
CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
--logdir ./logs_ddpg \
--num-threads 4 \
--ddpg-wrapper \
--skip-frames 5 \
--fail-reward -0.2 \
--reward-scale 10 \
--flip-state-action \
--actor-layers 64-64 --actor-layer-norm --actor-parameters-noise \
--actor-lr 0.001 --actor-lr-end 0.00001 \
--critic-layers 64-32 --critic-layer-norm \
--critic-lr 0.002 --critic-lr-end 0.00001 \
--initial-epsilon 0.5 --final-epsilon 0.001 \
--tau 0.0001
```
## Evaluate DDPG agent
```
CUDA_VISIBLE_DEVICES="" PYTHONPATH=./ python ddpg/submit.py \
--restore-actor-from ./logs_ddpg/actor_state_dict.pkl \
--restore-critic-from ./logs_ddpg/critic_state_dict.pkl \
--restore-args-from ./logs_ddpg/args.json \
--num-episodes 10
```
## Run TRPO/PPO agent
```
CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
--agent ppo \
--logdir ./logs_baseline \
--baseline-wrapper \
--skip-frames 5 \
--fail-reward -0.2 \
--reward-scale 10
```
View File
+4
View File
@@ -0,0 +1,4 @@
from baselines.baselines_common.console_util import *
from baselines.baselines_common.dataset import Dataset
from baselines.baselines_common.math_util import *
from baselines.baselines_common.misc_util import *
+38
View File
@@ -0,0 +1,38 @@
import numpy as np
def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
"""
Demmel p 312
"""
p = b.copy()
r = b.copy()
x = np.zeros_like(b)
rdotr = r.dot(r)
fmtstr = "%10i %10.3g %10.3g"
titlestr = "%10s %10s %10s"
if verbose:
print(titlestr % ("iter", "residual norm", "soln norm"))
for i in range(cg_iters):
if callback is not None:
callback(x)
if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
z = f_Ax(p)
v = rdotr / p.dot(z)
x += v * p
r -= v * z
newrdotr = r.dot(r)
mu = newrdotr / rdotr
p = r + mu * p
rdotr = newrdotr
if rdotr < residual_tol:
break
if callback is not None:
callback(x)
if verbose:
print(fmtstr % (i + 1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
return x
@@ -0,0 +1,62 @@
from __future__ import print_function
from contextlib import contextmanager
import numpy as np
import time
# ================================================================
# Misc
# ================================================================
def fmt_row(width, row, header=False):
out = " | ".join(fmt_item(x, width) for x in row)
if header: out = out + "\n" + "-" * len(out)
return out
def fmt_item(x, l):
if isinstance(x, np.ndarray):
assert x.ndim == 0
x = x.item()
if isinstance(x, float):
rep = "%g" % x
else:
rep = str(x)
return " " * (l - len(rep)) + rep
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
def colorize(string, color, bold=False, highlight=False):
attr = []
num = color2num[color]
if highlight: num += 10
attr.append(str(num))
if bold: attr.append('1')
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
MESSAGE_DEPTH = 0
@contextmanager
def timed(msg):
global MESSAGE_DEPTH # pylint: disable=W0603
print(colorize('\t' * MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
tstart = time.time()
MESSAGE_DEPTH += 1
yield
MESSAGE_DEPTH -= 1
print(colorize('\t' * MESSAGE_DEPTH + "done in %.3f seconds" % (time.time() - tstart),
color='magenta'))
+63
View File
@@ -0,0 +1,63 @@
import numpy as np
class Dataset(object):
def __init__(self, data_map, deterministic=False, shuffle=True):
self.data_map = data_map
self.deterministic = deterministic
self.enable_shuffle = shuffle
self.n = next(iter(data_map.values())).shape[0]
self._next_id = 0
self.shuffle()
def shuffle(self):
if self.deterministic:
return
perm = np.arange(self.n)
np.random.shuffle(perm)
for key in self.data_map:
self.data_map[key] = self.data_map[key][perm]
self._next_id = 0
def next_batch(self, batch_size):
if self._next_id >= self.n and self.enable_shuffle:
self.shuffle()
cur_id = self._next_id
cur_batch_size = min(batch_size, self.n - self._next_id)
self._next_id += cur_batch_size
data_map = dict()
for key in self.data_map:
data_map[key] = self.data_map[key][cur_id:cur_id + cur_batch_size]
return data_map
def iterate_once(self, batch_size):
if self.enable_shuffle: self.shuffle()
while self._next_id <= self.n - batch_size:
yield self.next_batch(batch_size)
self._next_id = 0
def subset(self, num_elements, deterministic=True):
data_map = dict()
for key in self.data_map:
data_map[key] = self.data_map[key][:num_elements]
return Dataset(data_map, deterministic)
def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True,
include_final_partial_batch=True):
assert (num_batches is None) != (
batch_size is None), 'Provide num_batches or batch_size, but not both'
arrays = tuple(map(np.asarray, arrays))
n = arrays[0].shape[0]
assert all(a.shape[0] == n for a in arrays[1:])
inds = np.arange(n)
if shuffle: np.random.shuffle(inds)
sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
for batch_inds in np.array_split(inds, sections):
if include_final_partial_batch or len(batch_inds) == batch_size:
yield tuple(a[batch_inds] for a in arrays)
+377
View File
@@ -0,0 +1,377 @@
import tensorflow as tf
import numpy as np
import baselines.baselines_common.tf_util as U
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
class Pd(object):
"""
A particular probability distribution
"""
def flatparam(self):
raise NotImplementedError
def mode(self):
raise NotImplementedError
def neglogp(self, x):
# Usually it's easier to define the negative logprob
raise NotImplementedError
def kl(self, other):
raise NotImplementedError
def entropy(self):
raise NotImplementedError
def sample(self):
raise NotImplementedError
def logp(self, x):
return - self.neglogp(x)
class PdType(object):
"""
Parametrized family of probability distributions
"""
def pdclass(self):
raise NotImplementedError
def pdfromflat(self, flat):
return self.pdclass()(flat)
def param_shape(self):
raise NotImplementedError
def sample_shape(self):
raise NotImplementedError
def sample_dtype(self):
raise NotImplementedError
def param_placeholder(self, prepend_shape, name=None):
return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name)
def sample_placeholder(self, prepend_shape, name=None):
return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(),
name=name)
class CategoricalPdType(PdType):
def __init__(self, ncat):
self.ncat = ncat
def pdclass(self):
return CategoricalPd
def param_shape(self):
return [self.ncat]
def sample_shape(self):
return []
def sample_dtype(self):
return tf.int32
class MultiCategoricalPdType(PdType):
def __init__(self, low, high):
self.low = low
self.high = high
self.ncats = high - low + 1
def pdclass(self):
return MultiCategoricalPd
def pdfromflat(self, flat):
return MultiCategoricalPd(self.low, self.high, flat)
def param_shape(self):
return [sum(self.ncats)]
def sample_shape(self):
return [len(self.ncats)]
def sample_dtype(self):
return tf.int32
class DiagGaussianPdType(PdType):
def __init__(self, size):
self.size = size
def pdclass(self):
return DiagGaussianPd
def param_shape(self):
return [2 * self.size]
def sample_shape(self):
return [self.size]
def sample_dtype(self):
return tf.float32
class BernoulliPdType(PdType):
def __init__(self, size):
self.size = size
def pdclass(self):
return BernoulliPd
def param_shape(self):
return [self.size]
def sample_shape(self):
return [self.size]
def sample_dtype(self):
return tf.int32
# WRONG SECOND DERIVATIVES
# class CategoricalPd(Pd):
# def __init__(self, logits):
# self.logits = logits
# self.ps = tf.nn.softmax(logits)
# @classmethod
# def fromflat(cls, flat):
# return cls(flat)
# def flatparam(self):
# return self.logits
# def mode(self):
# return U.argmax(self.logits, axis=-1)
# def logp(self, x):
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
# def kl(self, other):
# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def entropy(self):
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def sample(self):
# u = tf.random_uniform(tf.shape(self.logits))
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
class CategoricalPd(Pd):
def __init__(self, logits):
self.logits = logits
def flatparam(self):
return self.logits
def mode(self):
return U.argmax(self.logits, axis=-1)
def neglogp(self, x):
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
# the implementation does not allow second-order derivatives...
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
return tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits,
labels=one_hot_actions)
def kl(self, other):
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0)
ea1 = tf.exp(a1)
z0 = U.sum(ea0, axis=-1, keepdims=True)
z1 = U.sum(ea1, axis=-1, keepdims=True)
p0 = ea0 / z0
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
def entropy(self):
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0)
z0 = U.sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0
return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
def sample(self):
u = tf.random_uniform(tf.shape(self.logits))
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
@classmethod
def fromflat(cls, flat):
return cls(flat)
class MultiCategoricalPd(Pd):
def __init__(self, low, high, flat):
self.flat = flat
self.low = tf.constant(low, dtype=tf.int32)
self.categoricals = list(
map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
def flatparam(self):
return self.flat
def mode(self):
return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1),
tf.int32)
def neglogp(self, x):
return tf.add_n([p.neglogp(px) for p, px in zip(
self.categoricals, tf.unstack(x - self.low,
axis=len(x.get_shape()) - 1))])
def kl(self, other):
return tf.add_n([
p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
])
def entropy(self):
return tf.add_n([p.entropy() for p in self.categoricals])
def sample(self):
return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1),
tf.int32)
@classmethod
def fromflat(cls, flat):
raise NotImplementedError
class DiagGaussianPd(Pd):
def __init__(self, flat):
self.flat = flat
mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat)
self.mean = mean
self.logstd = logstd
self.std = tf.exp(logstd)
def flatparam(self):
return self.flat
def mode(self):
return self.mean
def neglogp(self, x):
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
+ U.sum(self.logstd, axis=-1)
def kl(self, other):
assert isinstance(other, DiagGaussianPd)
return U.sum(other.logstd - self.logstd + (
tf.square(self.std) + tf.square(self.mean - other.mean)) / (
2.0 * tf.square(other.std)) - 0.5, axis=-1)
def entropy(self):
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
def sample(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@classmethod
def fromflat(cls, flat):
return cls(flat)
class BernoulliPd(Pd):
def __init__(self, logits):
self.logits = logits
self.ps = tf.sigmoid(logits)
def flatparam(self):
return self.logits
def mode(self):
return tf.round(self.ps)
def neglogp(self, x):
return U.sum(
tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)),
axis=-1)
def kl(self, other):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps),
axis=-1) - U.sum(
tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
def entropy(self):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps),
axis=-1)
def sample(self):
u = tf.random_uniform(tf.shape(self.ps))
return tf.to_float(math_ops.less(u, self.ps))
@classmethod
def fromflat(cls, flat):
return cls(flat)
def make_pdtype(ac_space):
from gym import spaces
if isinstance(ac_space, spaces.Box):
assert len(ac_space.shape) == 1
return DiagGaussianPdType(ac_space.shape[0])
elif isinstance(ac_space, spaces.Discrete):
return CategoricalPdType(ac_space.n)
elif isinstance(ac_space, spaces.MultiDiscrete):
return MultiCategoricalPdType(ac_space.low, ac_space.high)
elif isinstance(ac_space, spaces.MultiBinary):
return BernoulliPdType(ac_space.n)
else:
raise NotImplementedError
def shape_el(v, i):
maybe = v.get_shape()[i]
if maybe is not None:
return maybe
else:
return tf.shape(v)[i]
@U.in_session
def test_probtypes():
np.random.seed(0)
pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) # pylint: disable=E1101
validate_probtype(diag_gauss, pdparam_diag_gauss)
pdparam_categorical = np.array([-.2, .3, .5])
categorical = CategoricalPdType(pdparam_categorical.size) # pylint: disable=E1101
validate_probtype(categorical, pdparam_categorical)
pdparam_bernoulli = np.array([-.2, .3, .5])
bernoulli = BernoulliPdType(pdparam_bernoulli.size) # pylint: disable=E1101
validate_probtype(bernoulli, pdparam_bernoulli)
def validate_probtype(probtype, pdparam):
N = 100000
# Check to see if mean negative log likelihood == differential entropy
Mval = np.repeat(pdparam[None, :], N, axis=0)
M = probtype.param_placeholder([N])
X = probtype.sample_placeholder([N])
pd = probtype.pdclass()(M)
calcloglik = U.function([X, M], pd.logp(X))
calcent = U.function([M], pd.entropy())
Xval = U.eval(pd.sample(), feed_dict={M: Mval})
logliks = calcloglik(Xval, Mval)
entval_ll = - logliks.mean() # pylint: disable=E1101
entval_ll_stderr = logliks.std() / np.sqrt(N) # pylint: disable=E1101
entval = calcent(Mval).mean() # pylint: disable=E1101
assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
M2 = probtype.param_placeholder([N])
pd2 = probtype.pdclass()(M2)
q = pdparam + np.random.randn(pdparam.size) * 0.1
Mval2 = np.repeat(q[None, :], N, axis=0)
calckl = U.function([M, M2], pd.kl(pd2))
klval = calckl(Mval, Mval2).mean() # pylint: disable=E1101
logliks = calcloglik(Xval, Mval2)
klval_ll = - entval - logliks.mean() # pylint: disable=E1101
klval_ll_stderr = logliks.std() / np.sqrt(N) # pylint: disable=E1101
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
+92
View File
@@ -0,0 +1,92 @@
import numpy as np
import scipy.signal
def discount(x, gamma):
"""
computes discounted sums along 0th dimension of x.
inputs
------
x: ndarray
gamma: float
outputs
-------
y: ndarray with same shape as x, satisfying
y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
where k = len(x) - t - 1
"""
assert x.ndim >= 1
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
def explained_variance(ypred, y):
"""
Computes fraction of variance that ypred explains about y.
Returns 1 - Var[y-ypred] / Var[y]
interpretation:
ev=0 => might as well have predicted zero
ev=1 => perfect prediction
ev<0 => worse than just predicting zero
"""
assert y.ndim == 1 and ypred.ndim == 1
vary = np.var(y)
return np.nan if vary == 0 else 1 - np.var(y - ypred) / vary
def explained_variance_2d(ypred, y):
assert y.ndim == 2 and ypred.ndim == 2
vary = np.var(y, axis=0)
out = 1 - np.var(y - ypred) / vary
out[vary < 1e-10] = 0
return out
def ncc(ypred, y):
return np.corrcoef(ypred, y)[1, 0]
def flatten_arrays(arrs):
return np.concatenate([arr.flat for arr in arrs])
def unflatten_vector(vec, shapes):
i = 0
arrs = []
for shape in shapes:
size = np.prod(shape)
arr = vec[i:i + size].reshape(shape)
arrs.append(arr)
i += size
return arrs
def discount_with_boundaries(X, New, gamma):
"""
X: 2d array of floats, time x features
New: 2d array of bools, indicating when a new episode has started
"""
Y = np.zeros_like(X)
T = X.shape[0]
Y[T - 1] = X[T - 1]
for t in range(T - 2, -1, -1):
Y[t] = X[t] + gamma * Y[t + 1] * (1 - New[t + 1])
return Y
def test_discount_with_boundaries():
gamma = 0.9
x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
starts = [1.0, 0.0, 0.0, 1.0]
y = discount_with_boundaries(x, starts, gamma)
assert np.allclose(y, [
1 + gamma * 2 + gamma ** 2 * 3,
2 + gamma * 3,
3,
4
])
+328
View File
@@ -0,0 +1,328 @@
import gym
import numpy as np
import os
import pickle
import random
import tempfile
import time
import zipfile
def zipsame(*seqs):
L = len(seqs[0])
assert all(len(seq) == L for seq in seqs[1:])
return zip(*seqs)
def unpack(seq, sizes):
"""
Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
None = just one bare element, not a list
Example:
unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
"""
seq = list(seq)
it = iter(seq)
assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
for size in sizes:
if size is None:
yield it.__next__()
else:
li = []
for _ in range(size):
li.append(it.__next__())
yield li
class EzPickle(object):
"""Objects that are pickled and unpickled via their constructor
arguments.
Example usage:
class Dog(Animal, EzPickle):
def __init__(self, furcolor, tailkind="bushy"):
Animal.__init__()
EzPickle.__init__(furcolor, tailkind)
...
When this object is unpickled, a new Dog will be constructed by passing the provided
furcolor and tailkind into the constructor. However, philosophers are still not sure
whether it is still the same dog.
This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
and Atari.
"""
def __init__(self, *args, **kwargs):
self._ezpickle_args = args
self._ezpickle_kwargs = kwargs
def __getstate__(self):
return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
def __setstate__(self, d):
out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
self.__dict__.update(out.__dict__)
def set_global_seeds(i):
try:
import tensorflow as tf
except ImportError:
pass
else:
tf.set_random_seed(i)
np.random.seed(i)
random.seed(i)
def pretty_eta(seconds_left):
"""Print the number of seconds in human readable format.
Examples:
2 days
2 hours and 37 minutes
less than a minute
Paramters
---------
seconds_left: int
Number of seconds to be converted to the ETA
Returns
-------
eta: str
String representing the pretty ETA.
"""
minutes_left = seconds_left // 60
seconds_left %= 60
hours_left = minutes_left // 60
minutes_left %= 60
days_left = hours_left // 24
hours_left %= 24
def helper(cnt, name):
return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
if days_left > 0:
msg = helper(days_left, 'day')
if hours_left > 0:
msg += ' and ' + helper(hours_left, 'hour')
return msg
if hours_left > 0:
msg = helper(hours_left, 'hour')
if minutes_left > 0:
msg += ' and ' + helper(minutes_left, 'minute')
return msg
if minutes_left > 0:
return helper(minutes_left, 'minute')
return 'less than a minute'
class RunningAvg(object):
def __init__(self, gamma, init_value=None):
"""Keep a running estimate of a quantity. This is a bit like mean
but more sensitive to recent changes.
Parameters
----------
gamma: float
Must be between 0 and 1, where 0 is the most sensitive to recent
changes.
init_value: float or None
Initial value of the estimate. If None, it will be set on the first update.
"""
self._value = init_value
self._gamma = gamma
def update(self, new_val):
"""Update the estimate.
Parameters
----------
new_val: float
new observated value of estimated quantity.
"""
if self._value is None:
self._value = new_val
else:
self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
def __float__(self):
"""Get the current estimate"""
return self._value
class SimpleMonitor(gym.Wrapper):
def __init__(self, env):
"""Adds two qunatities to info returned by every step:
num_steps: int
Number of steps takes so far
rewards: [float]
All the cumulative rewards for the episodes completed so far.
"""
super().__init__(env)
# current episode state
self._current_reward = None
self._num_steps = None
# temporary monitor state that we do not save
self._time_offset = None
self._total_steps = None
# monitor state
self._episode_rewards = []
self._episode_lengths = []
self._episode_end_times = []
def _reset(self):
obs = self.env.reset()
# recompute temporary state if needed
if self._time_offset is None:
self._time_offset = time.time()
if len(self._episode_end_times) > 0:
self._time_offset -= self._episode_end_times[-1]
if self._total_steps is None:
self._total_steps = sum(self._episode_lengths)
# update monitor state
if self._current_reward is not None:
self._episode_rewards.append(self._current_reward)
self._episode_lengths.append(self._num_steps)
self._episode_end_times.append(time.time() - self._time_offset)
# reset episode state
self._current_reward = 0
self._num_steps = 0
return obs
def _step(self, action):
obs, rew, done, info = self.env.step(action)
self._current_reward += rew
self._num_steps += 1
self._total_steps += 1
info['steps'] = self._total_steps
info['rewards'] = self._episode_rewards
return (obs, rew, done, info)
def get_state(self):
return {
'env_id': self.env.unwrapped.spec.id,
'episode_data': {
'episode_rewards': self._episode_rewards,
'episode_lengths': self._episode_lengths,
'episode_end_times': self._episode_end_times,
'initial_reset_time': 0,
}
}
def set_state(self, state):
assert state['env_id'] == self.env.unwrapped.spec.id
ed = state['episode_data']
self._episode_rewards = ed['episode_rewards']
self._episode_lengths = ed['episode_lengths']
self._episode_end_times = ed['episode_end_times']
def boolean_flag(parser, name, default=False, help=None):
"""Add a boolean flag to argparse parser.
Parameters
----------
parser: argparse.Parser
parser to add the flag to
name: str
--<name> will enable the flag, while --no-<name> will disable it
default: bool or None
default value of the flag
help: str
help string for the flag
"""
dest = name.replace('-', '_')
parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
parser.add_argument("--no-" + name, action="store_false", dest=dest)
def get_wrapper_by_name(env, classname):
"""Given an a gym environment possibly wrapped multiple times, returns a wrapper
of class named classname or raises ValueError if no such wrapper was applied
Parameters
----------
env: gym.Env of gym.Wrapper
gym environment
classname: str
name of the wrapper
Returns
-------
wrapper: gym.Wrapper
wrapper named classname
"""
currentenv = env
while True:
if classname == currentenv.class_name():
return currentenv
elif isinstance(currentenv, gym.Wrapper):
currentenv = currentenv.env
else:
raise ValueError("Couldn't find wrapper named %s" % classname)
def relatively_safe_pickle_dump(obj, path, compression=False):
"""This is just like regular pickle dump, except from the fact that failure cases are
different:
- It's never possible that we end up with a pickle in corrupted state.
- If a there was a different file at the path, that file will remain unchanged in the
even of failure (provided that filesystem rename is atomic).
- it is sometimes possible that we end up with useless temp file which needs to be
deleted manually (it will be removed automatically on the next function call)
The indended use case is periodic checkpoints of experiment state, such that we never
corrupt previous checkpoints if the current one fails.
Parameters
----------
obj: object
object to pickle
path: str
path to the output file
compression: bool
if true pickle will be compressed
"""
temp_storage = path + ".relatively_safe"
if compression:
# Using gzip here would be simpler, but the size is limited to 2GB
with tempfile.NamedTemporaryFile() as uncompressed_file:
pickle.dump(obj, uncompressed_file)
with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
myzip.write(uncompressed_file.name, "data")
else:
with open(temp_storage, "wb") as f:
pickle.dump(obj, f)
os.rename(temp_storage, path)
def pickle_load(path, compression=False):
"""Unpickle a possible compressed pickle.
Parameters
----------
path: str
path to the output file
compression: bool
if true assumes that pickle was compressed when created and attempts decompression.
Returns
-------
obj: object
the unpickled object
"""
if compression:
with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
with myzip.open("data") as f:
return pickle.load(f)
else:
with open(path, "rb") as f:
return pickle.load(f)
+85
View File
@@ -0,0 +1,85 @@
from mpi4py import MPI
import baselines.baselines_common.tf_util as U
import tensorflow as tf
import numpy as np
class MpiAdam(object):
def __init__(self, var_list, *,
beta1=0.9, beta2=0.999, epsilon=1e-08,
scale_grad_by_procs=True,
comm=None):
self.var_list = var_list
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.scale_grad_by_procs = scale_grad_by_procs
size = sum(U.numel(v) for v in var_list)
self.m = np.zeros(size, 'float32')
self.v = np.zeros(size, 'float32')
self.t = 0
self.setfromflat = U.SetFromFlat(var_list)
self.getflat = U.GetFlat(var_list)
self.comm = MPI.COMM_WORLD if comm is None else comm
def update(self, localg, stepsize):
if self.t % 100 == 0:
self.check_synced()
localg = localg.astype('float32')
globalg = np.zeros_like(localg)
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
if self.scale_grad_by_procs:
globalg /= self.comm.Get_size()
self.t += 1
a = stepsize * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
self.setfromflat(self.getflat() + step)
def sync(self):
theta = self.getflat()
self.comm.Bcast(theta, root=0)
self.setfromflat(theta)
def check_synced(self):
if self.comm.Get_rank() == 0: # this is root
theta = self.getflat()
self.comm.Bcast(theta, root=0)
else:
thetalocal = self.getflat()
thetaroot = np.empty_like(thetalocal)
self.comm.Bcast(thetaroot, root=0)
assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
@U.in_session
def test_MpiAdam():
np.random.seed(0)
tf.set_random_seed(0)
a = tf.Variable(np.random.randn(3).astype('float32'))
b = tf.Variable(np.random.randn(2, 5).astype('float32'))
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
stepsize = 1e-2
update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
do_update = U.function([], loss, updates=[update_op])
tf.get_default_session().run(tf.global_variables_initializer())
for i in range(10):
print(i, do_update())
tf.set_random_seed(0)
tf.get_default_session().run(tf.global_variables_initializer())
var_list = [a, b]
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
adam = MpiAdam(var_list)
for i in range(10):
l, g = lossandgrad()
adam.update(g, stepsize)
print(i, l)
+24
View File
@@ -0,0 +1,24 @@
import os, subprocess, sys
def mpi_fork(n, bind_to_core=False):
"""Re-launches the current script with workers
Returns "parent" for original parent, "child" for MPI children
"""
if n <= 1:
return "child"
if os.getenv("IN_MPI") is None:
env = os.environ.copy()
env.update(
MKL_NUM_THREADS="1",
OMP_NUM_THREADS="1",
IN_MPI="1"
)
args = ["mpirun", "-np", str(n)]
if bind_to_core:
args += ["-bind-to", "core"]
args += [sys.executable] + sys.argv
subprocess.check_call(args, env=env)
return "parent"
else:
return "child"
+52
View File
@@ -0,0 +1,52 @@
from mpi4py import MPI
import numpy as np
from baselines.baselines_common import zipsame
def mpi_moments(x, axis=0):
x = np.asarray(x, dtype='float64')
newshape = list(x.shape)
newshape.pop(axis)
n = np.prod(newshape, dtype=int)
totalvec = np.zeros(n * 2 + 1, 'float64')
addvec = np.concatenate([x.sum(axis=axis).ravel(),
np.square(x).sum(axis=axis).ravel(),
np.array([x.shape[axis]], dtype='float64')])
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
sum = totalvec[:n]
sumsq = totalvec[n:2 * n]
count = totalvec[2 * n]
if count == 0:
mean = np.empty(newshape);
mean[:] = np.nan
std = np.empty(newshape);
std[:] = np.nan
else:
mean = sum / count
std = np.sqrt(np.maximum(sumsq / count - np.square(mean), 0))
return mean, std, count
def test_runningmeanstd():
comm = MPI.COMM_WORLD
np.random.seed(0)
for (triple, axis) in [
((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0),
((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0),
((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1),
]:
x = np.concatenate(triple, axis=axis)
ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis)
for (a1, a2) in zipsame(ms1, ms2):
print(a1, a2)
assert np.allclose(a1, a2)
print("ok!")
if __name__ == "__main__":
# mpirun -np 3 python <script>
test_runningmeanstd()
@@ -0,0 +1,112 @@
from mpi4py import MPI
import tensorflow as tf
import baselines.baselines_common.tf_util as U
import numpy as np
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-2, shape=()):
self._sum = tf.get_variable(
dtype=tf.float64,
shape=shape,
initializer=tf.constant_initializer(0.0),
name="runningsum", trainable=False)
self._sumsq = tf.get_variable(
dtype=tf.float64,
shape=shape,
initializer=tf.constant_initializer(epsilon),
name="runningsumsq", trainable=False)
self._count = tf.get_variable(
dtype=tf.float64,
shape=(),
initializer=tf.constant_initializer(epsilon),
name="count", trainable=False)
self.shape = shape
self.mean = tf.to_float(self._sum / self._count)
self.std = tf.sqrt(
tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2))
newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
updates=[tf.assign_add(self._sum, newsum),
tf.assign_add(self._sumsq, newsumsq),
tf.assign_add(self._count, newcount)])
def update(self, x):
x = x.astype('float64')
n = int(np.prod(self.shape))
totalvec = np.zeros(n * 2 + 1, 'float64')
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(),
np.array([len(x)], dtype='float64')])
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2 * n].reshape(self.shape),
totalvec[2 * n])
@U.in_session
def test_runningmeanstd():
for (x1, x2, x3) in [
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
(np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
]:
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
U.initialize()
x = np.concatenate([x1, x2, x3], axis=0)
ms1 = [x.mean(axis=0), x.std(axis=0)]
rms.update(x1)
rms.update(x2)
rms.update(x3)
ms2 = U.eval([rms.mean, rms.std])
assert np.allclose(ms1, ms2)
@U.in_session
def test_dist():
np.random.seed(0)
p1, p2, p3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1))
q1, q2, q3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1))
# p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
# q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
comm = MPI.COMM_WORLD
assert comm.Get_size() == 2
if comm.Get_rank() == 0:
x1, x2, x3 = p1, p2, p3
elif comm.Get_rank() == 1:
x1, x2, x3 = q1, q2, q3
else:
assert False
rms = RunningMeanStd(epsilon=0.0, shape=(1,))
U.initialize()
rms.update(x1)
rms.update(x2)
rms.update(x3)
bigvec = np.concatenate([p1, p2, p3, q1, q2, q3])
def checkallclose(x, y):
print(x, y)
return np.allclose(x, y)
assert checkallclose(
bigvec.mean(axis=0),
U.eval(rms.mean)
)
assert checkallclose(
bigvec.std(axis=0),
U.eval(rms.std)
)
if __name__ == "__main__":
# Run with mpirun -np 2 python <filename>
test_dist()
+35
View File
@@ -0,0 +1,35 @@
from mpi4py import MPI
import baselines.baselines_common.tf_util as U
import tensorflow as tf
class MpiSaver(object):
def __init__(self, var_list=None, *,
comm=None,
log_prefix="/tmp"):
self.var_list = var_list
self.t = 0
self.saver = tf.train.Saver(
var_list=var_list,
max_to_keep=100,
keep_checkpoint_every_n_hours=0.25,
pad_step_number=True,
save_relative_paths=True)
self.log_prefix = log_prefix
self.comm = MPI.COMM_WORLD if comm is None else comm
def restore(self, restore_from=None):
if restore_from is not None:
self.saver.restore(U.get_session(), restore_from)
self.t += int(restore_from.split("-")[-1])
self.sync()
def sync(self):
if self.comm.Get_rank() == 0: # this is root
self.saver.save(
U.get_session(),
"{}/model.ckpt".format(self.log_prefix),
global_step=self.t)
self.t += 1
+99
View File
@@ -0,0 +1,99 @@
"""This file is used for specifying various schedules that evolve over
time throughout the execution of the algorithm, such as:
- learning rate for the optimizer
- exploration epsilon for the epsilon greedy exploration strategy
- beta parameter for beta parameter in prioritized replay
Each schedule has a function `value(t)` which returns the current value
of the parameter given the timestep t of the optimization procedure.
"""
class Schedule(object):
def value(self, t):
"""Value of the schedule at time t"""
raise NotImplementedError()
class ConstantSchedule(object):
def __init__(self, value):
"""Value remains constant over time.
Parameters
----------
value: float
Constant value of the schedule
"""
self._v = value
def value(self, t):
"""See Schedule.value"""
return self._v
def linear_interpolation(l, r, alpha):
return l + alpha * (r - l)
class PiecewiseSchedule(object):
def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
"""Piecewise schedule.
endpoints: [(int, int)]
list of pairs `(time, value)` meanining that schedule should output
`value` when `t==time`. All the values for time must be sorted in
an increasing order. When t is between two times, e.g. `(time_a, value_a)`
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
`interpolation(value_a, value_b, alpha)` where alpha is a fraction of
time passed between `time_a` and `time_b` for time `t`.
interpolation: lambda float, float, float: float
a function that takes value to the left and to the right of t according
to the `endpoints`. Alpha is the fraction of distance from left endpoint to
right endpoint that t has covered. See linear_interpolation for example.
outside_value: float
if the value is requested outside of all the intervals sepecified in
`endpoints` this value is returned. If None then AssertionError is
raised when outside value is requested.
"""
idxes = [e[0] for e in endpoints]
assert idxes == sorted(idxes)
self._interpolation = interpolation
self._outside_value = outside_value
self._endpoints = endpoints
def value(self, t):
"""See Schedule.value"""
for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
if l_t <= t and t < r_t:
alpha = float(t - l_t) / (r_t - l_t)
return self._interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self._outside_value is not None
return self._outside_value
class LinearSchedule(object):
def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
"""Linear interpolation between initial_p and final_p over
schedule_timesteps. After this many timesteps pass final_p is
returned.
Parameters
----------
schedule_timesteps: int
Number of timesteps for which to linearly anneal initial_p
to final_p
initial_p: float
initial output value
final_p: float
final output value
"""
self.schedule_timesteps = schedule_timesteps
self.final_p = final_p
self.initial_p = initial_p
def value(self, t):
"""See Schedule.value"""
fraction = min(float(t) / self.schedule_timesteps, 1.0)
return self.initial_p + fraction * (self.final_p - self.initial_p)
+146
View File
@@ -0,0 +1,146 @@
import operator
class SegmentTree(object):
def __init__(self, capacity, operation, neutral_element):
"""Build a Segment Tree data structure.
https://en.wikipedia.org/wiki/Segment_tree
Can be used as regular array, but with two
important differences:
a) setting item's value is slightly slower.
It is O(lg capacity) instead of O(1).
b) user has access to an efficient `reduce`
operation which reduces `operation` over
a contiguous subsequence of items in the
array.
Paramters
---------
capacity: int
Total size of the array - must be a power of two.
operation: lambda obj, obj -> obj
and operation for combining elements (eg. sum, max)
must for a mathematical group together with the set of
possible values for array elements.
neutral_element: obj
neutral element for the operation above. eg. float('-inf')
for max and 0 for sum.
"""
assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
self._capacity = capacity
self._value = [neutral_element for _ in range(2 * capacity)]
self._operation = operation
def _reduce_helper(self, start, end, node, node_start, node_end):
if start == node_start and end == node_end:
return self._value[node]
mid = (node_start + node_end) // 2
if end <= mid:
return self._reduce_helper(start, end, 2 * node, node_start, mid)
else:
if mid + 1 <= start:
return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
else:
return self._operation(
self._reduce_helper(start, mid, 2 * node, node_start, mid),
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
)
def reduce(self, start=0, end=None):
"""Returns result of applying `self.operation`
to a contiguous subsequence of the array.
self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
Parameters
----------
start: int
beginning of the subsequence
end: int
end of the subsequences
Returns
-------
reduced: obj
result of reducing self.operation over the specified range of array elements.
"""
if end is None:
end = self._capacity
if end < 0:
end += self._capacity
end -= 1
return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
def __setitem__(self, idx, val):
# index of the leaf
idx += self._capacity
self._value[idx] = val
idx //= 2
while idx >= 1:
self._value[idx] = self._operation(
self._value[2 * idx],
self._value[2 * idx + 1]
)
idx //= 2
def __getitem__(self, idx):
assert 0 <= idx < self._capacity
return self._value[self._capacity + idx]
class SumSegmentTree(SegmentTree):
def __init__(self, capacity):
super(SumSegmentTree, self).__init__(
capacity=capacity,
operation=operator.add,
neutral_element=0.0
)
def sum(self, start=0, end=None):
"""Returns arr[start] + ... + arr[end]"""
return super(SumSegmentTree, self).reduce(start, end)
def find_prefixsum_idx(self, prefixsum):
"""Find the highest index `i` in the array such that
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
if array values are probabilities, this function
allows to sample indexes according to the discrete
probability efficiently.
Parameters
----------
perfixsum: float
upperbound on the sum of array prefix
Returns
-------
idx: int
highest index satisfying the prefixsum constraint
"""
assert 0 <= prefixsum <= self.sum() + 1e-5
idx = 1
while idx < self._capacity: # while non-leaf
if self._value[2 * idx] > prefixsum:
idx = 2 * idx
else:
prefixsum -= self._value[2 * idx]
idx = 2 * idx + 1
return idx - self._capacity
class MinSegmentTree(SegmentTree):
def __init__(self, capacity):
super(MinSegmentTree, self).__init__(
capacity=capacity,
operation=min,
neutral_element=float('inf')
)
def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])"""
return super(MinSegmentTree, self).reduce(start, end)
+753
View File
@@ -0,0 +1,753 @@
import numpy as np
import tensorflow as tf # pylint: ignore-module
import builtins
import functools
import copy
import os
import collections
# ================================================================
# Make consistent with numpy
# ================================================================
clip = tf.clip_by_value
def sum(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
def mean(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
def var(x, axis=None, keepdims=False):
meanx = mean(x, axis=axis, keepdims=keepdims)
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
def std(x, axis=None, keepdims=False):
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
def max(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
def min(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
def concatenate(arrs, axis=0):
return tf.concat(axis=axis, values=arrs)
def argmax(x, axis=None):
return tf.argmax(x, axis=axis)
def switch(condition, then_expression, else_expression):
"""Switches between two operations depending on a scalar value (int or bool).
Note that both `then_expression` and `else_expression`
should be symbolic tensors of the *same shape*.
# Arguments
condition: scalar tensor.
then_expression: TensorFlow operation.
else_expression: TensorFlow operation.
"""
x_shape = copy.copy(then_expression.get_shape())
x = tf.cond(tf.cast(condition, 'bool'),
lambda: then_expression,
lambda: else_expression)
x.set_shape(x_shape)
return x
# ================================================================
# Extras
# ================================================================
def l2loss(params):
if len(params) == 0:
return tf.constant(0.0)
else:
return tf.add_n([sum(tf.square(p)) for p in params])
def lrelu(x, leak=0.2):
f1 = 0.5 * (1 + leak)
f2 = 0.5 * (1 - leak)
return f1 * x + f2 * abs(x)
def categorical_sample_logits(X):
# https://github.com/tensorflow/tensorflow/issues/456
U = tf.random_uniform(tf.shape(X))
return argmax(X - tf.log(-tf.log(U)), axis=1)
# ================================================================
# Inputs
# ================================================================
def is_placeholder(x):
return type(x) is tf.Tensor and len(x.op.inputs) == 0
class TfInput(object):
def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are:
- possibly uses multiple placeholders internally and returns multiple values
- can apply light postprocessing to the value feed to placeholder.
"""
self.name = name
def get(self):
"""Return the tf variable(s) representing the possibly postprocessed value
of placeholder(s).
"""
raise NotImplemented()
def make_feed_dict(data):
"""Given data input it to the placeholder(s)."""
raise NotImplemented()
class PlacholderTfInput(TfInput):
def __init__(self, placeholder):
"""Wrapper for regular tensorflow placeholder."""
super().__init__(placeholder.name)
self._placeholder = placeholder
def get(self):
return self._placeholder
def make_feed_dict(self, data):
return {self._placeholder: data}
class BatchInput(PlacholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype
Parameters
----------
shape: [int]
shape of a single elemenet of the batch
dtype: tf.dtype
number representation used for tensor contents
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlacholderTfInput):
def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255
before passing it to the model.
On GPU this ensures lower data transfer times.
Parameters
----------
shape: [int]
shape of the tensor.
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
self._shape = shape
self._output = tf.cast(super().get(), tf.float32) / 255.0
def get(self):
return self._output
def ensure_tf_input(thing):
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
if isinstance(thing, TfInput):
return thing
elif is_placeholder(thing):
return PlacholderTfInput(thing)
else:
raise ValueError("Must be a placeholder or TfInput")
# ================================================================
# Mathematical utils
# ================================================================
def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta)
)
# ================================================================
# Optimizer utils
# ================================================================
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each
variable is clipped to `clip_val`
"""
gradients = optimizer.compute_gradients(objective, var_list=var_list)
for i, (grad, var) in enumerate(gradients):
if grad is not None:
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
return optimizer.apply_gradients(gradients)
# ================================================================
# Global session
# ================================================================
def get_session():
"""Returns recently made Tensorflow session"""
return tf.get_default_session()
def make_session(num_cpu):
"""Returns a session that will use <num_cpu> CPU's only"""
tf_config = tf.ConfigProto(
inter_op_parallelism_threads=num_cpu,
intra_op_parallelism_threads=num_cpu)
return tf.Session(config=tf_config)
def single_threaded_session():
"""Returns a session which will only use a single CPU"""
return make_session(1)
ALREADY_INITIALIZED = set()
def initialize():
"""Initialize all the uninitialized variables in the global scope."""
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
get_session().run(tf.variables_initializer(new_variables))
ALREADY_INITIALIZED.update(new_variables)
def eval(expr, feed_dict=None):
if feed_dict is None:
feed_dict = {}
return get_session().run(expr, feed_dict=feed_dict)
VALUE_SETTERS = collections.OrderedDict()
def set_value(v, val):
global VALUE_SETTERS
if v in VALUE_SETTERS:
set_op, set_endpoint = VALUE_SETTERS[v]
else:
set_endpoint = tf.placeholder(v.dtype)
set_op = v.assign(set_endpoint)
VALUE_SETTERS[v] = (set_op, set_endpoint)
get_session().run(set_op, feed_dict={set_endpoint: val})
# ================================================================
# Saving variables
# ================================================================
def load_state(fname):
saver = tf.train.Saver()
saver.restore(get_session(), fname)
def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
saver.save(get_session(), fname)
# ================================================================
# Model components
# ================================================================
def normc_initializer(std=1.0):
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
out = np.random.randn(*shape).astype(np.float32)
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
return tf.constant(out)
return _initializer
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
summary_tag=None):
with tf.variable_scope(name):
stride_shape = [1, stride[0], stride[1], 1]
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
# there are "num input feature maps * filter height * filter width"
# inputs to each hidden unit
fan_in = intprod(filter_shape[:3])
# each unit in the lower layer receives a gradient from:
# "num output feature maps * filter height * filter width" /
# pooling size
fan_out = intprod(filter_shape[:2]) * num_filters
# initialize weights with random weights
w_bound = np.sqrt(6. / (fan_in + fan_out))
w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
collections=collections)
b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
collections=collections)
if summary_tag is not None:
tf.summary.image(summary_tag,
tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
[2, 0, 1, 3]),
max_images=10)
return tf.nn.conv2d(x, w, stride_shape, pad) + b
def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w)
if bias:
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
return ret + b
else:
return ret
def wndense(x, size, name, init_scale=1.0):
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
initializer=tf.random_normal_initializer(0, 0.05))
g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
# use weight normalization (Salimans & Kingma, 2016)
x = tf.matmul(x, v)
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
def densenobias(x, size, name, weight_init=None):
return dense(x, size, name, weight_init=weight_init, bias=False)
def dropout(x, pkeep, phase=None, mask=None):
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
if phase is None:
return mask * x
else:
return switch(phase, mask * x, pkeep * x)
# ================================================================
# Theano-like Function
# ================================================================
def function(inputs, outputs, updates=None, givens=None):
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
values to be fed to the input's placeholders and produces the values of the expressions
in outputs.
Input values can be passed in the same order as inputs or can be provided as kwargs based
on placeholder name (passed to constructor or accessible via placeholder.op.name).
Example:
x = tf.placeholder(tf.int32, (), name="x")
y = tf.placeholder(tf.int32, (), name="y")
z = 3 * x + 2 * y
lin = function([x, y], z, givens={y: 0})
with single_threaded_session():
initialize()
assert lin(2) == 6
assert lin(x=3) == 9
assert lin(2, 2) == 10
assert lin(x=2, y=3) == 12
Parameters
----------
inputs: [tf.placeholder or TfInput]
list of input arguments
outputs: [tf.Variable] or tf.Variable
list of outputs or a single output to be returned from function. Returned
value will also have the same shape.
"""
if isinstance(outputs, list):
return _Function(inputs, outputs, updates, givens=givens)
elif isinstance(outputs, (dict, collections.OrderedDict)):
f = _Function(inputs, outputs.values(), updates, givens=givens)
return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
else:
f = _Function(inputs, [outputs], updates, givens=givens)
return lambda *args, **kwargs: f(*args, **kwargs)[0]
class _Function(object):
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
for inpt in inputs:
if not issubclass(type(inpt), TfInput):
assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of src.baselines_common.TfInput"
self.inputs = inputs
updates = updates or []
self.update_group = tf.group(*updates)
self.outputs_update = list(outputs) + [self.update_group]
self.givens = {} if givens is None else givens
self.check_nan = check_nan
def _feed_input(self, feed_dict, inpt, value):
if issubclass(type(inpt), TfInput):
feed_dict.update(inpt.make_feed_dict(value))
elif is_placeholder(inpt):
feed_dict[inpt] = value
def __call__(self, *args, **kwargs):
assert len(args) <= len(self.inputs), "Too many arguments provided"
feed_dict = {}
# Update the args
for inpt, value in zip(self.inputs, args):
self._feed_input(feed_dict, inpt, value)
# Update the kwargs
kwargs_passed_inpt_names = set()
for inpt in self.inputs[len(args):]:
inpt_name = inpt.name.split(':')[0]
inpt_name = inpt_name.split('/')[-1]
assert inpt_name not in kwargs_passed_inpt_names, \
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
if inpt_name in kwargs:
kwargs_passed_inpt_names.add(inpt_name)
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
else:
assert inpt in self.givens, "Missing argument " + inpt_name
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
# Update feed dict with givens.
for inpt in self.givens:
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
if self.check_nan:
if any(np.isnan(r).any() for r in results):
raise RuntimeError("Nan detected")
return results
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
if isinstance(outputs, list):
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
else:
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
return lambda *inputs: f(*inputs)[0]
class _MemFriendlyFunction(object):
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
self.nondata_inputs = nondata_inputs
self.data_inputs = data_inputs
self.outputs = list(outputs)
self.batch_size = batch_size
def __call__(self, *inputvals):
assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
nondata_vals = inputvals[0:len(self.nondata_inputs)]
data_vals = inputvals[len(self.nondata_inputs):]
feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
n = data_vals[0].shape[0]
for v in data_vals[1:]:
assert v.shape[0] == n
for i_start in range(0, n, self.batch_size):
slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
for (var, val) in zip(self.data_inputs, slice_vals):
feed_dict[var] = val
results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
if i_start == 0:
sum_results = results
else:
for i in range(len(results)):
sum_results[i] = sum_results[i] + results[i]
for i in range(len(results)):
sum_results[i] = sum_results[i] / n
return sum_results
# ================================================================
# Modules
# ================================================================
class Module(object):
def __init__(self, name):
self.name = name
self.first_time = True
self.scope = None
self.cache = {}
def __call__(self, *args):
if args in self.cache:
print("(%s) retrieving value from cache" % (self.name,))
return self.cache[args]
with tf.variable_scope(self.name, reuse=not self.first_time):
scope = tf.get_variable_scope().name
if self.first_time:
self.scope = scope
print("(%s) running function for the first time" % (self.name,))
else:
assert self.scope == scope, "Tried calling function with a different scope"
print("(%s) running function on new inputs" % (self.name,))
self.first_time = False
out = self._call(*args)
self.cache[args] = out
return out
def _call(self, *args):
raise NotImplementedError
@property
def trainable_variables(self):
assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
@property
def variables(self):
assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def module(name):
@functools.wraps
def wrapper(f):
class WrapperModule(Module):
def _call(self, *args):
return f(*args)
return WrapperModule(name)
return wrapper
# ================================================================
# Graph traversal
# ================================================================
VARIABLES = {}
def get_parents(node):
return node.op.inputs
def topsorted(outputs):
"""
Topological sort via non-recursive depth-first search
"""
assert isinstance(outputs, (list, tuple))
marks = {}
out = []
stack = [] # pylint: disable=W0621
# i: node
# jidx = number of children visited so far from that node
# marks: state of each node, which is one of
# 0: haven't visited
# 1: have visited, but not done visiting children
# 2: done visiting children
for x in outputs:
stack.append((x, 0))
while stack:
(i, jidx) = stack.pop()
if jidx == 0:
m = marks.get(i, 0)
if m == 0:
marks[i] = 1
elif m == 1:
raise ValueError("not a dag")
else:
continue
ps = get_parents(i)
if jidx == len(ps):
marks[i] = 2
out.append(i)
else:
stack.append((i, jidx + 1))
j = ps[jidx]
stack.append((j, 0))
return out
# ================================================================
# Flat vectors
# ================================================================
def var_shape(x):
out = x.get_shape().as_list()
assert all(isinstance(a, int) for a in out), \
"shape function assumes that shape is fully known"
return out
def numel(x):
return intprod(var_shape(x))
def intprod(x):
return int(np.prod(x))
def flatgrad(loss, var_list, clip_norm=None):
grads = tf.gradients(loss, var_list)
if clip_norm is not None:
grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
return tf.concat(axis=0, values=[
tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
for (v, grad) in zip(var_list, grads)
])
class SetFromFlat(object):
def __init__(self, var_list, dtype=tf.float32):
shapes = list(map(var_shape, var_list))
total_size = np.sum([intprod(shape) for shape in shapes])
self.theta = theta = tf.placeholder(dtype, [total_size])
start = 0
assigns = []
for (shape, v) in zip(shapes, var_list):
size = intprod(shape)
assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
start += size
self.op = tf.group(*assigns)
def __call__(self, theta):
get_session().run(self.op, feed_dict={self.theta: theta})
class GetFlat(object):
def __init__(self, var_list):
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
def __call__(self):
return get_session().run(self.op)
# ================================================================
# Misc
# ================================================================
def fancy_slice_2d(X, inds0, inds1):
"""
like numpy X[inds0, inds1]
XXX this implementation is bad
"""
inds0 = tf.cast(inds0, tf.int64)
inds1 = tf.cast(inds1, tf.int64)
shape = tf.cast(tf.shape(X), tf.int64)
ncols = shape[1]
Xflat = tf.reshape(X, [-1])
return tf.gather(Xflat, inds0 * ncols + inds1)
# ================================================================
# Scopes
# ================================================================
def scope_vars(scope, trainable_only=False):
"""
Get variables inside a scope
The scope can be specified as a string
Parameters
----------
scope: str or VariableScope
scope in which the variables reside.
trainable_only: bool
whether or not to return only the variables that were marked as trainable.
Returns
-------
vars: [tf.Variable]
list of variables in `scope`.
"""
return tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
scope=scope if isinstance(scope, str) else scope.name
)
def scope_name():
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
return tf.get_variable_scope().name
def absolute_scope_name(relative_scope_name):
"""Appends parent scope name to `relative_scope_name`"""
return scope_name() + "/" + relative_scope_name
def lengths_to_mask(lengths_b, max_length):
"""
Turns a vector of lengths into a boolean mask
Args:
lengths_b: an integer vector of lengths
max_length: maximum length to fill the mask
Returns:
a boolean array of shape (batch_size, max_length)
row[i] consists of True repeated lengths_b[i] times, followed by False
"""
lengths_b = tf.convert_to_tensor(lengths_b)
assert lengths_b.get_shape().ndims == 1
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
return mask_bt
def in_session(f):
@functools.wraps(f)
def newfunc(*args, **kwargs):
with tf.Session():
f(*args, **kwargs)
return newfunc
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
def get_placeholder(name, dtype, shape):
if name in _PLACEHOLDER_CACHE:
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
assert dtype1 == dtype and shape1 == shape
return out
else:
out = tf.placeholder(dtype=dtype, shape=shape, name=name)
_PLACEHOLDER_CACHE[name] = (out, dtype, shape)
return out
def get_placeholder_cached(name):
return _PLACEHOLDER_CACHE[name][0]
def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
def reset():
global _PLACEHOLDER_CACHE
global VARIABLES
_PLACEHOLDER_CACHE = {}
VARIABLES = {}
tf.reset_default_graph()
@@ -0,0 +1,19 @@
class VecEnv(object):
"""
Vectorized environment base class
"""
def step(self, vac):
"""
Apply sequence of actions to sequence of environments
actions -> (observations, rewards, news)
where 'news' is a boolean vector indicating whether each element is new.
"""
raise NotImplementedError
def reset(self):
"""
Reset all environments
"""
raise NotImplementedError
def close(self):
pass
@@ -0,0 +1,79 @@
import numpy as np
from multiprocessing import Process, Pipe
from src.common.vec_env import VecEnv
def worker(remote, env_fn_wrapper):
env = env_fn_wrapper.x()
while True:
cmd, data = remote.recv()
if cmd == 'step':
ob, reward, done, info = env.step(data)
if done:
ob = env.reset()
remote.send((ob, reward, done, info))
elif cmd == 'reset':
ob = env.reset()
remote.send(ob)
elif cmd == 'close':
remote.close()
break
elif cmd == 'get_spaces':
remote.send((env.action_space, env.observation_space))
else:
raise NotImplementedError
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv):
def __init__(self, env_fns):
"""
envs: list of gym environments to run in subprocesses
"""
nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
for p in self.ps:
p.start()
self.remotes[0].send(('get_spaces', None))
self.action_space, self.observation_space = self.remotes[0].recv()
def step(self, actions):
for remote, action in zip(self.remotes, actions):
remote.send(('step', action))
results = [remote.recv() for remote in self.remotes]
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
for remote in self.remotes:
remote.send(('reset', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self):
for remote in self.remotes:
remote.send(('close', None))
for p in self.ps:
p.join()
@property
def num_envs(self):
return len(self.remotes)
+95
View File
@@ -0,0 +1,95 @@
import tensorflow as tf
import baselines.baselines_common.tf_util as U
from baselines.baselines_common.mpi_running_mean_std import RunningMeanStd
from baselines.baselines_common.distributions import make_pdtype, DiagGaussianPdType, BernoulliPdType
def mlp_block(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
with tf.variable_scope(name_or_scope=name):
for i in range(num_hid_layers):
x = U.dense(
x, hid_size,
name="fc%i" % (i + 1), weight_init=U.normc_initializer(1.0))
x = activation_fn(x)
return x
def feature_net(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
with tf.variable_scope(name_or_scope=name):
x = mlp_block(
x, name="mlp",
hid_size=hid_size, num_hid_layers=num_hid_layers, activation_fn=activation_fn)
return x
class Actor(object):
def __init__(self, name, *args, **kwargs):
with tf.variable_scope(name):
self._init(*args, **kwargs)
self.scope = tf.get_variable_scope().name
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, noise_type=None):
if noise_type == "gaussian":
self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0])
else:
self.pdtype = pdtype = make_pdtype(ac_space)
ob = U.get_placeholder(
name="ob", dtype=tf.float32,
shape=[None] + list(ob_space.shape))
with tf.variable_scope("obfilter"):
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
obz = (ob - self.ob_rms.mean) / self.ob_rms.std
obz = tf.clip_by_value(obz, -5.0, 5.0)
# critic net (value network)
last_out = feature_net(
obz, name="vf",
num_hid_layers=num_hid_layers, hid_size=hid_size,
activation_fn=tf.nn.tanh)
self.vpred = U.dense(
last_out, 1,
name="vf_final", weight_init=U.normc_initializer(1.0))[:, 0]
# actor net (policy network)
last_out = feature_net(
obz, name="pol",
num_hid_layers=num_hid_layers, hid_size=hid_size,
activation_fn=tf.nn.tanh)
if gaussian_fixed_var and isinstance(self.pdtype, DiagGaussianPdType):
mean = U.dense(
last_out, pdtype.param_shape()[0] // 2,
name="pol_final", weight_init=U.normc_initializer(0.01))
logstd = tf.get_variable(
name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
initializer=tf.zeros_initializer())
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
else:
pdparam = U.dense(
last_out, pdtype.param_shape()[0],
name="pol_final", weight_init=U.normc_initializer(0.01))
# pd - probability distribution
self.pd = pdtype.pdfromflat(pdparam)
self.state_in = []
self.state_out = []
stochastic = tf.placeholder(dtype=tf.bool, shape=())
ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
self._act = U.function([stochastic, ob], [ac, self.vpred])
def act(self, stochastic, ob):
ac1, vpred1 = self._act(stochastic, ob[None])
return ac1[0], vpred1[0]
def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
def get_initial_state(self):
return []
+171
View File
@@ -0,0 +1,171 @@
import tensorflow as tf
import numpy as np
import time
from mpi4py import MPI
from collections import deque
from contextlib import contextmanager
from common.logger import Logger
from baselines.baselines_common import Dataset, explained_variance, fmt_row, zipsame
import baselines.baselines_common.tf_util as U
from baselines.baselines_common.mpi_adam import MpiAdam
from baselines.baselines_common.mpi_saver import MpiSaver
from baselines.baselines_common.mpi_moments import mpi_moments
from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
def learn(env, policy_func, args, *,
timesteps_per_batch, # timesteps per actor per update
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers
gamma, lam, # advantage estimation
adam_epsilon=1e-5,
schedule='constant'): # annealing for stepsize parameters (epsilon and adam),
# Setup losses and stuff
# ----------------------------------------
ob_space = env.observation_space
ac_space = env.action_space
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
atarg = tf.placeholder(dtype=tf.float32,
shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
shape=[]) # learning rate multiplier, updated with schedule
clip_param = clip_param * lrmult # Annealed cliping parameter epislon
ob = U.get_placeholder_cached(name="ob")
ac = pi.pdtype.sample_placeholder([None])
kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy()
meankl = U.mean(kloldnew)
meanent = U.mean(ent)
pol_entpen = (-entcoeff) * meanent
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
surr1 = ratio * atarg # surrogate from conservative policy iteration
surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
vf_loss = U.mean(tf.square(pi.vpred - ret))
total_loss = pol_surr + pol_entpen + vf_loss
losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
var_list = pi.get_trainable_variables()
lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
losses + [U.flatgrad(total_loss, var_list)])
adam = MpiAdam(var_list, epsilon=adam_epsilon)
policy_var_list = [v for v in var_list if v.name.split("/")[0].startswith("pi")]
saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
for (oldv, newv) in
zipsame(oldpi.get_variables(),
pi.get_variables())])
compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
U.initialize()
saver.restore(restore_from=args.restore_actor_from)
adam.sync()
# Prepare for rollouts
# ----------------------------------------
seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
episodes_so_far = 0
timesteps_so_far = 0
iters_so_far = 0
tstart = time.time()
lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
# max_timesteps = 1e10
cur_lrmult = 1.0
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
logger = Logger(args.logdir)
while time.time() - tstart < 86400 * args.max_train_days:
# if schedule == 'constant':
# cur_lrmult = 1.0
# elif schedule == 'linear':
# cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
# else:
# raise NotImplementedError
# logger.log("********** Iteration %i ************" % iters_so_far)
seg = seg_gen.__next__()
add_vtarg_and_adv(seg, gamma, lam)
# ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
vpredbefore = seg["vpred"] # predicted value function before udpate
atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True)
optim_batchsize = optim_batchsize or ob.shape[0]
if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
assign_old_eq_new() # set old parameter values to new parameter values
# logger.log("Optimizing...")
# logger.log(fmt_row(13, loss_names))
# Here we do a bunch of optimization epochs over the data
for _ in range(optim_epochs):
losses = [] # list of tuples, each of which gives the loss for a minibatch
for batch in d.iterate_once(optim_batchsize):
*newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
batch["vtarg"], cur_lrmult)
adam.update(g, optim_stepsize * cur_lrmult)
losses.append(newlosses)
# logger.log(fmt_row(13, np.mean(losses, axis=0)))
saver.sync()
# logger.log("Evaluating losses...")
losses = []
for batch in d.iterate_once(optim_batchsize):
newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
cur_lrmult)
losses.append(newlosses)
meanlosses, _, _ = mpi_moments(losses, axis=0)
# logger.log(fmt_row(13, meanlosses))
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
lens, rews = map(flatten_lists, zip(*listoflrpairs))
lenbuffer.extend(lens)
rewbuffer.extend(rews)
episodes_so_far += len(lens)
timesteps_so_far += sum(lens)
iters_so_far += 1
# Logging
logger.scalar_summary("episodes", len(lens), iters_so_far)
for (lossname, lossval) in zip(loss_names, meanlosses):
logger.scalar_summary(lossname, lossval, episodes_so_far)
logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
elapsed_time = time.time() - tstart
logger.scalar_summary(
"episode per minute",
episodes_so_far / elapsed_time * 60,
episodes_so_far)
logger.scalar_summary(
"step per second",
timesteps_so_far / elapsed_time,
episodes_so_far)
def flatten_lists(listoflists):
return [el for list_ in listoflists for el in list_]
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python
# noinspection PyUnresolvedReferences
import os
import json
import argparse
from mpi4py import MPI
from common.misc_util import boolean_flag, str2params, create_if_need
from common.misc_util import set_global_seeds
from common.env_wrappers import create_env
from baselines.nets import Actor
from baselines import trpo, ppo
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
'--agent',
type=str,
default="trpo",
choices=["trpo", "ppo"],
help='Which agent to use. (default: %(default)s)')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--difficulty', type=int, default=2)
parser.add_argument('--max-obstacles', type=int, default=3)
parser.add_argument('--logdir', type=str, default="./logs")
boolean_flag(parser, "baseline-wrapper", default=False)
parser.add_argument('--skip-frames', type=int, default=1)
parser.add_argument('--reward-scale', type=float, default=1.)
parser.add_argument('--fail-reward', type=float, default=0.0)
parser.add_argument('--hid-size', type=int, default=64)
parser.add_argument('--num-hid-layers', type=int, default=2)
parser.add_argument('--gamma', type=float, default=0.96)
parser.add_argument('--restore-args-from', type=str, default=None)
parser.add_argument('--restore-actor-from', type=str, default=None)
parser.add_argument(
'--max-train-days',
default=int(1e1),
type=int)
args = parser.parse_args()
return args
def restore_params(args):
with open(args.restore_args_from, "r") as fin:
params = json.load(fin)
del params["seed"]
del params["difficulty"]
del params["max_obstacles"]
del params["skip_frames"]
del params["restore_args_from"]
del params["restore_actor_from"]
for key, value in params.items():
setattr(args, key, value)
return args
def train(args):
import baselines.baselines_common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
if args.restore_args_from is not None:
args = restore_params(args)
rank = MPI.COMM_WORLD.Get_rank()
workerseed = args.seed + 241 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
def policy_fn(name, ob_space, ac_space):
return Actor(
name=name,
ob_space=ob_space, ac_space=ac_space,
hid_size=args.hid_size, num_hid_layers=args.num_hid_layers,
noise_type=args.noise_type)
env = create_env(args)
env.seed(workerseed)
if rank == 0:
create_if_need(args.logdir)
with open("{}/args.json".format(args.logdir), "w") as fout:
json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
try:
args.thread = rank
if args.agent == "trpo":
trpo.learn(
env, policy_fn, args,
timesteps_per_batch=1024,
gamma=args.gamma,
lam=0.98,
max_kl=0.01,
cg_iters=10,
cg_damping=0.1,
vf_iters=5,
vf_stepsize=1e-3)
elif args.agent == "ppo":
# optimal settings:
# timesteps_per_batch = optim_epochs * optim_batchsize
ppo.learn(
env, policy_fn, args,
timesteps_per_batch=256,
gamma=args.gamma,
lam=0.95,
clip_param=0.2,
entcoeff=0.0,
optim_epochs=4,
optim_stepsize=3e-4,
optim_batchsize=64,
schedule='constant')
else:
raise NotImplementedError
except KeyboardInterrupt:
print("closing envs...")
env.close()
if __name__ == '__main__':
args = parse_args()
args.noise_type = "gaussian"
train(args)
+76
View File
@@ -0,0 +1,76 @@
import numpy as np
def traj_segment_generator(pi, env, args, horizon, stochastic):
# Initialize state variables
t = 0
ac = env.action_space.sample() # not used, just so we have the datatype
new = True # marks if we're on first timestep of an episode
ob = env.reset(difficulty=args.difficulty)
cur_ep_ret = 0 # return in current episode
cur_ep_len = 0 # len of current episode
ep_rets = [] # returns of completed episodes in this segment
ep_lens = [] # lengths of ...
# Initialize history arrays
obs = np.array([ob for _ in range(horizon)])
rews = np.zeros(horizon, 'float32')
vpreds = np.zeros(horizon, 'float32')
news = np.zeros(horizon, 'int32')
acs = np.array([ac for _ in range(horizon)])
prevacs = acs.copy()
while True:
prevac = ac
ac, vpred = pi.act(stochastic, ob)
# Slight weirdness here because we need value function at time T
# before returning segment [0, T-1] so we get the correct
# terminal value
if t > 0 and t % horizon == 0:
yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
"ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
"ep_rets": ep_rets, "ep_lens": ep_lens}
# @TODO: TRPO & PPO implementation diff
# _, vpred = pi.act(stochastic, ob) # @TODO: uncomment??? IMPORTANT!!
# Be careful!!! if you change the downstream algorithm to aggregate
# several of these batches, then be sure to do a deepcopy
ep_rets = []
ep_lens = []
i = t % horizon
obs[i] = ob
vpreds[i] = vpred
news[i] = new
acs[i] = ac
prevacs[i] = prevac
ob, rew, new, _ = env.step(ac)
rews[i] = rew
cur_ep_ret += rew
cur_ep_len += 1
if new:
ep_rets.append(cur_ep_ret)
ep_lens.append(cur_ep_len)
cur_ep_ret = 0
cur_ep_len = 0
ob = env.reset(difficulty=args.difficulty)
t += 1
def add_vtarg_and_adv(seg, gamma, lam):
"""
Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
"""
# last element is only used for last vtarg, but we already zeroed it if last new = 1
new = np.append(seg["new"], 0)
vpred = np.append(seg["vpred"], seg["nextvpred"])
T = len(seg["rew"])
seg["adv"] = gaelam = np.empty(T, 'float32')
rew = seg["rew"]
lastgaelam = 0
for t in reversed(range(T)):
nonterminal = 1 - new[t + 1]
delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"]
+243
View File
@@ -0,0 +1,243 @@
import tensorflow as tf
import numpy as np
import time
from mpi4py import MPI
from collections import deque
from contextlib import contextmanager
from common.logger import Logger
from baselines.baselines_common import explained_variance, zipsame, dataset
import baselines.baselines_common.tf_util as U
from baselines.baselines_common import colorize
from baselines.baselines_common.mpi_adam import MpiAdam
from baselines.baselines_common.mpi_saver import MpiSaver
from baselines.baselines_common.cg import cg
from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
def learn(env, policy_func, args, *,
timesteps_per_batch, # what to train on
max_kl, cg_iters,
gamma, lam, # advantage estimation
entcoeff=0.0,
cg_damping=1e-2,
vf_stepsize=3e-4,
vf_iters=3):
nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
np.set_printoptions(precision=3)
# Setup losses and stuff
# ----------------------------------------
ob_space = env.observation_space
ac_space = env.action_space
pi = policy_func("pi", ob_space, ac_space)
oldpi = policy_func("oldpi", ob_space, ac_space)
atarg = tf.placeholder(
dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
ob = U.get_placeholder_cached(name="ob")
ac = pi.pdtype.sample_placeholder([None])
kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy()
meankl = U.mean(kloldnew)
meanent = U.mean(ent)
entbonus = entcoeff * meanent
vferr = U.mean(tf.square(pi.vpred - ret))
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
surrgain = U.mean(ratio * atarg)
optimgain = surrgain + entbonus
losses = [optimgain, meankl, entbonus, surrgain, meanent]
loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
dist = meankl
all_var_list = pi.get_trainable_variables()
var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
vfadam = MpiAdam(vf_var_list)
policy_var_list = [v for v in all_var_list if v.name.split("/")[0].startswith("pi")]
saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
get_flat = U.GetFlat(var_list)
set_from_flat = U.SetFromFlat(var_list)
klgrads = tf.gradients(dist, var_list)
flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
shapes = [var.get_shape().as_list() for var in var_list]
start = 0
tangents = []
for shape in shapes:
sz = U.intprod(shape)
tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
start += sz
gvp = tf.add_n([U.sum(g * tangent) for (g, tangent) in
zipsame(klgrads, tangents)]) # pylint: disable=E1111
fvp = U.flatgrad(gvp, var_list)
assign_old_eq_new = U.function(
[], [],
updates=[tf.assign(oldv, newv)
for (oldv, newv) in
zipsame(oldpi.get_variables(), pi.get_variables())])
compute_losses = U.function([ob, ac, atarg], losses)
compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
@contextmanager
def timed(msg):
if rank == 0:
print(colorize(msg, color='magenta'))
tstart = time.time()
yield
print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
else:
yield
def allmean(x):
assert isinstance(x, np.ndarray)
out = np.empty_like(x)
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
out /= nworkers
return out
U.initialize()
saver.restore(restore_from=args.restore_actor_from)
th_init = get_flat()
MPI.COMM_WORLD.Bcast(th_init, root=0)
set_from_flat(th_init)
vfadam.sync()
print("Init param sum", th_init.sum(), flush=True)
# Prepare for rollouts
# ----------------------------------------
seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
episodes_so_far = 0
timesteps_so_far = 0
iters_so_far = 0
tstart = time.time()
lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
logger = Logger(args.logdir)
while time.time() - tstart < 86400 * args.max_train_days:
# logger.log("********** Iteration %i ************" % iters_so_far)
meanlosses = [0] * len(loss_names)
with timed("sampling"):
seg = seg_gen.__next__()
add_vtarg_and_adv(seg, gamma, lam)
# ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
vpredbefore = seg["vpred"] # predicted value function before udpate
atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
segargs = seg["ob"], seg["ac"], seg["adv"]
fvpargs = [arr[::5] for arr in segargs]
def fisher_vector_product(p):
return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
assign_old_eq_new() # set old parameter values to new parameter values
with timed("computegrad"):
*lossbefore, g = compute_lossandgrad(*segargs)
lossbefore = allmean(np.array(lossbefore))
g = allmean(g)
if np.allclose(g, 0):
pass
# logger.log("Got zero gradient. not updating")
else:
with timed("cg"):
stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
assert np.isfinite(stepdir).all()
shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
lm = np.sqrt(shs / max_kl)
# logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
fullstep = stepdir / lm
expectedimprove = g.dot(fullstep)
surrbefore = lossbefore[0]
stepsize = 1.0
thbefore = get_flat()
for _ in range(10):
thnew = thbefore + fullstep * stepsize
set_from_flat(thnew)
meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*segargs)))
improve = surr - surrbefore
# logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
# if not np.isfinite(meanlosses).all():
# logger.log("Got non-finite value of losses -- bad!")
# elif kl > max_kl * 1.5:
# logger.log("violated KL constraint. shrinking step.")
# elif improve < 0:
# logger.log("surrogate didn't improve. shrinking step.")
# else:
# logger.log("Stepsize OK!")
# break
stepsize *= .5
else:
# logger.log("couldn't compute a good step")
set_from_flat(thbefore)
if nworkers > 1 and iters_so_far % 20 == 0:
paramsums = MPI.COMM_WORLD.allgather(
(thnew.sum(), vfadam.getflat().sum())) # list of tuples
assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
with timed("vf"):
for _ in range(vf_iters):
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
include_final_partial_batch=False,
batch_size=64):
g = allmean(compute_vflossandgrad(mbob, mbret))
vfadam.update(g, vf_stepsize)
saver.sync()
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
lens, rews = map(flatten_lists, zip(*listoflrpairs))
lenbuffer.extend(lens)
rewbuffer.extend(rews)
episodes_so_far += len(lens)
timesteps_so_far += sum(lens)
iters_so_far += 1
# Logging
logger.scalar_summary("episodes", len(lens), iters_so_far)
for (lossname, lossval) in zip(loss_names, meanlosses):
logger.scalar_summary(lossname, lossval, episodes_so_far)
logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
elapsed_time = time.time() - tstart
logger.scalar_summary(
"episode per minute",
episodes_so_far / elapsed_time * 60,
episodes_so_far)
logger.scalar_summary(
"step per second",
timesteps_so_far / elapsed_time,
episodes_so_far)
def flatten_lists(listoflists):
return [el for list_ in listoflists for el in list_]
View File
+215
View File
@@ -0,0 +1,215 @@
import random
import numpy as np
from common.segment_tree import SumSegmentTree, MinSegmentTree
class ReplayBuffer(object):
def __init__(self, size):
"""Create Prioritized Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
"""
self._storage = []
self._maxsize = size
self._next_idx = 0
def __len__(self):
return len(self._storage)
def add(self, obs_t, action, reward, obs_tp1, done):
data = (obs_t, action, reward, obs_tp1, done)
if self._next_idx >= len(self._storage):
self._storage.append(data)
else:
self._storage[self._next_idx] = data
self._next_idx = (self._next_idx + 1) % self._maxsize
def _encode_sample(self, idxes):
obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
for i in idxes:
data = self._storage[i]
obs_t, action, reward, obs_tp1, done = data
obses_t.append(np.array(obs_t, copy=False))
actions.append(np.array(action, copy=False))
rewards.append(reward)
obses_tp1.append(np.array(obs_tp1, copy=False))
dones.append(done)
return np.array(obses_t), \
np.array(actions), \
np.array(rewards), \
np.array(obses_tp1), \
np.array(dones)
def sample(self, batch_size):
"""Sample a batch of experiences.
Parameters
----------
batch_size: int
How many transitions to sample.
Returns
-------
obs_batch: np.array
batch of observations
act_batch: np.array
batch of actions executed given obs_batch
rew_batch: np.array
rewards received as results of executing act_batch
next_obs_batch: np.array
next set of observations seen after executing act_batch
done_mask: np.array
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
return self._encode_sample(idxes)
class PrioritizedReplayBuffer(ReplayBuffer):
def __init__(self, size, alpha=0.5):
"""Create Prioritized Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
alpha: float
how much prioritization is used
(0 - no prioritization, 1 - full prioritization)
See Also
--------
ReplayBuffer.__init__
"""
super(PrioritizedReplayBuffer, self).__init__(size)
assert alpha > 0
self._alpha = alpha
it_capacity = 1
while it_capacity < size:
it_capacity *= 2
self._it_sum = SumSegmentTree(it_capacity)
self._it_min = MinSegmentTree(it_capacity)
self._max_priority = 1.0
def add(self, *args, **kwargs):
"""See ReplayBuffer.store_effect"""
idx = self._next_idx
super().add(*args, **kwargs)
self._it_sum[idx] = self._max_priority ** self._alpha
self._it_min[idx] = self._max_priority ** self._alpha
def _sample_proportional(self, batch_size):
res = []
for _ in range(batch_size):
# TODO(szymon): should we ensure no repeats?
mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
idx = self._it_sum.find_prefixsum_idx(mass)
res.append(idx)
return res
def sample(self, batch_size, beta=0.5):
"""Sample a batch of experiences.
compared to ReplayBuffer.sample
it also returns importance weights and idxes
of sampled experiences.
Parameters
----------
batch_size: int
How many transitions to sample.
beta: float
To what degree to use importance weights
(0 - no corrections, 1 - full correction)
Returns
-------
obs_batch: np.array
batch of observations
act_batch: np.array
batch of actions executed given obs_batch
rew_batch: np.array
rewards received as results of executing act_batch
next_obs_batch: np.array
next set of observations seen after executing act_batch
done_mask: np.array
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
weights: np.array
Array of shape (batch_size,) and dtype np.float32
denoting importance weight of each sampled transition
idxes: np.array
Array of shape (batch_size,) and dtype np.int32
idexes in buffer of sampled experiences
"""
assert beta > 0
idxes = self._sample_proportional(batch_size)
weights = []
p_min = self._it_min.min() / self._it_sum.sum()
max_weight = (p_min * len(self._storage)) ** (-beta)
for idx in idxes:
p_sample = self._it_sum[idx] / self._it_sum.sum()
weight = (p_sample * len(self._storage)) ** (-beta)
weights.append(weight / max_weight)
weights = np.array(weights)
encoded_sample = self._encode_sample(idxes)
return tuple(list(encoded_sample) + [weights, idxes])
def update_priorities(self, idxes, priorities):
"""Update priorities of sampled transitions.
sets priority of transition at index idxes[i] in buffer
to priorities[i].
Parameters
----------
idxes: [int]
List of idxes of sampled transitions
priorities: [float]
List of updated priorities corresponding to
transitions at the sampled idxes denoted by
variable `idxes`.
"""
assert len(idxes) == len(priorities)
for idx, priority in zip(idxes, priorities):
assert priority > 0
assert 0 <= idx < len(self._storage)
self._it_sum[idx] = priority ** self._alpha
self._it_min[idx] = priority ** self._alpha
self._max_priority = max(self._max_priority, priority)
buffers = {
"simple": ReplayBuffer,
"prioritized": PrioritizedReplayBuffer
}
def buffer_generator(buffer, batch_size=32):
result = None
while True:
observation, action, reward, next_observation, done = yield result
buffer.add(observation, action, reward, next_observation, done)
result = buffer.sample(batch_size=batch_size)
def create_buffer(args):
if args.prioritized_replay:
return PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
else:
return ReplayBuffer(args.buffer_size)
+95
View File
@@ -0,0 +1,95 @@
import numpy as np
import gym
from gym.spaces import Box
from osim.env import RunEnv
from common.state_transform import StateVelCentr
class DdpgWrapper(gym.Wrapper):
def __init__(self, env, args):
gym.Wrapper.__init__(self, env)
self.state_transform = StateVelCentr(
obstacles_mode='standard',
exclude_centr=True,
vel_states=[])
self.observation_space = Box(-1000, 1000, self.state_transform.state_size)
self.skip_frames = args.skip_frames
self.reward_scale = args.reward_scale
self.fail_reward = args.fail_reward
# [-1, 1] <-> [0, 1]
action_mean = .5
action_std = .5
self.normalize_action = lambda x: (x - action_mean) / action_std
self.denormalise_action = lambda x: x * action_std + action_mean
def reset(self, **kwargs):
return self._reset(**kwargs)
def _reset(self, **kwargs):
observation = self.env.reset(**kwargs)
self.env_step = 0
self.state_transform.reset()
observation, _ = self.state_transform.process(observation)
observation = self.observation(observation)
return observation
def _step(self, action):
action = self.denormalise_action(action)
total_reward = 0.
for _ in range(self.skip_frames):
observation, reward, done, _ = self.env.step(action)
observation, obst_rew = self.state_transform.process(observation)
total_reward += reward + obst_rew
self.env_step += 1
if done:
if self.env_step < 1000: # hardcoded
total_reward += self.fail_reward
break
observation = self.observation(observation)
total_reward *= self.reward_scale
return observation, total_reward, done, None
def observation(self, observation):
return self._observation(observation)
def _observation(self, observation):
observation = np.array(observation, dtype=np.float32)
return observation
def create_env(args):
env = RunEnv(visualize=False, max_obstacles=args.max_obstacles)
if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
env = DdpgWrapper(env, args)
return env
def create_observation_handler(args):
if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
state_transform = StateVelCentr(
obstacles_mode='standard',
exclude_centr=True,
vel_states=[])
def observation_handler(observation, previous_action=None):
observation = np.array(observation, dtype=np.float32)
observation, _ = state_transform.process(observation)
return observation
else:
def observation_handler(observation, previous_action=None):
observation = np.array(observation, dtype=np.float32)
return observation
return observation_handler
def create_action_handler(args):
action_mean = .5
action_std = .5
action_handler = lambda x: x * action_std + action_mean
return action_handler
+60
View File
@@ -0,0 +1,60 @@
import numpy as np
import torch
import torch.nn as nn
def create_linear_decay_fn(initial_value, final_value, max_step):
def decay_fn(step):
relative = 1. - step / max_step
return initial_value * relative + final_value * (1. - relative)
return decay_fn
def create_cycle_decay_fn(initial_value, final_value, cycle_len, num_cycles):
max_step = cycle_len * num_cycles
def decay_fn(step):
relative = 1. - step / max_step
relative_cosine = 0.5 * (np.cos(np.pi * np.mod(step, cycle_len) / cycle_len) + 1.0)
return relative_cosine * (initial_value - final_value) * relative + final_value
return decay_fn
def create_decay_fn(decay_type, **kwargs):
if decay_type == "linear":
return create_linear_decay_fn(**kwargs)
elif decay_type == "cycle":
return create_cycle_decay_fn(**kwargs)
else:
raise NotImplementedError()
class QuadricLinearLoss(nn.Module):
def __init__(self, clip_delta):
super(QuadricLinearLoss, self).__init__()
self.clip_delta = clip_delta
def forward(self, y_pred, y_true, weights):
td_error = y_true - y_pred
td_error_abs = torch.abs(td_error)
quadratic_part = torch.clamp(td_error_abs, max=self.clip_delta)
linear_part = td_error_abs - quadratic_part
loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
loss = torch.mean(loss * weights)
return loss
losses = {
"mse": nn.MSELoss,
"quadric-linear": QuadricLinearLoss
}
def create_loss(args):
if args.loss_type == "mse":
return nn.MSELoss()
elif args.loss_type == "quadric-linear":
return QuadricLinearLoss(clip_delta=args.clip_delta)
else:
raise NotImplementedError()
+88
View File
@@ -0,0 +1,88 @@
import os
import sys
import random
import numpy as np
def create_if_need(path):
if not os.path.exists(path):
os.makedirs(path)
def boolean_flag(parser, name, default=False, help=None):
"""Add a boolean flag to argparse parser.
Parameters
----------
parser: argparse.Parser
parser to add the flag to
name: str
--<name> will enable the flag, while --no-<name> will disable it
default: bool or None
default value of the flag
help: str
help string for the flag
"""
dest = name.replace('-', '_')
parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
parser.add_argument("--no-" + name, action="store_false", dest=dest)
def str2params(string, delimeter="-"):
try:
result = list(map(int, string.split(delimeter)))
except:
result = None
return result
def set_global_seeds(i):
try:
import torch
except ImportError:
pass
else:
torch.manual_seed(i)
try:
import tensorflow as tf
except ImportError:
pass
else:
tf.set_random_seed(i)
np.random.seed(i)
random.seed(i)
def query_yes_no(question, default="no"):
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
"""
valid = {
"yes": True, "y": True, "ye": True,
"no": False, "n": False
}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == '':
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' "
"(or 'y' or 'n').\n")
+15
View File
@@ -0,0 +1,15 @@
import torch
import torch.nn as nn
class LayerNorm(nn.Module):
def __init__(self, features, eps=1e-6):
super().__init__()
self.gamma = nn.Parameter(torch.ones(features))
self.beta = nn.Parameter(torch.zeros(features))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
+92
View File
@@ -0,0 +1,92 @@
import math
import torch
from torch.nn.parameter import Parameter
import torch.nn.functional as F
from torch.nn.modules.module import Module
from torch.autograd import Variable
class NoisyLinear(Module):
"""Applies a noisy linear transformation to the incoming data:
:math:`y = (mu_w + sigma_w \cdot epsilon_w)x + mu_b + sigma_b \cdot epsilon_b`
More details can be found in the paper `Noisy Networks for Exploration` _ .
Args:
in_features: size of each input sample
out_features: size of each output sample
bias: If set to False, the layer will not learn an additive bias. Default: True
factorised: whether or not to use factorised noise. Default: True
std_init: initialization constant for standard deviation component of weights. If None,
defaults to 0.017 for independent and 0.4 for factorised. Default: None
Shape:
- Input: :math:`(N, in\_features)`
- Output: :math:`(N, out\_features)`
Attributes:
weight: the learnable weights of the module of shape (out_features x in_features)
bias: the learnable bias of the module of shape (out_features)
Examples::
>>> m = nn.NoisyLinear(20, 30)
>>> input = autograd.Variable(torch.randn(128, 20))
>>> output = m(input)
>>> print(output.size())
"""
def __init__(self, in_features, out_features, bias=True, factorised=True, std_init=None):
super(NoisyLinear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.factorised = factorised
self.weight_mu = Parameter(torch.Tensor(out_features, in_features))
self.weight_sigma = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias_mu = Parameter(torch.Tensor(out_features))
self.bias_sigma = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
if not std_init:
if self.factorised:
self.std_init = 0.4
else:
self.std_init = 0.017
else:
self.std_init = std_init
self.reset_parameters(bias)
def reset_parameters(self, bias):
if self.factorised:
mu_range = 1. / math.sqrt(self.weight_mu.size(1))
self.weight_mu.data.uniform_(-mu_range, mu_range)
self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
if bias:
self.bias_mu.data.uniform_(-mu_range, mu_range)
self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
else:
mu_range = math.sqrt(3. / self.weight_mu.size(1))
self.weight_mu.data.uniform_(-mu_range, mu_range)
self.weight_sigma.data.fill_(self.std_init)
if bias:
self.bias_mu.data.uniform_(-mu_range, mu_range)
self.bias_sigma.data.fill_(self.std_init)
def scale_noise(self, size):
x = torch.Tensor(size).normal_()
x = x.sign().mul(x.abs().sqrt())
return x
def forward(self, input):
if self.factorised:
epsilon_in = self.scale_noise(self.in_features)
epsilon_out = self.scale_noise(self.out_features)
weight_epsilon = Variable(epsilon_out.ger(epsilon_in))
bias_epsilon = Variable(self.scale_noise(self.out_features))
else:
weight_epsilon = Variable(torch.Tensor(self.out_features, self.in_features).normal_())
bias_epsilon = Variable(torch.Tensor(self.out_features).normal_())
return F.linear(input,
self.weight_mu + self.weight_sigma.mul(weight_epsilon),
self.bias_mu + self.bias_sigma.mul(bias_epsilon))
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ str(self.in_features) + ' -> ' \
+ str(self.out_features) + ')'
View File
+42
View File
@@ -0,0 +1,42 @@
from collections import OrderedDict
from itertools import tee
import torch
import torch.nn as nn
from common.modules.LayerNorm import LayerNorm
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return zip(a, b)
class LinearNet(nn.Module):
def __init__(self, layers, activation=torch.nn.ELU,
layer_norm=False, linear_layer=nn.Linear):
super(LinearNet, self).__init__()
self.input_shape = layers[0]
self.output_shape = layers[-1]
if layer_norm:
layer_fn = lambda layer: [
("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
("layer_norm_{}".format(layer[0]), LayerNorm(layer[1][1])),
("act_{}".format(layer[0]), activation())]
else:
layer_fn = lambda layer: [
("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
("act_{}".format(layer[0]), activation())]
self.net = torch.nn.Sequential(
OrderedDict([
x for y in map(
lambda layer: layer_fn(layer),
enumerate(pairwise(layers))) for x in y]))
def forward(self, x):
x = self.net.forward(x)
return x
+62
View File
@@ -0,0 +1,62 @@
import numpy as np
class RandomProcess(object):
def reset_states(self):
pass
class AnnealedGaussianProcess(RandomProcess):
def __init__(self, mu, sigma, sigma_min, n_steps_annealing=int(1e5)):
self.mu = mu
self.sigma = sigma
self.n_steps = 0
if sigma_min is not None:
self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
self.c = sigma
self.sigma_min = sigma_min
else:
self.m = 0.
self.c = sigma
self.sigma_min = sigma
@property
def current_sigma(self):
sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
return sigma
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
def __init__(self, theta, mu=0., sigma=1., dt=1e-2,
x0=None, size=1, sigma_min=None, n_steps_annealing=int(1e5)):
super(OrnsteinUhlenbeckProcess, self).__init__(
mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
self.theta = theta
self.mu = mu
self.dt = dt
self.x0 = x0
self.size = size
self.reset_states()
def sample(self):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
self.x_prev = x
self.n_steps += 1
return x
def reset_states(self):
self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
def create_random_process(args):
if args.rp_type == "ornstein-uhlenbeck":
return OrnsteinUhlenbeckProcess(
size=args.n_action,
theta=args.rp_theta,
mu=args.rp_mu,
sigma=args.rp_sigma,
sigma_min=args.rp_sigma_min)
else:
raise NotImplementedError()
+146
View File
@@ -0,0 +1,146 @@
import operator
class SegmentTree(object):
def __init__(self, capacity, operation, neutral_element):
"""Build a Segment Tree data structure.
https://en.wikipedia.org/wiki/Segment_tree
Can be used as regular array, but with two
important differences:
a) setting item's value is slightly slower.
It is O(lg capacity) instead of O(1).
b) user has access to an efficient `reduce`
operation which reduces `operation` over
a contiguous subsequence of items in the
array.
Paramters
---------
capacity: int
Total size of the array - must be a power of two.
operation: lambda obj, obj -> obj
and operation for combining elements (eg. sum, max)
must for a mathematical group together with the set of
possible values for array elements.
neutral_element: obj
neutral element for the operation above. eg. float('-inf')
for max and 0 for sum.
"""
assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
self._capacity = capacity
self._value = [neutral_element for _ in range(2 * capacity)]
self._operation = operation
def _reduce_helper(self, start, end, node, node_start, node_end):
if start == node_start and end == node_end:
return self._value[node]
mid = (node_start + node_end) // 2
if end <= mid:
return self._reduce_helper(start, end, 2 * node, node_start, mid)
else:
if mid + 1 <= start:
return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
else:
return self._operation(
self._reduce_helper(start, mid, 2 * node, node_start, mid),
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
)
def reduce(self, start=0, end=None):
"""Returns result of applying `self.operation`
to a contiguous subsequence of the array.
self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
Parameters
----------
start: int
beginning of the subsequence
end: int
end of the subsequences
Returns
-------
reduced: obj
result of reducing self.operation over the specified range of array elements.
"""
if end is None:
end = self._capacity
if end < 0:
end += self._capacity
end -= 1
return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
def __setitem__(self, idx, val):
# index of the leaf
idx += self._capacity
self._value[idx] = val
idx //= 2
while idx >= 1:
self._value[idx] = self._operation(
self._value[2 * idx],
self._value[2 * idx + 1]
)
idx //= 2
def __getitem__(self, idx):
assert 0 <= idx < self._capacity
return self._value[self._capacity + idx]
class SumSegmentTree(SegmentTree):
def __init__(self, capacity):
super(SumSegmentTree, self).__init__(
capacity=capacity,
operation=operator.add,
neutral_element=0.0
)
def sum(self, start=0, end=None):
"""Returns arr[start] + ... + arr[end]"""
return super(SumSegmentTree, self).reduce(start, end)
def find_prefixsum_idx(self, prefixsum):
"""Find the highest index `i` in the array such that
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
if array values are probabilities, this function
allows to sample indexes according to the discrete
probability efficiently.
Parameters
----------
perfixsum: float
upperbound on the sum of array prefix
Returns
-------
idx: int
highest index satisfying the prefixsum constraint
"""
assert 0 <= prefixsum <= self.sum() + 1e-5
idx = 1
while idx < self._capacity: # while non-leaf
if self._value[2 * idx] > prefixsum:
idx = 2 * idx
else:
prefixsum -= self._value[2 * idx]
idx = 2 * idx + 1
return idx - self._capacity
class MinSegmentTree(SegmentTree):
def __init__(self, capacity):
super(MinSegmentTree, self).__init__(
capacity=capacity,
operation=min,
neutral_element=float('inf')
)
def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])"""
return super(MinSegmentTree, self).reduce(start, end)
+336
View File
@@ -0,0 +1,336 @@
from __future__ import division
import numpy as np
from collections import OrderedDict
def get_state_names(all=False, obst=False):
names = ['pelvis_' + n for n in ('rot', 'x', 'y')]
names += ['pelvis_vel_' + n for n in ('rot', 'x', 'y')]
names += ['hip_right', 'knee_right', 'ankle_right', 'hip_left', 'knee_left', 'ankle_left']
names += ['hip_right_vel', 'knee_right_vel', 'ankle_right_vel', 'hip_left_vel', 'knee_left_vel', 'ankle_left_vel']
names += ['mass_x', 'mass_y']
names += ['mass_x_vel', 'mass_y_vel']
if all:
names += [b + '_' + i for b in ['head', 'pelvis2', 'torso', 'toes_left',
'toes_right', 'talus_left', 'talus_right'] for i in
['x', 'y']]
else:
names += [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right',
'talus_left', 'talus_right'] for i in
['x', 'y']]
names += ['muscle_left', 'muscle_right']
if obst:
names += ['obst_dist', 'obst_y', 'obst_r']
return names
def get_names_to_center(centr):
if centr == 'pelvis':
pelvis_or_mass = 'mass'
elif centr == 'mass':
pelvis_or_mass = 'pelvis'
else:
raise ValueError('centr should be in [mass or pelvis], not {}'.format(centr))
return [b + '_x' for b in ['head', pelvis_or_mass, 'torso', 'toes_left',
'toes_right', 'talus_left', 'talus_right']]
def get_bodies_names():
return [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right', 'talus_left', 'talus_right']
for i in ['x', 'y']]
def get_names_obstacles():
return ['toes_left', 'toes_right', 'talus_left', 'talus_right']
def calculate_velocity(cur, prev):
if prev is None:
return np.zeros_like(cur)
return 100.*(cur - prev)
def _get_pattern_idxs(lst, pattern):
idxs = [i for i, x in enumerate(lst) if pattern in x]
return idxs
class State(object):
def __init__(self, obstacles_mode='bodies_dist', obst_grid_dist=1,
grid_points=100, predict_bodies=True, add_step=True, osb_first=False):
assert obstacles_mode in ['exclude', 'grid', 'bodies_dist', 'standard']
self.state_idxs = [i for i, n in enumerate(get_state_names(True, True)) if n not in ['pelvis2_x', 'pelvis2_y']]
self.state_names = get_state_names()
self.step = 0
self.add_step = add_step
self.osb_first = osb_first
self.obstacles_mode = obstacles_mode
self.obstacles = OrderedDict()
self.obst_names = []
if obstacles_mode == 'standard':
self.obst_names = ['obst_dist', 'obst_y', 'obst_r']
elif obstacles_mode == 'grid':
self.obst_names = ['obst_grid_{}'.format(i) for i in range(grid_points)]
self.obst_grid_dist = obst_grid_dist
self.obst_grid_points = grid_points
self.obst_grid_size = obst_grid_dist * 2 / grid_points
elif obstacles_mode == 'bodies_dist':
self._obst_names = get_names_obstacles()
for i in range(3):
for n in self._obst_names:
self.obst_names.append('{}_{}_obst_x_start'.format(n, i))
self.obst_names.append('{}_{}_obst_x_end'.format(n, i))
self.obst_names.append('{}_{}_obst_y'.format(n, i))
self.obst_names.append('is_obstacle')
if self.add_step:
self.state_names.append('step')
self.predict_bodies = predict_bodies
self.bodies_idxs_x = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_x')]
self.bodies_idxs_y = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_y')]
self.bodies_idxs = self.bodies_idxs_x + self.bodies_idxs_y
self.mass_x_idx = self.state_names.index('mass_x')
self.mass_y_idx = self.state_names.index('mass_y')
self.state_names_out = self.state_names
self._set_left_right()
def _set_left_right(self):
self.left_idxs = _get_pattern_idxs(self.state_names, '_left')
self.right_idxs = _get_pattern_idxs(self.state_names, '_right')
def reset(self):
self.step = 0
self.prev_orig = None
self.prev_pred = None
self.obstacles = OrderedDict()
def _predict_bodies(self, state):
state = np.copy(state)
if self.step > 0:
def update_bodies(cur, prev_orig, prev_pred, d):
flt = cur == prev_orig
cur[flt] = prev_pred[flt] + d
# does not matter orig or pred
dx = state[self.mass_x_idx] - self.prev_orig[self.mass_x_idx]
dy = state[self.mass_y_idx] - self.prev_orig[self.mass_y_idx]
cur_bodies_x = state[self.bodies_idxs_x]
cur_bodies_y = state[self.bodies_idxs_y]
# need for filter
prev_orig_bodies_x = self.prev_orig[self.bodies_idxs_x]
prev_orig_bodies_y = self.prev_orig[self.bodies_idxs_y]
# need for updating
prev_pred_bodies_x = self.prev_pred[self.bodies_idxs_x]
prev_pred_bodies_y = self.prev_pred[self.bodies_idxs_y]
update_bodies(cur_bodies_x, prev_orig_bodies_x, prev_pred_bodies_x, dx)
update_bodies(cur_bodies_y, prev_orig_bodies_y, prev_pred_bodies_y, dy)
state[self.bodies_idxs_x] = cur_bodies_x
state[self.bodies_idxs_y] = cur_bodies_y
return state
def _add_obstacle(self, state):
pelvis_x = state[1]
obstacle_x = state[-3]
if obstacle_x != 100:
obstacle_x += pelvis_x
if round(obstacle_x, 5) not in self.obstacles:
self.obstacles[round(obstacle_x, 5)] = [obstacle_x, state[-2], state[-1]]
#print('obstacles {}, step {}'.format(self.obstacles.keys(), self.step))
if len(self.obstacles) > 3:
Warning('more than 3 obstacles')
def _get_obstacle_state_reward(self, state):
is_obst = float(state[-3] != 100)
if self.obstacles_mode == 'exclude':
return [is_obst], 0.
elif self.obstacles_mode == 'standard':
if not is_obst:
return [-1., 0., 0., is_obst], 0.
obst_features = np.clip(state[-3:], -10., 10.)
return np.append(obst_features, is_obst), 0.
elif self.obstacles_mode == 'gird':
mass_x = state[self.state_names.index('mass_x')]
obst_grid = np.zeros(self.obst_grid_points)
for k, v in self.obstacles.iteritems():
obst_x, obst_y, obst_r = v
obst_h = obst_y + obst_r
obst_left = int(np.ceil((obst_x - mass_x - obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
obst_right = int(np.ceil((obst_x - mass_x + obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
obst_left = max(obst_left, 0)
obst_right = max(obst_right, -1)
obst_grid[obst_left:obst_right + 1] = obst_h
obst_features = np.append(obst_grid, is_obst)
return obst_features, 0
else:
obst_state = []
obst_reward = 0
for i in range(3):
if i >= len(self.obstacles):
for n in self._obst_names:
body_y = state[self.state_names.index(n + '_y')]
obst_state.extend([10, 10, body_y])
else:
v = self.obstacles.values()[i]
obst_x, obst_y, obst_r = v
obst_h = obst_y + obst_r
obst_x_start = obst_x - obst_r
obst_x_end = obst_x + obst_r
for n in self._obst_names:
body_x = state[self.state_names.index(n + '_x')]
body_y = state[self.state_names.index(n + '_y')]
obst_state.append(obst_x_start - body_x)
obst_state.append(obst_x_end - body_x)
obst_state.append(body_y - obst_h)
if obst_reward >= 0 and body_x >= (obst_x_start - obst_r/2) \
and (body_x <= obst_x_end+obst_r/2) and (obst_h + obst_r/2) >= body_y:
obst_reward = -0.5
obst_state.append(is_obst)
return np.asarray(obst_state), obst_reward
def process(self, state):
state = np.asarray(state)
state = state[self.state_idxs]
if self.osb_first and self.step == 0:
state[-3:] = [100, 0, 0]
self._add_obstacle(state)
obst_state, obst_reward = self._get_obstacle_state_reward(state)
state_orig = state[:-3]
if self.add_step:
state_orig = np.append(state_orig, 1. * self.step / 1000)
if self.predict_bodies:
state = self._predict_bodies(state_orig)
else:
state = state_orig
self.step += 1
self.prev_orig = state_orig
self.prev_pred = np.copy(state)
return (state, obst_state), obst_reward
def flip_state(self, state, copy=True):
assert np.ndim(state) == 1
state = np.asarray(state)
state = self.flip_states(state.reshape(1, -1), copy)
return state.ravel()
def flip_states(self, states, copy=True):
assert np.ndim(states) == 2
states = np.asarray(states)
if copy:
states = states.copy()
left = states[:, self.left_idxs]
right = states[:, self.right_idxs]
states[:, self.left_idxs] = right
states[:, self.right_idxs] = left
return states
@property
def state_size(self):
return len(self.state_names_out) + len(self.obst_names)
class StateVel(State):
def __init__(self, vel_states=get_bodies_names(), obstacles_mode='bodies_dist',
add_step=True, predict_bodies=True, osb_first=False):
super(StateVel, self).__init__(obstacles_mode=obstacles_mode,
predict_bodies=predict_bodies,
add_step=add_step,
osb_first=osb_first)
self.vel_idxs = [self.state_names.index(k) for k in vel_states]
self.prev_vals = None
self.state_names += [n + '_vel' for n in vel_states]
self.state_names_out = self.state_names
# left right idxs
self._set_left_right()
def reset(self):
super(StateVel, self).reset()
self.prev_vals = None
def process(self, state):
(state, obst_state), obst_reward = super(StateVel, self).process(state)
cur_vals = state[self.vel_idxs]
vel = calculate_velocity(cur_vals, self.prev_vals)
self.prev_vals = cur_vals
state = np.concatenate((state, vel, obst_state))
return state, obst_reward
class StateVelCentr(State):
def __init__(self, centr_state='pelvis_x', vel_states=get_bodies_names(),
states_to_center=get_names_to_center('pelvis'),
vel_before_centr=True, obstacles_mode='bodies_dist',
exclude_centr=False, predict_bodies=True,
add_step=True, osb_first=False):
super(StateVelCentr, self).__init__(obstacles_mode=obstacles_mode,
predict_bodies=predict_bodies,
add_step=add_step,
osb_first=osb_first)
# center
self.centr_idx = self.state_names.index(centr_state)
self.states_to_center = [self.state_names.index(k) for k in states_to_center]
# velocities
self.prev_vals = None
self.vel_idxs = [self.state_names.index(k) for k in vel_states]
self.vel_before_centr = vel_before_centr
self.state_names += [n + '_vel' for n in vel_states]
self.exclude_centr = exclude_centr
if self.exclude_centr:
self.state_names_out = self.state_names[:max(0, self.centr_idx)] + \
self.state_names[self.centr_idx + 1:]
else:
self.state_names_out = self.state_names
# left right idxs
self._set_left_right()
def _set_left_right(self):
state_names = self.state_names_out
self.left_idxs = _get_pattern_idxs(state_names, '_left')
self.right_idxs = _get_pattern_idxs(state_names, '_right')
def reset(self):
super(StateVelCentr, self).reset()
self.prev_vals = None
def process(self, state):
(state, obst_state), obst_reward = super(StateVelCentr, self).process(state)
if self.vel_before_centr:
cur_vals = state[self.vel_idxs]
vel = calculate_velocity(cur_vals, self.prev_vals)
self.prev_vals = cur_vals
state[self.states_to_center] -= state[self.centr_idx]
else:
state[self.states_to_center] -= state[self.centr_idx]
cur_vals = state[self.vel_idxs]
vel = calculate_velocity(cur_vals, self.prev_vals)
self.prev_vals = cur_vals
if self.exclude_centr:
state = np.concatenate([state[:max(0, self.centr_idx)], state[self.centr_idx+1:]])
state = np.concatenate((state, vel, obst_state))
return state, obst_reward
+37
View File
@@ -0,0 +1,37 @@
import torch
from torch.autograd import Variable
USE_CUDA = torch.cuda.is_available()
FLOAT = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
def to_numpy(var):
return var.cpu().data.numpy() if USE_CUDA else var.data.numpy()
def to_tensor(ndarray, volatile=False, requires_grad=False, dtype=FLOAT):
return Variable(
torch.from_numpy(ndarray), volatile=volatile, requires_grad=requires_grad
).type(dtype)
def soft_update(target, source, tau):
for target_param, param in zip(target.parameters(), source.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - tau) + param.data * tau
)
def hard_update(target, source):
for target_param, param in zip(target.parameters(), source.parameters()):
target_param.data.copy_(param.data)
activations = {
"relu": torch.nn.ReLU,
"elu": torch.nn.ELU,
"leakyrelu": torch.nn.LeakyReLU,
"selu": torch.nn.SELU,
"sigmoid": torch.nn.Sigmoid,
"tanh": torch.nn.Tanh
}
View File
+70
View File
@@ -0,0 +1,70 @@
import os
import torch
import copy
from multiprocessing import Value
from common.misc_util import str2params, create_if_need
from common.env_wrappers import create_env
from common.torch_util import activations, hard_update
from ddpg.model import create_model, create_act_update_fns, train_multi_thread
from ddpg.train import parse_args
def debug(args, model_fn, act_update_fns, multi_thread):
create_if_need(args.logdir)
env = create_env(args)
if args.flip_state_action and hasattr(env, "state_transform"):
args.flip_states = env.state_transform.flip_states
args.n_action = env.action_space.shape[0]
args.n_observation = env.observation_space.shape[0]
args.actor_layers = str2params(args.actor_layers)
args.critic_layers = str2params(args.critic_layers)
args.actor_activation = activations[args.actor_activation]
args.critic_activation = activations[args.critic_activation]
actor, critic = model_fn(args)
if args.restore_actor_from is not None:
actor.load_state_dict(torch.load(args.restore_actor_from))
if args.restore_critic_from is not None:
critic.load_state_dict(torch.load(args.restore_critic_from))
actor.train()
critic.train()
actor.share_memory()
critic.share_memory()
target_actor = copy.deepcopy(actor)
target_critic = copy.deepcopy(critic)
hard_update(target_actor, actor)
hard_update(target_critic, critic)
target_actor.train()
critic.train()
target_actor.share_memory()
target_critic.share_memory()
_, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
args.thread = 0
best_reward = Value("f", 0.0)
multi_thread(actor, critic, target_actor, target_critic, args, act_update_fns, best_reward)
save_fn()
if __name__ == '__main__':
os.environ['OMP_NUM_THREADS'] = '1'
torch.set_num_threads(1)
args = parse_args()
debug(
args,
create_model,
create_act_update_fns,
train_multi_thread)
+477
View File
@@ -0,0 +1,477 @@
import random
import numpy as np
import torch
import queue as py_queue
import time
import torch.nn as nn
from pprint import pprint
from ddpg.nets import Actor, Critic
from common.torch_util import to_numpy, to_tensor, soft_update
from common.misc_util import create_if_need, set_global_seeds
from common.logger import Logger
from common.buffers import create_buffer
from common.loss import create_loss, create_decay_fn
from common.env_wrappers import create_env
from common.random_process import create_random_process
def create_model(args):
actor = Actor(
args.n_observation, args.n_action, args.actor_layers,
activation=args.actor_activation,
layer_norm=args.actor_layer_norm,
parameters_noise=args.actor_parameters_noise,
parameters_noise_factorised=args.actor_parameters_noise_factorised,
last_activation=nn.Tanh)
critic = Critic(
args.n_observation, args.n_action, args.critic_layers,
activation=args.critic_activation,
layer_norm=args.critic_layer_norm,
parameters_noise=args.critic_parameters_noise,
parameters_noise_factorised=args.critic_parameters_noise_factorised)
pprint(actor)
pprint(critic)
return actor, critic
def create_act_update_fns(actor, critic, target_actor, target_critic, args):
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
criterion = create_loss(args)
low_action_boundary = -1.
high_action_boundary = 1.
def act_fn(observation, noise=0):
nonlocal actor
action = to_numpy(actor(to_tensor(np.array([observation], dtype=np.float32)))).squeeze(0)
action += noise
action = np.clip(action, low_action_boundary, high_action_boundary)
return action
def update_fn(
observations, actions, rewards, next_observations, dones, weights,
actor_lr=1e-4, critic_lr=1e-3):
nonlocal actor, critic, target_actor, target_critic, actor_optim, critic_optim
if hasattr(args, "flip_states"):
observations_flip = args.flip_states(observations)
next_observations_flip = args.flip_states(next_observations)
actions_flip = np.zeros_like(actions)
actions_flip[:, :args.n_action // 2] = actions[:, args.n_action // 2:]
actions_flip[:, args.n_action // 2:] = actions[:, :args.n_action // 2]
observations = np.concatenate((observations, observations_flip))
actions = np.concatenate((actions, actions_flip))
rewards = np.tile(rewards.ravel(), 2)
next_observations = np.concatenate((next_observations, next_observations_flip))
dones = np.tile(dones.ravel(), 2)
dones = dones[:, None].astype(np.bool)
rewards = rewards[:, None].astype(np.float32)
dones = to_tensor(np.invert(dones).astype(np.float32))
rewards = to_tensor(rewards)
weights = to_tensor(weights, requires_grad=False)
next_v_values = target_critic(
to_tensor(next_observations, volatile=True),
target_actor(to_tensor(next_observations, volatile=True)),
)
next_v_values.volatile = False
reward_predicted = dones * args.gamma * next_v_values
td_target = rewards + reward_predicted
# Critic update
critic.zero_grad()
v_values = critic(to_tensor(observations), to_tensor(actions))
value_loss = criterion(v_values, td_target, weights=weights)
value_loss.backward()
torch.nn.utils.clip_grad_norm(critic.parameters(), args.grad_clip)
for param_group in critic_optim.param_groups:
param_group["lr"] = critic_lr
critic_optim.step()
# Actor update
actor.zero_grad()
policy_loss = -critic(
to_tensor(observations),
actor(to_tensor(observations))
)
policy_loss = torch.mean(policy_loss * weights)
policy_loss.backward()
torch.nn.utils.clip_grad_norm(actor.parameters(), args.grad_clip)
for param_group in actor_optim.param_groups:
param_group["lr"] = actor_lr
actor_optim.step()
# Target update
soft_update(target_actor, actor, args.tau)
soft_update(target_critic, critic, args.tau)
metrics = {
"value_loss": value_loss,
"policy_loss": policy_loss
}
td_v_values = critic(
to_tensor(observations, volatile=True, requires_grad=False),
to_tensor(actions, volatile=True, requires_grad=False))
td_error = td_target - td_v_values
info = {
"td_error": to_numpy(td_error)
}
return metrics, info
def save_fn(episode=None):
nonlocal actor, critic
if episode is None:
save_path = args.logdir
else:
save_path = "{}/episode_{}".format(args.logdir, episode)
create_if_need(save_path)
torch.save(actor.state_dict(), "{}/actor_state_dict.pkl".format(save_path))
torch.save(critic.state_dict(), "{}/critic_state_dict.pkl".format(save_path))
torch.save(target_actor.state_dict(), "{}/target_actor_state_dict.pkl".format(save_path))
torch.save(target_critic.state_dict(), "{}/target_critic_state_dict.pkl".format(save_path))
return act_fn, update_fn, save_fn
def train_multi_thread(actor, critic, target_actor, target_critic, args, prepare_fn, best_reward):
workerseed = args.seed + 241 * args.thread
set_global_seeds(workerseed)
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
create_if_need(args.logdir)
act_fn, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
logger = Logger(args.logdir)
buffer = create_buffer(args)
if args.prioritized_replay:
beta_deacy_fn = create_decay_fn(
"linear",
initial_value=args.prioritized_replay_beta0,
final_value=1.0,
max_step=args.max_episodes)
env = create_env(args)
random_process = create_random_process(args)
actor_learning_rate_decay_fn = create_decay_fn(
"linear",
initial_value=args.actor_lr,
final_value=args.actor_lr_end,
max_step=args.max_episodes)
critic_learning_rate_decay_fn = create_decay_fn(
"linear",
initial_value=args.critic_lr,
final_value=args.critic_lr_end,
max_step=args.max_episodes)
epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
epsilon_decay_fn = create_decay_fn(
"cycle",
initial_value=args.initial_epsilon,
final_value=args.final_epsilon,
cycle_len=epsilon_cycle_len,
num_cycles=args.max_episodes // epsilon_cycle_len)
episode = 0
step = 0
start_time = time.time()
while episode < args.max_episodes:
if episode % 100 == 0:
env = create_env(args)
seed = random.randrange(2 ** 32 - 2)
actor_lr = actor_learning_rate_decay_fn(episode)
critic_lr = critic_learning_rate_decay_fn(episode)
epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
episode_metrics = {
"value_loss": 0.0,
"policy_loss": 0.0,
"reward": 0.0,
"step": 0,
"epsilon": epsilon
}
observation = env.reset(seed=seed, difficulty=args.difficulty)
random_process.reset_states()
done = False
while not done:
action = act_fn(observation, noise=epsilon*random_process.sample())
next_observation, reward, done, _ = env.step(action)
buffer.add(observation, action, reward, next_observation, done)
episode_metrics["reward"] += reward
episode_metrics["step"] += 1
if len(buffer) >= args.train_steps:
if args.prioritized_replay:
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
weights, batch_idxes) = \
buffer.sample(batch_size=args.batch_size, beta=beta_deacy_fn(episode))
else:
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
buffer.sample(batch_size=args.batch_size)
weights, batch_idxes = np.ones_like(tr_rewards), None
step_metrics, step_info = update_fn(
tr_observations, tr_actions, tr_rewards,
tr_next_observations, tr_dones,
weights, actor_lr, critic_lr)
if args.prioritized_replay:
new_priorities = np.abs(step_info["td_error"]) + 1e-6
buffer.update_priorities(batch_idxes, new_priorities)
for key, value in step_metrics.items():
value = to_numpy(value)[0]
episode_metrics[key] += value
observation = next_observation
episode += 1
if episode_metrics["reward"] > 15.0 * args.reward_scale \
and episode_metrics["reward"] > best_reward.value:
best_reward.value = episode_metrics["reward"]
logger.scalar_summary("best reward", best_reward.value, episode)
save_fn(episode)
step += episode_metrics["step"]
elapsed_time = time.time() - start_time
for key, value in episode_metrics.items():
value = value if "loss" not in key else value / episode_metrics["step"]
logger.scalar_summary(key, value, episode)
logger.scalar_summary(
"episode per minute",
episode / elapsed_time * 60,
episode)
logger.scalar_summary(
"step per second",
step / elapsed_time,
episode)
logger.scalar_summary("actor lr", actor_lr, episode)
logger.scalar_summary("critic lr", critic_lr, episode)
if episode % args.save_step == 0:
save_fn(episode)
if elapsed_time > 86400 * args.max_train_days:
episode = args.max_episodes + 1
save_fn(episode)
raise KeyboardInterrupt
def train_single_thread(
actor, critic, target_actor, target_critic, args, prepare_fn,
global_episode, global_update_step, episodes_queue):
workerseed = args.seed + 241 * args.thread
set_global_seeds(workerseed)
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
create_if_need(args.logdir)
_, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
logger = Logger(args.logdir)
buffer = create_buffer(args)
if args.prioritized_replay:
beta_deacy_fn = create_decay_fn(
"linear",
initial_value=args.prioritized_replay_beta0,
final_value=1.0,
max_step=args.max_update_steps)
actor_learning_rate_decay_fn = create_decay_fn(
"linear",
initial_value=args.actor_lr,
final_value=args.actor_lr_end,
max_step=args.max_update_steps)
critic_learning_rate_decay_fn = create_decay_fn(
"linear",
initial_value=args.critic_lr,
final_value=args.critic_lr_end,
max_step=args.max_update_steps)
update_step = 0
received_examples = 1 # just hack
while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
and global_update_step.value < args.max_update_steps * args.num_train_threads:
actor_lr = actor_learning_rate_decay_fn(update_step)
critic_lr = critic_learning_rate_decay_fn(update_step)
actor_lr = min(args.actor_lr, max(args.actor_lr_end, actor_lr))
critic_lr = min(args.critic_lr, max(args.critic_lr_end, critic_lr))
while True:
try:
replay = episodes_queue.get_nowait()
for (observation, action, reward, next_observation, done) in replay:
buffer.add(observation, action, reward, next_observation, done)
received_examples += len(replay)
except py_queue.Empty:
break
if len(buffer) >= args.train_steps:
if args.prioritized_replay:
beta = beta_deacy_fn(update_step)
beta = min(1.0, max(args.prioritized_replay_beta0, beta))
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
weights, batch_idxes) = \
buffer.sample(
batch_size=args.batch_size,
beta=beta)
else:
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
buffer.sample(batch_size=args.batch_size)
weights, batch_idxes = np.ones_like(tr_rewards), None
step_metrics, step_info = update_fn(
tr_observations, tr_actions, tr_rewards,
tr_next_observations, tr_dones,
weights, actor_lr, critic_lr)
update_step += 1
global_update_step.value += 1
if args.prioritized_replay:
new_priorities = np.abs(step_info["td_error"]) + 1e-6
buffer.update_priorities(batch_idxes, new_priorities)
for key, value in step_metrics.items():
value = to_numpy(value)[0]
logger.scalar_summary(key, value, update_step)
logger.scalar_summary("actor lr", actor_lr, update_step)
logger.scalar_summary("critic lr", critic_lr, update_step)
if update_step % args.save_step == 0:
save_fn(update_step)
else:
time.sleep(1)
logger.scalar_summary("buffer size", len(buffer), global_episode.value)
logger.scalar_summary(
"updates per example",
update_step * args.batch_size / received_examples,
global_episode.value)
save_fn(update_step)
raise KeyboardInterrupt
def play_single_thread(
actor, critic, target_actor, target_critic, args, prepare_fn,
global_episode, global_update_step, episodes_queue,
best_reward):
workerseed = args.seed + 241 * args.thread
set_global_seeds(workerseed)
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
create_if_need(args.logdir)
act_fn, _, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
logger = Logger(args.logdir)
env = create_env(args)
random_process = create_random_process(args)
epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
epsilon_decay_fn = create_decay_fn(
"cycle",
initial_value=args.initial_epsilon,
final_value=args.final_epsilon,
cycle_len=epsilon_cycle_len,
num_cycles=args.max_episodes // epsilon_cycle_len)
episode = 1
step = 0
start_time = time.time()
while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
and global_update_step.value < args.max_update_steps * args.num_train_threads:
if episode % 100 == 0:
env = create_env(args)
seed = random.randrange(2 ** 32 - 2)
epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
episode_metrics = {
"reward": 0.0,
"step": 0,
"epsilon": epsilon
}
observation = env.reset(seed=seed, difficulty=args.difficulty)
random_process.reset_states()
done = False
replay = []
while not done:
action = act_fn(observation, noise=epsilon * random_process.sample())
next_observation, reward, done, _ = env.step(action)
replay.append((observation, action, reward, next_observation, done))
episode_metrics["reward"] += reward
episode_metrics["step"] += 1
observation = next_observation
episodes_queue.put(replay)
episode += 1
global_episode.value += 1
if episode_metrics["reward"] > best_reward.value:
best_reward.value = episode_metrics["reward"]
logger.scalar_summary("best reward", best_reward.value, episode)
if episode_metrics["reward"] > 15.0 * args.reward_scale:
save_fn(episode)
step += episode_metrics["step"]
elapsed_time = time.time() - start_time
for key, value in episode_metrics.items():
logger.scalar_summary(key, value, episode)
logger.scalar_summary(
"episode per minute",
episode / elapsed_time * 60,
episode)
logger.scalar_summary(
"step per second",
step / elapsed_time,
episode)
if elapsed_time > 86400 * args.max_train_days:
global_episode.value = args.max_episodes * (args.num_threads - args.num_train_threads) + 1
raise KeyboardInterrupt
+90
View File
@@ -0,0 +1,90 @@
import numpy as np
import torch
import torch.nn as nn
from common.nets import LinearNet
from common.modules.NoisyLinear import NoisyLinear
def fanin_init(size, fanin=None):
fanin = fanin or size[0]
v = 1. / np.sqrt(fanin)
return torch.Tensor(size).uniform_(-v, v)
class Actor(nn.Module):
def __init__(self, n_observation, n_action,
layers, activation=torch.nn.ELU,
layer_norm=False,
parameters_noise=False, parameters_noise_factorised=False,
last_activation=torch.nn.Tanh, init_w=3e-3):
super(Actor, self).__init__()
if parameters_noise:
def linear_layer(x_in, x_out):
return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
else:
linear_layer = nn.Linear
self.feature_net = LinearNet(
layers=[n_observation] + layers,
activation=activation,
layer_norm=layer_norm,
linear_layer=linear_layer)
self.policy_net = LinearNet(
layers=[self.feature_net.output_shape, n_action],
activation=last_activation,
layer_norm=False
)
self.init_weights(init_w)
def init_weights(self, init_w):
for layer in self.feature_net.net:
if isinstance(layer, nn.Linear):
layer.weight.data = fanin_init(layer.weight.data.size())
for layer in self.feature_net.net:
if isinstance(layer, nn.Linear):
layer.weight.data.uniform_(-init_w, init_w)
def forward(self, observation):
x = observation
x = self.feature_net.forward(x)
x = self.policy_net.forward(x)
return x
class Critic(nn.Module):
def __init__(self, n_observation, n_action,
layers, activation=torch.nn.ELU,
layer_norm=False,
parameters_noise=False, parameters_noise_factorised=False,
init_w=3e-3):
super(Critic, self).__init__()
if parameters_noise:
def linear_layer(x_in, x_out):
return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
else:
linear_layer = nn.Linear
self.feature_net = LinearNet(
layers=[n_observation + n_action] + layers,
activation=activation,
layer_norm=layer_norm,
linear_layer=linear_layer)
self.value_net = nn.Linear(self.feature_net.output_shape, 1)
self.init_weights(init_w)
def init_weights(self, init_w):
for layer in self.feature_net.net:
if isinstance(layer, nn.Linear):
layer.weight.data = fanin_init(layer.weight.data.size())
self.value_net.weight.data.uniform_(-init_w, init_w)
def forward(self, observation, action):
x = torch.cat((observation, action), dim=1)
x = self.feature_net.forward(x)
x = self.value_net.forward(x)
return x
+186
View File
@@ -0,0 +1,186 @@
import os
import json
import argparse
import numpy as np
import pandas as pd
import torch
from pprint import pprint
from osim.env import RunEnv
from osim.http.client import Client
from common.misc_util import boolean_flag, query_yes_no
from common.env_wrappers import create_observation_handler, create_action_handler, create_env
from ddpg.train import str2params, activations
from ddpg.model import create_model, create_act_update_fns
REMOTE_BASE = 'http://grader.crowdai.org:1729'
ACTION_SHAPE = 18
SEEDS = [
3834825972, 3049289152, 3538742899, 2904257823, 4011088434,
2684066875, 781202090, 1691535473, 898088606, 1301477286
]
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--restore-args-from', type=str, default=None)
parser.add_argument('--restore-actor-from', type=str, default=None)
parser.add_argument('--restore-critic-from', type=str, default=None)
parser.add_argument('--max-obstacles', type=int, default=3)
parser.add_argument('--num-episodes', type=int, default=1)
parser.add_argument('--token', type=str, default=None)
boolean_flag(parser, "visualize", default=False)
boolean_flag(parser, "submit", default=False)
return parser.parse_args()
def restore_args(args):
with open(args.restore_args_from, "r") as fin:
params = json.load(fin)
unwanted = [
"max_obstacles",
"restore_args_from",
"restore_actor_from",
"restore_critic_from"
]
for unwanted_key in unwanted:
value = params.pop(unwanted_key, None)
if value is not None:
del value
for key, value in params.items():
setattr(args, key, value)
return args
def submit(actor, critic, args, act_update_fn):
act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
client = Client(REMOTE_BASE)
all_episode_metrics = []
episode_metrics = {
"reward": 0.0,
"step": 0,
}
observation_handler = create_observation_handler(args)
action_handler = create_action_handler(args)
observation = client.env_create(args.token)
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
observation = observation_handler(observation, action)
submitted = False
while not submitted:
print(episode_metrics["reward"])
action = act_fn(observation)
observation, reward, done, _ = client.env_step(action_handler(action).tolist())
episode_metrics["reward"] += reward
episode_metrics["step"] += 1
if done:
all_episode_metrics.append(episode_metrics)
episode_metrics = {
"reward": 0.0,
"step": 0,
}
observation_handler = create_observation_handler(args)
action_handler = create_action_handler(args)
observation = client.env_create(args.token)
if not observation:
submitted = True
break
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
observation = observation_handler(observation, action)
else:
observation = observation_handler(observation, action)
df = pd.DataFrame(all_episode_metrics)
pprint(df.describe())
if query_yes_no("Submit?"):
client.submit()
def test(actor, critic, args, act_update_fn):
act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles)
all_episode_metrics = []
for episode in range(args.num_episodes):
episode_metrics = {
"reward": 0.0,
"step": 0,
}
observation_handler = create_observation_handler(args)
action_handler = create_action_handler(args)
observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)])
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
observation = observation_handler(observation, action)
done = False
while not done:
print(episode_metrics["reward"])
action = act_fn(observation)
observation, reward, done, _ = env.step(action_handler(action))
episode_metrics["reward"] += reward
episode_metrics["step"] += 1
if done:
break
observation = observation_handler(observation, action)
all_episode_metrics.append(episode_metrics)
df = pd.DataFrame(all_episode_metrics)
pprint(df.describe())
def submit_or_test(args, model_fn, act_update_fn, submit_fn, test_fn):
args = restore_args(args)
env = create_env(args)
args.n_action = env.action_space.shape[0]
args.n_observation = env.observation_space.shape[0]
args.actor_layers = str2params(args.actor_layers)
args.critic_layers = str2params(args.critic_layers)
args.actor_activation = activations[args.actor_activation]
args.critic_activation = activations[args.critic_activation]
actor, critic = model_fn(args)
actor.load_state_dict(torch.load(args.restore_actor_from))
critic.load_state_dict(torch.load(args.restore_critic_from))
if args.submit:
submit_fn(actor, critic, args, act_update_fn)
else:
test_fn(actor, critic, args, act_update_fn)
if __name__ == '__main__':
os.environ['OMP_NUM_THREADS'] = '1'
torch.set_num_threads(1)
args = parse_args()
submit_or_test(args, create_model, create_act_update_fns, submit, test)
+237
View File
@@ -0,0 +1,237 @@
import argparse
import os
import json
import copy
import torch
import torch.multiprocessing as mp
from multiprocessing import Value
from common.misc_util import boolean_flag, str2params, create_if_need
from common.env_wrappers import create_env
from common.torch_util import activations, hard_update
from ddpg.model import create_model, create_act_update_fns, train_multi_thread, \
train_single_thread, play_single_thread
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--difficulty', type=int, default=2)
parser.add_argument('--max-obstacles', type=int, default=3)
parser.add_argument('--logdir', type=str, default="./logs")
parser.add_argument('--num-threads', type=int, default=1)
parser.add_argument('--num-train-threads', type=int, default=1)
boolean_flag(parser, "ddpg-wrapper", default=False)
parser.add_argument('--skip-frames', type=int, default=1)
parser.add_argument('--fail-reward', type=float, default=0.0)
parser.add_argument('--reward-scale', type=float, default=1.)
boolean_flag(parser, "flip-state-action", default=False)
for agent in ["actor", "critic"]:
parser.add_argument('--{}-layers'.format(agent), type=str, default="64-64")
parser.add_argument('--{}-activation'.format(agent), type=str, default="relu")
boolean_flag(parser, "{}-layer-norm".format(agent), default=False)
boolean_flag(parser, "{}-parameters-noise".format(agent), default=False)
boolean_flag(parser, "{}-parameters-noise-factorised".format(agent), default=False)
parser.add_argument('--{}-lr'.format(agent), type=float, default=1e-3)
parser.add_argument('--{}-lr-end'.format(agent), type=float, default=5e-5)
parser.add_argument('--restore-{}-from'.format(agent), type=str, default=None)
parser.add_argument('--gamma', type=float, default=0.96)
parser.add_argument('--loss-type', type=str, default="quadric-linear")
parser.add_argument('--grad-clip', type=float, default=10.)
parser.add_argument('--tau', default=0.01, type=float)
parser.add_argument('--train-steps', type=int, default=int(1e4))
parser.add_argument('--batch-size', type=int, default=256) # per worker
parser.add_argument('--buffer-size', type=int, default=int(1e6))
boolean_flag(parser, "prioritized-replay", default=False)
parser.add_argument('--prioritized-replay-alpha', default=0.6, type=float)
parser.add_argument('--prioritized-replay-beta0', default=0.4, type=float)
parser.add_argument('--initial-epsilon', default=1., type=float)
parser.add_argument('--final-epsilon', default=0.01, type=float)
parser.add_argument('--max-episodes', default=int(1e4), type=int)
parser.add_argument('--max-update-steps', default=int(5e6), type=int)
parser.add_argument('--epsilon-cycle-len', default=int(2e2), type=int)
parser.add_argument('--max-train-days', default=int(1e1), type=int)
parser.add_argument('--rp-type', default="ornstein-uhlenbeck", type=str)
parser.add_argument('--rp-theta', default=0.15, type=float)
parser.add_argument('--rp-sigma', default=0.2, type=float)
parser.add_argument('--rp-sigma-min', default=0.15, type=float)
parser.add_argument('--rp-mu', default=0.0, type=float)
parser.add_argument('--clip-delta', type=int, default=10)
parser.add_argument('--save-step', type=int, default=int(1e4))
parser.add_argument('--restore-args-from', type=str, default=None)
return parser.parse_args()
def restore_args(args):
with open(args.restore_args_from, "r") as fin:
params = json.load(fin)
del params["seed"]
del params["difficulty"]
del params["max_obstacles"]
del params["logdir"]
del params["num_threads"]
del params["num_train_threads"]
del params["skip_frames"]
for agent in ["actor", "critic"]:
del params["{}_lr".format(agent)]
del params["{}_lr_end".format(agent)]
del params["restore_{}_from".format(agent)]
del params["grad_clip"]
del params["tau"]
del params["train_steps"]
del params["batch_size"]
del params["buffer_size"]
del params["prioritized_replay"]
del params["prioritized_replay_alpha"]
del params["prioritized_replay_beta0"]
del params["initial_epsilon"]
del params["final_epsilon"]
del params["max_episodes"]
del params["max_update_steps"]
del params["epsilon_cycle_len"]
del params["max_train_days"]
del params["rp_type"]
del params["rp_theta"]
del params["rp_sigma"]
del params["rp_sigma_min"]
del params["rp_mu"]
del params["clip_delta"]
del params["save_step"]
del params["restore_args_from"]
for key, value in params.items():
setattr(args, key, value)
return args
def train(args, model_fn, act_update_fns, multi_thread, train_single, play_single):
create_if_need(args.logdir)
if args.restore_args_from is not None:
args = restore_args(args)
with open("{}/args.json".format(args.logdir), "w") as fout:
json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
env = create_env(args)
if args.flip_state_action and hasattr(env, "state_transform"):
args.flip_states = env.state_transform.flip_states
args.batch_size = args.batch_size // 2
args.n_action = env.action_space.shape[0]
args.n_observation = env.observation_space.shape[0]
args.actor_layers = str2params(args.actor_layers)
args.critic_layers = str2params(args.critic_layers)
args.actor_activation = activations[args.actor_activation]
args.critic_activation = activations[args.critic_activation]
actor, critic = model_fn(args)
if args.restore_actor_from is not None:
actor.load_state_dict(torch.load(args.restore_actor_from))
if args.restore_critic_from is not None:
critic.load_state_dict(torch.load(args.restore_critic_from))
actor.train()
critic.train()
actor.share_memory()
critic.share_memory()
target_actor = copy.deepcopy(actor)
target_critic = copy.deepcopy(critic)
hard_update(target_actor, actor)
hard_update(target_critic, critic)
target_actor.train()
target_critic.train()
target_actor.share_memory()
target_critic.share_memory()
_, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
processes = []
best_reward = Value("f", 0.0)
try:
if args.num_threads == args.num_train_threads:
for rank in range(args.num_threads):
args.thread = rank
p = mp.Process(
target=multi_thread,
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
best_reward))
p.start()
processes.append(p)
else:
global_episode = Value("i", 0)
global_update_step = Value("i", 0)
episodes_queue = mp.Queue()
for rank in range(args.num_threads):
args.thread = rank
if rank < args.num_train_threads:
p = mp.Process(
target=train_single,
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
global_episode, global_update_step, episodes_queue))
else:
p = mp.Process(
target=play_single,
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
global_episode, global_update_step, episodes_queue,
best_reward))
p.start()
processes.append(p)
for p in processes:
p.join()
except KeyboardInterrupt:
pass
save_fn()
if __name__ == '__main__':
os.environ['OMP_NUM_THREADS'] = '1'
torch.set_num_threads(1)
args = parse_args()
train(args,
create_model,
create_act_update_fns,
train_multi_thread,
train_single_thread,
play_single_thread)
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 15 MiB

BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 15 MiB

+2
View File
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
conda create -n opensim-rl -c kidzik opensim git python=3.5.2 anaconda -y
+7
View File
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
conda upgrade pip -y && \
conda install -c conda-forge lapack git -y && \
conda install ipython libgcc -y && \
conda install pytorch torchvision -c soumith -y && \
pip install tensorflow==1.3.0 gym && \
pip install git+https://github.com/stanfordnmbl/osim-rl.git
+7
View File
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
conda upgrade pip -y && \
conda install -c conda-forge lapack git -y && \
conda install ipython libgcc -y && \
conda install pytorch torchvision -c soumith -y && \
pip install tensorflow==1.3.0 gym mpi4py && \
pip install git+https://github.com/stanfordnmbl/osim-rl.git