mirror of
https://github.com/wassname/Run-Skeleton-Run.git
synced 2026-06-27 16:13:51 +08:00
pytorch version
This commit is contained in:
@@ -99,3 +99,7 @@ ENV/
|
|||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
|
.DS_Store
|
||||||
|
.idea
|
||||||
|
log*
|
||||||
|
|||||||
@@ -1,2 +1,76 @@
|
|||||||
# Run-Skeleton-Run
|
# Run-Skeleton-Run
|
||||||
Reason8.ai PyTorch solution for NIPS RL 2017 challenge
|
[Reason8.ai](https://reason8.ai) PyTorch solution for 3rd place [NIPS RL 2017 challenge](https://www.crowdai.org/challenges/nips-2017-learning-to-run/leaderboards?challenge_round_id=12).
|
||||||
|
|
||||||
|
Additional thanks to [Michail Pavlov](https://github.com/fgvbrt) for collaboration.
|
||||||
|
|
||||||
|
## Agent policies
|
||||||
|
|
||||||
|
### no-flip-state-action
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### flip-state-action
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## How to setup environment?
|
||||||
|
|
||||||
|
1. `sh setup_conda.sh`
|
||||||
|
2. `source activate opensim-rl`
|
||||||
|
|
||||||
|
Would like to test baselines? (Need MPI support)
|
||||||
|
3. `sudo apt-get install openmpi-bin openmpi-doc libopenmpi-dev`
|
||||||
|
3+. `sh setup_env_mpi.sh`
|
||||||
|
|
||||||
|
OR like DDPG agents?
|
||||||
|
3. `sh setup_env.sh`
|
||||||
|
|
||||||
|
4. Congrats! Now you are ready to check our agents.
|
||||||
|
|
||||||
|
|
||||||
|
## Run DDPG agent
|
||||||
|
|
||||||
|
```
|
||||||
|
CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
|
||||||
|
--logdir ./logs_ddpg \
|
||||||
|
--num-threads 4 \
|
||||||
|
--ddpg-wrapper \
|
||||||
|
--skip-frames 5 \
|
||||||
|
--fail-reward -0.2 \
|
||||||
|
--reward-scale 10 \
|
||||||
|
--flip-state-action \
|
||||||
|
--actor-layers 64-64 --actor-layer-norm --actor-parameters-noise \
|
||||||
|
--actor-lr 0.001 --actor-lr-end 0.00001 \
|
||||||
|
--critic-layers 64-32 --critic-layer-norm \
|
||||||
|
--critic-lr 0.002 --critic-lr-end 0.00001 \
|
||||||
|
--initial-epsilon 0.5 --final-epsilon 0.001 \
|
||||||
|
--tau 0.0001
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Evaluate DDPG agent
|
||||||
|
|
||||||
|
```
|
||||||
|
CUDA_VISIBLE_DEVICES="" PYTHONPATH=./ python ddpg/submit.py \
|
||||||
|
--restore-actor-from ./logs_ddpg/actor_state_dict.pkl \
|
||||||
|
--restore-critic-from ./logs_ddpg/critic_state_dict.pkl \
|
||||||
|
--restore-args-from ./logs_ddpg/args.json \
|
||||||
|
--num-episodes 10
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Run TRPO/PPO agent
|
||||||
|
|
||||||
|
```
|
||||||
|
CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
|
||||||
|
--agent ppo \
|
||||||
|
--logdir ./logs_baseline \
|
||||||
|
--baseline-wrapper \
|
||||||
|
--skip-frames 5 \
|
||||||
|
--fail-reward -0.2 \
|
||||||
|
--reward-scale 10
|
||||||
|
```
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
from baselines.baselines_common.console_util import *
|
||||||
|
from baselines.baselines_common.dataset import Dataset
|
||||||
|
from baselines.baselines_common.math_util import *
|
||||||
|
from baselines.baselines_common.misc_util import *
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
|
||||||
|
"""
|
||||||
|
Demmel p 312
|
||||||
|
"""
|
||||||
|
p = b.copy()
|
||||||
|
r = b.copy()
|
||||||
|
x = np.zeros_like(b)
|
||||||
|
rdotr = r.dot(r)
|
||||||
|
|
||||||
|
fmtstr = "%10i %10.3g %10.3g"
|
||||||
|
titlestr = "%10s %10s %10s"
|
||||||
|
if verbose:
|
||||||
|
print(titlestr % ("iter", "residual norm", "soln norm"))
|
||||||
|
|
||||||
|
for i in range(cg_iters):
|
||||||
|
if callback is not None:
|
||||||
|
callback(x)
|
||||||
|
if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
|
||||||
|
z = f_Ax(p)
|
||||||
|
v = rdotr / p.dot(z)
|
||||||
|
x += v * p
|
||||||
|
r -= v * z
|
||||||
|
newrdotr = r.dot(r)
|
||||||
|
mu = newrdotr / rdotr
|
||||||
|
p = r + mu * p
|
||||||
|
|
||||||
|
rdotr = newrdotr
|
||||||
|
if rdotr < residual_tol:
|
||||||
|
break
|
||||||
|
|
||||||
|
if callback is not None:
|
||||||
|
callback(x)
|
||||||
|
if verbose:
|
||||||
|
print(fmtstr % (i + 1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
|
||||||
|
return x
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
from __future__ import print_function
|
||||||
|
from contextlib import contextmanager
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Misc
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def fmt_row(width, row, header=False):
|
||||||
|
out = " | ".join(fmt_item(x, width) for x in row)
|
||||||
|
if header: out = out + "\n" + "-" * len(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_item(x, l):
|
||||||
|
if isinstance(x, np.ndarray):
|
||||||
|
assert x.ndim == 0
|
||||||
|
x = x.item()
|
||||||
|
if isinstance(x, float):
|
||||||
|
rep = "%g" % x
|
||||||
|
else:
|
||||||
|
rep = str(x)
|
||||||
|
return " " * (l - len(rep)) + rep
|
||||||
|
|
||||||
|
|
||||||
|
color2num = dict(
|
||||||
|
gray=30,
|
||||||
|
red=31,
|
||||||
|
green=32,
|
||||||
|
yellow=33,
|
||||||
|
blue=34,
|
||||||
|
magenta=35,
|
||||||
|
cyan=36,
|
||||||
|
white=37,
|
||||||
|
crimson=38
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def colorize(string, color, bold=False, highlight=False):
|
||||||
|
attr = []
|
||||||
|
num = color2num[color]
|
||||||
|
if highlight: num += 10
|
||||||
|
attr.append(str(num))
|
||||||
|
if bold: attr.append('1')
|
||||||
|
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
|
||||||
|
|
||||||
|
|
||||||
|
MESSAGE_DEPTH = 0
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def timed(msg):
|
||||||
|
global MESSAGE_DEPTH # pylint: disable=W0603
|
||||||
|
print(colorize('\t' * MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
|
||||||
|
tstart = time.time()
|
||||||
|
MESSAGE_DEPTH += 1
|
||||||
|
yield
|
||||||
|
MESSAGE_DEPTH -= 1
|
||||||
|
print(colorize('\t' * MESSAGE_DEPTH + "done in %.3f seconds" % (time.time() - tstart),
|
||||||
|
color='magenta'))
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Dataset(object):
|
||||||
|
def __init__(self, data_map, deterministic=False, shuffle=True):
|
||||||
|
self.data_map = data_map
|
||||||
|
self.deterministic = deterministic
|
||||||
|
self.enable_shuffle = shuffle
|
||||||
|
self.n = next(iter(data_map.values())).shape[0]
|
||||||
|
self._next_id = 0
|
||||||
|
self.shuffle()
|
||||||
|
|
||||||
|
def shuffle(self):
|
||||||
|
if self.deterministic:
|
||||||
|
return
|
||||||
|
perm = np.arange(self.n)
|
||||||
|
np.random.shuffle(perm)
|
||||||
|
|
||||||
|
for key in self.data_map:
|
||||||
|
self.data_map[key] = self.data_map[key][perm]
|
||||||
|
|
||||||
|
self._next_id = 0
|
||||||
|
|
||||||
|
def next_batch(self, batch_size):
|
||||||
|
if self._next_id >= self.n and self.enable_shuffle:
|
||||||
|
self.shuffle()
|
||||||
|
|
||||||
|
cur_id = self._next_id
|
||||||
|
cur_batch_size = min(batch_size, self.n - self._next_id)
|
||||||
|
self._next_id += cur_batch_size
|
||||||
|
|
||||||
|
data_map = dict()
|
||||||
|
for key in self.data_map:
|
||||||
|
data_map[key] = self.data_map[key][cur_id:cur_id + cur_batch_size]
|
||||||
|
return data_map
|
||||||
|
|
||||||
|
def iterate_once(self, batch_size):
|
||||||
|
if self.enable_shuffle: self.shuffle()
|
||||||
|
|
||||||
|
while self._next_id <= self.n - batch_size:
|
||||||
|
yield self.next_batch(batch_size)
|
||||||
|
self._next_id = 0
|
||||||
|
|
||||||
|
def subset(self, num_elements, deterministic=True):
|
||||||
|
data_map = dict()
|
||||||
|
for key in self.data_map:
|
||||||
|
data_map[key] = self.data_map[key][:num_elements]
|
||||||
|
return Dataset(data_map, deterministic)
|
||||||
|
|
||||||
|
|
||||||
|
def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True,
|
||||||
|
include_final_partial_batch=True):
|
||||||
|
assert (num_batches is None) != (
|
||||||
|
batch_size is None), 'Provide num_batches or batch_size, but not both'
|
||||||
|
arrays = tuple(map(np.asarray, arrays))
|
||||||
|
n = arrays[0].shape[0]
|
||||||
|
assert all(a.shape[0] == n for a in arrays[1:])
|
||||||
|
inds = np.arange(n)
|
||||||
|
if shuffle: np.random.shuffle(inds)
|
||||||
|
sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
|
||||||
|
for batch_inds in np.array_split(inds, sections):
|
||||||
|
if include_final_partial_batch or len(batch_inds) == batch_size:
|
||||||
|
yield tuple(a[batch_inds] for a in arrays)
|
||||||
@@ -0,0 +1,377 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
from tensorflow.python.ops import math_ops
|
||||||
|
from tensorflow.python.ops import nn
|
||||||
|
|
||||||
|
|
||||||
|
class Pd(object):
|
||||||
|
"""
|
||||||
|
A particular probability distribution
|
||||||
|
"""
|
||||||
|
|
||||||
|
def flatparam(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def neglogp(self, x):
|
||||||
|
# Usually it's easier to define the negative logprob
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def kl(self, other):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def logp(self, x):
|
||||||
|
return - self.neglogp(x)
|
||||||
|
|
||||||
|
|
||||||
|
class PdType(object):
|
||||||
|
"""
|
||||||
|
Parametrized family of probability distributions
|
||||||
|
"""
|
||||||
|
|
||||||
|
def pdclass(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def pdfromflat(self, flat):
|
||||||
|
return self.pdclass()(flat)
|
||||||
|
|
||||||
|
def param_shape(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def sample_shape(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def sample_dtype(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def param_placeholder(self, prepend_shape, name=None):
|
||||||
|
return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name)
|
||||||
|
|
||||||
|
def sample_placeholder(self, prepend_shape, name=None):
|
||||||
|
return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(),
|
||||||
|
name=name)
|
||||||
|
|
||||||
|
|
||||||
|
class CategoricalPdType(PdType):
|
||||||
|
def __init__(self, ncat):
|
||||||
|
self.ncat = ncat
|
||||||
|
|
||||||
|
def pdclass(self):
|
||||||
|
return CategoricalPd
|
||||||
|
|
||||||
|
def param_shape(self):
|
||||||
|
return [self.ncat]
|
||||||
|
|
||||||
|
def sample_shape(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def sample_dtype(self):
|
||||||
|
return tf.int32
|
||||||
|
|
||||||
|
|
||||||
|
class MultiCategoricalPdType(PdType):
|
||||||
|
def __init__(self, low, high):
|
||||||
|
self.low = low
|
||||||
|
self.high = high
|
||||||
|
self.ncats = high - low + 1
|
||||||
|
|
||||||
|
def pdclass(self):
|
||||||
|
return MultiCategoricalPd
|
||||||
|
|
||||||
|
def pdfromflat(self, flat):
|
||||||
|
return MultiCategoricalPd(self.low, self.high, flat)
|
||||||
|
|
||||||
|
def param_shape(self):
|
||||||
|
return [sum(self.ncats)]
|
||||||
|
|
||||||
|
def sample_shape(self):
|
||||||
|
return [len(self.ncats)]
|
||||||
|
|
||||||
|
def sample_dtype(self):
|
||||||
|
return tf.int32
|
||||||
|
|
||||||
|
|
||||||
|
class DiagGaussianPdType(PdType):
|
||||||
|
def __init__(self, size):
|
||||||
|
self.size = size
|
||||||
|
|
||||||
|
def pdclass(self):
|
||||||
|
return DiagGaussianPd
|
||||||
|
|
||||||
|
def param_shape(self):
|
||||||
|
return [2 * self.size]
|
||||||
|
|
||||||
|
def sample_shape(self):
|
||||||
|
return [self.size]
|
||||||
|
|
||||||
|
def sample_dtype(self):
|
||||||
|
return tf.float32
|
||||||
|
|
||||||
|
|
||||||
|
class BernoulliPdType(PdType):
|
||||||
|
def __init__(self, size):
|
||||||
|
self.size = size
|
||||||
|
|
||||||
|
def pdclass(self):
|
||||||
|
return BernoulliPd
|
||||||
|
|
||||||
|
def param_shape(self):
|
||||||
|
return [self.size]
|
||||||
|
|
||||||
|
def sample_shape(self):
|
||||||
|
return [self.size]
|
||||||
|
|
||||||
|
def sample_dtype(self):
|
||||||
|
return tf.int32
|
||||||
|
|
||||||
|
|
||||||
|
# WRONG SECOND DERIVATIVES
|
||||||
|
# class CategoricalPd(Pd):
|
||||||
|
# def __init__(self, logits):
|
||||||
|
# self.logits = logits
|
||||||
|
# self.ps = tf.nn.softmax(logits)
|
||||||
|
# @classmethod
|
||||||
|
# def fromflat(cls, flat):
|
||||||
|
# return cls(flat)
|
||||||
|
# def flatparam(self):
|
||||||
|
# return self.logits
|
||||||
|
# def mode(self):
|
||||||
|
# return U.argmax(self.logits, axis=-1)
|
||||||
|
# def logp(self, x):
|
||||||
|
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
|
||||||
|
# def kl(self, other):
|
||||||
|
# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
|
||||||
|
# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
|
||||||
|
# def entropy(self):
|
||||||
|
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
|
||||||
|
# def sample(self):
|
||||||
|
# u = tf.random_uniform(tf.shape(self.logits))
|
||||||
|
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||||
|
|
||||||
|
class CategoricalPd(Pd):
|
||||||
|
def __init__(self, logits):
|
||||||
|
self.logits = logits
|
||||||
|
|
||||||
|
def flatparam(self):
|
||||||
|
return self.logits
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return U.argmax(self.logits, axis=-1)
|
||||||
|
|
||||||
|
def neglogp(self, x):
|
||||||
|
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||||
|
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
||||||
|
# the implementation does not allow second-order derivatives...
|
||||||
|
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
|
||||||
|
return tf.nn.softmax_cross_entropy_with_logits(
|
||||||
|
logits=self.logits,
|
||||||
|
labels=one_hot_actions)
|
||||||
|
|
||||||
|
def kl(self, other):
|
||||||
|
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||||
|
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
|
||||||
|
ea0 = tf.exp(a0)
|
||||||
|
ea1 = tf.exp(a1)
|
||||||
|
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||||
|
z1 = U.sum(ea1, axis=-1, keepdims=True)
|
||||||
|
p0 = ea0 / z0
|
||||||
|
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||||
|
ea0 = tf.exp(a0)
|
||||||
|
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||||
|
p0 = ea0 / z0
|
||||||
|
return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
u = tf.random_uniform(tf.shape(self.logits))
|
||||||
|
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromflat(cls, flat):
|
||||||
|
return cls(flat)
|
||||||
|
|
||||||
|
|
||||||
|
class MultiCategoricalPd(Pd):
|
||||||
|
def __init__(self, low, high, flat):
|
||||||
|
self.flat = flat
|
||||||
|
self.low = tf.constant(low, dtype=tf.int32)
|
||||||
|
self.categoricals = list(
|
||||||
|
map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
|
||||||
|
|
||||||
|
def flatparam(self):
|
||||||
|
return self.flat
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1),
|
||||||
|
tf.int32)
|
||||||
|
|
||||||
|
def neglogp(self, x):
|
||||||
|
return tf.add_n([p.neglogp(px) for p, px in zip(
|
||||||
|
self.categoricals, tf.unstack(x - self.low,
|
||||||
|
axis=len(x.get_shape()) - 1))])
|
||||||
|
|
||||||
|
def kl(self, other):
|
||||||
|
return tf.add_n([
|
||||||
|
p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
|
||||||
|
])
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
return tf.add_n([p.entropy() for p in self.categoricals])
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1),
|
||||||
|
tf.int32)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromflat(cls, flat):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class DiagGaussianPd(Pd):
|
||||||
|
def __init__(self, flat):
|
||||||
|
self.flat = flat
|
||||||
|
mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat)
|
||||||
|
self.mean = mean
|
||||||
|
self.logstd = logstd
|
||||||
|
self.std = tf.exp(logstd)
|
||||||
|
|
||||||
|
def flatparam(self):
|
||||||
|
return self.flat
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return self.mean
|
||||||
|
|
||||||
|
def neglogp(self, x):
|
||||||
|
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
||||||
|
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
|
||||||
|
+ U.sum(self.logstd, axis=-1)
|
||||||
|
|
||||||
|
def kl(self, other):
|
||||||
|
assert isinstance(other, DiagGaussianPd)
|
||||||
|
return U.sum(other.logstd - self.logstd + (
|
||||||
|
tf.square(self.std) + tf.square(self.mean - other.mean)) / (
|
||||||
|
2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromflat(cls, flat):
|
||||||
|
return cls(flat)
|
||||||
|
|
||||||
|
|
||||||
|
class BernoulliPd(Pd):
|
||||||
|
def __init__(self, logits):
|
||||||
|
self.logits = logits
|
||||||
|
self.ps = tf.sigmoid(logits)
|
||||||
|
|
||||||
|
def flatparam(self):
|
||||||
|
return self.logits
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return tf.round(self.ps)
|
||||||
|
|
||||||
|
def neglogp(self, x):
|
||||||
|
return U.sum(
|
||||||
|
tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)),
|
||||||
|
axis=-1)
|
||||||
|
|
||||||
|
def kl(self, other):
|
||||||
|
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps),
|
||||||
|
axis=-1) - U.sum(
|
||||||
|
tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps),
|
||||||
|
axis=-1)
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
u = tf.random_uniform(tf.shape(self.ps))
|
||||||
|
return tf.to_float(math_ops.less(u, self.ps))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromflat(cls, flat):
|
||||||
|
return cls(flat)
|
||||||
|
|
||||||
|
|
||||||
|
def make_pdtype(ac_space):
|
||||||
|
from gym import spaces
|
||||||
|
if isinstance(ac_space, spaces.Box):
|
||||||
|
assert len(ac_space.shape) == 1
|
||||||
|
return DiagGaussianPdType(ac_space.shape[0])
|
||||||
|
elif isinstance(ac_space, spaces.Discrete):
|
||||||
|
return CategoricalPdType(ac_space.n)
|
||||||
|
elif isinstance(ac_space, spaces.MultiDiscrete):
|
||||||
|
return MultiCategoricalPdType(ac_space.low, ac_space.high)
|
||||||
|
elif isinstance(ac_space, spaces.MultiBinary):
|
||||||
|
return BernoulliPdType(ac_space.n)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def shape_el(v, i):
|
||||||
|
maybe = v.get_shape()[i]
|
||||||
|
if maybe is not None:
|
||||||
|
return maybe
|
||||||
|
else:
|
||||||
|
return tf.shape(v)[i]
|
||||||
|
|
||||||
|
|
||||||
|
@U.in_session
|
||||||
|
def test_probtypes():
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
|
||||||
|
diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) # pylint: disable=E1101
|
||||||
|
validate_probtype(diag_gauss, pdparam_diag_gauss)
|
||||||
|
|
||||||
|
pdparam_categorical = np.array([-.2, .3, .5])
|
||||||
|
categorical = CategoricalPdType(pdparam_categorical.size) # pylint: disable=E1101
|
||||||
|
validate_probtype(categorical, pdparam_categorical)
|
||||||
|
|
||||||
|
pdparam_bernoulli = np.array([-.2, .3, .5])
|
||||||
|
bernoulli = BernoulliPdType(pdparam_bernoulli.size) # pylint: disable=E1101
|
||||||
|
validate_probtype(bernoulli, pdparam_bernoulli)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_probtype(probtype, pdparam):
|
||||||
|
N = 100000
|
||||||
|
# Check to see if mean negative log likelihood == differential entropy
|
||||||
|
Mval = np.repeat(pdparam[None, :], N, axis=0)
|
||||||
|
M = probtype.param_placeholder([N])
|
||||||
|
X = probtype.sample_placeholder([N])
|
||||||
|
pd = probtype.pdclass()(M)
|
||||||
|
calcloglik = U.function([X, M], pd.logp(X))
|
||||||
|
calcent = U.function([M], pd.entropy())
|
||||||
|
Xval = U.eval(pd.sample(), feed_dict={M: Mval})
|
||||||
|
logliks = calcloglik(Xval, Mval)
|
||||||
|
entval_ll = - logliks.mean() # pylint: disable=E1101
|
||||||
|
entval_ll_stderr = logliks.std() / np.sqrt(N) # pylint: disable=E1101
|
||||||
|
entval = calcent(Mval).mean() # pylint: disable=E1101
|
||||||
|
assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
|
||||||
|
|
||||||
|
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
|
||||||
|
M2 = probtype.param_placeholder([N])
|
||||||
|
pd2 = probtype.pdclass()(M2)
|
||||||
|
q = pdparam + np.random.randn(pdparam.size) * 0.1
|
||||||
|
Mval2 = np.repeat(q[None, :], N, axis=0)
|
||||||
|
calckl = U.function([M, M2], pd.kl(pd2))
|
||||||
|
klval = calckl(Mval, Mval2).mean() # pylint: disable=E1101
|
||||||
|
logliks = calcloglik(Xval, Mval2)
|
||||||
|
klval_ll = - entval - logliks.mean() # pylint: disable=E1101
|
||||||
|
klval_ll_stderr = logliks.std() / np.sqrt(N) # pylint: disable=E1101
|
||||||
|
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy.signal
|
||||||
|
|
||||||
|
|
||||||
|
def discount(x, gamma):
|
||||||
|
"""
|
||||||
|
computes discounted sums along 0th dimension of x.
|
||||||
|
|
||||||
|
inputs
|
||||||
|
------
|
||||||
|
x: ndarray
|
||||||
|
gamma: float
|
||||||
|
|
||||||
|
outputs
|
||||||
|
-------
|
||||||
|
y: ndarray with same shape as x, satisfying
|
||||||
|
|
||||||
|
y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
|
||||||
|
where k = len(x) - t - 1
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert x.ndim >= 1
|
||||||
|
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||||
|
|
||||||
|
|
||||||
|
def explained_variance(ypred, y):
|
||||||
|
"""
|
||||||
|
Computes fraction of variance that ypred explains about y.
|
||||||
|
Returns 1 - Var[y-ypred] / Var[y]
|
||||||
|
|
||||||
|
interpretation:
|
||||||
|
ev=0 => might as well have predicted zero
|
||||||
|
ev=1 => perfect prediction
|
||||||
|
ev<0 => worse than just predicting zero
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert y.ndim == 1 and ypred.ndim == 1
|
||||||
|
vary = np.var(y)
|
||||||
|
return np.nan if vary == 0 else 1 - np.var(y - ypred) / vary
|
||||||
|
|
||||||
|
|
||||||
|
def explained_variance_2d(ypred, y):
|
||||||
|
assert y.ndim == 2 and ypred.ndim == 2
|
||||||
|
vary = np.var(y, axis=0)
|
||||||
|
out = 1 - np.var(y - ypred) / vary
|
||||||
|
out[vary < 1e-10] = 0
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def ncc(ypred, y):
|
||||||
|
return np.corrcoef(ypred, y)[1, 0]
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_arrays(arrs):
|
||||||
|
return np.concatenate([arr.flat for arr in arrs])
|
||||||
|
|
||||||
|
|
||||||
|
def unflatten_vector(vec, shapes):
|
||||||
|
i = 0
|
||||||
|
arrs = []
|
||||||
|
for shape in shapes:
|
||||||
|
size = np.prod(shape)
|
||||||
|
arr = vec[i:i + size].reshape(shape)
|
||||||
|
arrs.append(arr)
|
||||||
|
i += size
|
||||||
|
return arrs
|
||||||
|
|
||||||
|
|
||||||
|
def discount_with_boundaries(X, New, gamma):
|
||||||
|
"""
|
||||||
|
X: 2d array of floats, time x features
|
||||||
|
New: 2d array of bools, indicating when a new episode has started
|
||||||
|
"""
|
||||||
|
Y = np.zeros_like(X)
|
||||||
|
T = X.shape[0]
|
||||||
|
Y[T - 1] = X[T - 1]
|
||||||
|
for t in range(T - 2, -1, -1):
|
||||||
|
Y[t] = X[t] + gamma * Y[t + 1] * (1 - New[t + 1])
|
||||||
|
return Y
|
||||||
|
|
||||||
|
|
||||||
|
def test_discount_with_boundaries():
|
||||||
|
gamma = 0.9
|
||||||
|
x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
|
||||||
|
starts = [1.0, 0.0, 0.0, 1.0]
|
||||||
|
y = discount_with_boundaries(x, starts, gamma)
|
||||||
|
assert np.allclose(y, [
|
||||||
|
1 + gamma * 2 + gamma ** 2 * 3,
|
||||||
|
2 + gamma * 3,
|
||||||
|
3,
|
||||||
|
4
|
||||||
|
])
|
||||||
@@ -0,0 +1,328 @@
|
|||||||
|
import gym
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import random
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
|
||||||
|
def zipsame(*seqs):
|
||||||
|
L = len(seqs[0])
|
||||||
|
assert all(len(seq) == L for seq in seqs[1:])
|
||||||
|
return zip(*seqs)
|
||||||
|
|
||||||
|
|
||||||
|
def unpack(seq, sizes):
|
||||||
|
"""
|
||||||
|
Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
|
||||||
|
None = just one bare element, not a list
|
||||||
|
|
||||||
|
Example:
|
||||||
|
unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
|
||||||
|
"""
|
||||||
|
seq = list(seq)
|
||||||
|
it = iter(seq)
|
||||||
|
assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
|
||||||
|
for size in sizes:
|
||||||
|
if size is None:
|
||||||
|
yield it.__next__()
|
||||||
|
else:
|
||||||
|
li = []
|
||||||
|
for _ in range(size):
|
||||||
|
li.append(it.__next__())
|
||||||
|
yield li
|
||||||
|
|
||||||
|
|
||||||
|
class EzPickle(object):
|
||||||
|
"""Objects that are pickled and unpickled via their constructor
|
||||||
|
arguments.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
class Dog(Animal, EzPickle):
|
||||||
|
def __init__(self, furcolor, tailkind="bushy"):
|
||||||
|
Animal.__init__()
|
||||||
|
EzPickle.__init__(furcolor, tailkind)
|
||||||
|
...
|
||||||
|
|
||||||
|
When this object is unpickled, a new Dog will be constructed by passing the provided
|
||||||
|
furcolor and tailkind into the constructor. However, philosophers are still not sure
|
||||||
|
whether it is still the same dog.
|
||||||
|
|
||||||
|
This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
|
||||||
|
and Atari.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._ezpickle_args = args
|
||||||
|
self._ezpickle_kwargs = kwargs
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
|
||||||
|
|
||||||
|
def __setstate__(self, d):
|
||||||
|
out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
|
||||||
|
self.__dict__.update(out.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
def set_global_seeds(i):
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
tf.set_random_seed(i)
|
||||||
|
np.random.seed(i)
|
||||||
|
random.seed(i)
|
||||||
|
|
||||||
|
|
||||||
|
def pretty_eta(seconds_left):
|
||||||
|
"""Print the number of seconds in human readable format.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
2 days
|
||||||
|
2 hours and 37 minutes
|
||||||
|
less than a minute
|
||||||
|
|
||||||
|
Paramters
|
||||||
|
---------
|
||||||
|
seconds_left: int
|
||||||
|
Number of seconds to be converted to the ETA
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
eta: str
|
||||||
|
String representing the pretty ETA.
|
||||||
|
"""
|
||||||
|
minutes_left = seconds_left // 60
|
||||||
|
seconds_left %= 60
|
||||||
|
hours_left = minutes_left // 60
|
||||||
|
minutes_left %= 60
|
||||||
|
days_left = hours_left // 24
|
||||||
|
hours_left %= 24
|
||||||
|
|
||||||
|
def helper(cnt, name):
|
||||||
|
return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
|
||||||
|
|
||||||
|
if days_left > 0:
|
||||||
|
msg = helper(days_left, 'day')
|
||||||
|
if hours_left > 0:
|
||||||
|
msg += ' and ' + helper(hours_left, 'hour')
|
||||||
|
return msg
|
||||||
|
if hours_left > 0:
|
||||||
|
msg = helper(hours_left, 'hour')
|
||||||
|
if minutes_left > 0:
|
||||||
|
msg += ' and ' + helper(minutes_left, 'minute')
|
||||||
|
return msg
|
||||||
|
if minutes_left > 0:
|
||||||
|
return helper(minutes_left, 'minute')
|
||||||
|
return 'less than a minute'
|
||||||
|
|
||||||
|
|
||||||
|
class RunningAvg(object):
|
||||||
|
def __init__(self, gamma, init_value=None):
|
||||||
|
"""Keep a running estimate of a quantity. This is a bit like mean
|
||||||
|
but more sensitive to recent changes.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
gamma: float
|
||||||
|
Must be between 0 and 1, where 0 is the most sensitive to recent
|
||||||
|
changes.
|
||||||
|
init_value: float or None
|
||||||
|
Initial value of the estimate. If None, it will be set on the first update.
|
||||||
|
"""
|
||||||
|
self._value = init_value
|
||||||
|
self._gamma = gamma
|
||||||
|
|
||||||
|
def update(self, new_val):
|
||||||
|
"""Update the estimate.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
new_val: float
|
||||||
|
new observated value of estimated quantity.
|
||||||
|
"""
|
||||||
|
if self._value is None:
|
||||||
|
self._value = new_val
|
||||||
|
else:
|
||||||
|
self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
|
||||||
|
|
||||||
|
def __float__(self):
|
||||||
|
"""Get the current estimate"""
|
||||||
|
return self._value
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleMonitor(gym.Wrapper):
|
||||||
|
def __init__(self, env):
|
||||||
|
"""Adds two qunatities to info returned by every step:
|
||||||
|
|
||||||
|
num_steps: int
|
||||||
|
Number of steps takes so far
|
||||||
|
rewards: [float]
|
||||||
|
All the cumulative rewards for the episodes completed so far.
|
||||||
|
"""
|
||||||
|
super().__init__(env)
|
||||||
|
# current episode state
|
||||||
|
self._current_reward = None
|
||||||
|
self._num_steps = None
|
||||||
|
# temporary monitor state that we do not save
|
||||||
|
self._time_offset = None
|
||||||
|
self._total_steps = None
|
||||||
|
# monitor state
|
||||||
|
self._episode_rewards = []
|
||||||
|
self._episode_lengths = []
|
||||||
|
self._episode_end_times = []
|
||||||
|
|
||||||
|
def _reset(self):
|
||||||
|
obs = self.env.reset()
|
||||||
|
# recompute temporary state if needed
|
||||||
|
if self._time_offset is None:
|
||||||
|
self._time_offset = time.time()
|
||||||
|
if len(self._episode_end_times) > 0:
|
||||||
|
self._time_offset -= self._episode_end_times[-1]
|
||||||
|
if self._total_steps is None:
|
||||||
|
self._total_steps = sum(self._episode_lengths)
|
||||||
|
# update monitor state
|
||||||
|
if self._current_reward is not None:
|
||||||
|
self._episode_rewards.append(self._current_reward)
|
||||||
|
self._episode_lengths.append(self._num_steps)
|
||||||
|
self._episode_end_times.append(time.time() - self._time_offset)
|
||||||
|
# reset episode state
|
||||||
|
self._current_reward = 0
|
||||||
|
self._num_steps = 0
|
||||||
|
|
||||||
|
return obs
|
||||||
|
|
||||||
|
def _step(self, action):
|
||||||
|
obs, rew, done, info = self.env.step(action)
|
||||||
|
self._current_reward += rew
|
||||||
|
self._num_steps += 1
|
||||||
|
self._total_steps += 1
|
||||||
|
info['steps'] = self._total_steps
|
||||||
|
info['rewards'] = self._episode_rewards
|
||||||
|
return (obs, rew, done, info)
|
||||||
|
|
||||||
|
def get_state(self):
|
||||||
|
return {
|
||||||
|
'env_id': self.env.unwrapped.spec.id,
|
||||||
|
'episode_data': {
|
||||||
|
'episode_rewards': self._episode_rewards,
|
||||||
|
'episode_lengths': self._episode_lengths,
|
||||||
|
'episode_end_times': self._episode_end_times,
|
||||||
|
'initial_reset_time': 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def set_state(self, state):
|
||||||
|
assert state['env_id'] == self.env.unwrapped.spec.id
|
||||||
|
ed = state['episode_data']
|
||||||
|
self._episode_rewards = ed['episode_rewards']
|
||||||
|
self._episode_lengths = ed['episode_lengths']
|
||||||
|
self._episode_end_times = ed['episode_end_times']
|
||||||
|
|
||||||
|
|
||||||
|
def boolean_flag(parser, name, default=False, help=None):
|
||||||
|
"""Add a boolean flag to argparse parser.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
parser: argparse.Parser
|
||||||
|
parser to add the flag to
|
||||||
|
name: str
|
||||||
|
--<name> will enable the flag, while --no-<name> will disable it
|
||||||
|
default: bool or None
|
||||||
|
default value of the flag
|
||||||
|
help: str
|
||||||
|
help string for the flag
|
||||||
|
"""
|
||||||
|
dest = name.replace('-', '_')
|
||||||
|
parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
|
||||||
|
parser.add_argument("--no-" + name, action="store_false", dest=dest)
|
||||||
|
|
||||||
|
|
||||||
|
def get_wrapper_by_name(env, classname):
|
||||||
|
"""Given an a gym environment possibly wrapped multiple times, returns a wrapper
|
||||||
|
of class named classname or raises ValueError if no such wrapper was applied
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
env: gym.Env of gym.Wrapper
|
||||||
|
gym environment
|
||||||
|
classname: str
|
||||||
|
name of the wrapper
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
wrapper: gym.Wrapper
|
||||||
|
wrapper named classname
|
||||||
|
"""
|
||||||
|
currentenv = env
|
||||||
|
while True:
|
||||||
|
if classname == currentenv.class_name():
|
||||||
|
return currentenv
|
||||||
|
elif isinstance(currentenv, gym.Wrapper):
|
||||||
|
currentenv = currentenv.env
|
||||||
|
else:
|
||||||
|
raise ValueError("Couldn't find wrapper named %s" % classname)
|
||||||
|
|
||||||
|
|
||||||
|
def relatively_safe_pickle_dump(obj, path, compression=False):
|
||||||
|
"""This is just like regular pickle dump, except from the fact that failure cases are
|
||||||
|
different:
|
||||||
|
|
||||||
|
- It's never possible that we end up with a pickle in corrupted state.
|
||||||
|
- If a there was a different file at the path, that file will remain unchanged in the
|
||||||
|
even of failure (provided that filesystem rename is atomic).
|
||||||
|
- it is sometimes possible that we end up with useless temp file which needs to be
|
||||||
|
deleted manually (it will be removed automatically on the next function call)
|
||||||
|
|
||||||
|
The indended use case is periodic checkpoints of experiment state, such that we never
|
||||||
|
corrupt previous checkpoints if the current one fails.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
obj: object
|
||||||
|
object to pickle
|
||||||
|
path: str
|
||||||
|
path to the output file
|
||||||
|
compression: bool
|
||||||
|
if true pickle will be compressed
|
||||||
|
"""
|
||||||
|
temp_storage = path + ".relatively_safe"
|
||||||
|
if compression:
|
||||||
|
# Using gzip here would be simpler, but the size is limited to 2GB
|
||||||
|
with tempfile.NamedTemporaryFile() as uncompressed_file:
|
||||||
|
pickle.dump(obj, uncompressed_file)
|
||||||
|
with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
|
||||||
|
myzip.write(uncompressed_file.name, "data")
|
||||||
|
else:
|
||||||
|
with open(temp_storage, "wb") as f:
|
||||||
|
pickle.dump(obj, f)
|
||||||
|
os.rename(temp_storage, path)
|
||||||
|
|
||||||
|
|
||||||
|
def pickle_load(path, compression=False):
|
||||||
|
"""Unpickle a possible compressed pickle.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path: str
|
||||||
|
path to the output file
|
||||||
|
compression: bool
|
||||||
|
if true assumes that pickle was compressed when created and attempts decompression.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
obj: object
|
||||||
|
the unpickled object
|
||||||
|
"""
|
||||||
|
|
||||||
|
if compression:
|
||||||
|
with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
|
||||||
|
with myzip.open("data") as f:
|
||||||
|
return pickle.load(f)
|
||||||
|
else:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
return pickle.load(f)
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
from mpi4py import MPI
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class MpiAdam(object):
|
||||||
|
def __init__(self, var_list, *,
|
||||||
|
beta1=0.9, beta2=0.999, epsilon=1e-08,
|
||||||
|
scale_grad_by_procs=True,
|
||||||
|
comm=None):
|
||||||
|
self.var_list = var_list
|
||||||
|
self.beta1 = beta1
|
||||||
|
self.beta2 = beta2
|
||||||
|
self.epsilon = epsilon
|
||||||
|
self.scale_grad_by_procs = scale_grad_by_procs
|
||||||
|
size = sum(U.numel(v) for v in var_list)
|
||||||
|
self.m = np.zeros(size, 'float32')
|
||||||
|
self.v = np.zeros(size, 'float32')
|
||||||
|
|
||||||
|
self.t = 0
|
||||||
|
self.setfromflat = U.SetFromFlat(var_list)
|
||||||
|
self.getflat = U.GetFlat(var_list)
|
||||||
|
self.comm = MPI.COMM_WORLD if comm is None else comm
|
||||||
|
|
||||||
|
def update(self, localg, stepsize):
|
||||||
|
if self.t % 100 == 0:
|
||||||
|
self.check_synced()
|
||||||
|
localg = localg.astype('float32')
|
||||||
|
globalg = np.zeros_like(localg)
|
||||||
|
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
|
||||||
|
if self.scale_grad_by_procs:
|
||||||
|
globalg /= self.comm.Get_size()
|
||||||
|
|
||||||
|
self.t += 1
|
||||||
|
a = stepsize * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
|
||||||
|
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
|
||||||
|
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
|
||||||
|
step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
|
||||||
|
self.setfromflat(self.getflat() + step)
|
||||||
|
|
||||||
|
def sync(self):
|
||||||
|
theta = self.getflat()
|
||||||
|
self.comm.Bcast(theta, root=0)
|
||||||
|
self.setfromflat(theta)
|
||||||
|
|
||||||
|
def check_synced(self):
|
||||||
|
if self.comm.Get_rank() == 0: # this is root
|
||||||
|
theta = self.getflat()
|
||||||
|
self.comm.Bcast(theta, root=0)
|
||||||
|
else:
|
||||||
|
thetalocal = self.getflat()
|
||||||
|
thetaroot = np.empty_like(thetalocal)
|
||||||
|
self.comm.Bcast(thetaroot, root=0)
|
||||||
|
assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
|
||||||
|
|
||||||
|
|
||||||
|
@U.in_session
|
||||||
|
def test_MpiAdam():
|
||||||
|
np.random.seed(0)
|
||||||
|
tf.set_random_seed(0)
|
||||||
|
|
||||||
|
a = tf.Variable(np.random.randn(3).astype('float32'))
|
||||||
|
b = tf.Variable(np.random.randn(2, 5).astype('float32'))
|
||||||
|
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
|
||||||
|
|
||||||
|
stepsize = 1e-2
|
||||||
|
update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
|
||||||
|
do_update = U.function([], loss, updates=[update_op])
|
||||||
|
|
||||||
|
tf.get_default_session().run(tf.global_variables_initializer())
|
||||||
|
for i in range(10):
|
||||||
|
print(i, do_update())
|
||||||
|
|
||||||
|
tf.set_random_seed(0)
|
||||||
|
tf.get_default_session().run(tf.global_variables_initializer())
|
||||||
|
|
||||||
|
var_list = [a, b]
|
||||||
|
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
|
||||||
|
adam = MpiAdam(var_list)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
l, g = lossandgrad()
|
||||||
|
adam.update(g, stepsize)
|
||||||
|
print(i, l)
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
import os, subprocess, sys
|
||||||
|
|
||||||
|
|
||||||
|
def mpi_fork(n, bind_to_core=False):
|
||||||
|
"""Re-launches the current script with workers
|
||||||
|
Returns "parent" for original parent, "child" for MPI children
|
||||||
|
"""
|
||||||
|
if n <= 1:
|
||||||
|
return "child"
|
||||||
|
if os.getenv("IN_MPI") is None:
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
MKL_NUM_THREADS="1",
|
||||||
|
OMP_NUM_THREADS="1",
|
||||||
|
IN_MPI="1"
|
||||||
|
)
|
||||||
|
args = ["mpirun", "-np", str(n)]
|
||||||
|
if bind_to_core:
|
||||||
|
args += ["-bind-to", "core"]
|
||||||
|
args += [sys.executable] + sys.argv
|
||||||
|
subprocess.check_call(args, env=env)
|
||||||
|
return "parent"
|
||||||
|
else:
|
||||||
|
return "child"
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
from mpi4py import MPI
|
||||||
|
import numpy as np
|
||||||
|
from baselines.baselines_common import zipsame
|
||||||
|
|
||||||
|
|
||||||
|
def mpi_moments(x, axis=0):
|
||||||
|
x = np.asarray(x, dtype='float64')
|
||||||
|
newshape = list(x.shape)
|
||||||
|
newshape.pop(axis)
|
||||||
|
n = np.prod(newshape, dtype=int)
|
||||||
|
totalvec = np.zeros(n * 2 + 1, 'float64')
|
||||||
|
addvec = np.concatenate([x.sum(axis=axis).ravel(),
|
||||||
|
np.square(x).sum(axis=axis).ravel(),
|
||||||
|
np.array([x.shape[axis]], dtype='float64')])
|
||||||
|
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
||||||
|
sum = totalvec[:n]
|
||||||
|
sumsq = totalvec[n:2 * n]
|
||||||
|
count = totalvec[2 * n]
|
||||||
|
if count == 0:
|
||||||
|
mean = np.empty(newshape);
|
||||||
|
mean[:] = np.nan
|
||||||
|
std = np.empty(newshape);
|
||||||
|
std[:] = np.nan
|
||||||
|
else:
|
||||||
|
mean = sum / count
|
||||||
|
std = np.sqrt(np.maximum(sumsq / count - np.square(mean), 0))
|
||||||
|
return mean, std, count
|
||||||
|
|
||||||
|
|
||||||
|
def test_runningmeanstd():
|
||||||
|
comm = MPI.COMM_WORLD
|
||||||
|
np.random.seed(0)
|
||||||
|
for (triple, axis) in [
|
||||||
|
((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0),
|
||||||
|
((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0),
|
||||||
|
((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1),
|
||||||
|
]:
|
||||||
|
|
||||||
|
x = np.concatenate(triple, axis=axis)
|
||||||
|
ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
|
||||||
|
|
||||||
|
ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis)
|
||||||
|
|
||||||
|
for (a1, a2) in zipsame(ms1, ms2):
|
||||||
|
print(a1, a2)
|
||||||
|
assert np.allclose(a1, a2)
|
||||||
|
print("ok!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# mpirun -np 3 python <script>
|
||||||
|
test_runningmeanstd()
|
||||||
@@ -0,0 +1,112 @@
|
|||||||
|
from mpi4py import MPI
|
||||||
|
import tensorflow as tf
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class RunningMeanStd(object):
|
||||||
|
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
||||||
|
def __init__(self, epsilon=1e-2, shape=()):
|
||||||
|
self._sum = tf.get_variable(
|
||||||
|
dtype=tf.float64,
|
||||||
|
shape=shape,
|
||||||
|
initializer=tf.constant_initializer(0.0),
|
||||||
|
name="runningsum", trainable=False)
|
||||||
|
self._sumsq = tf.get_variable(
|
||||||
|
dtype=tf.float64,
|
||||||
|
shape=shape,
|
||||||
|
initializer=tf.constant_initializer(epsilon),
|
||||||
|
name="runningsumsq", trainable=False)
|
||||||
|
self._count = tf.get_variable(
|
||||||
|
dtype=tf.float64,
|
||||||
|
shape=(),
|
||||||
|
initializer=tf.constant_initializer(epsilon),
|
||||||
|
name="count", trainable=False)
|
||||||
|
self.shape = shape
|
||||||
|
|
||||||
|
self.mean = tf.to_float(self._sum / self._count)
|
||||||
|
self.std = tf.sqrt(
|
||||||
|
tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2))
|
||||||
|
|
||||||
|
newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
|
||||||
|
newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
|
||||||
|
newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
|
||||||
|
self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
|
||||||
|
updates=[tf.assign_add(self._sum, newsum),
|
||||||
|
tf.assign_add(self._sumsq, newsumsq),
|
||||||
|
tf.assign_add(self._count, newcount)])
|
||||||
|
|
||||||
|
def update(self, x):
|
||||||
|
x = x.astype('float64')
|
||||||
|
n = int(np.prod(self.shape))
|
||||||
|
totalvec = np.zeros(n * 2 + 1, 'float64')
|
||||||
|
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(),
|
||||||
|
np.array([len(x)], dtype='float64')])
|
||||||
|
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
||||||
|
self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2 * n].reshape(self.shape),
|
||||||
|
totalvec[2 * n])
|
||||||
|
|
||||||
|
|
||||||
|
@U.in_session
|
||||||
|
def test_runningmeanstd():
|
||||||
|
for (x1, x2, x3) in [
|
||||||
|
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
|
||||||
|
(np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
|
||||||
|
]:
|
||||||
|
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
|
||||||
|
U.initialize()
|
||||||
|
|
||||||
|
x = np.concatenate([x1, x2, x3], axis=0)
|
||||||
|
ms1 = [x.mean(axis=0), x.std(axis=0)]
|
||||||
|
rms.update(x1)
|
||||||
|
rms.update(x2)
|
||||||
|
rms.update(x3)
|
||||||
|
ms2 = U.eval([rms.mean, rms.std])
|
||||||
|
|
||||||
|
assert np.allclose(ms1, ms2)
|
||||||
|
|
||||||
|
|
||||||
|
@U.in_session
|
||||||
|
def test_dist():
|
||||||
|
np.random.seed(0)
|
||||||
|
p1, p2, p3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1))
|
||||||
|
q1, q2, q3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1))
|
||||||
|
|
||||||
|
# p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
|
||||||
|
# q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
|
||||||
|
|
||||||
|
comm = MPI.COMM_WORLD
|
||||||
|
assert comm.Get_size() == 2
|
||||||
|
if comm.Get_rank() == 0:
|
||||||
|
x1, x2, x3 = p1, p2, p3
|
||||||
|
elif comm.Get_rank() == 1:
|
||||||
|
x1, x2, x3 = q1, q2, q3
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
rms = RunningMeanStd(epsilon=0.0, shape=(1,))
|
||||||
|
U.initialize()
|
||||||
|
|
||||||
|
rms.update(x1)
|
||||||
|
rms.update(x2)
|
||||||
|
rms.update(x3)
|
||||||
|
|
||||||
|
bigvec = np.concatenate([p1, p2, p3, q1, q2, q3])
|
||||||
|
|
||||||
|
def checkallclose(x, y):
|
||||||
|
print(x, y)
|
||||||
|
return np.allclose(x, y)
|
||||||
|
|
||||||
|
assert checkallclose(
|
||||||
|
bigvec.mean(axis=0),
|
||||||
|
U.eval(rms.mean)
|
||||||
|
)
|
||||||
|
assert checkallclose(
|
||||||
|
bigvec.std(axis=0),
|
||||||
|
U.eval(rms.std)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run with mpirun -np 2 python <filename>
|
||||||
|
test_dist()
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
from mpi4py import MPI
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
class MpiSaver(object):
|
||||||
|
def __init__(self, var_list=None, *,
|
||||||
|
comm=None,
|
||||||
|
log_prefix="/tmp"):
|
||||||
|
self.var_list = var_list
|
||||||
|
self.t = 0
|
||||||
|
|
||||||
|
self.saver = tf.train.Saver(
|
||||||
|
var_list=var_list,
|
||||||
|
max_to_keep=100,
|
||||||
|
keep_checkpoint_every_n_hours=0.25,
|
||||||
|
pad_step_number=True,
|
||||||
|
save_relative_paths=True)
|
||||||
|
self.log_prefix = log_prefix
|
||||||
|
|
||||||
|
self.comm = MPI.COMM_WORLD if comm is None else comm
|
||||||
|
|
||||||
|
def restore(self, restore_from=None):
|
||||||
|
if restore_from is not None:
|
||||||
|
self.saver.restore(U.get_session(), restore_from)
|
||||||
|
self.t += int(restore_from.split("-")[-1])
|
||||||
|
self.sync()
|
||||||
|
|
||||||
|
def sync(self):
|
||||||
|
if self.comm.Get_rank() == 0: # this is root
|
||||||
|
self.saver.save(
|
||||||
|
U.get_session(),
|
||||||
|
"{}/model.ckpt".format(self.log_prefix),
|
||||||
|
global_step=self.t)
|
||||||
|
self.t += 1
|
||||||
@@ -0,0 +1,99 @@
|
|||||||
|
"""This file is used for specifying various schedules that evolve over
|
||||||
|
time throughout the execution of the algorithm, such as:
|
||||||
|
- learning rate for the optimizer
|
||||||
|
- exploration epsilon for the epsilon greedy exploration strategy
|
||||||
|
- beta parameter for beta parameter in prioritized replay
|
||||||
|
|
||||||
|
Each schedule has a function `value(t)` which returns the current value
|
||||||
|
of the parameter given the timestep t of the optimization procedure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Schedule(object):
|
||||||
|
def value(self, t):
|
||||||
|
"""Value of the schedule at time t"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class ConstantSchedule(object):
|
||||||
|
def __init__(self, value):
|
||||||
|
"""Value remains constant over time.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
value: float
|
||||||
|
Constant value of the schedule
|
||||||
|
"""
|
||||||
|
self._v = value
|
||||||
|
|
||||||
|
def value(self, t):
|
||||||
|
"""See Schedule.value"""
|
||||||
|
return self._v
|
||||||
|
|
||||||
|
|
||||||
|
def linear_interpolation(l, r, alpha):
|
||||||
|
return l + alpha * (r - l)
|
||||||
|
|
||||||
|
|
||||||
|
class PiecewiseSchedule(object):
|
||||||
|
def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
|
||||||
|
"""Piecewise schedule.
|
||||||
|
|
||||||
|
endpoints: [(int, int)]
|
||||||
|
list of pairs `(time, value)` meanining that schedule should output
|
||||||
|
`value` when `t==time`. All the values for time must be sorted in
|
||||||
|
an increasing order. When t is between two times, e.g. `(time_a, value_a)`
|
||||||
|
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
|
||||||
|
`interpolation(value_a, value_b, alpha)` where alpha is a fraction of
|
||||||
|
time passed between `time_a` and `time_b` for time `t`.
|
||||||
|
interpolation: lambda float, float, float: float
|
||||||
|
a function that takes value to the left and to the right of t according
|
||||||
|
to the `endpoints`. Alpha is the fraction of distance from left endpoint to
|
||||||
|
right endpoint that t has covered. See linear_interpolation for example.
|
||||||
|
outside_value: float
|
||||||
|
if the value is requested outside of all the intervals sepecified in
|
||||||
|
`endpoints` this value is returned. If None then AssertionError is
|
||||||
|
raised when outside value is requested.
|
||||||
|
"""
|
||||||
|
idxes = [e[0] for e in endpoints]
|
||||||
|
assert idxes == sorted(idxes)
|
||||||
|
self._interpolation = interpolation
|
||||||
|
self._outside_value = outside_value
|
||||||
|
self._endpoints = endpoints
|
||||||
|
|
||||||
|
def value(self, t):
|
||||||
|
"""See Schedule.value"""
|
||||||
|
for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
|
||||||
|
if l_t <= t and t < r_t:
|
||||||
|
alpha = float(t - l_t) / (r_t - l_t)
|
||||||
|
return self._interpolation(l, r, alpha)
|
||||||
|
|
||||||
|
# t does not belong to any of the pieces, so doom.
|
||||||
|
assert self._outside_value is not None
|
||||||
|
return self._outside_value
|
||||||
|
|
||||||
|
|
||||||
|
class LinearSchedule(object):
|
||||||
|
def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
|
||||||
|
"""Linear interpolation between initial_p and final_p over
|
||||||
|
schedule_timesteps. After this many timesteps pass final_p is
|
||||||
|
returned.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
schedule_timesteps: int
|
||||||
|
Number of timesteps for which to linearly anneal initial_p
|
||||||
|
to final_p
|
||||||
|
initial_p: float
|
||||||
|
initial output value
|
||||||
|
final_p: float
|
||||||
|
final output value
|
||||||
|
"""
|
||||||
|
self.schedule_timesteps = schedule_timesteps
|
||||||
|
self.final_p = final_p
|
||||||
|
self.initial_p = initial_p
|
||||||
|
|
||||||
|
def value(self, t):
|
||||||
|
"""See Schedule.value"""
|
||||||
|
fraction = min(float(t) / self.schedule_timesteps, 1.0)
|
||||||
|
return self.initial_p + fraction * (self.final_p - self.initial_p)
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
import operator
|
||||||
|
|
||||||
|
|
||||||
|
class SegmentTree(object):
|
||||||
|
def __init__(self, capacity, operation, neutral_element):
|
||||||
|
"""Build a Segment Tree data structure.
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Segment_tree
|
||||||
|
|
||||||
|
Can be used as regular array, but with two
|
||||||
|
important differences:
|
||||||
|
|
||||||
|
a) setting item's value is slightly slower.
|
||||||
|
It is O(lg capacity) instead of O(1).
|
||||||
|
b) user has access to an efficient `reduce`
|
||||||
|
operation which reduces `operation` over
|
||||||
|
a contiguous subsequence of items in the
|
||||||
|
array.
|
||||||
|
|
||||||
|
Paramters
|
||||||
|
---------
|
||||||
|
capacity: int
|
||||||
|
Total size of the array - must be a power of two.
|
||||||
|
operation: lambda obj, obj -> obj
|
||||||
|
and operation for combining elements (eg. sum, max)
|
||||||
|
must for a mathematical group together with the set of
|
||||||
|
possible values for array elements.
|
||||||
|
neutral_element: obj
|
||||||
|
neutral element for the operation above. eg. float('-inf')
|
||||||
|
for max and 0 for sum.
|
||||||
|
"""
|
||||||
|
assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
|
||||||
|
self._capacity = capacity
|
||||||
|
self._value = [neutral_element for _ in range(2 * capacity)]
|
||||||
|
self._operation = operation
|
||||||
|
|
||||||
|
def _reduce_helper(self, start, end, node, node_start, node_end):
|
||||||
|
if start == node_start and end == node_end:
|
||||||
|
return self._value[node]
|
||||||
|
mid = (node_start + node_end) // 2
|
||||||
|
if end <= mid:
|
||||||
|
return self._reduce_helper(start, end, 2 * node, node_start, mid)
|
||||||
|
else:
|
||||||
|
if mid + 1 <= start:
|
||||||
|
return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
|
||||||
|
else:
|
||||||
|
return self._operation(
|
||||||
|
self._reduce_helper(start, mid, 2 * node, node_start, mid),
|
||||||
|
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
|
||||||
|
)
|
||||||
|
|
||||||
|
def reduce(self, start=0, end=None):
|
||||||
|
"""Returns result of applying `self.operation`
|
||||||
|
to a contiguous subsequence of the array.
|
||||||
|
|
||||||
|
self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
start: int
|
||||||
|
beginning of the subsequence
|
||||||
|
end: int
|
||||||
|
end of the subsequences
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
reduced: obj
|
||||||
|
result of reducing self.operation over the specified range of array elements.
|
||||||
|
"""
|
||||||
|
if end is None:
|
||||||
|
end = self._capacity
|
||||||
|
if end < 0:
|
||||||
|
end += self._capacity
|
||||||
|
end -= 1
|
||||||
|
return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
|
||||||
|
|
||||||
|
def __setitem__(self, idx, val):
|
||||||
|
# index of the leaf
|
||||||
|
idx += self._capacity
|
||||||
|
self._value[idx] = val
|
||||||
|
idx //= 2
|
||||||
|
while idx >= 1:
|
||||||
|
self._value[idx] = self._operation(
|
||||||
|
self._value[2 * idx],
|
||||||
|
self._value[2 * idx + 1]
|
||||||
|
)
|
||||||
|
idx //= 2
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
assert 0 <= idx < self._capacity
|
||||||
|
return self._value[self._capacity + idx]
|
||||||
|
|
||||||
|
|
||||||
|
class SumSegmentTree(SegmentTree):
|
||||||
|
def __init__(self, capacity):
|
||||||
|
super(SumSegmentTree, self).__init__(
|
||||||
|
capacity=capacity,
|
||||||
|
operation=operator.add,
|
||||||
|
neutral_element=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
def sum(self, start=0, end=None):
|
||||||
|
"""Returns arr[start] + ... + arr[end]"""
|
||||||
|
return super(SumSegmentTree, self).reduce(start, end)
|
||||||
|
|
||||||
|
def find_prefixsum_idx(self, prefixsum):
|
||||||
|
"""Find the highest index `i` in the array such that
|
||||||
|
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
|
||||||
|
|
||||||
|
if array values are probabilities, this function
|
||||||
|
allows to sample indexes according to the discrete
|
||||||
|
probability efficiently.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
perfixsum: float
|
||||||
|
upperbound on the sum of array prefix
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
idx: int
|
||||||
|
highest index satisfying the prefixsum constraint
|
||||||
|
"""
|
||||||
|
assert 0 <= prefixsum <= self.sum() + 1e-5
|
||||||
|
idx = 1
|
||||||
|
while idx < self._capacity: # while non-leaf
|
||||||
|
if self._value[2 * idx] > prefixsum:
|
||||||
|
idx = 2 * idx
|
||||||
|
else:
|
||||||
|
prefixsum -= self._value[2 * idx]
|
||||||
|
idx = 2 * idx + 1
|
||||||
|
return idx - self._capacity
|
||||||
|
|
||||||
|
|
||||||
|
class MinSegmentTree(SegmentTree):
|
||||||
|
def __init__(self, capacity):
|
||||||
|
super(MinSegmentTree, self).__init__(
|
||||||
|
capacity=capacity,
|
||||||
|
operation=min,
|
||||||
|
neutral_element=float('inf')
|
||||||
|
)
|
||||||
|
|
||||||
|
def min(self, start=0, end=None):
|
||||||
|
"""Returns min(arr[start], ..., arr[end])"""
|
||||||
|
|
||||||
|
return super(MinSegmentTree, self).reduce(start, end)
|
||||||
@@ -0,0 +1,753 @@
|
|||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf # pylint: ignore-module
|
||||||
|
import builtins
|
||||||
|
import functools
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import collections
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Make consistent with numpy
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
clip = tf.clip_by_value
|
||||||
|
|
||||||
|
|
||||||
|
def sum(x, axis=None, keepdims=False):
|
||||||
|
axis = None if axis is None else [axis]
|
||||||
|
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
|
def mean(x, axis=None, keepdims=False):
|
||||||
|
axis = None if axis is None else [axis]
|
||||||
|
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
|
def var(x, axis=None, keepdims=False):
|
||||||
|
meanx = mean(x, axis=axis, keepdims=keepdims)
|
||||||
|
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
|
def std(x, axis=None, keepdims=False):
|
||||||
|
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
|
||||||
|
|
||||||
|
|
||||||
|
def max(x, axis=None, keepdims=False):
|
||||||
|
axis = None if axis is None else [axis]
|
||||||
|
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
|
def min(x, axis=None, keepdims=False):
|
||||||
|
axis = None if axis is None else [axis]
|
||||||
|
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate(arrs, axis=0):
|
||||||
|
return tf.concat(axis=axis, values=arrs)
|
||||||
|
|
||||||
|
|
||||||
|
def argmax(x, axis=None):
|
||||||
|
return tf.argmax(x, axis=axis)
|
||||||
|
|
||||||
|
|
||||||
|
def switch(condition, then_expression, else_expression):
|
||||||
|
"""Switches between two operations depending on a scalar value (int or bool).
|
||||||
|
Note that both `then_expression` and `else_expression`
|
||||||
|
should be symbolic tensors of the *same shape*.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
condition: scalar tensor.
|
||||||
|
then_expression: TensorFlow operation.
|
||||||
|
else_expression: TensorFlow operation.
|
||||||
|
"""
|
||||||
|
x_shape = copy.copy(then_expression.get_shape())
|
||||||
|
x = tf.cond(tf.cast(condition, 'bool'),
|
||||||
|
lambda: then_expression,
|
||||||
|
lambda: else_expression)
|
||||||
|
x.set_shape(x_shape)
|
||||||
|
return x
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Extras
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def l2loss(params):
|
||||||
|
if len(params) == 0:
|
||||||
|
return tf.constant(0.0)
|
||||||
|
else:
|
||||||
|
return tf.add_n([sum(tf.square(p)) for p in params])
|
||||||
|
|
||||||
|
|
||||||
|
def lrelu(x, leak=0.2):
|
||||||
|
f1 = 0.5 * (1 + leak)
|
||||||
|
f2 = 0.5 * (1 - leak)
|
||||||
|
return f1 * x + f2 * abs(x)
|
||||||
|
|
||||||
|
|
||||||
|
def categorical_sample_logits(X):
|
||||||
|
# https://github.com/tensorflow/tensorflow/issues/456
|
||||||
|
U = tf.random_uniform(tf.shape(X))
|
||||||
|
return argmax(X - tf.log(-tf.log(U)), axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Inputs
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def is_placeholder(x):
|
||||||
|
return type(x) is tf.Tensor and len(x.op.inputs) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TfInput(object):
|
||||||
|
def __init__(self, name="(unnamed)"):
|
||||||
|
"""Generalized Tensorflow placeholder. The main differences are:
|
||||||
|
- possibly uses multiple placeholders internally and returns multiple values
|
||||||
|
- can apply light postprocessing to the value feed to placeholder.
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
"""Return the tf variable(s) representing the possibly postprocessed value
|
||||||
|
of placeholder(s).
|
||||||
|
"""
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def make_feed_dict(data):
|
||||||
|
"""Given data input it to the placeholder(s)."""
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
|
||||||
|
class PlacholderTfInput(TfInput):
|
||||||
|
def __init__(self, placeholder):
|
||||||
|
"""Wrapper for regular tensorflow placeholder."""
|
||||||
|
super().__init__(placeholder.name)
|
||||||
|
self._placeholder = placeholder
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self._placeholder
|
||||||
|
|
||||||
|
def make_feed_dict(self, data):
|
||||||
|
return {self._placeholder: data}
|
||||||
|
|
||||||
|
|
||||||
|
class BatchInput(PlacholderTfInput):
|
||||||
|
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||||
|
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
shape: [int]
|
||||||
|
shape of a single elemenet of the batch
|
||||||
|
dtype: tf.dtype
|
||||||
|
number representation used for tensor contents
|
||||||
|
name: str
|
||||||
|
name of the underlying placeholder
|
||||||
|
"""
|
||||||
|
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||||
|
|
||||||
|
|
||||||
|
class Uint8Input(PlacholderTfInput):
|
||||||
|
def __init__(self, shape, name=None):
|
||||||
|
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||||
|
before passing it to the model.
|
||||||
|
|
||||||
|
On GPU this ensures lower data transfer times.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
shape: [int]
|
||||||
|
shape of the tensor.
|
||||||
|
name: str
|
||||||
|
name of the underlying placeholder
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
|
||||||
|
self._shape = shape
|
||||||
|
self._output = tf.cast(super().get(), tf.float32) / 255.0
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self._output
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_tf_input(thing):
|
||||||
|
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
|
||||||
|
if isinstance(thing, TfInput):
|
||||||
|
return thing
|
||||||
|
elif is_placeholder(thing):
|
||||||
|
return PlacholderTfInput(thing)
|
||||||
|
else:
|
||||||
|
raise ValueError("Must be a placeholder or TfInput")
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Mathematical utils
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def huber_loss(x, delta=1.0):
|
||||||
|
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
||||||
|
return tf.where(
|
||||||
|
tf.abs(x) < delta,
|
||||||
|
tf.square(x) * 0.5,
|
||||||
|
delta * (tf.abs(x) - 0.5 * delta)
|
||||||
|
)
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Optimizer utils
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
||||||
|
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
||||||
|
`var_list` while ensure the norm of the gradients for each
|
||||||
|
variable is clipped to `clip_val`
|
||||||
|
"""
|
||||||
|
gradients = optimizer.compute_gradients(objective, var_list=var_list)
|
||||||
|
for i, (grad, var) in enumerate(gradients):
|
||||||
|
if grad is not None:
|
||||||
|
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
||||||
|
return optimizer.apply_gradients(gradients)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Global session
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def get_session():
|
||||||
|
"""Returns recently made Tensorflow session"""
|
||||||
|
return tf.get_default_session()
|
||||||
|
|
||||||
|
|
||||||
|
def make_session(num_cpu):
|
||||||
|
"""Returns a session that will use <num_cpu> CPU's only"""
|
||||||
|
tf_config = tf.ConfigProto(
|
||||||
|
inter_op_parallelism_threads=num_cpu,
|
||||||
|
intra_op_parallelism_threads=num_cpu)
|
||||||
|
return tf.Session(config=tf_config)
|
||||||
|
|
||||||
|
|
||||||
|
def single_threaded_session():
|
||||||
|
"""Returns a session which will only use a single CPU"""
|
||||||
|
return make_session(1)
|
||||||
|
|
||||||
|
|
||||||
|
ALREADY_INITIALIZED = set()
|
||||||
|
|
||||||
|
|
||||||
|
def initialize():
|
||||||
|
"""Initialize all the uninitialized variables in the global scope."""
|
||||||
|
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
||||||
|
get_session().run(tf.variables_initializer(new_variables))
|
||||||
|
ALREADY_INITIALIZED.update(new_variables)
|
||||||
|
|
||||||
|
|
||||||
|
def eval(expr, feed_dict=None):
|
||||||
|
if feed_dict is None:
|
||||||
|
feed_dict = {}
|
||||||
|
return get_session().run(expr, feed_dict=feed_dict)
|
||||||
|
|
||||||
|
|
||||||
|
VALUE_SETTERS = collections.OrderedDict()
|
||||||
|
|
||||||
|
|
||||||
|
def set_value(v, val):
|
||||||
|
global VALUE_SETTERS
|
||||||
|
if v in VALUE_SETTERS:
|
||||||
|
set_op, set_endpoint = VALUE_SETTERS[v]
|
||||||
|
else:
|
||||||
|
set_endpoint = tf.placeholder(v.dtype)
|
||||||
|
set_op = v.assign(set_endpoint)
|
||||||
|
VALUE_SETTERS[v] = (set_op, set_endpoint)
|
||||||
|
get_session().run(set_op, feed_dict={set_endpoint: val})
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Saving variables
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def load_state(fname):
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
saver.restore(get_session(), fname)
|
||||||
|
|
||||||
|
|
||||||
|
def save_state(fname):
|
||||||
|
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
saver.save(get_session(), fname)
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Model components
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def normc_initializer(std=1.0):
|
||||||
|
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
|
||||||
|
out = np.random.randn(*shape).astype(np.float32)
|
||||||
|
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||||
|
return tf.constant(out)
|
||||||
|
return _initializer
|
||||||
|
|
||||||
|
|
||||||
|
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
|
||||||
|
summary_tag=None):
|
||||||
|
with tf.variable_scope(name):
|
||||||
|
stride_shape = [1, stride[0], stride[1], 1]
|
||||||
|
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
|
||||||
|
|
||||||
|
# there are "num input feature maps * filter height * filter width"
|
||||||
|
# inputs to each hidden unit
|
||||||
|
fan_in = intprod(filter_shape[:3])
|
||||||
|
# each unit in the lower layer receives a gradient from:
|
||||||
|
# "num output feature maps * filter height * filter width" /
|
||||||
|
# pooling size
|
||||||
|
fan_out = intprod(filter_shape[:2]) * num_filters
|
||||||
|
# initialize weights with random weights
|
||||||
|
w_bound = np.sqrt(6. / (fan_in + fan_out))
|
||||||
|
|
||||||
|
w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
|
||||||
|
collections=collections)
|
||||||
|
b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
|
||||||
|
collections=collections)
|
||||||
|
|
||||||
|
if summary_tag is not None:
|
||||||
|
tf.summary.image(summary_tag,
|
||||||
|
tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
|
||||||
|
[2, 0, 1, 3]),
|
||||||
|
max_images=10)
|
||||||
|
|
||||||
|
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||||
|
|
||||||
|
|
||||||
|
def dense(x, size, name, weight_init=None, bias=True):
|
||||||
|
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
|
||||||
|
ret = tf.matmul(x, w)
|
||||||
|
if bias:
|
||||||
|
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
|
||||||
|
return ret + b
|
||||||
|
else:
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def wndense(x, size, name, init_scale=1.0):
|
||||||
|
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
|
||||||
|
initializer=tf.random_normal_initializer(0, 0.05))
|
||||||
|
g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
|
||||||
|
b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
|
||||||
|
|
||||||
|
# use weight normalization (Salimans & Kingma, 2016)
|
||||||
|
x = tf.matmul(x, v)
|
||||||
|
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
|
||||||
|
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
|
||||||
|
|
||||||
|
|
||||||
|
def densenobias(x, size, name, weight_init=None):
|
||||||
|
return dense(x, size, name, weight_init=weight_init, bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
def dropout(x, pkeep, phase=None, mask=None):
|
||||||
|
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
|
||||||
|
if phase is None:
|
||||||
|
return mask * x
|
||||||
|
else:
|
||||||
|
return switch(phase, mask * x, pkeep * x)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Theano-like Function
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def function(inputs, outputs, updates=None, givens=None):
|
||||||
|
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions
|
||||||
|
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
|
||||||
|
values to be fed to the input's placeholders and produces the values of the expressions
|
||||||
|
in outputs.
|
||||||
|
|
||||||
|
Input values can be passed in the same order as inputs or can be provided as kwargs based
|
||||||
|
on placeholder name (passed to constructor or accessible via placeholder.op.name).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
x = tf.placeholder(tf.int32, (), name="x")
|
||||||
|
y = tf.placeholder(tf.int32, (), name="y")
|
||||||
|
z = 3 * x + 2 * y
|
||||||
|
lin = function([x, y], z, givens={y: 0})
|
||||||
|
|
||||||
|
with single_threaded_session():
|
||||||
|
initialize()
|
||||||
|
|
||||||
|
assert lin(2) == 6
|
||||||
|
assert lin(x=3) == 9
|
||||||
|
assert lin(2, 2) == 10
|
||||||
|
assert lin(x=2, y=3) == 12
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
inputs: [tf.placeholder or TfInput]
|
||||||
|
list of input arguments
|
||||||
|
outputs: [tf.Variable] or tf.Variable
|
||||||
|
list of outputs or a single output to be returned from function. Returned
|
||||||
|
value will also have the same shape.
|
||||||
|
"""
|
||||||
|
if isinstance(outputs, list):
|
||||||
|
return _Function(inputs, outputs, updates, givens=givens)
|
||||||
|
elif isinstance(outputs, (dict, collections.OrderedDict)):
|
||||||
|
f = _Function(inputs, outputs.values(), updates, givens=givens)
|
||||||
|
return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
|
||||||
|
else:
|
||||||
|
f = _Function(inputs, [outputs], updates, givens=givens)
|
||||||
|
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
|
class _Function(object):
|
||||||
|
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
|
||||||
|
for inpt in inputs:
|
||||||
|
if not issubclass(type(inpt), TfInput):
|
||||||
|
assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of src.baselines_common.TfInput"
|
||||||
|
self.inputs = inputs
|
||||||
|
updates = updates or []
|
||||||
|
self.update_group = tf.group(*updates)
|
||||||
|
self.outputs_update = list(outputs) + [self.update_group]
|
||||||
|
self.givens = {} if givens is None else givens
|
||||||
|
self.check_nan = check_nan
|
||||||
|
|
||||||
|
def _feed_input(self, feed_dict, inpt, value):
|
||||||
|
if issubclass(type(inpt), TfInput):
|
||||||
|
feed_dict.update(inpt.make_feed_dict(value))
|
||||||
|
elif is_placeholder(inpt):
|
||||||
|
feed_dict[inpt] = value
|
||||||
|
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
assert len(args) <= len(self.inputs), "Too many arguments provided"
|
||||||
|
feed_dict = {}
|
||||||
|
# Update the args
|
||||||
|
for inpt, value in zip(self.inputs, args):
|
||||||
|
self._feed_input(feed_dict, inpt, value)
|
||||||
|
# Update the kwargs
|
||||||
|
kwargs_passed_inpt_names = set()
|
||||||
|
for inpt in self.inputs[len(args):]:
|
||||||
|
inpt_name = inpt.name.split(':')[0]
|
||||||
|
inpt_name = inpt_name.split('/')[-1]
|
||||||
|
assert inpt_name not in kwargs_passed_inpt_names, \
|
||||||
|
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
|
||||||
|
if inpt_name in kwargs:
|
||||||
|
kwargs_passed_inpt_names.add(inpt_name)
|
||||||
|
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
|
||||||
|
else:
|
||||||
|
assert inpt in self.givens, "Missing argument " + inpt_name
|
||||||
|
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
|
||||||
|
# Update feed dict with givens.
|
||||||
|
for inpt in self.givens:
|
||||||
|
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
|
||||||
|
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
|
||||||
|
if self.check_nan:
|
||||||
|
if any(np.isnan(r).any() for r in results):
|
||||||
|
raise RuntimeError("Nan detected")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
||||||
|
if isinstance(outputs, list):
|
||||||
|
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
|
||||||
|
else:
|
||||||
|
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
|
||||||
|
return lambda *inputs: f(*inputs)[0]
|
||||||
|
|
||||||
|
|
||||||
|
class _MemFriendlyFunction(object):
|
||||||
|
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
|
||||||
|
self.nondata_inputs = nondata_inputs
|
||||||
|
self.data_inputs = data_inputs
|
||||||
|
self.outputs = list(outputs)
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
def __call__(self, *inputvals):
|
||||||
|
assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
|
||||||
|
nondata_vals = inputvals[0:len(self.nondata_inputs)]
|
||||||
|
data_vals = inputvals[len(self.nondata_inputs):]
|
||||||
|
feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
|
||||||
|
n = data_vals[0].shape[0]
|
||||||
|
for v in data_vals[1:]:
|
||||||
|
assert v.shape[0] == n
|
||||||
|
for i_start in range(0, n, self.batch_size):
|
||||||
|
slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
|
||||||
|
for (var, val) in zip(self.data_inputs, slice_vals):
|
||||||
|
feed_dict[var] = val
|
||||||
|
results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
|
||||||
|
if i_start == 0:
|
||||||
|
sum_results = results
|
||||||
|
else:
|
||||||
|
for i in range(len(results)):
|
||||||
|
sum_results[i] = sum_results[i] + results[i]
|
||||||
|
for i in range(len(results)):
|
||||||
|
sum_results[i] = sum_results[i] / n
|
||||||
|
return sum_results
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Modules
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class Module(object):
|
||||||
|
def __init__(self, name):
|
||||||
|
self.name = name
|
||||||
|
self.first_time = True
|
||||||
|
self.scope = None
|
||||||
|
self.cache = {}
|
||||||
|
|
||||||
|
def __call__(self, *args):
|
||||||
|
if args in self.cache:
|
||||||
|
print("(%s) retrieving value from cache" % (self.name,))
|
||||||
|
return self.cache[args]
|
||||||
|
with tf.variable_scope(self.name, reuse=not self.first_time):
|
||||||
|
scope = tf.get_variable_scope().name
|
||||||
|
if self.first_time:
|
||||||
|
self.scope = scope
|
||||||
|
print("(%s) running function for the first time" % (self.name,))
|
||||||
|
else:
|
||||||
|
assert self.scope == scope, "Tried calling function with a different scope"
|
||||||
|
print("(%s) running function on new inputs" % (self.name,))
|
||||||
|
self.first_time = False
|
||||||
|
out = self._call(*args)
|
||||||
|
self.cache[args] = out
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _call(self, *args):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def trainable_variables(self):
|
||||||
|
assert self.scope is not None, "need to call module once before getting variables"
|
||||||
|
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def variables(self):
|
||||||
|
assert self.scope is not None, "need to call module once before getting variables"
|
||||||
|
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
||||||
|
|
||||||
|
|
||||||
|
def module(name):
|
||||||
|
@functools.wraps
|
||||||
|
def wrapper(f):
|
||||||
|
class WrapperModule(Module):
|
||||||
|
def _call(self, *args):
|
||||||
|
return f(*args)
|
||||||
|
return WrapperModule(name)
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Graph traversal
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
VARIABLES = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_parents(node):
|
||||||
|
return node.op.inputs
|
||||||
|
|
||||||
|
|
||||||
|
def topsorted(outputs):
|
||||||
|
"""
|
||||||
|
Topological sort via non-recursive depth-first search
|
||||||
|
"""
|
||||||
|
assert isinstance(outputs, (list, tuple))
|
||||||
|
marks = {}
|
||||||
|
out = []
|
||||||
|
stack = [] # pylint: disable=W0621
|
||||||
|
# i: node
|
||||||
|
# jidx = number of children visited so far from that node
|
||||||
|
# marks: state of each node, which is one of
|
||||||
|
# 0: haven't visited
|
||||||
|
# 1: have visited, but not done visiting children
|
||||||
|
# 2: done visiting children
|
||||||
|
for x in outputs:
|
||||||
|
stack.append((x, 0))
|
||||||
|
while stack:
|
||||||
|
(i, jidx) = stack.pop()
|
||||||
|
if jidx == 0:
|
||||||
|
m = marks.get(i, 0)
|
||||||
|
if m == 0:
|
||||||
|
marks[i] = 1
|
||||||
|
elif m == 1:
|
||||||
|
raise ValueError("not a dag")
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
ps = get_parents(i)
|
||||||
|
if jidx == len(ps):
|
||||||
|
marks[i] = 2
|
||||||
|
out.append(i)
|
||||||
|
else:
|
||||||
|
stack.append((i, jidx + 1))
|
||||||
|
j = ps[jidx]
|
||||||
|
stack.append((j, 0))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Flat vectors
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def var_shape(x):
|
||||||
|
out = x.get_shape().as_list()
|
||||||
|
assert all(isinstance(a, int) for a in out), \
|
||||||
|
"shape function assumes that shape is fully known"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def numel(x):
|
||||||
|
return intprod(var_shape(x))
|
||||||
|
|
||||||
|
|
||||||
|
def intprod(x):
|
||||||
|
return int(np.prod(x))
|
||||||
|
|
||||||
|
|
||||||
|
def flatgrad(loss, var_list, clip_norm=None):
|
||||||
|
grads = tf.gradients(loss, var_list)
|
||||||
|
if clip_norm is not None:
|
||||||
|
grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
|
||||||
|
return tf.concat(axis=0, values=[
|
||||||
|
tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
|
||||||
|
for (v, grad) in zip(var_list, grads)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
class SetFromFlat(object):
|
||||||
|
def __init__(self, var_list, dtype=tf.float32):
|
||||||
|
shapes = list(map(var_shape, var_list))
|
||||||
|
total_size = np.sum([intprod(shape) for shape in shapes])
|
||||||
|
|
||||||
|
self.theta = theta = tf.placeholder(dtype, [total_size])
|
||||||
|
start = 0
|
||||||
|
assigns = []
|
||||||
|
for (shape, v) in zip(shapes, var_list):
|
||||||
|
size = intprod(shape)
|
||||||
|
assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
|
||||||
|
start += size
|
||||||
|
self.op = tf.group(*assigns)
|
||||||
|
|
||||||
|
def __call__(self, theta):
|
||||||
|
get_session().run(self.op, feed_dict={self.theta: theta})
|
||||||
|
|
||||||
|
|
||||||
|
class GetFlat(object):
|
||||||
|
def __init__(self, var_list):
|
||||||
|
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
return get_session().run(self.op)
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Misc
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def fancy_slice_2d(X, inds0, inds1):
|
||||||
|
"""
|
||||||
|
like numpy X[inds0, inds1]
|
||||||
|
XXX this implementation is bad
|
||||||
|
"""
|
||||||
|
inds0 = tf.cast(inds0, tf.int64)
|
||||||
|
inds1 = tf.cast(inds1, tf.int64)
|
||||||
|
shape = tf.cast(tf.shape(X), tf.int64)
|
||||||
|
ncols = shape[1]
|
||||||
|
Xflat = tf.reshape(X, [-1])
|
||||||
|
return tf.gather(Xflat, inds0 * ncols + inds1)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Scopes
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def scope_vars(scope, trainable_only=False):
|
||||||
|
"""
|
||||||
|
Get variables inside a scope
|
||||||
|
The scope can be specified as a string
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
scope: str or VariableScope
|
||||||
|
scope in which the variables reside.
|
||||||
|
trainable_only: bool
|
||||||
|
whether or not to return only the variables that were marked as trainable.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
vars: [tf.Variable]
|
||||||
|
list of variables in `scope`.
|
||||||
|
"""
|
||||||
|
return tf.get_collection(
|
||||||
|
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
|
||||||
|
scope=scope if isinstance(scope, str) else scope.name
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def scope_name():
|
||||||
|
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
||||||
|
return tf.get_variable_scope().name
|
||||||
|
|
||||||
|
|
||||||
|
def absolute_scope_name(relative_scope_name):
|
||||||
|
"""Appends parent scope name to `relative_scope_name`"""
|
||||||
|
return scope_name() + "/" + relative_scope_name
|
||||||
|
|
||||||
|
|
||||||
|
def lengths_to_mask(lengths_b, max_length):
|
||||||
|
"""
|
||||||
|
Turns a vector of lengths into a boolean mask
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lengths_b: an integer vector of lengths
|
||||||
|
max_length: maximum length to fill the mask
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
a boolean array of shape (batch_size, max_length)
|
||||||
|
row[i] consists of True repeated lengths_b[i] times, followed by False
|
||||||
|
"""
|
||||||
|
lengths_b = tf.convert_to_tensor(lengths_b)
|
||||||
|
assert lengths_b.get_shape().ndims == 1
|
||||||
|
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
|
||||||
|
return mask_bt
|
||||||
|
|
||||||
|
|
||||||
|
def in_session(f):
|
||||||
|
@functools.wraps(f)
|
||||||
|
def newfunc(*args, **kwargs):
|
||||||
|
with tf.Session():
|
||||||
|
f(*args, **kwargs)
|
||||||
|
return newfunc
|
||||||
|
|
||||||
|
|
||||||
|
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
||||||
|
|
||||||
|
|
||||||
|
def get_placeholder(name, dtype, shape):
|
||||||
|
if name in _PLACEHOLDER_CACHE:
|
||||||
|
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
|
||||||
|
assert dtype1 == dtype and shape1 == shape
|
||||||
|
return out
|
||||||
|
else:
|
||||||
|
out = tf.placeholder(dtype=dtype, shape=shape, name=name)
|
||||||
|
_PLACEHOLDER_CACHE[name] = (out, dtype, shape)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def get_placeholder_cached(name):
|
||||||
|
return _PLACEHOLDER_CACHE[name][0]
|
||||||
|
|
||||||
|
|
||||||
|
def flattenallbut0(x):
|
||||||
|
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
||||||
|
|
||||||
|
|
||||||
|
def reset():
|
||||||
|
global _PLACEHOLDER_CACHE
|
||||||
|
global VARIABLES
|
||||||
|
_PLACEHOLDER_CACHE = {}
|
||||||
|
VARIABLES = {}
|
||||||
|
tf.reset_default_graph()
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
class VecEnv(object):
|
||||||
|
"""
|
||||||
|
Vectorized environment base class
|
||||||
|
"""
|
||||||
|
def step(self, vac):
|
||||||
|
"""
|
||||||
|
Apply sequence of actions to sequence of environments
|
||||||
|
actions -> (observations, rewards, news)
|
||||||
|
|
||||||
|
where 'news' is a boolean vector indicating whether each element is new.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
def reset(self):
|
||||||
|
"""
|
||||||
|
Reset all environments
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
import numpy as np
|
||||||
|
from multiprocessing import Process, Pipe
|
||||||
|
from src.common.vec_env import VecEnv
|
||||||
|
|
||||||
|
|
||||||
|
def worker(remote, env_fn_wrapper):
|
||||||
|
env = env_fn_wrapper.x()
|
||||||
|
while True:
|
||||||
|
cmd, data = remote.recv()
|
||||||
|
if cmd == 'step':
|
||||||
|
ob, reward, done, info = env.step(data)
|
||||||
|
if done:
|
||||||
|
ob = env.reset()
|
||||||
|
remote.send((ob, reward, done, info))
|
||||||
|
elif cmd == 'reset':
|
||||||
|
ob = env.reset()
|
||||||
|
remote.send(ob)
|
||||||
|
elif cmd == 'close':
|
||||||
|
remote.close()
|
||||||
|
break
|
||||||
|
elif cmd == 'get_spaces':
|
||||||
|
remote.send((env.action_space, env.observation_space))
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class CloudpickleWrapper(object):
|
||||||
|
"""
|
||||||
|
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
import cloudpickle
|
||||||
|
return cloudpickle.dumps(self.x)
|
||||||
|
|
||||||
|
def __setstate__(self, ob):
|
||||||
|
import pickle
|
||||||
|
self.x = pickle.loads(ob)
|
||||||
|
|
||||||
|
|
||||||
|
class SubprocVecEnv(VecEnv):
|
||||||
|
def __init__(self, env_fns):
|
||||||
|
"""
|
||||||
|
envs: list of gym environments to run in subprocesses
|
||||||
|
"""
|
||||||
|
nenvs = len(env_fns)
|
||||||
|
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||||
|
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
|
||||||
|
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
|
||||||
|
for p in self.ps:
|
||||||
|
p.start()
|
||||||
|
|
||||||
|
self.remotes[0].send(('get_spaces', None))
|
||||||
|
self.action_space, self.observation_space = self.remotes[0].recv()
|
||||||
|
|
||||||
|
def step(self, actions):
|
||||||
|
for remote, action in zip(self.remotes, actions):
|
||||||
|
remote.send(('step', action))
|
||||||
|
results = [remote.recv() for remote in self.remotes]
|
||||||
|
obs, rews, dones, infos = zip(*results)
|
||||||
|
return np.stack(obs), np.stack(rews), np.stack(dones), infos
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
for remote in self.remotes:
|
||||||
|
remote.send(('reset', None))
|
||||||
|
return np.stack([remote.recv() for remote in self.remotes])
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
for remote in self.remotes:
|
||||||
|
remote.send(('close', None))
|
||||||
|
for p in self.ps:
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_envs(self):
|
||||||
|
return len(self.remotes)
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
from baselines.baselines_common.mpi_running_mean_std import RunningMeanStd
|
||||||
|
from baselines.baselines_common.distributions import make_pdtype, DiagGaussianPdType, BernoulliPdType
|
||||||
|
|
||||||
|
|
||||||
|
def mlp_block(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
|
||||||
|
with tf.variable_scope(name_or_scope=name):
|
||||||
|
for i in range(num_hid_layers):
|
||||||
|
x = U.dense(
|
||||||
|
x, hid_size,
|
||||||
|
name="fc%i" % (i + 1), weight_init=U.normc_initializer(1.0))
|
||||||
|
x = activation_fn(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def feature_net(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
|
||||||
|
with tf.variable_scope(name_or_scope=name):
|
||||||
|
x = mlp_block(
|
||||||
|
x, name="mlp",
|
||||||
|
hid_size=hid_size, num_hid_layers=num_hid_layers, activation_fn=activation_fn)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Actor(object):
|
||||||
|
def __init__(self, name, *args, **kwargs):
|
||||||
|
with tf.variable_scope(name):
|
||||||
|
self._init(*args, **kwargs)
|
||||||
|
self.scope = tf.get_variable_scope().name
|
||||||
|
|
||||||
|
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, noise_type=None):
|
||||||
|
if noise_type == "gaussian":
|
||||||
|
self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0])
|
||||||
|
else:
|
||||||
|
self.pdtype = pdtype = make_pdtype(ac_space)
|
||||||
|
|
||||||
|
ob = U.get_placeholder(
|
||||||
|
name="ob", dtype=tf.float32,
|
||||||
|
shape=[None] + list(ob_space.shape))
|
||||||
|
|
||||||
|
with tf.variable_scope("obfilter"):
|
||||||
|
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
||||||
|
obz = (ob - self.ob_rms.mean) / self.ob_rms.std
|
||||||
|
obz = tf.clip_by_value(obz, -5.0, 5.0)
|
||||||
|
|
||||||
|
# critic net (value network)
|
||||||
|
last_out = feature_net(
|
||||||
|
obz, name="vf",
|
||||||
|
num_hid_layers=num_hid_layers, hid_size=hid_size,
|
||||||
|
activation_fn=tf.nn.tanh)
|
||||||
|
self.vpred = U.dense(
|
||||||
|
last_out, 1,
|
||||||
|
name="vf_final", weight_init=U.normc_initializer(1.0))[:, 0]
|
||||||
|
|
||||||
|
# actor net (policy network)
|
||||||
|
last_out = feature_net(
|
||||||
|
obz, name="pol",
|
||||||
|
num_hid_layers=num_hid_layers, hid_size=hid_size,
|
||||||
|
activation_fn=tf.nn.tanh)
|
||||||
|
|
||||||
|
if gaussian_fixed_var and isinstance(self.pdtype, DiagGaussianPdType):
|
||||||
|
mean = U.dense(
|
||||||
|
last_out, pdtype.param_shape()[0] // 2,
|
||||||
|
name="pol_final", weight_init=U.normc_initializer(0.01))
|
||||||
|
logstd = tf.get_variable(
|
||||||
|
name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
|
||||||
|
initializer=tf.zeros_initializer())
|
||||||
|
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
|
||||||
|
else:
|
||||||
|
pdparam = U.dense(
|
||||||
|
last_out, pdtype.param_shape()[0],
|
||||||
|
name="pol_final", weight_init=U.normc_initializer(0.01))
|
||||||
|
|
||||||
|
# pd - probability distribution
|
||||||
|
self.pd = pdtype.pdfromflat(pdparam)
|
||||||
|
|
||||||
|
self.state_in = []
|
||||||
|
self.state_out = []
|
||||||
|
|
||||||
|
stochastic = tf.placeholder(dtype=tf.bool, shape=())
|
||||||
|
ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
|
||||||
|
self._act = U.function([stochastic, ob], [ac, self.vpred])
|
||||||
|
|
||||||
|
def act(self, stochastic, ob):
|
||||||
|
ac1, vpred1 = self._act(stochastic, ob[None])
|
||||||
|
return ac1[0], vpred1[0]
|
||||||
|
|
||||||
|
def get_variables(self):
|
||||||
|
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
||||||
|
|
||||||
|
def get_trainable_variables(self):
|
||||||
|
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||||
|
|
||||||
|
def get_initial_state(self):
|
||||||
|
return []
|
||||||
@@ -0,0 +1,171 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
from mpi4py import MPI
|
||||||
|
from collections import deque
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from common.logger import Logger
|
||||||
|
from baselines.baselines_common import Dataset, explained_variance, fmt_row, zipsame
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
from baselines.baselines_common.mpi_adam import MpiAdam
|
||||||
|
from baselines.baselines_common.mpi_saver import MpiSaver
|
||||||
|
from baselines.baselines_common.mpi_moments import mpi_moments
|
||||||
|
|
||||||
|
from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
|
||||||
|
|
||||||
|
|
||||||
|
def learn(env, policy_func, args, *,
|
||||||
|
timesteps_per_batch, # timesteps per actor per update
|
||||||
|
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
|
||||||
|
optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers
|
||||||
|
gamma, lam, # advantage estimation
|
||||||
|
adam_epsilon=1e-5,
|
||||||
|
schedule='constant'): # annealing for stepsize parameters (epsilon and adam),
|
||||||
|
# Setup losses and stuff
|
||||||
|
# ----------------------------------------
|
||||||
|
ob_space = env.observation_space
|
||||||
|
ac_space = env.action_space
|
||||||
|
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
|
||||||
|
oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
|
||||||
|
atarg = tf.placeholder(dtype=tf.float32,
|
||||||
|
shape=[None]) # Target advantage function (if applicable)
|
||||||
|
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
||||||
|
|
||||||
|
lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
|
||||||
|
shape=[]) # learning rate multiplier, updated with schedule
|
||||||
|
clip_param = clip_param * lrmult # Annealed cliping parameter epislon
|
||||||
|
|
||||||
|
ob = U.get_placeholder_cached(name="ob")
|
||||||
|
ac = pi.pdtype.sample_placeholder([None])
|
||||||
|
|
||||||
|
kloldnew = oldpi.pd.kl(pi.pd)
|
||||||
|
ent = pi.pd.entropy()
|
||||||
|
meankl = U.mean(kloldnew)
|
||||||
|
meanent = U.mean(ent)
|
||||||
|
pol_entpen = (-entcoeff) * meanent
|
||||||
|
|
||||||
|
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
|
||||||
|
surr1 = ratio * atarg # surrogate from conservative policy iteration
|
||||||
|
surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
|
||||||
|
pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
|
||||||
|
vf_loss = U.mean(tf.square(pi.vpred - ret))
|
||||||
|
total_loss = pol_surr + pol_entpen + vf_loss
|
||||||
|
losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
|
||||||
|
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
|
||||||
|
|
||||||
|
var_list = pi.get_trainable_variables()
|
||||||
|
lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
|
||||||
|
losses + [U.flatgrad(total_loss, var_list)])
|
||||||
|
adam = MpiAdam(var_list, epsilon=adam_epsilon)
|
||||||
|
policy_var_list = [v for v in var_list if v.name.split("/")[0].startswith("pi")]
|
||||||
|
saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
|
||||||
|
|
||||||
|
assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
|
||||||
|
for (oldv, newv) in
|
||||||
|
zipsame(oldpi.get_variables(),
|
||||||
|
pi.get_variables())])
|
||||||
|
compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
|
||||||
|
|
||||||
|
U.initialize()
|
||||||
|
saver.restore(restore_from=args.restore_actor_from)
|
||||||
|
adam.sync()
|
||||||
|
|
||||||
|
# Prepare for rollouts
|
||||||
|
# ----------------------------------------
|
||||||
|
seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
|
||||||
|
|
||||||
|
episodes_so_far = 0
|
||||||
|
timesteps_so_far = 0
|
||||||
|
iters_so_far = 0
|
||||||
|
tstart = time.time()
|
||||||
|
lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
|
||||||
|
rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
|
||||||
|
|
||||||
|
# max_timesteps = 1e10
|
||||||
|
cur_lrmult = 1.0
|
||||||
|
|
||||||
|
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
|
||||||
|
while time.time() - tstart < 86400 * args.max_train_days:
|
||||||
|
# if schedule == 'constant':
|
||||||
|
# cur_lrmult = 1.0
|
||||||
|
# elif schedule == 'linear':
|
||||||
|
# cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
|
||||||
|
# else:
|
||||||
|
# raise NotImplementedError
|
||||||
|
|
||||||
|
# logger.log("********** Iteration %i ************" % iters_so_far)
|
||||||
|
|
||||||
|
seg = seg_gen.__next__()
|
||||||
|
add_vtarg_and_adv(seg, gamma, lam)
|
||||||
|
|
||||||
|
# ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
|
||||||
|
ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
|
||||||
|
vpredbefore = seg["vpred"] # predicted value function before udpate
|
||||||
|
atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
|
||||||
|
d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True)
|
||||||
|
optim_batchsize = optim_batchsize or ob.shape[0]
|
||||||
|
|
||||||
|
if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
|
||||||
|
|
||||||
|
assign_old_eq_new() # set old parameter values to new parameter values
|
||||||
|
# logger.log("Optimizing...")
|
||||||
|
# logger.log(fmt_row(13, loss_names))
|
||||||
|
# Here we do a bunch of optimization epochs over the data
|
||||||
|
for _ in range(optim_epochs):
|
||||||
|
losses = [] # list of tuples, each of which gives the loss for a minibatch
|
||||||
|
for batch in d.iterate_once(optim_batchsize):
|
||||||
|
*newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
|
||||||
|
batch["vtarg"], cur_lrmult)
|
||||||
|
adam.update(g, optim_stepsize * cur_lrmult)
|
||||||
|
losses.append(newlosses)
|
||||||
|
# logger.log(fmt_row(13, np.mean(losses, axis=0)))
|
||||||
|
|
||||||
|
saver.sync()
|
||||||
|
# logger.log("Evaluating losses...")
|
||||||
|
losses = []
|
||||||
|
for batch in d.iterate_once(optim_batchsize):
|
||||||
|
newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
|
||||||
|
cur_lrmult)
|
||||||
|
losses.append(newlosses)
|
||||||
|
meanlosses, _, _ = mpi_moments(losses, axis=0)
|
||||||
|
# logger.log(fmt_row(13, meanlosses))
|
||||||
|
|
||||||
|
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
|
||||||
|
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
|
||||||
|
lens, rews = map(flatten_lists, zip(*listoflrpairs))
|
||||||
|
lenbuffer.extend(lens)
|
||||||
|
rewbuffer.extend(rews)
|
||||||
|
|
||||||
|
episodes_so_far += len(lens)
|
||||||
|
timesteps_so_far += sum(lens)
|
||||||
|
iters_so_far += 1
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logger.scalar_summary("episodes", len(lens), iters_so_far)
|
||||||
|
|
||||||
|
for (lossname, lossval) in zip(loss_names, meanlosses):
|
||||||
|
logger.scalar_summary(lossname, lossval, episodes_so_far)
|
||||||
|
|
||||||
|
logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
|
||||||
|
|
||||||
|
logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
|
||||||
|
logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
|
||||||
|
logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
|
||||||
|
|
||||||
|
elapsed_time = time.time() - tstart
|
||||||
|
|
||||||
|
logger.scalar_summary(
|
||||||
|
"episode per minute",
|
||||||
|
episodes_so_far / elapsed_time * 60,
|
||||||
|
episodes_so_far)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"step per second",
|
||||||
|
timesteps_so_far / elapsed_time,
|
||||||
|
episodes_so_far)
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_lists(listoflists):
|
||||||
|
return [el for list_ in listoflists for el in list_]
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# noinspection PyUnresolvedReferences
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from mpi4py import MPI
|
||||||
|
|
||||||
|
from common.misc_util import boolean_flag, str2params, create_if_need
|
||||||
|
from common.misc_util import set_global_seeds
|
||||||
|
from common.env_wrappers import create_env
|
||||||
|
|
||||||
|
from baselines.nets import Actor
|
||||||
|
from baselines import trpo, ppo
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
parser.add_argument(
|
||||||
|
'--agent',
|
||||||
|
type=str,
|
||||||
|
default="trpo",
|
||||||
|
choices=["trpo", "ppo"],
|
||||||
|
help='Which agent to use. (default: %(default)s)')
|
||||||
|
|
||||||
|
parser.add_argument('--seed', type=int, default=42)
|
||||||
|
parser.add_argument('--difficulty', type=int, default=2)
|
||||||
|
parser.add_argument('--max-obstacles', type=int, default=3)
|
||||||
|
|
||||||
|
parser.add_argument('--logdir', type=str, default="./logs")
|
||||||
|
|
||||||
|
boolean_flag(parser, "baseline-wrapper", default=False)
|
||||||
|
parser.add_argument('--skip-frames', type=int, default=1)
|
||||||
|
parser.add_argument('--reward-scale', type=float, default=1.)
|
||||||
|
parser.add_argument('--fail-reward', type=float, default=0.0)
|
||||||
|
|
||||||
|
parser.add_argument('--hid-size', type=int, default=64)
|
||||||
|
parser.add_argument('--num-hid-layers', type=int, default=2)
|
||||||
|
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.96)
|
||||||
|
|
||||||
|
parser.add_argument('--restore-args-from', type=str, default=None)
|
||||||
|
parser.add_argument('--restore-actor-from', type=str, default=None)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-train-days',
|
||||||
|
default=int(1e1),
|
||||||
|
type=int)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def restore_params(args):
|
||||||
|
with open(args.restore_args_from, "r") as fin:
|
||||||
|
params = json.load(fin)
|
||||||
|
|
||||||
|
del params["seed"]
|
||||||
|
del params["difficulty"]
|
||||||
|
del params["max_obstacles"]
|
||||||
|
|
||||||
|
del params["skip_frames"]
|
||||||
|
|
||||||
|
del params["restore_args_from"]
|
||||||
|
del params["restore_actor_from"]
|
||||||
|
|
||||||
|
for key, value in params.items():
|
||||||
|
setattr(args, key, value)
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def train(args):
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
|
||||||
|
sess = U.single_threaded_session()
|
||||||
|
sess.__enter__()
|
||||||
|
|
||||||
|
if args.restore_args_from is not None:
|
||||||
|
args = restore_params(args)
|
||||||
|
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
|
||||||
|
workerseed = args.seed + 241 * MPI.COMM_WORLD.Get_rank()
|
||||||
|
set_global_seeds(workerseed)
|
||||||
|
|
||||||
|
def policy_fn(name, ob_space, ac_space):
|
||||||
|
return Actor(
|
||||||
|
name=name,
|
||||||
|
ob_space=ob_space, ac_space=ac_space,
|
||||||
|
hid_size=args.hid_size, num_hid_layers=args.num_hid_layers,
|
||||||
|
noise_type=args.noise_type)
|
||||||
|
|
||||||
|
env = create_env(args)
|
||||||
|
env.seed(workerseed)
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
with open("{}/args.json".format(args.logdir), "w") as fout:
|
||||||
|
json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
args.thread = rank
|
||||||
|
if args.agent == "trpo":
|
||||||
|
trpo.learn(
|
||||||
|
env, policy_fn, args,
|
||||||
|
timesteps_per_batch=1024,
|
||||||
|
gamma=args.gamma,
|
||||||
|
lam=0.98,
|
||||||
|
max_kl=0.01,
|
||||||
|
cg_iters=10,
|
||||||
|
cg_damping=0.1,
|
||||||
|
vf_iters=5,
|
||||||
|
vf_stepsize=1e-3)
|
||||||
|
elif args.agent == "ppo":
|
||||||
|
# optimal settings:
|
||||||
|
# timesteps_per_batch = optim_epochs * optim_batchsize
|
||||||
|
ppo.learn(
|
||||||
|
env, policy_fn, args,
|
||||||
|
timesteps_per_batch=256,
|
||||||
|
gamma=args.gamma,
|
||||||
|
lam=0.95,
|
||||||
|
clip_param=0.2,
|
||||||
|
entcoeff=0.0,
|
||||||
|
optim_epochs=4,
|
||||||
|
optim_stepsize=3e-4,
|
||||||
|
optim_batchsize=64,
|
||||||
|
schedule='constant')
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("closing envs...")
|
||||||
|
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_args()
|
||||||
|
args.noise_type = "gaussian"
|
||||||
|
train(args)
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def traj_segment_generator(pi, env, args, horizon, stochastic):
|
||||||
|
# Initialize state variables
|
||||||
|
t = 0
|
||||||
|
ac = env.action_space.sample() # not used, just so we have the datatype
|
||||||
|
new = True # marks if we're on first timestep of an episode
|
||||||
|
ob = env.reset(difficulty=args.difficulty)
|
||||||
|
|
||||||
|
cur_ep_ret = 0 # return in current episode
|
||||||
|
cur_ep_len = 0 # len of current episode
|
||||||
|
ep_rets = [] # returns of completed episodes in this segment
|
||||||
|
ep_lens = [] # lengths of ...
|
||||||
|
|
||||||
|
# Initialize history arrays
|
||||||
|
obs = np.array([ob for _ in range(horizon)])
|
||||||
|
rews = np.zeros(horizon, 'float32')
|
||||||
|
vpreds = np.zeros(horizon, 'float32')
|
||||||
|
news = np.zeros(horizon, 'int32')
|
||||||
|
acs = np.array([ac for _ in range(horizon)])
|
||||||
|
prevacs = acs.copy()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
prevac = ac
|
||||||
|
ac, vpred = pi.act(stochastic, ob)
|
||||||
|
# Slight weirdness here because we need value function at time T
|
||||||
|
# before returning segment [0, T-1] so we get the correct
|
||||||
|
# terminal value
|
||||||
|
if t > 0 and t % horizon == 0:
|
||||||
|
yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
|
||||||
|
"ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
|
||||||
|
"ep_rets": ep_rets, "ep_lens": ep_lens}
|
||||||
|
# @TODO: TRPO & PPO implementation diff
|
||||||
|
# _, vpred = pi.act(stochastic, ob) # @TODO: uncomment??? IMPORTANT!!
|
||||||
|
# Be careful!!! if you change the downstream algorithm to aggregate
|
||||||
|
# several of these batches, then be sure to do a deepcopy
|
||||||
|
ep_rets = []
|
||||||
|
ep_lens = []
|
||||||
|
i = t % horizon
|
||||||
|
obs[i] = ob
|
||||||
|
vpreds[i] = vpred
|
||||||
|
news[i] = new
|
||||||
|
acs[i] = ac
|
||||||
|
prevacs[i] = prevac
|
||||||
|
|
||||||
|
ob, rew, new, _ = env.step(ac)
|
||||||
|
rews[i] = rew
|
||||||
|
|
||||||
|
cur_ep_ret += rew
|
||||||
|
cur_ep_len += 1
|
||||||
|
if new:
|
||||||
|
ep_rets.append(cur_ep_ret)
|
||||||
|
ep_lens.append(cur_ep_len)
|
||||||
|
cur_ep_ret = 0
|
||||||
|
cur_ep_len = 0
|
||||||
|
ob = env.reset(difficulty=args.difficulty)
|
||||||
|
t += 1
|
||||||
|
|
||||||
|
|
||||||
|
def add_vtarg_and_adv(seg, gamma, lam):
|
||||||
|
"""
|
||||||
|
Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
|
||||||
|
"""
|
||||||
|
# last element is only used for last vtarg, but we already zeroed it if last new = 1
|
||||||
|
new = np.append(seg["new"], 0)
|
||||||
|
vpred = np.append(seg["vpred"], seg["nextvpred"])
|
||||||
|
T = len(seg["rew"])
|
||||||
|
seg["adv"] = gaelam = np.empty(T, 'float32')
|
||||||
|
rew = seg["rew"]
|
||||||
|
lastgaelam = 0
|
||||||
|
for t in reversed(range(T)):
|
||||||
|
nonterminal = 1 - new[t + 1]
|
||||||
|
delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
|
||||||
|
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
||||||
|
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||||
@@ -0,0 +1,243 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
from mpi4py import MPI
|
||||||
|
from collections import deque
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from common.logger import Logger
|
||||||
|
|
||||||
|
from baselines.baselines_common import explained_variance, zipsame, dataset
|
||||||
|
import baselines.baselines_common.tf_util as U
|
||||||
|
from baselines.baselines_common import colorize
|
||||||
|
from baselines.baselines_common.mpi_adam import MpiAdam
|
||||||
|
from baselines.baselines_common.mpi_saver import MpiSaver
|
||||||
|
from baselines.baselines_common.cg import cg
|
||||||
|
|
||||||
|
from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
|
||||||
|
|
||||||
|
|
||||||
|
def learn(env, policy_func, args, *,
|
||||||
|
timesteps_per_batch, # what to train on
|
||||||
|
max_kl, cg_iters,
|
||||||
|
gamma, lam, # advantage estimation
|
||||||
|
entcoeff=0.0,
|
||||||
|
cg_damping=1e-2,
|
||||||
|
vf_stepsize=3e-4,
|
||||||
|
vf_iters=3):
|
||||||
|
nworkers = MPI.COMM_WORLD.Get_size()
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
np.set_printoptions(precision=3)
|
||||||
|
# Setup losses and stuff
|
||||||
|
# ----------------------------------------
|
||||||
|
ob_space = env.observation_space
|
||||||
|
ac_space = env.action_space
|
||||||
|
pi = policy_func("pi", ob_space, ac_space)
|
||||||
|
oldpi = policy_func("oldpi", ob_space, ac_space)
|
||||||
|
atarg = tf.placeholder(
|
||||||
|
dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
||||||
|
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
||||||
|
|
||||||
|
ob = U.get_placeholder_cached(name="ob")
|
||||||
|
ac = pi.pdtype.sample_placeholder([None])
|
||||||
|
|
||||||
|
kloldnew = oldpi.pd.kl(pi.pd)
|
||||||
|
ent = pi.pd.entropy()
|
||||||
|
meankl = U.mean(kloldnew)
|
||||||
|
meanent = U.mean(ent)
|
||||||
|
entbonus = entcoeff * meanent
|
||||||
|
|
||||||
|
vferr = U.mean(tf.square(pi.vpred - ret))
|
||||||
|
|
||||||
|
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
||||||
|
surrgain = U.mean(ratio * atarg)
|
||||||
|
|
||||||
|
optimgain = surrgain + entbonus
|
||||||
|
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
||||||
|
loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
|
||||||
|
|
||||||
|
dist = meankl
|
||||||
|
|
||||||
|
all_var_list = pi.get_trainable_variables()
|
||||||
|
var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
|
||||||
|
vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
|
||||||
|
vfadam = MpiAdam(vf_var_list)
|
||||||
|
|
||||||
|
policy_var_list = [v for v in all_var_list if v.name.split("/")[0].startswith("pi")]
|
||||||
|
saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
|
||||||
|
|
||||||
|
get_flat = U.GetFlat(var_list)
|
||||||
|
set_from_flat = U.SetFromFlat(var_list)
|
||||||
|
klgrads = tf.gradients(dist, var_list)
|
||||||
|
flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
|
||||||
|
shapes = [var.get_shape().as_list() for var in var_list]
|
||||||
|
start = 0
|
||||||
|
tangents = []
|
||||||
|
for shape in shapes:
|
||||||
|
sz = U.intprod(shape)
|
||||||
|
tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
|
||||||
|
start += sz
|
||||||
|
gvp = tf.add_n([U.sum(g * tangent) for (g, tangent) in
|
||||||
|
zipsame(klgrads, tangents)]) # pylint: disable=E1111
|
||||||
|
fvp = U.flatgrad(gvp, var_list)
|
||||||
|
|
||||||
|
assign_old_eq_new = U.function(
|
||||||
|
[], [],
|
||||||
|
updates=[tf.assign(oldv, newv)
|
||||||
|
for (oldv, newv) in
|
||||||
|
zipsame(oldpi.get_variables(), pi.get_variables())])
|
||||||
|
compute_losses = U.function([ob, ac, atarg], losses)
|
||||||
|
compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
|
||||||
|
compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
|
||||||
|
compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def timed(msg):
|
||||||
|
if rank == 0:
|
||||||
|
print(colorize(msg, color='magenta'))
|
||||||
|
tstart = time.time()
|
||||||
|
yield
|
||||||
|
print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
|
||||||
|
else:
|
||||||
|
yield
|
||||||
|
|
||||||
|
def allmean(x):
|
||||||
|
assert isinstance(x, np.ndarray)
|
||||||
|
out = np.empty_like(x)
|
||||||
|
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
|
||||||
|
out /= nworkers
|
||||||
|
return out
|
||||||
|
|
||||||
|
U.initialize()
|
||||||
|
saver.restore(restore_from=args.restore_actor_from)
|
||||||
|
th_init = get_flat()
|
||||||
|
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||||
|
set_from_flat(th_init)
|
||||||
|
vfadam.sync()
|
||||||
|
print("Init param sum", th_init.sum(), flush=True)
|
||||||
|
|
||||||
|
# Prepare for rollouts
|
||||||
|
# ----------------------------------------
|
||||||
|
seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
|
||||||
|
|
||||||
|
episodes_so_far = 0
|
||||||
|
timesteps_so_far = 0
|
||||||
|
iters_so_far = 0
|
||||||
|
tstart = time.time()
|
||||||
|
lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
||||||
|
rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
||||||
|
|
||||||
|
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
|
||||||
|
while time.time() - tstart < 86400 * args.max_train_days:
|
||||||
|
# logger.log("********** Iteration %i ************" % iters_so_far)
|
||||||
|
meanlosses = [0] * len(loss_names)
|
||||||
|
with timed("sampling"):
|
||||||
|
seg = seg_gen.__next__()
|
||||||
|
add_vtarg_and_adv(seg, gamma, lam)
|
||||||
|
|
||||||
|
# ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
|
||||||
|
ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
|
||||||
|
vpredbefore = seg["vpred"] # predicted value function before udpate
|
||||||
|
atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
|
||||||
|
|
||||||
|
if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
|
||||||
|
if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
|
||||||
|
|
||||||
|
segargs = seg["ob"], seg["ac"], seg["adv"]
|
||||||
|
fvpargs = [arr[::5] for arr in segargs]
|
||||||
|
|
||||||
|
def fisher_vector_product(p):
|
||||||
|
return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
|
||||||
|
|
||||||
|
assign_old_eq_new() # set old parameter values to new parameter values
|
||||||
|
with timed("computegrad"):
|
||||||
|
*lossbefore, g = compute_lossandgrad(*segargs)
|
||||||
|
lossbefore = allmean(np.array(lossbefore))
|
||||||
|
g = allmean(g)
|
||||||
|
if np.allclose(g, 0):
|
||||||
|
pass
|
||||||
|
# logger.log("Got zero gradient. not updating")
|
||||||
|
else:
|
||||||
|
with timed("cg"):
|
||||||
|
stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
|
||||||
|
assert np.isfinite(stepdir).all()
|
||||||
|
shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
|
||||||
|
lm = np.sqrt(shs / max_kl)
|
||||||
|
# logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
|
||||||
|
fullstep = stepdir / lm
|
||||||
|
expectedimprove = g.dot(fullstep)
|
||||||
|
surrbefore = lossbefore[0]
|
||||||
|
stepsize = 1.0
|
||||||
|
thbefore = get_flat()
|
||||||
|
for _ in range(10):
|
||||||
|
thnew = thbefore + fullstep * stepsize
|
||||||
|
set_from_flat(thnew)
|
||||||
|
meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*segargs)))
|
||||||
|
improve = surr - surrbefore
|
||||||
|
# logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
|
||||||
|
# if not np.isfinite(meanlosses).all():
|
||||||
|
# logger.log("Got non-finite value of losses -- bad!")
|
||||||
|
# elif kl > max_kl * 1.5:
|
||||||
|
# logger.log("violated KL constraint. shrinking step.")
|
||||||
|
# elif improve < 0:
|
||||||
|
# logger.log("surrogate didn't improve. shrinking step.")
|
||||||
|
# else:
|
||||||
|
# logger.log("Stepsize OK!")
|
||||||
|
# break
|
||||||
|
stepsize *= .5
|
||||||
|
else:
|
||||||
|
# logger.log("couldn't compute a good step")
|
||||||
|
set_from_flat(thbefore)
|
||||||
|
if nworkers > 1 and iters_so_far % 20 == 0:
|
||||||
|
paramsums = MPI.COMM_WORLD.allgather(
|
||||||
|
(thnew.sum(), vfadam.getflat().sum())) # list of tuples
|
||||||
|
assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
|
||||||
|
|
||||||
|
with timed("vf"):
|
||||||
|
for _ in range(vf_iters):
|
||||||
|
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
|
||||||
|
include_final_partial_batch=False,
|
||||||
|
batch_size=64):
|
||||||
|
g = allmean(compute_vflossandgrad(mbob, mbret))
|
||||||
|
vfadam.update(g, vf_stepsize)
|
||||||
|
|
||||||
|
saver.sync()
|
||||||
|
|
||||||
|
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
|
||||||
|
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
|
||||||
|
lens, rews = map(flatten_lists, zip(*listoflrpairs))
|
||||||
|
lenbuffer.extend(lens)
|
||||||
|
rewbuffer.extend(rews)
|
||||||
|
|
||||||
|
episodes_so_far += len(lens)
|
||||||
|
timesteps_so_far += sum(lens)
|
||||||
|
iters_so_far += 1
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logger.scalar_summary("episodes", len(lens), iters_so_far)
|
||||||
|
|
||||||
|
for (lossname, lossval) in zip(loss_names, meanlosses):
|
||||||
|
logger.scalar_summary(lossname, lossval, episodes_so_far)
|
||||||
|
|
||||||
|
logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
|
||||||
|
|
||||||
|
logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
|
||||||
|
logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
|
||||||
|
logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
|
||||||
|
|
||||||
|
elapsed_time = time.time() - tstart
|
||||||
|
|
||||||
|
logger.scalar_summary(
|
||||||
|
"episode per minute",
|
||||||
|
episodes_so_far / elapsed_time * 60,
|
||||||
|
episodes_so_far)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"step per second",
|
||||||
|
timesteps_so_far / elapsed_time,
|
||||||
|
episodes_so_far)
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_lists(listoflists):
|
||||||
|
return [el for list_ in listoflists for el in list_]
|
||||||
@@ -0,0 +1,215 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from common.segment_tree import SumSegmentTree, MinSegmentTree
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayBuffer(object):
|
||||||
|
def __init__(self, size):
|
||||||
|
"""Create Prioritized Replay buffer.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size: int
|
||||||
|
Max number of transitions to store in the buffer. When the buffer
|
||||||
|
overflows the old memories are dropped.
|
||||||
|
"""
|
||||||
|
self._storage = []
|
||||||
|
self._maxsize = size
|
||||||
|
self._next_idx = 0
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._storage)
|
||||||
|
|
||||||
|
def add(self, obs_t, action, reward, obs_tp1, done):
|
||||||
|
data = (obs_t, action, reward, obs_tp1, done)
|
||||||
|
|
||||||
|
if self._next_idx >= len(self._storage):
|
||||||
|
self._storage.append(data)
|
||||||
|
else:
|
||||||
|
self._storage[self._next_idx] = data
|
||||||
|
self._next_idx = (self._next_idx + 1) % self._maxsize
|
||||||
|
|
||||||
|
def _encode_sample(self, idxes):
|
||||||
|
obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
|
||||||
|
for i in idxes:
|
||||||
|
data = self._storage[i]
|
||||||
|
obs_t, action, reward, obs_tp1, done = data
|
||||||
|
obses_t.append(np.array(obs_t, copy=False))
|
||||||
|
actions.append(np.array(action, copy=False))
|
||||||
|
rewards.append(reward)
|
||||||
|
obses_tp1.append(np.array(obs_tp1, copy=False))
|
||||||
|
dones.append(done)
|
||||||
|
return np.array(obses_t), \
|
||||||
|
np.array(actions), \
|
||||||
|
np.array(rewards), \
|
||||||
|
np.array(obses_tp1), \
|
||||||
|
np.array(dones)
|
||||||
|
|
||||||
|
def sample(self, batch_size):
|
||||||
|
"""Sample a batch of experiences.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
batch_size: int
|
||||||
|
How many transitions to sample.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
obs_batch: np.array
|
||||||
|
batch of observations
|
||||||
|
act_batch: np.array
|
||||||
|
batch of actions executed given obs_batch
|
||||||
|
rew_batch: np.array
|
||||||
|
rewards received as results of executing act_batch
|
||||||
|
next_obs_batch: np.array
|
||||||
|
next set of observations seen after executing act_batch
|
||||||
|
done_mask: np.array
|
||||||
|
done_mask[i] = 1 if executing act_batch[i] resulted in
|
||||||
|
the end of an episode and 0 otherwise.
|
||||||
|
"""
|
||||||
|
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
|
||||||
|
return self._encode_sample(idxes)
|
||||||
|
|
||||||
|
|
||||||
|
class PrioritizedReplayBuffer(ReplayBuffer):
|
||||||
|
def __init__(self, size, alpha=0.5):
|
||||||
|
"""Create Prioritized Replay buffer.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size: int
|
||||||
|
Max number of transitions to store in the buffer. When the buffer
|
||||||
|
overflows the old memories are dropped.
|
||||||
|
alpha: float
|
||||||
|
how much prioritization is used
|
||||||
|
(0 - no prioritization, 1 - full prioritization)
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
ReplayBuffer.__init__
|
||||||
|
"""
|
||||||
|
super(PrioritizedReplayBuffer, self).__init__(size)
|
||||||
|
assert alpha > 0
|
||||||
|
self._alpha = alpha
|
||||||
|
|
||||||
|
it_capacity = 1
|
||||||
|
while it_capacity < size:
|
||||||
|
it_capacity *= 2
|
||||||
|
|
||||||
|
self._it_sum = SumSegmentTree(it_capacity)
|
||||||
|
self._it_min = MinSegmentTree(it_capacity)
|
||||||
|
self._max_priority = 1.0
|
||||||
|
|
||||||
|
def add(self, *args, **kwargs):
|
||||||
|
"""See ReplayBuffer.store_effect"""
|
||||||
|
idx = self._next_idx
|
||||||
|
super().add(*args, **kwargs)
|
||||||
|
self._it_sum[idx] = self._max_priority ** self._alpha
|
||||||
|
self._it_min[idx] = self._max_priority ** self._alpha
|
||||||
|
|
||||||
|
def _sample_proportional(self, batch_size):
|
||||||
|
res = []
|
||||||
|
for _ in range(batch_size):
|
||||||
|
# TODO(szymon): should we ensure no repeats?
|
||||||
|
mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
|
||||||
|
idx = self._it_sum.find_prefixsum_idx(mass)
|
||||||
|
res.append(idx)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def sample(self, batch_size, beta=0.5):
|
||||||
|
"""Sample a batch of experiences.
|
||||||
|
|
||||||
|
compared to ReplayBuffer.sample
|
||||||
|
it also returns importance weights and idxes
|
||||||
|
of sampled experiences.
|
||||||
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
batch_size: int
|
||||||
|
How many transitions to sample.
|
||||||
|
beta: float
|
||||||
|
To what degree to use importance weights
|
||||||
|
(0 - no corrections, 1 - full correction)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
obs_batch: np.array
|
||||||
|
batch of observations
|
||||||
|
act_batch: np.array
|
||||||
|
batch of actions executed given obs_batch
|
||||||
|
rew_batch: np.array
|
||||||
|
rewards received as results of executing act_batch
|
||||||
|
next_obs_batch: np.array
|
||||||
|
next set of observations seen after executing act_batch
|
||||||
|
done_mask: np.array
|
||||||
|
done_mask[i] = 1 if executing act_batch[i] resulted in
|
||||||
|
the end of an episode and 0 otherwise.
|
||||||
|
weights: np.array
|
||||||
|
Array of shape (batch_size,) and dtype np.float32
|
||||||
|
denoting importance weight of each sampled transition
|
||||||
|
idxes: np.array
|
||||||
|
Array of shape (batch_size,) and dtype np.int32
|
||||||
|
idexes in buffer of sampled experiences
|
||||||
|
"""
|
||||||
|
assert beta > 0
|
||||||
|
|
||||||
|
idxes = self._sample_proportional(batch_size)
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
p_min = self._it_min.min() / self._it_sum.sum()
|
||||||
|
max_weight = (p_min * len(self._storage)) ** (-beta)
|
||||||
|
|
||||||
|
for idx in idxes:
|
||||||
|
p_sample = self._it_sum[idx] / self._it_sum.sum()
|
||||||
|
weight = (p_sample * len(self._storage)) ** (-beta)
|
||||||
|
weights.append(weight / max_weight)
|
||||||
|
weights = np.array(weights)
|
||||||
|
encoded_sample = self._encode_sample(idxes)
|
||||||
|
return tuple(list(encoded_sample) + [weights, idxes])
|
||||||
|
|
||||||
|
def update_priorities(self, idxes, priorities):
|
||||||
|
"""Update priorities of sampled transitions.
|
||||||
|
|
||||||
|
sets priority of transition at index idxes[i] in buffer
|
||||||
|
to priorities[i].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idxes: [int]
|
||||||
|
List of idxes of sampled transitions
|
||||||
|
priorities: [float]
|
||||||
|
List of updated priorities corresponding to
|
||||||
|
transitions at the sampled idxes denoted by
|
||||||
|
variable `idxes`.
|
||||||
|
"""
|
||||||
|
assert len(idxes) == len(priorities)
|
||||||
|
for idx, priority in zip(idxes, priorities):
|
||||||
|
assert priority > 0
|
||||||
|
assert 0 <= idx < len(self._storage)
|
||||||
|
self._it_sum[idx] = priority ** self._alpha
|
||||||
|
self._it_min[idx] = priority ** self._alpha
|
||||||
|
|
||||||
|
self._max_priority = max(self._max_priority, priority)
|
||||||
|
|
||||||
|
buffers = {
|
||||||
|
"simple": ReplayBuffer,
|
||||||
|
"prioritized": PrioritizedReplayBuffer
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def buffer_generator(buffer, batch_size=32):
|
||||||
|
result = None
|
||||||
|
while True:
|
||||||
|
observation, action, reward, next_observation, done = yield result
|
||||||
|
buffer.add(observation, action, reward, next_observation, done)
|
||||||
|
result = buffer.sample(batch_size=batch_size)
|
||||||
|
|
||||||
|
|
||||||
|
def create_buffer(args):
|
||||||
|
if args.prioritized_replay:
|
||||||
|
return PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
|
||||||
|
else:
|
||||||
|
return ReplayBuffer(args.buffer_size)
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import numpy as np
|
||||||
|
import gym
|
||||||
|
from gym.spaces import Box
|
||||||
|
from osim.env import RunEnv
|
||||||
|
|
||||||
|
from common.state_transform import StateVelCentr
|
||||||
|
|
||||||
|
|
||||||
|
class DdpgWrapper(gym.Wrapper):
|
||||||
|
def __init__(self, env, args):
|
||||||
|
gym.Wrapper.__init__(self, env)
|
||||||
|
self.state_transform = StateVelCentr(
|
||||||
|
obstacles_mode='standard',
|
||||||
|
exclude_centr=True,
|
||||||
|
vel_states=[])
|
||||||
|
self.observation_space = Box(-1000, 1000, self.state_transform.state_size)
|
||||||
|
self.skip_frames = args.skip_frames
|
||||||
|
self.reward_scale = args.reward_scale
|
||||||
|
self.fail_reward = args.fail_reward
|
||||||
|
# [-1, 1] <-> [0, 1]
|
||||||
|
action_mean = .5
|
||||||
|
action_std = .5
|
||||||
|
self.normalize_action = lambda x: (x - action_mean) / action_std
|
||||||
|
self.denormalise_action = lambda x: x * action_std + action_mean
|
||||||
|
|
||||||
|
def reset(self, **kwargs):
|
||||||
|
return self._reset(**kwargs)
|
||||||
|
|
||||||
|
def _reset(self, **kwargs):
|
||||||
|
observation = self.env.reset(**kwargs)
|
||||||
|
self.env_step = 0
|
||||||
|
self.state_transform.reset()
|
||||||
|
observation, _ = self.state_transform.process(observation)
|
||||||
|
observation = self.observation(observation)
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def _step(self, action):
|
||||||
|
action = self.denormalise_action(action)
|
||||||
|
total_reward = 0.
|
||||||
|
for _ in range(self.skip_frames):
|
||||||
|
observation, reward, done, _ = self.env.step(action)
|
||||||
|
observation, obst_rew = self.state_transform.process(observation)
|
||||||
|
total_reward += reward + obst_rew
|
||||||
|
self.env_step += 1
|
||||||
|
if done:
|
||||||
|
if self.env_step < 1000: # hardcoded
|
||||||
|
total_reward += self.fail_reward
|
||||||
|
break
|
||||||
|
|
||||||
|
observation = self.observation(observation)
|
||||||
|
total_reward *= self.reward_scale
|
||||||
|
return observation, total_reward, done, None
|
||||||
|
|
||||||
|
def observation(self, observation):
|
||||||
|
return self._observation(observation)
|
||||||
|
|
||||||
|
def _observation(self, observation):
|
||||||
|
observation = np.array(observation, dtype=np.float32)
|
||||||
|
return observation
|
||||||
|
|
||||||
|
|
||||||
|
def create_env(args):
|
||||||
|
env = RunEnv(visualize=False, max_obstacles=args.max_obstacles)
|
||||||
|
|
||||||
|
if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
|
||||||
|
env = DdpgWrapper(env, args)
|
||||||
|
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def create_observation_handler(args):
|
||||||
|
|
||||||
|
if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
|
||||||
|
state_transform = StateVelCentr(
|
||||||
|
obstacles_mode='standard',
|
||||||
|
exclude_centr=True,
|
||||||
|
vel_states=[])
|
||||||
|
|
||||||
|
def observation_handler(observation, previous_action=None):
|
||||||
|
observation = np.array(observation, dtype=np.float32)
|
||||||
|
observation, _ = state_transform.process(observation)
|
||||||
|
return observation
|
||||||
|
else:
|
||||||
|
def observation_handler(observation, previous_action=None):
|
||||||
|
observation = np.array(observation, dtype=np.float32)
|
||||||
|
return observation
|
||||||
|
|
||||||
|
return observation_handler
|
||||||
|
|
||||||
|
|
||||||
|
def create_action_handler(args):
|
||||||
|
action_mean = .5
|
||||||
|
action_std = .5
|
||||||
|
action_handler = lambda x: x * action_std + action_mean
|
||||||
|
return action_handler
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
def create_linear_decay_fn(initial_value, final_value, max_step):
|
||||||
|
def decay_fn(step):
|
||||||
|
relative = 1. - step / max_step
|
||||||
|
return initial_value * relative + final_value * (1. - relative)
|
||||||
|
|
||||||
|
return decay_fn
|
||||||
|
|
||||||
|
|
||||||
|
def create_cycle_decay_fn(initial_value, final_value, cycle_len, num_cycles):
|
||||||
|
max_step = cycle_len * num_cycles
|
||||||
|
|
||||||
|
def decay_fn(step):
|
||||||
|
relative = 1. - step / max_step
|
||||||
|
relative_cosine = 0.5 * (np.cos(np.pi * np.mod(step, cycle_len) / cycle_len) + 1.0)
|
||||||
|
return relative_cosine * (initial_value - final_value) * relative + final_value
|
||||||
|
|
||||||
|
return decay_fn
|
||||||
|
|
||||||
|
|
||||||
|
def create_decay_fn(decay_type, **kwargs):
|
||||||
|
if decay_type == "linear":
|
||||||
|
return create_linear_decay_fn(**kwargs)
|
||||||
|
elif decay_type == "cycle":
|
||||||
|
return create_cycle_decay_fn(**kwargs)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class QuadricLinearLoss(nn.Module):
|
||||||
|
def __init__(self, clip_delta):
|
||||||
|
super(QuadricLinearLoss, self).__init__()
|
||||||
|
self.clip_delta = clip_delta
|
||||||
|
|
||||||
|
def forward(self, y_pred, y_true, weights):
|
||||||
|
td_error = y_true - y_pred
|
||||||
|
td_error_abs = torch.abs(td_error)
|
||||||
|
quadratic_part = torch.clamp(td_error_abs, max=self.clip_delta)
|
||||||
|
linear_part = td_error_abs - quadratic_part
|
||||||
|
loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
|
||||||
|
loss = torch.mean(loss * weights)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
losses = {
|
||||||
|
"mse": nn.MSELoss,
|
||||||
|
"quadric-linear": QuadricLinearLoss
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_loss(args):
|
||||||
|
if args.loss_type == "mse":
|
||||||
|
return nn.MSELoss()
|
||||||
|
elif args.loss_type == "quadric-linear":
|
||||||
|
return QuadricLinearLoss(clip_delta=args.clip_delta)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
@@ -0,0 +1,88 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def create_if_need(path):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
|
||||||
|
def boolean_flag(parser, name, default=False, help=None):
|
||||||
|
"""Add a boolean flag to argparse parser.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
parser: argparse.Parser
|
||||||
|
parser to add the flag to
|
||||||
|
name: str
|
||||||
|
--<name> will enable the flag, while --no-<name> will disable it
|
||||||
|
default: bool or None
|
||||||
|
default value of the flag
|
||||||
|
help: str
|
||||||
|
help string for the flag
|
||||||
|
"""
|
||||||
|
dest = name.replace('-', '_')
|
||||||
|
parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
|
||||||
|
parser.add_argument("--no-" + name, action="store_false", dest=dest)
|
||||||
|
|
||||||
|
|
||||||
|
def str2params(string, delimeter="-"):
|
||||||
|
try:
|
||||||
|
result = list(map(int, string.split(delimeter)))
|
||||||
|
except:
|
||||||
|
result = None
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def set_global_seeds(i):
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
torch.manual_seed(i)
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
tf.set_random_seed(i)
|
||||||
|
np.random.seed(i)
|
||||||
|
random.seed(i)
|
||||||
|
|
||||||
|
|
||||||
|
def query_yes_no(question, default="no"):
|
||||||
|
"""Ask a yes/no question via input() and return their answer.
|
||||||
|
|
||||||
|
"question" is a string that is presented to the user.
|
||||||
|
"default" is the presumed answer if the user just hits <Enter>.
|
||||||
|
It must be "yes" (the default), "no" or None (meaning
|
||||||
|
an answer is required of the user).
|
||||||
|
|
||||||
|
The "answer" return value is True for "yes" or False for "no".
|
||||||
|
"""
|
||||||
|
valid = {
|
||||||
|
"yes": True, "y": True, "ye": True,
|
||||||
|
"no": False, "n": False
|
||||||
|
}
|
||||||
|
if default is None:
|
||||||
|
prompt = " [y/n] "
|
||||||
|
elif default == "yes":
|
||||||
|
prompt = " [Y/n] "
|
||||||
|
elif default == "no":
|
||||||
|
prompt = " [y/N] "
|
||||||
|
else:
|
||||||
|
raise ValueError("invalid default answer: '%s'" % default)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
sys.stdout.write(question + prompt)
|
||||||
|
choice = input().lower()
|
||||||
|
if default is not None and choice == '':
|
||||||
|
return valid[default]
|
||||||
|
elif choice in valid:
|
||||||
|
return valid[choice]
|
||||||
|
else:
|
||||||
|
sys.stdout.write("Please respond with 'yes' or 'no' "
|
||||||
|
"(or 'y' or 'n').\n")
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class LayerNorm(nn.Module):
|
||||||
|
def __init__(self, features, eps=1e-6):
|
||||||
|
super().__init__()
|
||||||
|
self.gamma = nn.Parameter(torch.ones(features))
|
||||||
|
self.beta = nn.Parameter(torch.zeros(features))
|
||||||
|
self.eps = eps
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
mean = x.mean(-1, keepdim=True)
|
||||||
|
std = x.std(-1, keepdim=True)
|
||||||
|
return self.gamma * (x - mean) / (std + self.eps) + self.beta
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
import math
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.nn.parameter import Parameter
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.nn.modules.module import Module
|
||||||
|
from torch.autograd import Variable
|
||||||
|
|
||||||
|
|
||||||
|
class NoisyLinear(Module):
|
||||||
|
"""Applies a noisy linear transformation to the incoming data:
|
||||||
|
:math:`y = (mu_w + sigma_w \cdot epsilon_w)x + mu_b + sigma_b \cdot epsilon_b`
|
||||||
|
More details can be found in the paper `Noisy Networks for Exploration` _ .
|
||||||
|
Args:
|
||||||
|
in_features: size of each input sample
|
||||||
|
out_features: size of each output sample
|
||||||
|
bias: If set to False, the layer will not learn an additive bias. Default: True
|
||||||
|
factorised: whether or not to use factorised noise. Default: True
|
||||||
|
std_init: initialization constant for standard deviation component of weights. If None,
|
||||||
|
defaults to 0.017 for independent and 0.4 for factorised. Default: None
|
||||||
|
Shape:
|
||||||
|
- Input: :math:`(N, in\_features)`
|
||||||
|
- Output: :math:`(N, out\_features)`
|
||||||
|
Attributes:
|
||||||
|
weight: the learnable weights of the module of shape (out_features x in_features)
|
||||||
|
bias: the learnable bias of the module of shape (out_features)
|
||||||
|
Examples::
|
||||||
|
>>> m = nn.NoisyLinear(20, 30)
|
||||||
|
>>> input = autograd.Variable(torch.randn(128, 20))
|
||||||
|
>>> output = m(input)
|
||||||
|
>>> print(output.size())
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, in_features, out_features, bias=True, factorised=True, std_init=None):
|
||||||
|
super(NoisyLinear, self).__init__()
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.factorised = factorised
|
||||||
|
self.weight_mu = Parameter(torch.Tensor(out_features, in_features))
|
||||||
|
self.weight_sigma = Parameter(torch.Tensor(out_features, in_features))
|
||||||
|
if bias:
|
||||||
|
self.bias_mu = Parameter(torch.Tensor(out_features))
|
||||||
|
self.bias_sigma = Parameter(torch.Tensor(out_features))
|
||||||
|
else:
|
||||||
|
self.register_parameter('bias', None)
|
||||||
|
if not std_init:
|
||||||
|
if self.factorised:
|
||||||
|
self.std_init = 0.4
|
||||||
|
else:
|
||||||
|
self.std_init = 0.017
|
||||||
|
else:
|
||||||
|
self.std_init = std_init
|
||||||
|
self.reset_parameters(bias)
|
||||||
|
|
||||||
|
def reset_parameters(self, bias):
|
||||||
|
if self.factorised:
|
||||||
|
mu_range = 1. / math.sqrt(self.weight_mu.size(1))
|
||||||
|
self.weight_mu.data.uniform_(-mu_range, mu_range)
|
||||||
|
self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
|
||||||
|
if bias:
|
||||||
|
self.bias_mu.data.uniform_(-mu_range, mu_range)
|
||||||
|
self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
|
||||||
|
else:
|
||||||
|
mu_range = math.sqrt(3. / self.weight_mu.size(1))
|
||||||
|
self.weight_mu.data.uniform_(-mu_range, mu_range)
|
||||||
|
self.weight_sigma.data.fill_(self.std_init)
|
||||||
|
if bias:
|
||||||
|
self.bias_mu.data.uniform_(-mu_range, mu_range)
|
||||||
|
self.bias_sigma.data.fill_(self.std_init)
|
||||||
|
|
||||||
|
def scale_noise(self, size):
|
||||||
|
x = torch.Tensor(size).normal_()
|
||||||
|
x = x.sign().mul(x.abs().sqrt())
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
if self.factorised:
|
||||||
|
epsilon_in = self.scale_noise(self.in_features)
|
||||||
|
epsilon_out = self.scale_noise(self.out_features)
|
||||||
|
weight_epsilon = Variable(epsilon_out.ger(epsilon_in))
|
||||||
|
bias_epsilon = Variable(self.scale_noise(self.out_features))
|
||||||
|
else:
|
||||||
|
weight_epsilon = Variable(torch.Tensor(self.out_features, self.in_features).normal_())
|
||||||
|
bias_epsilon = Variable(torch.Tensor(self.out_features).normal_())
|
||||||
|
return F.linear(input,
|
||||||
|
self.weight_mu + self.weight_sigma.mul(weight_epsilon),
|
||||||
|
self.bias_mu + self.bias_sigma.mul(bias_epsilon))
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__class__.__name__ + ' (' \
|
||||||
|
+ str(self.in_features) + ' -> ' \
|
||||||
|
+ str(self.out_features) + ')'
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
from collections import OrderedDict
|
||||||
|
from itertools import tee
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from common.modules.LayerNorm import LayerNorm
|
||||||
|
|
||||||
|
|
||||||
|
def pairwise(iterable):
|
||||||
|
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
|
||||||
|
a, b = tee(iterable)
|
||||||
|
next(b, None)
|
||||||
|
return zip(a, b)
|
||||||
|
|
||||||
|
|
||||||
|
class LinearNet(nn.Module):
|
||||||
|
def __init__(self, layers, activation=torch.nn.ELU,
|
||||||
|
layer_norm=False, linear_layer=nn.Linear):
|
||||||
|
super(LinearNet, self).__init__()
|
||||||
|
self.input_shape = layers[0]
|
||||||
|
self.output_shape = layers[-1]
|
||||||
|
|
||||||
|
if layer_norm:
|
||||||
|
layer_fn = lambda layer: [
|
||||||
|
("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
|
||||||
|
("layer_norm_{}".format(layer[0]), LayerNorm(layer[1][1])),
|
||||||
|
("act_{}".format(layer[0]), activation())]
|
||||||
|
else:
|
||||||
|
layer_fn = lambda layer: [
|
||||||
|
("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
|
||||||
|
("act_{}".format(layer[0]), activation())]
|
||||||
|
|
||||||
|
self.net = torch.nn.Sequential(
|
||||||
|
OrderedDict([
|
||||||
|
x for y in map(
|
||||||
|
lambda layer: layer_fn(layer),
|
||||||
|
enumerate(pairwise(layers))) for x in y]))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.net.forward(x)
|
||||||
|
return x
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class RandomProcess(object):
|
||||||
|
def reset_states(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class AnnealedGaussianProcess(RandomProcess):
|
||||||
|
def __init__(self, mu, sigma, sigma_min, n_steps_annealing=int(1e5)):
|
||||||
|
self.mu = mu
|
||||||
|
self.sigma = sigma
|
||||||
|
self.n_steps = 0
|
||||||
|
|
||||||
|
if sigma_min is not None:
|
||||||
|
self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
|
||||||
|
self.c = sigma
|
||||||
|
self.sigma_min = sigma_min
|
||||||
|
else:
|
||||||
|
self.m = 0.
|
||||||
|
self.c = sigma
|
||||||
|
self.sigma_min = sigma
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_sigma(self):
|
||||||
|
sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
|
||||||
|
return sigma
|
||||||
|
|
||||||
|
|
||||||
|
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
|
||||||
|
def __init__(self, theta, mu=0., sigma=1., dt=1e-2,
|
||||||
|
x0=None, size=1, sigma_min=None, n_steps_annealing=int(1e5)):
|
||||||
|
super(OrnsteinUhlenbeckProcess, self).__init__(
|
||||||
|
mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
|
||||||
|
self.theta = theta
|
||||||
|
self.mu = mu
|
||||||
|
self.dt = dt
|
||||||
|
self.x0 = x0
|
||||||
|
self.size = size
|
||||||
|
self.reset_states()
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
|
||||||
|
self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
|
||||||
|
self.x_prev = x
|
||||||
|
self.n_steps += 1
|
||||||
|
return x
|
||||||
|
|
||||||
|
def reset_states(self):
|
||||||
|
self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
|
||||||
|
|
||||||
|
|
||||||
|
def create_random_process(args):
|
||||||
|
if args.rp_type == "ornstein-uhlenbeck":
|
||||||
|
return OrnsteinUhlenbeckProcess(
|
||||||
|
size=args.n_action,
|
||||||
|
theta=args.rp_theta,
|
||||||
|
mu=args.rp_mu,
|
||||||
|
sigma=args.rp_sigma,
|
||||||
|
sigma_min=args.rp_sigma_min)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
import operator
|
||||||
|
|
||||||
|
|
||||||
|
class SegmentTree(object):
|
||||||
|
def __init__(self, capacity, operation, neutral_element):
|
||||||
|
"""Build a Segment Tree data structure.
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Segment_tree
|
||||||
|
|
||||||
|
Can be used as regular array, but with two
|
||||||
|
important differences:
|
||||||
|
|
||||||
|
a) setting item's value is slightly slower.
|
||||||
|
It is O(lg capacity) instead of O(1).
|
||||||
|
b) user has access to an efficient `reduce`
|
||||||
|
operation which reduces `operation` over
|
||||||
|
a contiguous subsequence of items in the
|
||||||
|
array.
|
||||||
|
|
||||||
|
Paramters
|
||||||
|
---------
|
||||||
|
capacity: int
|
||||||
|
Total size of the array - must be a power of two.
|
||||||
|
operation: lambda obj, obj -> obj
|
||||||
|
and operation for combining elements (eg. sum, max)
|
||||||
|
must for a mathematical group together with the set of
|
||||||
|
possible values for array elements.
|
||||||
|
neutral_element: obj
|
||||||
|
neutral element for the operation above. eg. float('-inf')
|
||||||
|
for max and 0 for sum.
|
||||||
|
"""
|
||||||
|
assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
|
||||||
|
self._capacity = capacity
|
||||||
|
self._value = [neutral_element for _ in range(2 * capacity)]
|
||||||
|
self._operation = operation
|
||||||
|
|
||||||
|
def _reduce_helper(self, start, end, node, node_start, node_end):
|
||||||
|
if start == node_start and end == node_end:
|
||||||
|
return self._value[node]
|
||||||
|
mid = (node_start + node_end) // 2
|
||||||
|
if end <= mid:
|
||||||
|
return self._reduce_helper(start, end, 2 * node, node_start, mid)
|
||||||
|
else:
|
||||||
|
if mid + 1 <= start:
|
||||||
|
return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
|
||||||
|
else:
|
||||||
|
return self._operation(
|
||||||
|
self._reduce_helper(start, mid, 2 * node, node_start, mid),
|
||||||
|
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
|
||||||
|
)
|
||||||
|
|
||||||
|
def reduce(self, start=0, end=None):
|
||||||
|
"""Returns result of applying `self.operation`
|
||||||
|
to a contiguous subsequence of the array.
|
||||||
|
|
||||||
|
self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
start: int
|
||||||
|
beginning of the subsequence
|
||||||
|
end: int
|
||||||
|
end of the subsequences
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
reduced: obj
|
||||||
|
result of reducing self.operation over the specified range of array elements.
|
||||||
|
"""
|
||||||
|
if end is None:
|
||||||
|
end = self._capacity
|
||||||
|
if end < 0:
|
||||||
|
end += self._capacity
|
||||||
|
end -= 1
|
||||||
|
return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
|
||||||
|
|
||||||
|
def __setitem__(self, idx, val):
|
||||||
|
# index of the leaf
|
||||||
|
idx += self._capacity
|
||||||
|
self._value[idx] = val
|
||||||
|
idx //= 2
|
||||||
|
while idx >= 1:
|
||||||
|
self._value[idx] = self._operation(
|
||||||
|
self._value[2 * idx],
|
||||||
|
self._value[2 * idx + 1]
|
||||||
|
)
|
||||||
|
idx //= 2
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
assert 0 <= idx < self._capacity
|
||||||
|
return self._value[self._capacity + idx]
|
||||||
|
|
||||||
|
|
||||||
|
class SumSegmentTree(SegmentTree):
|
||||||
|
def __init__(self, capacity):
|
||||||
|
super(SumSegmentTree, self).__init__(
|
||||||
|
capacity=capacity,
|
||||||
|
operation=operator.add,
|
||||||
|
neutral_element=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
def sum(self, start=0, end=None):
|
||||||
|
"""Returns arr[start] + ... + arr[end]"""
|
||||||
|
return super(SumSegmentTree, self).reduce(start, end)
|
||||||
|
|
||||||
|
def find_prefixsum_idx(self, prefixsum):
|
||||||
|
"""Find the highest index `i` in the array such that
|
||||||
|
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
|
||||||
|
|
||||||
|
if array values are probabilities, this function
|
||||||
|
allows to sample indexes according to the discrete
|
||||||
|
probability efficiently.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
perfixsum: float
|
||||||
|
upperbound on the sum of array prefix
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
idx: int
|
||||||
|
highest index satisfying the prefixsum constraint
|
||||||
|
"""
|
||||||
|
assert 0 <= prefixsum <= self.sum() + 1e-5
|
||||||
|
idx = 1
|
||||||
|
while idx < self._capacity: # while non-leaf
|
||||||
|
if self._value[2 * idx] > prefixsum:
|
||||||
|
idx = 2 * idx
|
||||||
|
else:
|
||||||
|
prefixsum -= self._value[2 * idx]
|
||||||
|
idx = 2 * idx + 1
|
||||||
|
return idx - self._capacity
|
||||||
|
|
||||||
|
|
||||||
|
class MinSegmentTree(SegmentTree):
|
||||||
|
def __init__(self, capacity):
|
||||||
|
super(MinSegmentTree, self).__init__(
|
||||||
|
capacity=capacity,
|
||||||
|
operation=min,
|
||||||
|
neutral_element=float('inf')
|
||||||
|
)
|
||||||
|
|
||||||
|
def min(self, start=0, end=None):
|
||||||
|
"""Returns min(arr[start], ..., arr[end])"""
|
||||||
|
|
||||||
|
return super(MinSegmentTree, self).reduce(start, end)
|
||||||
@@ -0,0 +1,336 @@
|
|||||||
|
from __future__ import division
|
||||||
|
import numpy as np
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
def get_state_names(all=False, obst=False):
|
||||||
|
names = ['pelvis_' + n for n in ('rot', 'x', 'y')]
|
||||||
|
names += ['pelvis_vel_' + n for n in ('rot', 'x', 'y')]
|
||||||
|
names += ['hip_right', 'knee_right', 'ankle_right', 'hip_left', 'knee_left', 'ankle_left']
|
||||||
|
names += ['hip_right_vel', 'knee_right_vel', 'ankle_right_vel', 'hip_left_vel', 'knee_left_vel', 'ankle_left_vel']
|
||||||
|
names += ['mass_x', 'mass_y']
|
||||||
|
names += ['mass_x_vel', 'mass_y_vel']
|
||||||
|
|
||||||
|
if all:
|
||||||
|
names += [b + '_' + i for b in ['head', 'pelvis2', 'torso', 'toes_left',
|
||||||
|
'toes_right', 'talus_left', 'talus_right'] for i in
|
||||||
|
['x', 'y']]
|
||||||
|
else:
|
||||||
|
names += [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right',
|
||||||
|
'talus_left', 'talus_right'] for i in
|
||||||
|
['x', 'y']]
|
||||||
|
|
||||||
|
names += ['muscle_left', 'muscle_right']
|
||||||
|
if obst:
|
||||||
|
names += ['obst_dist', 'obst_y', 'obst_r']
|
||||||
|
return names
|
||||||
|
|
||||||
|
|
||||||
|
def get_names_to_center(centr):
|
||||||
|
if centr == 'pelvis':
|
||||||
|
pelvis_or_mass = 'mass'
|
||||||
|
elif centr == 'mass':
|
||||||
|
pelvis_or_mass = 'pelvis'
|
||||||
|
else:
|
||||||
|
raise ValueError('centr should be in [mass or pelvis], not {}'.format(centr))
|
||||||
|
return [b + '_x' for b in ['head', pelvis_or_mass, 'torso', 'toes_left',
|
||||||
|
'toes_right', 'talus_left', 'talus_right']]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bodies_names():
|
||||||
|
return [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right', 'talus_left', 'talus_right']
|
||||||
|
for i in ['x', 'y']]
|
||||||
|
|
||||||
|
|
||||||
|
def get_names_obstacles():
|
||||||
|
return ['toes_left', 'toes_right', 'talus_left', 'talus_right']
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_velocity(cur, prev):
|
||||||
|
if prev is None:
|
||||||
|
return np.zeros_like(cur)
|
||||||
|
return 100.*(cur - prev)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pattern_idxs(lst, pattern):
|
||||||
|
idxs = [i for i, x in enumerate(lst) if pattern in x]
|
||||||
|
return idxs
|
||||||
|
|
||||||
|
|
||||||
|
class State(object):
|
||||||
|
def __init__(self, obstacles_mode='bodies_dist', obst_grid_dist=1,
|
||||||
|
grid_points=100, predict_bodies=True, add_step=True, osb_first=False):
|
||||||
|
assert obstacles_mode in ['exclude', 'grid', 'bodies_dist', 'standard']
|
||||||
|
|
||||||
|
self.state_idxs = [i for i, n in enumerate(get_state_names(True, True)) if n not in ['pelvis2_x', 'pelvis2_y']]
|
||||||
|
self.state_names = get_state_names()
|
||||||
|
self.step = 0
|
||||||
|
self.add_step = add_step
|
||||||
|
self.osb_first = osb_first
|
||||||
|
self.obstacles_mode = obstacles_mode
|
||||||
|
self.obstacles = OrderedDict()
|
||||||
|
|
||||||
|
self.obst_names = []
|
||||||
|
if obstacles_mode == 'standard':
|
||||||
|
self.obst_names = ['obst_dist', 'obst_y', 'obst_r']
|
||||||
|
elif obstacles_mode == 'grid':
|
||||||
|
self.obst_names = ['obst_grid_{}'.format(i) for i in range(grid_points)]
|
||||||
|
self.obst_grid_dist = obst_grid_dist
|
||||||
|
self.obst_grid_points = grid_points
|
||||||
|
self.obst_grid_size = obst_grid_dist * 2 / grid_points
|
||||||
|
elif obstacles_mode == 'bodies_dist':
|
||||||
|
self._obst_names = get_names_obstacles()
|
||||||
|
for i in range(3):
|
||||||
|
for n in self._obst_names:
|
||||||
|
self.obst_names.append('{}_{}_obst_x_start'.format(n, i))
|
||||||
|
self.obst_names.append('{}_{}_obst_x_end'.format(n, i))
|
||||||
|
self.obst_names.append('{}_{}_obst_y'.format(n, i))
|
||||||
|
self.obst_names.append('is_obstacle')
|
||||||
|
|
||||||
|
if self.add_step:
|
||||||
|
self.state_names.append('step')
|
||||||
|
|
||||||
|
self.predict_bodies = predict_bodies
|
||||||
|
self.bodies_idxs_x = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_x')]
|
||||||
|
self.bodies_idxs_y = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_y')]
|
||||||
|
self.bodies_idxs = self.bodies_idxs_x + self.bodies_idxs_y
|
||||||
|
self.mass_x_idx = self.state_names.index('mass_x')
|
||||||
|
self.mass_y_idx = self.state_names.index('mass_y')
|
||||||
|
|
||||||
|
self.state_names_out = self.state_names
|
||||||
|
self._set_left_right()
|
||||||
|
|
||||||
|
def _set_left_right(self):
|
||||||
|
self.left_idxs = _get_pattern_idxs(self.state_names, '_left')
|
||||||
|
self.right_idxs = _get_pattern_idxs(self.state_names, '_right')
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.step = 0
|
||||||
|
self.prev_orig = None
|
||||||
|
self.prev_pred = None
|
||||||
|
self.obstacles = OrderedDict()
|
||||||
|
|
||||||
|
def _predict_bodies(self, state):
|
||||||
|
state = np.copy(state)
|
||||||
|
|
||||||
|
if self.step > 0:
|
||||||
|
|
||||||
|
def update_bodies(cur, prev_orig, prev_pred, d):
|
||||||
|
flt = cur == prev_orig
|
||||||
|
cur[flt] = prev_pred[flt] + d
|
||||||
|
|
||||||
|
# does not matter orig or pred
|
||||||
|
dx = state[self.mass_x_idx] - self.prev_orig[self.mass_x_idx]
|
||||||
|
dy = state[self.mass_y_idx] - self.prev_orig[self.mass_y_idx]
|
||||||
|
|
||||||
|
cur_bodies_x = state[self.bodies_idxs_x]
|
||||||
|
cur_bodies_y = state[self.bodies_idxs_y]
|
||||||
|
|
||||||
|
# need for filter
|
||||||
|
prev_orig_bodies_x = self.prev_orig[self.bodies_idxs_x]
|
||||||
|
prev_orig_bodies_y = self.prev_orig[self.bodies_idxs_y]
|
||||||
|
|
||||||
|
# need for updating
|
||||||
|
prev_pred_bodies_x = self.prev_pred[self.bodies_idxs_x]
|
||||||
|
prev_pred_bodies_y = self.prev_pred[self.bodies_idxs_y]
|
||||||
|
|
||||||
|
update_bodies(cur_bodies_x, prev_orig_bodies_x, prev_pred_bodies_x, dx)
|
||||||
|
update_bodies(cur_bodies_y, prev_orig_bodies_y, prev_pred_bodies_y, dy)
|
||||||
|
|
||||||
|
state[self.bodies_idxs_x] = cur_bodies_x
|
||||||
|
state[self.bodies_idxs_y] = cur_bodies_y
|
||||||
|
return state
|
||||||
|
|
||||||
|
def _add_obstacle(self, state):
|
||||||
|
pelvis_x = state[1]
|
||||||
|
obstacle_x = state[-3]
|
||||||
|
|
||||||
|
if obstacle_x != 100:
|
||||||
|
obstacle_x += pelvis_x
|
||||||
|
if round(obstacle_x, 5) not in self.obstacles:
|
||||||
|
self.obstacles[round(obstacle_x, 5)] = [obstacle_x, state[-2], state[-1]]
|
||||||
|
#print('obstacles {}, step {}'.format(self.obstacles.keys(), self.step))
|
||||||
|
if len(self.obstacles) > 3:
|
||||||
|
Warning('more than 3 obstacles')
|
||||||
|
|
||||||
|
def _get_obstacle_state_reward(self, state):
|
||||||
|
is_obst = float(state[-3] != 100)
|
||||||
|
|
||||||
|
if self.obstacles_mode == 'exclude':
|
||||||
|
return [is_obst], 0.
|
||||||
|
elif self.obstacles_mode == 'standard':
|
||||||
|
if not is_obst:
|
||||||
|
return [-1., 0., 0., is_obst], 0.
|
||||||
|
obst_features = np.clip(state[-3:], -10., 10.)
|
||||||
|
return np.append(obst_features, is_obst), 0.
|
||||||
|
elif self.obstacles_mode == 'gird':
|
||||||
|
mass_x = state[self.state_names.index('mass_x')]
|
||||||
|
obst_grid = np.zeros(self.obst_grid_points)
|
||||||
|
for k, v in self.obstacles.iteritems():
|
||||||
|
obst_x, obst_y, obst_r = v
|
||||||
|
obst_h = obst_y + obst_r
|
||||||
|
obst_left = int(np.ceil((obst_x - mass_x - obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
|
||||||
|
obst_right = int(np.ceil((obst_x - mass_x + obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
|
||||||
|
obst_left = max(obst_left, 0)
|
||||||
|
obst_right = max(obst_right, -1)
|
||||||
|
obst_grid[obst_left:obst_right + 1] = obst_h
|
||||||
|
obst_features = np.append(obst_grid, is_obst)
|
||||||
|
return obst_features, 0
|
||||||
|
else:
|
||||||
|
obst_state = []
|
||||||
|
obst_reward = 0
|
||||||
|
for i in range(3):
|
||||||
|
if i >= len(self.obstacles):
|
||||||
|
for n in self._obst_names:
|
||||||
|
body_y = state[self.state_names.index(n + '_y')]
|
||||||
|
obst_state.extend([10, 10, body_y])
|
||||||
|
else:
|
||||||
|
v = self.obstacles.values()[i]
|
||||||
|
obst_x, obst_y, obst_r = v
|
||||||
|
obst_h = obst_y + obst_r
|
||||||
|
obst_x_start = obst_x - obst_r
|
||||||
|
obst_x_end = obst_x + obst_r
|
||||||
|
for n in self._obst_names:
|
||||||
|
body_x = state[self.state_names.index(n + '_x')]
|
||||||
|
body_y = state[self.state_names.index(n + '_y')]
|
||||||
|
obst_state.append(obst_x_start - body_x)
|
||||||
|
obst_state.append(obst_x_end - body_x)
|
||||||
|
obst_state.append(body_y - obst_h)
|
||||||
|
if obst_reward >= 0 and body_x >= (obst_x_start - obst_r/2) \
|
||||||
|
and (body_x <= obst_x_end+obst_r/2) and (obst_h + obst_r/2) >= body_y:
|
||||||
|
obst_reward = -0.5
|
||||||
|
obst_state.append(is_obst)
|
||||||
|
return np.asarray(obst_state), obst_reward
|
||||||
|
|
||||||
|
def process(self, state):
|
||||||
|
state = np.asarray(state)
|
||||||
|
state = state[self.state_idxs]
|
||||||
|
|
||||||
|
if self.osb_first and self.step == 0:
|
||||||
|
state[-3:] = [100, 0, 0]
|
||||||
|
|
||||||
|
self._add_obstacle(state)
|
||||||
|
obst_state, obst_reward = self._get_obstacle_state_reward(state)
|
||||||
|
state_orig = state[:-3]
|
||||||
|
|
||||||
|
if self.add_step:
|
||||||
|
state_orig = np.append(state_orig, 1. * self.step / 1000)
|
||||||
|
|
||||||
|
if self.predict_bodies:
|
||||||
|
state = self._predict_bodies(state_orig)
|
||||||
|
else:
|
||||||
|
state = state_orig
|
||||||
|
|
||||||
|
self.step += 1
|
||||||
|
self.prev_orig = state_orig
|
||||||
|
self.prev_pred = np.copy(state)
|
||||||
|
|
||||||
|
return (state, obst_state), obst_reward
|
||||||
|
|
||||||
|
def flip_state(self, state, copy=True):
|
||||||
|
assert np.ndim(state) == 1
|
||||||
|
state = np.asarray(state)
|
||||||
|
state = self.flip_states(state.reshape(1, -1), copy)
|
||||||
|
return state.ravel()
|
||||||
|
|
||||||
|
def flip_states(self, states, copy=True):
|
||||||
|
assert np.ndim(states) == 2
|
||||||
|
states = np.asarray(states)
|
||||||
|
if copy:
|
||||||
|
states = states.copy()
|
||||||
|
left = states[:, self.left_idxs]
|
||||||
|
right = states[:, self.right_idxs]
|
||||||
|
states[:, self.left_idxs] = right
|
||||||
|
states[:, self.right_idxs] = left
|
||||||
|
return states
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state_size(self):
|
||||||
|
return len(self.state_names_out) + len(self.obst_names)
|
||||||
|
|
||||||
|
|
||||||
|
class StateVel(State):
|
||||||
|
def __init__(self, vel_states=get_bodies_names(), obstacles_mode='bodies_dist',
|
||||||
|
add_step=True, predict_bodies=True, osb_first=False):
|
||||||
|
super(StateVel, self).__init__(obstacles_mode=obstacles_mode,
|
||||||
|
predict_bodies=predict_bodies,
|
||||||
|
add_step=add_step,
|
||||||
|
osb_first=osb_first)
|
||||||
|
self.vel_idxs = [self.state_names.index(k) for k in vel_states]
|
||||||
|
self.prev_vals = None
|
||||||
|
self.state_names += [n + '_vel' for n in vel_states]
|
||||||
|
self.state_names_out = self.state_names
|
||||||
|
# left right idxs
|
||||||
|
self._set_left_right()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super(StateVel, self).reset()
|
||||||
|
self.prev_vals = None
|
||||||
|
|
||||||
|
def process(self, state):
|
||||||
|
(state, obst_state), obst_reward = super(StateVel, self).process(state)
|
||||||
|
cur_vals = state[self.vel_idxs]
|
||||||
|
vel = calculate_velocity(cur_vals, self.prev_vals)
|
||||||
|
self.prev_vals = cur_vals
|
||||||
|
state = np.concatenate((state, vel, obst_state))
|
||||||
|
return state, obst_reward
|
||||||
|
|
||||||
|
|
||||||
|
class StateVelCentr(State):
|
||||||
|
def __init__(self, centr_state='pelvis_x', vel_states=get_bodies_names(),
|
||||||
|
states_to_center=get_names_to_center('pelvis'),
|
||||||
|
vel_before_centr=True, obstacles_mode='bodies_dist',
|
||||||
|
exclude_centr=False, predict_bodies=True,
|
||||||
|
add_step=True, osb_first=False):
|
||||||
|
super(StateVelCentr, self).__init__(obstacles_mode=obstacles_mode,
|
||||||
|
predict_bodies=predict_bodies,
|
||||||
|
add_step=add_step,
|
||||||
|
osb_first=osb_first)
|
||||||
|
|
||||||
|
# center
|
||||||
|
self.centr_idx = self.state_names.index(centr_state)
|
||||||
|
self.states_to_center = [self.state_names.index(k) for k in states_to_center]
|
||||||
|
# velocities
|
||||||
|
self.prev_vals = None
|
||||||
|
self.vel_idxs = [self.state_names.index(k) for k in vel_states]
|
||||||
|
self.vel_before_centr = vel_before_centr
|
||||||
|
self.state_names += [n + '_vel' for n in vel_states]
|
||||||
|
self.exclude_centr = exclude_centr
|
||||||
|
|
||||||
|
if self.exclude_centr:
|
||||||
|
self.state_names_out = self.state_names[:max(0, self.centr_idx)] + \
|
||||||
|
self.state_names[self.centr_idx + 1:]
|
||||||
|
else:
|
||||||
|
self.state_names_out = self.state_names
|
||||||
|
|
||||||
|
# left right idxs
|
||||||
|
self._set_left_right()
|
||||||
|
|
||||||
|
def _set_left_right(self):
|
||||||
|
state_names = self.state_names_out
|
||||||
|
self.left_idxs = _get_pattern_idxs(state_names, '_left')
|
||||||
|
self.right_idxs = _get_pattern_idxs(state_names, '_right')
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super(StateVelCentr, self).reset()
|
||||||
|
self.prev_vals = None
|
||||||
|
|
||||||
|
def process(self, state):
|
||||||
|
(state, obst_state), obst_reward = super(StateVelCentr, self).process(state)
|
||||||
|
|
||||||
|
if self.vel_before_centr:
|
||||||
|
cur_vals = state[self.vel_idxs]
|
||||||
|
vel = calculate_velocity(cur_vals, self.prev_vals)
|
||||||
|
self.prev_vals = cur_vals
|
||||||
|
state[self.states_to_center] -= state[self.centr_idx]
|
||||||
|
else:
|
||||||
|
state[self.states_to_center] -= state[self.centr_idx]
|
||||||
|
cur_vals = state[self.vel_idxs]
|
||||||
|
vel = calculate_velocity(cur_vals, self.prev_vals)
|
||||||
|
self.prev_vals = cur_vals
|
||||||
|
|
||||||
|
if self.exclude_centr:
|
||||||
|
state = np.concatenate([state[:max(0, self.centr_idx)], state[self.centr_idx+1:]])
|
||||||
|
|
||||||
|
state = np.concatenate((state, vel, obst_state))
|
||||||
|
return state, obst_reward
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
import torch
|
||||||
|
from torch.autograd import Variable
|
||||||
|
|
||||||
|
USE_CUDA = torch.cuda.is_available()
|
||||||
|
FLOAT = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
|
||||||
|
|
||||||
|
|
||||||
|
def to_numpy(var):
|
||||||
|
return var.cpu().data.numpy() if USE_CUDA else var.data.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def to_tensor(ndarray, volatile=False, requires_grad=False, dtype=FLOAT):
|
||||||
|
return Variable(
|
||||||
|
torch.from_numpy(ndarray), volatile=volatile, requires_grad=requires_grad
|
||||||
|
).type(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def soft_update(target, source, tau):
|
||||||
|
for target_param, param in zip(target.parameters(), source.parameters()):
|
||||||
|
target_param.data.copy_(
|
||||||
|
target_param.data * (1.0 - tau) + param.data * tau
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def hard_update(target, source):
|
||||||
|
for target_param, param in zip(target.parameters(), source.parameters()):
|
||||||
|
target_param.data.copy_(param.data)
|
||||||
|
|
||||||
|
|
||||||
|
activations = {
|
||||||
|
"relu": torch.nn.ReLU,
|
||||||
|
"elu": torch.nn.ELU,
|
||||||
|
"leakyrelu": torch.nn.LeakyReLU,
|
||||||
|
"selu": torch.nn.SELU,
|
||||||
|
"sigmoid": torch.nn.Sigmoid,
|
||||||
|
"tanh": torch.nn.Tanh
|
||||||
|
}
|
||||||
@@ -0,0 +1,70 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import copy
|
||||||
|
from multiprocessing import Value
|
||||||
|
|
||||||
|
from common.misc_util import str2params, create_if_need
|
||||||
|
from common.env_wrappers import create_env
|
||||||
|
from common.torch_util import activations, hard_update
|
||||||
|
|
||||||
|
from ddpg.model import create_model, create_act_update_fns, train_multi_thread
|
||||||
|
from ddpg.train import parse_args
|
||||||
|
|
||||||
|
|
||||||
|
def debug(args, model_fn, act_update_fns, multi_thread):
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
env = create_env(args)
|
||||||
|
|
||||||
|
if args.flip_state_action and hasattr(env, "state_transform"):
|
||||||
|
args.flip_states = env.state_transform.flip_states
|
||||||
|
|
||||||
|
args.n_action = env.action_space.shape[0]
|
||||||
|
args.n_observation = env.observation_space.shape[0]
|
||||||
|
|
||||||
|
args.actor_layers = str2params(args.actor_layers)
|
||||||
|
args.critic_layers = str2params(args.critic_layers)
|
||||||
|
|
||||||
|
args.actor_activation = activations[args.actor_activation]
|
||||||
|
args.critic_activation = activations[args.critic_activation]
|
||||||
|
|
||||||
|
actor, critic = model_fn(args)
|
||||||
|
|
||||||
|
if args.restore_actor_from is not None:
|
||||||
|
actor.load_state_dict(torch.load(args.restore_actor_from))
|
||||||
|
if args.restore_critic_from is not None:
|
||||||
|
critic.load_state_dict(torch.load(args.restore_critic_from))
|
||||||
|
|
||||||
|
actor.train()
|
||||||
|
critic.train()
|
||||||
|
actor.share_memory()
|
||||||
|
critic.share_memory()
|
||||||
|
|
||||||
|
target_actor = copy.deepcopy(actor)
|
||||||
|
target_critic = copy.deepcopy(critic)
|
||||||
|
|
||||||
|
hard_update(target_actor, actor)
|
||||||
|
hard_update(target_critic, critic)
|
||||||
|
|
||||||
|
target_actor.train()
|
||||||
|
critic.train()
|
||||||
|
target_actor.share_memory()
|
||||||
|
target_critic.share_memory()
|
||||||
|
|
||||||
|
_, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
|
||||||
|
|
||||||
|
args.thread = 0
|
||||||
|
best_reward = Value("f", 0.0)
|
||||||
|
multi_thread(actor, critic, target_actor, target_critic, args, act_update_fns, best_reward)
|
||||||
|
|
||||||
|
save_fn()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
os.environ['OMP_NUM_THREADS'] = '1'
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
args = parse_args()
|
||||||
|
debug(
|
||||||
|
args,
|
||||||
|
create_model,
|
||||||
|
create_act_update_fns,
|
||||||
|
train_multi_thread)
|
||||||
+477
@@ -0,0 +1,477 @@
|
|||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import queue as py_queue
|
||||||
|
import time
|
||||||
|
import torch.nn as nn
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from ddpg.nets import Actor, Critic
|
||||||
|
from common.torch_util import to_numpy, to_tensor, soft_update
|
||||||
|
from common.misc_util import create_if_need, set_global_seeds
|
||||||
|
from common.logger import Logger
|
||||||
|
from common.buffers import create_buffer
|
||||||
|
from common.loss import create_loss, create_decay_fn
|
||||||
|
from common.env_wrappers import create_env
|
||||||
|
from common.random_process import create_random_process
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(args):
|
||||||
|
actor = Actor(
|
||||||
|
args.n_observation, args.n_action, args.actor_layers,
|
||||||
|
activation=args.actor_activation,
|
||||||
|
layer_norm=args.actor_layer_norm,
|
||||||
|
parameters_noise=args.actor_parameters_noise,
|
||||||
|
parameters_noise_factorised=args.actor_parameters_noise_factorised,
|
||||||
|
last_activation=nn.Tanh)
|
||||||
|
critic = Critic(
|
||||||
|
args.n_observation, args.n_action, args.critic_layers,
|
||||||
|
activation=args.critic_activation,
|
||||||
|
layer_norm=args.critic_layer_norm,
|
||||||
|
parameters_noise=args.critic_parameters_noise,
|
||||||
|
parameters_noise_factorised=args.critic_parameters_noise_factorised)
|
||||||
|
|
||||||
|
pprint(actor)
|
||||||
|
pprint(critic)
|
||||||
|
|
||||||
|
return actor, critic
|
||||||
|
|
||||||
|
|
||||||
|
def create_act_update_fns(actor, critic, target_actor, target_critic, args):
|
||||||
|
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
|
||||||
|
critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
|
||||||
|
|
||||||
|
criterion = create_loss(args)
|
||||||
|
|
||||||
|
low_action_boundary = -1.
|
||||||
|
high_action_boundary = 1.
|
||||||
|
|
||||||
|
def act_fn(observation, noise=0):
|
||||||
|
nonlocal actor
|
||||||
|
action = to_numpy(actor(to_tensor(np.array([observation], dtype=np.float32)))).squeeze(0)
|
||||||
|
action += noise
|
||||||
|
action = np.clip(action, low_action_boundary, high_action_boundary)
|
||||||
|
return action
|
||||||
|
|
||||||
|
def update_fn(
|
||||||
|
observations, actions, rewards, next_observations, dones, weights,
|
||||||
|
actor_lr=1e-4, critic_lr=1e-3):
|
||||||
|
nonlocal actor, critic, target_actor, target_critic, actor_optim, critic_optim
|
||||||
|
|
||||||
|
if hasattr(args, "flip_states"):
|
||||||
|
observations_flip = args.flip_states(observations)
|
||||||
|
next_observations_flip = args.flip_states(next_observations)
|
||||||
|
actions_flip = np.zeros_like(actions)
|
||||||
|
actions_flip[:, :args.n_action // 2] = actions[:, args.n_action // 2:]
|
||||||
|
actions_flip[:, args.n_action // 2:] = actions[:, :args.n_action // 2]
|
||||||
|
|
||||||
|
observations = np.concatenate((observations, observations_flip))
|
||||||
|
actions = np.concatenate((actions, actions_flip))
|
||||||
|
rewards = np.tile(rewards.ravel(), 2)
|
||||||
|
next_observations = np.concatenate((next_observations, next_observations_flip))
|
||||||
|
dones = np.tile(dones.ravel(), 2)
|
||||||
|
|
||||||
|
dones = dones[:, None].astype(np.bool)
|
||||||
|
rewards = rewards[:, None].astype(np.float32)
|
||||||
|
|
||||||
|
dones = to_tensor(np.invert(dones).astype(np.float32))
|
||||||
|
rewards = to_tensor(rewards)
|
||||||
|
weights = to_tensor(weights, requires_grad=False)
|
||||||
|
|
||||||
|
next_v_values = target_critic(
|
||||||
|
to_tensor(next_observations, volatile=True),
|
||||||
|
target_actor(to_tensor(next_observations, volatile=True)),
|
||||||
|
)
|
||||||
|
next_v_values.volatile = False
|
||||||
|
|
||||||
|
reward_predicted = dones * args.gamma * next_v_values
|
||||||
|
td_target = rewards + reward_predicted
|
||||||
|
|
||||||
|
# Critic update
|
||||||
|
critic.zero_grad()
|
||||||
|
|
||||||
|
v_values = critic(to_tensor(observations), to_tensor(actions))
|
||||||
|
value_loss = criterion(v_values, td_target, weights=weights)
|
||||||
|
value_loss.backward()
|
||||||
|
|
||||||
|
torch.nn.utils.clip_grad_norm(critic.parameters(), args.grad_clip)
|
||||||
|
for param_group in critic_optim.param_groups:
|
||||||
|
param_group["lr"] = critic_lr
|
||||||
|
|
||||||
|
critic_optim.step()
|
||||||
|
|
||||||
|
# Actor update
|
||||||
|
actor.zero_grad()
|
||||||
|
|
||||||
|
policy_loss = -critic(
|
||||||
|
to_tensor(observations),
|
||||||
|
actor(to_tensor(observations))
|
||||||
|
)
|
||||||
|
|
||||||
|
policy_loss = torch.mean(policy_loss * weights)
|
||||||
|
policy_loss.backward()
|
||||||
|
|
||||||
|
torch.nn.utils.clip_grad_norm(actor.parameters(), args.grad_clip)
|
||||||
|
for param_group in actor_optim.param_groups:
|
||||||
|
param_group["lr"] = actor_lr
|
||||||
|
|
||||||
|
actor_optim.step()
|
||||||
|
|
||||||
|
# Target update
|
||||||
|
soft_update(target_actor, actor, args.tau)
|
||||||
|
soft_update(target_critic, critic, args.tau)
|
||||||
|
|
||||||
|
metrics = {
|
||||||
|
"value_loss": value_loss,
|
||||||
|
"policy_loss": policy_loss
|
||||||
|
}
|
||||||
|
|
||||||
|
td_v_values = critic(
|
||||||
|
to_tensor(observations, volatile=True, requires_grad=False),
|
||||||
|
to_tensor(actions, volatile=True, requires_grad=False))
|
||||||
|
td_error = td_target - td_v_values
|
||||||
|
|
||||||
|
info = {
|
||||||
|
"td_error": to_numpy(td_error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return metrics, info
|
||||||
|
|
||||||
|
def save_fn(episode=None):
|
||||||
|
nonlocal actor, critic
|
||||||
|
if episode is None:
|
||||||
|
save_path = args.logdir
|
||||||
|
else:
|
||||||
|
save_path = "{}/episode_{}".format(args.logdir, episode)
|
||||||
|
create_if_need(save_path)
|
||||||
|
torch.save(actor.state_dict(), "{}/actor_state_dict.pkl".format(save_path))
|
||||||
|
torch.save(critic.state_dict(), "{}/critic_state_dict.pkl".format(save_path))
|
||||||
|
torch.save(target_actor.state_dict(), "{}/target_actor_state_dict.pkl".format(save_path))
|
||||||
|
torch.save(target_critic.state_dict(), "{}/target_critic_state_dict.pkl".format(save_path))
|
||||||
|
|
||||||
|
return act_fn, update_fn, save_fn
|
||||||
|
|
||||||
|
|
||||||
|
def train_multi_thread(actor, critic, target_actor, target_critic, args, prepare_fn, best_reward):
|
||||||
|
workerseed = args.seed + 241 * args.thread
|
||||||
|
set_global_seeds(workerseed)
|
||||||
|
|
||||||
|
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
|
||||||
|
act_fn, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
|
||||||
|
buffer = create_buffer(args)
|
||||||
|
if args.prioritized_replay:
|
||||||
|
beta_deacy_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.prioritized_replay_beta0,
|
||||||
|
final_value=1.0,
|
||||||
|
max_step=args.max_episodes)
|
||||||
|
|
||||||
|
env = create_env(args)
|
||||||
|
random_process = create_random_process(args)
|
||||||
|
|
||||||
|
actor_learning_rate_decay_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.actor_lr,
|
||||||
|
final_value=args.actor_lr_end,
|
||||||
|
max_step=args.max_episodes)
|
||||||
|
critic_learning_rate_decay_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.critic_lr,
|
||||||
|
final_value=args.critic_lr_end,
|
||||||
|
max_step=args.max_episodes)
|
||||||
|
|
||||||
|
epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
|
||||||
|
|
||||||
|
epsilon_decay_fn = create_decay_fn(
|
||||||
|
"cycle",
|
||||||
|
initial_value=args.initial_epsilon,
|
||||||
|
final_value=args.final_epsilon,
|
||||||
|
cycle_len=epsilon_cycle_len,
|
||||||
|
num_cycles=args.max_episodes // epsilon_cycle_len)
|
||||||
|
|
||||||
|
episode = 0
|
||||||
|
step = 0
|
||||||
|
start_time = time.time()
|
||||||
|
while episode < args.max_episodes:
|
||||||
|
if episode % 100 == 0:
|
||||||
|
env = create_env(args)
|
||||||
|
seed = random.randrange(2 ** 32 - 2)
|
||||||
|
|
||||||
|
actor_lr = actor_learning_rate_decay_fn(episode)
|
||||||
|
critic_lr = critic_learning_rate_decay_fn(episode)
|
||||||
|
epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
|
||||||
|
|
||||||
|
episode_metrics = {
|
||||||
|
"value_loss": 0.0,
|
||||||
|
"policy_loss": 0.0,
|
||||||
|
"reward": 0.0,
|
||||||
|
"step": 0,
|
||||||
|
"epsilon": epsilon
|
||||||
|
}
|
||||||
|
|
||||||
|
observation = env.reset(seed=seed, difficulty=args.difficulty)
|
||||||
|
random_process.reset_states()
|
||||||
|
done = False
|
||||||
|
|
||||||
|
while not done:
|
||||||
|
action = act_fn(observation, noise=epsilon*random_process.sample())
|
||||||
|
next_observation, reward, done, _ = env.step(action)
|
||||||
|
|
||||||
|
buffer.add(observation, action, reward, next_observation, done)
|
||||||
|
episode_metrics["reward"] += reward
|
||||||
|
episode_metrics["step"] += 1
|
||||||
|
|
||||||
|
if len(buffer) >= args.train_steps:
|
||||||
|
|
||||||
|
if args.prioritized_replay:
|
||||||
|
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
|
||||||
|
weights, batch_idxes) = \
|
||||||
|
buffer.sample(batch_size=args.batch_size, beta=beta_deacy_fn(episode))
|
||||||
|
else:
|
||||||
|
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
|
||||||
|
buffer.sample(batch_size=args.batch_size)
|
||||||
|
weights, batch_idxes = np.ones_like(tr_rewards), None
|
||||||
|
|
||||||
|
step_metrics, step_info = update_fn(
|
||||||
|
tr_observations, tr_actions, tr_rewards,
|
||||||
|
tr_next_observations, tr_dones,
|
||||||
|
weights, actor_lr, critic_lr)
|
||||||
|
|
||||||
|
if args.prioritized_replay:
|
||||||
|
new_priorities = np.abs(step_info["td_error"]) + 1e-6
|
||||||
|
buffer.update_priorities(batch_idxes, new_priorities)
|
||||||
|
|
||||||
|
for key, value in step_metrics.items():
|
||||||
|
value = to_numpy(value)[0]
|
||||||
|
episode_metrics[key] += value
|
||||||
|
|
||||||
|
observation = next_observation
|
||||||
|
|
||||||
|
episode += 1
|
||||||
|
|
||||||
|
if episode_metrics["reward"] > 15.0 * args.reward_scale \
|
||||||
|
and episode_metrics["reward"] > best_reward.value:
|
||||||
|
best_reward.value = episode_metrics["reward"]
|
||||||
|
logger.scalar_summary("best reward", best_reward.value, episode)
|
||||||
|
save_fn(episode)
|
||||||
|
|
||||||
|
step += episode_metrics["step"]
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
for key, value in episode_metrics.items():
|
||||||
|
value = value if "loss" not in key else value / episode_metrics["step"]
|
||||||
|
logger.scalar_summary(key, value, episode)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"episode per minute",
|
||||||
|
episode / elapsed_time * 60,
|
||||||
|
episode)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"step per second",
|
||||||
|
step / elapsed_time,
|
||||||
|
episode)
|
||||||
|
logger.scalar_summary("actor lr", actor_lr, episode)
|
||||||
|
logger.scalar_summary("critic lr", critic_lr, episode)
|
||||||
|
|
||||||
|
if episode % args.save_step == 0:
|
||||||
|
save_fn(episode)
|
||||||
|
|
||||||
|
if elapsed_time > 86400 * args.max_train_days:
|
||||||
|
episode = args.max_episodes + 1
|
||||||
|
|
||||||
|
save_fn(episode)
|
||||||
|
|
||||||
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
|
||||||
|
def train_single_thread(
|
||||||
|
actor, critic, target_actor, target_critic, args, prepare_fn,
|
||||||
|
global_episode, global_update_step, episodes_queue):
|
||||||
|
workerseed = args.seed + 241 * args.thread
|
||||||
|
set_global_seeds(workerseed)
|
||||||
|
|
||||||
|
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
|
||||||
|
_, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
|
||||||
|
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
|
||||||
|
buffer = create_buffer(args)
|
||||||
|
|
||||||
|
if args.prioritized_replay:
|
||||||
|
beta_deacy_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.prioritized_replay_beta0,
|
||||||
|
final_value=1.0,
|
||||||
|
max_step=args.max_update_steps)
|
||||||
|
|
||||||
|
actor_learning_rate_decay_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.actor_lr,
|
||||||
|
final_value=args.actor_lr_end,
|
||||||
|
max_step=args.max_update_steps)
|
||||||
|
critic_learning_rate_decay_fn = create_decay_fn(
|
||||||
|
"linear",
|
||||||
|
initial_value=args.critic_lr,
|
||||||
|
final_value=args.critic_lr_end,
|
||||||
|
max_step=args.max_update_steps)
|
||||||
|
|
||||||
|
update_step = 0
|
||||||
|
received_examples = 1 # just hack
|
||||||
|
while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
|
||||||
|
and global_update_step.value < args.max_update_steps * args.num_train_threads:
|
||||||
|
actor_lr = actor_learning_rate_decay_fn(update_step)
|
||||||
|
critic_lr = critic_learning_rate_decay_fn(update_step)
|
||||||
|
|
||||||
|
actor_lr = min(args.actor_lr, max(args.actor_lr_end, actor_lr))
|
||||||
|
critic_lr = min(args.critic_lr, max(args.critic_lr_end, critic_lr))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
replay = episodes_queue.get_nowait()
|
||||||
|
for (observation, action, reward, next_observation, done) in replay:
|
||||||
|
buffer.add(observation, action, reward, next_observation, done)
|
||||||
|
received_examples += len(replay)
|
||||||
|
except py_queue.Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(buffer) >= args.train_steps:
|
||||||
|
if args.prioritized_replay:
|
||||||
|
beta = beta_deacy_fn(update_step)
|
||||||
|
beta = min(1.0, max(args.prioritized_replay_beta0, beta))
|
||||||
|
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
|
||||||
|
weights, batch_idxes) = \
|
||||||
|
buffer.sample(
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
beta=beta)
|
||||||
|
else:
|
||||||
|
(tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
|
||||||
|
buffer.sample(batch_size=args.batch_size)
|
||||||
|
weights, batch_idxes = np.ones_like(tr_rewards), None
|
||||||
|
|
||||||
|
step_metrics, step_info = update_fn(
|
||||||
|
tr_observations, tr_actions, tr_rewards,
|
||||||
|
tr_next_observations, tr_dones,
|
||||||
|
weights, actor_lr, critic_lr)
|
||||||
|
|
||||||
|
update_step += 1
|
||||||
|
global_update_step.value += 1
|
||||||
|
|
||||||
|
if args.prioritized_replay:
|
||||||
|
new_priorities = np.abs(step_info["td_error"]) + 1e-6
|
||||||
|
buffer.update_priorities(batch_idxes, new_priorities)
|
||||||
|
|
||||||
|
for key, value in step_metrics.items():
|
||||||
|
value = to_numpy(value)[0]
|
||||||
|
logger.scalar_summary(key, value, update_step)
|
||||||
|
|
||||||
|
logger.scalar_summary("actor lr", actor_lr, update_step)
|
||||||
|
logger.scalar_summary("critic lr", critic_lr, update_step)
|
||||||
|
|
||||||
|
if update_step % args.save_step == 0:
|
||||||
|
save_fn(update_step)
|
||||||
|
else:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
logger.scalar_summary("buffer size", len(buffer), global_episode.value)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"updates per example",
|
||||||
|
update_step * args.batch_size / received_examples,
|
||||||
|
global_episode.value)
|
||||||
|
|
||||||
|
save_fn(update_step)
|
||||||
|
|
||||||
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
|
||||||
|
def play_single_thread(
|
||||||
|
actor, critic, target_actor, target_critic, args, prepare_fn,
|
||||||
|
global_episode, global_update_step, episodes_queue,
|
||||||
|
best_reward):
|
||||||
|
workerseed = args.seed + 241 * args.thread
|
||||||
|
set_global_seeds(workerseed)
|
||||||
|
|
||||||
|
args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
|
||||||
|
act_fn, _, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
|
||||||
|
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
env = create_env(args)
|
||||||
|
random_process = create_random_process(args)
|
||||||
|
|
||||||
|
epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
|
||||||
|
|
||||||
|
epsilon_decay_fn = create_decay_fn(
|
||||||
|
"cycle",
|
||||||
|
initial_value=args.initial_epsilon,
|
||||||
|
final_value=args.final_epsilon,
|
||||||
|
cycle_len=epsilon_cycle_len,
|
||||||
|
num_cycles=args.max_episodes // epsilon_cycle_len)
|
||||||
|
|
||||||
|
episode = 1
|
||||||
|
step = 0
|
||||||
|
start_time = time.time()
|
||||||
|
while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
|
||||||
|
and global_update_step.value < args.max_update_steps * args.num_train_threads:
|
||||||
|
if episode % 100 == 0:
|
||||||
|
env = create_env(args)
|
||||||
|
seed = random.randrange(2 ** 32 - 2)
|
||||||
|
|
||||||
|
epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
|
||||||
|
|
||||||
|
episode_metrics = {
|
||||||
|
"reward": 0.0,
|
||||||
|
"step": 0,
|
||||||
|
"epsilon": epsilon
|
||||||
|
}
|
||||||
|
|
||||||
|
observation = env.reset(seed=seed, difficulty=args.difficulty)
|
||||||
|
random_process.reset_states()
|
||||||
|
done = False
|
||||||
|
|
||||||
|
replay = []
|
||||||
|
while not done:
|
||||||
|
action = act_fn(observation, noise=epsilon * random_process.sample())
|
||||||
|
next_observation, reward, done, _ = env.step(action)
|
||||||
|
|
||||||
|
replay.append((observation, action, reward, next_observation, done))
|
||||||
|
episode_metrics["reward"] += reward
|
||||||
|
episode_metrics["step"] += 1
|
||||||
|
|
||||||
|
observation = next_observation
|
||||||
|
|
||||||
|
episodes_queue.put(replay)
|
||||||
|
|
||||||
|
episode += 1
|
||||||
|
global_episode.value += 1
|
||||||
|
|
||||||
|
if episode_metrics["reward"] > best_reward.value:
|
||||||
|
best_reward.value = episode_metrics["reward"]
|
||||||
|
logger.scalar_summary("best reward", best_reward.value, episode)
|
||||||
|
|
||||||
|
if episode_metrics["reward"] > 15.0 * args.reward_scale:
|
||||||
|
save_fn(episode)
|
||||||
|
|
||||||
|
step += episode_metrics["step"]
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
for key, value in episode_metrics.items():
|
||||||
|
logger.scalar_summary(key, value, episode)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"episode per minute",
|
||||||
|
episode / elapsed_time * 60,
|
||||||
|
episode)
|
||||||
|
logger.scalar_summary(
|
||||||
|
"step per second",
|
||||||
|
step / elapsed_time,
|
||||||
|
episode)
|
||||||
|
|
||||||
|
if elapsed_time > 86400 * args.max_train_days:
|
||||||
|
global_episode.value = args.max_episodes * (args.num_threads - args.num_train_threads) + 1
|
||||||
|
|
||||||
|
raise KeyboardInterrupt
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from common.nets import LinearNet
|
||||||
|
from common.modules.NoisyLinear import NoisyLinear
|
||||||
|
|
||||||
|
|
||||||
|
def fanin_init(size, fanin=None):
|
||||||
|
fanin = fanin or size[0]
|
||||||
|
v = 1. / np.sqrt(fanin)
|
||||||
|
return torch.Tensor(size).uniform_(-v, v)
|
||||||
|
|
||||||
|
|
||||||
|
class Actor(nn.Module):
|
||||||
|
def __init__(self, n_observation, n_action,
|
||||||
|
layers, activation=torch.nn.ELU,
|
||||||
|
layer_norm=False,
|
||||||
|
parameters_noise=False, parameters_noise_factorised=False,
|
||||||
|
last_activation=torch.nn.Tanh, init_w=3e-3):
|
||||||
|
super(Actor, self).__init__()
|
||||||
|
|
||||||
|
if parameters_noise:
|
||||||
|
def linear_layer(x_in, x_out):
|
||||||
|
return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
|
||||||
|
else:
|
||||||
|
linear_layer = nn.Linear
|
||||||
|
|
||||||
|
self.feature_net = LinearNet(
|
||||||
|
layers=[n_observation] + layers,
|
||||||
|
activation=activation,
|
||||||
|
layer_norm=layer_norm,
|
||||||
|
linear_layer=linear_layer)
|
||||||
|
self.policy_net = LinearNet(
|
||||||
|
layers=[self.feature_net.output_shape, n_action],
|
||||||
|
activation=last_activation,
|
||||||
|
layer_norm=False
|
||||||
|
)
|
||||||
|
self.init_weights(init_w)
|
||||||
|
|
||||||
|
def init_weights(self, init_w):
|
||||||
|
for layer in self.feature_net.net:
|
||||||
|
if isinstance(layer, nn.Linear):
|
||||||
|
layer.weight.data = fanin_init(layer.weight.data.size())
|
||||||
|
|
||||||
|
for layer in self.feature_net.net:
|
||||||
|
if isinstance(layer, nn.Linear):
|
||||||
|
layer.weight.data.uniform_(-init_w, init_w)
|
||||||
|
|
||||||
|
def forward(self, observation):
|
||||||
|
x = observation
|
||||||
|
x = self.feature_net.forward(x)
|
||||||
|
x = self.policy_net.forward(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Critic(nn.Module):
|
||||||
|
def __init__(self, n_observation, n_action,
|
||||||
|
layers, activation=torch.nn.ELU,
|
||||||
|
layer_norm=False,
|
||||||
|
parameters_noise=False, parameters_noise_factorised=False,
|
||||||
|
init_w=3e-3):
|
||||||
|
super(Critic, self).__init__()
|
||||||
|
|
||||||
|
if parameters_noise:
|
||||||
|
def linear_layer(x_in, x_out):
|
||||||
|
return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
|
||||||
|
else:
|
||||||
|
linear_layer = nn.Linear
|
||||||
|
|
||||||
|
self.feature_net = LinearNet(
|
||||||
|
layers=[n_observation + n_action] + layers,
|
||||||
|
activation=activation,
|
||||||
|
layer_norm=layer_norm,
|
||||||
|
linear_layer=linear_layer)
|
||||||
|
self.value_net = nn.Linear(self.feature_net.output_shape, 1)
|
||||||
|
self.init_weights(init_w)
|
||||||
|
|
||||||
|
def init_weights(self, init_w):
|
||||||
|
for layer in self.feature_net.net:
|
||||||
|
if isinstance(layer, nn.Linear):
|
||||||
|
layer.weight.data = fanin_init(layer.weight.data.size())
|
||||||
|
|
||||||
|
self.value_net.weight.data.uniform_(-init_w, init_w)
|
||||||
|
|
||||||
|
def forward(self, observation, action):
|
||||||
|
x = torch.cat((observation, action), dim=1)
|
||||||
|
x = self.feature_net.forward(x)
|
||||||
|
x = self.value_net.forward(x)
|
||||||
|
return x
|
||||||
+186
@@ -0,0 +1,186 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import torch
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from osim.env import RunEnv
|
||||||
|
from osim.http.client import Client
|
||||||
|
|
||||||
|
from common.misc_util import boolean_flag, query_yes_no
|
||||||
|
from common.env_wrappers import create_observation_handler, create_action_handler, create_env
|
||||||
|
|
||||||
|
from ddpg.train import str2params, activations
|
||||||
|
from ddpg.model import create_model, create_act_update_fns
|
||||||
|
|
||||||
|
|
||||||
|
REMOTE_BASE = 'http://grader.crowdai.org:1729'
|
||||||
|
ACTION_SHAPE = 18
|
||||||
|
SEEDS = [
|
||||||
|
3834825972, 3049289152, 3538742899, 2904257823, 4011088434,
|
||||||
|
2684066875, 781202090, 1691535473, 898088606, 1301477286
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument('--restore-args-from', type=str, default=None)
|
||||||
|
parser.add_argument('--restore-actor-from', type=str, default=None)
|
||||||
|
parser.add_argument('--restore-critic-from', type=str, default=None)
|
||||||
|
|
||||||
|
parser.add_argument('--max-obstacles', type=int, default=3)
|
||||||
|
parser.add_argument('--num-episodes', type=int, default=1)
|
||||||
|
parser.add_argument('--token', type=str, default=None)
|
||||||
|
|
||||||
|
boolean_flag(parser, "visualize", default=False)
|
||||||
|
boolean_flag(parser, "submit", default=False)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def restore_args(args):
|
||||||
|
with open(args.restore_args_from, "r") as fin:
|
||||||
|
params = json.load(fin)
|
||||||
|
|
||||||
|
unwanted = [
|
||||||
|
"max_obstacles",
|
||||||
|
"restore_args_from",
|
||||||
|
"restore_actor_from",
|
||||||
|
"restore_critic_from"
|
||||||
|
]
|
||||||
|
|
||||||
|
for unwanted_key in unwanted:
|
||||||
|
value = params.pop(unwanted_key, None)
|
||||||
|
if value is not None:
|
||||||
|
del value
|
||||||
|
|
||||||
|
for key, value in params.items():
|
||||||
|
setattr(args, key, value)
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def submit(actor, critic, args, act_update_fn):
|
||||||
|
act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
|
||||||
|
|
||||||
|
client = Client(REMOTE_BASE)
|
||||||
|
|
||||||
|
all_episode_metrics = []
|
||||||
|
|
||||||
|
episode_metrics = {
|
||||||
|
"reward": 0.0,
|
||||||
|
"step": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
observation_handler = create_observation_handler(args)
|
||||||
|
action_handler = create_action_handler(args)
|
||||||
|
observation = client.env_create(args.token)
|
||||||
|
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
|
||||||
|
observation = observation_handler(observation, action)
|
||||||
|
|
||||||
|
submitted = False
|
||||||
|
while not submitted:
|
||||||
|
print(episode_metrics["reward"])
|
||||||
|
action = act_fn(observation)
|
||||||
|
|
||||||
|
observation, reward, done, _ = client.env_step(action_handler(action).tolist())
|
||||||
|
|
||||||
|
episode_metrics["reward"] += reward
|
||||||
|
episode_metrics["step"] += 1
|
||||||
|
|
||||||
|
if done:
|
||||||
|
all_episode_metrics.append(episode_metrics)
|
||||||
|
|
||||||
|
episode_metrics = {
|
||||||
|
"reward": 0.0,
|
||||||
|
"step": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
observation_handler = create_observation_handler(args)
|
||||||
|
action_handler = create_action_handler(args)
|
||||||
|
observation = client.env_create(args.token)
|
||||||
|
|
||||||
|
if not observation:
|
||||||
|
submitted = True
|
||||||
|
break
|
||||||
|
|
||||||
|
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
|
||||||
|
observation = observation_handler(observation, action)
|
||||||
|
else:
|
||||||
|
observation = observation_handler(observation, action)
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_episode_metrics)
|
||||||
|
pprint(df.describe())
|
||||||
|
|
||||||
|
if query_yes_no("Submit?"):
|
||||||
|
client.submit()
|
||||||
|
|
||||||
|
|
||||||
|
def test(actor, critic, args, act_update_fn):
|
||||||
|
act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
|
||||||
|
env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles)
|
||||||
|
|
||||||
|
all_episode_metrics = []
|
||||||
|
for episode in range(args.num_episodes):
|
||||||
|
episode_metrics = {
|
||||||
|
"reward": 0.0,
|
||||||
|
"step": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
observation_handler = create_observation_handler(args)
|
||||||
|
action_handler = create_action_handler(args)
|
||||||
|
observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)])
|
||||||
|
action = np.zeros(ACTION_SHAPE, dtype=np.float32)
|
||||||
|
observation = observation_handler(observation, action)
|
||||||
|
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
print(episode_metrics["reward"])
|
||||||
|
action = act_fn(observation)
|
||||||
|
|
||||||
|
observation, reward, done, _ = env.step(action_handler(action))
|
||||||
|
|
||||||
|
episode_metrics["reward"] += reward
|
||||||
|
episode_metrics["step"] += 1
|
||||||
|
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
observation = observation_handler(observation, action)
|
||||||
|
|
||||||
|
all_episode_metrics.append(episode_metrics)
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_episode_metrics)
|
||||||
|
pprint(df.describe())
|
||||||
|
|
||||||
|
|
||||||
|
def submit_or_test(args, model_fn, act_update_fn, submit_fn, test_fn):
|
||||||
|
args = restore_args(args)
|
||||||
|
env = create_env(args)
|
||||||
|
|
||||||
|
args.n_action = env.action_space.shape[0]
|
||||||
|
args.n_observation = env.observation_space.shape[0]
|
||||||
|
|
||||||
|
args.actor_layers = str2params(args.actor_layers)
|
||||||
|
args.critic_layers = str2params(args.critic_layers)
|
||||||
|
|
||||||
|
args.actor_activation = activations[args.actor_activation]
|
||||||
|
args.critic_activation = activations[args.critic_activation]
|
||||||
|
|
||||||
|
actor, critic = model_fn(args)
|
||||||
|
actor.load_state_dict(torch.load(args.restore_actor_from))
|
||||||
|
critic.load_state_dict(torch.load(args.restore_critic_from))
|
||||||
|
|
||||||
|
if args.submit:
|
||||||
|
submit_fn(actor, critic, args, act_update_fn)
|
||||||
|
else:
|
||||||
|
test_fn(actor, critic, args, act_update_fn)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
os.environ['OMP_NUM_THREADS'] = '1'
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
args = parse_args()
|
||||||
|
submit_or_test(args, create_model, create_act_update_fns, submit, test)
|
||||||
+237
@@ -0,0 +1,237 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import copy
|
||||||
|
import torch
|
||||||
|
import torch.multiprocessing as mp
|
||||||
|
from multiprocessing import Value
|
||||||
|
|
||||||
|
from common.misc_util import boolean_flag, str2params, create_if_need
|
||||||
|
from common.env_wrappers import create_env
|
||||||
|
from common.torch_util import activations, hard_update
|
||||||
|
|
||||||
|
from ddpg.model import create_model, create_act_update_fns, train_multi_thread, \
|
||||||
|
train_single_thread, play_single_thread
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument('--seed', type=int, default=42)
|
||||||
|
parser.add_argument('--difficulty', type=int, default=2)
|
||||||
|
parser.add_argument('--max-obstacles', type=int, default=3)
|
||||||
|
|
||||||
|
parser.add_argument('--logdir', type=str, default="./logs")
|
||||||
|
parser.add_argument('--num-threads', type=int, default=1)
|
||||||
|
parser.add_argument('--num-train-threads', type=int, default=1)
|
||||||
|
|
||||||
|
boolean_flag(parser, "ddpg-wrapper", default=False)
|
||||||
|
parser.add_argument('--skip-frames', type=int, default=1)
|
||||||
|
parser.add_argument('--fail-reward', type=float, default=0.0)
|
||||||
|
parser.add_argument('--reward-scale', type=float, default=1.)
|
||||||
|
boolean_flag(parser, "flip-state-action", default=False)
|
||||||
|
|
||||||
|
for agent in ["actor", "critic"]:
|
||||||
|
parser.add_argument('--{}-layers'.format(agent), type=str, default="64-64")
|
||||||
|
parser.add_argument('--{}-activation'.format(agent), type=str, default="relu")
|
||||||
|
boolean_flag(parser, "{}-layer-norm".format(agent), default=False)
|
||||||
|
boolean_flag(parser, "{}-parameters-noise".format(agent), default=False)
|
||||||
|
boolean_flag(parser, "{}-parameters-noise-factorised".format(agent), default=False)
|
||||||
|
|
||||||
|
parser.add_argument('--{}-lr'.format(agent), type=float, default=1e-3)
|
||||||
|
parser.add_argument('--{}-lr-end'.format(agent), type=float, default=5e-5)
|
||||||
|
|
||||||
|
parser.add_argument('--restore-{}-from'.format(agent), type=str, default=None)
|
||||||
|
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.96)
|
||||||
|
parser.add_argument('--loss-type', type=str, default="quadric-linear")
|
||||||
|
parser.add_argument('--grad-clip', type=float, default=10.)
|
||||||
|
|
||||||
|
parser.add_argument('--tau', default=0.01, type=float)
|
||||||
|
|
||||||
|
parser.add_argument('--train-steps', type=int, default=int(1e4))
|
||||||
|
parser.add_argument('--batch-size', type=int, default=256) # per worker
|
||||||
|
|
||||||
|
parser.add_argument('--buffer-size', type=int, default=int(1e6))
|
||||||
|
|
||||||
|
boolean_flag(parser, "prioritized-replay", default=False)
|
||||||
|
parser.add_argument('--prioritized-replay-alpha', default=0.6, type=float)
|
||||||
|
parser.add_argument('--prioritized-replay-beta0', default=0.4, type=float)
|
||||||
|
|
||||||
|
parser.add_argument('--initial-epsilon', default=1., type=float)
|
||||||
|
parser.add_argument('--final-epsilon', default=0.01, type=float)
|
||||||
|
parser.add_argument('--max-episodes', default=int(1e4), type=int)
|
||||||
|
parser.add_argument('--max-update-steps', default=int(5e6), type=int)
|
||||||
|
parser.add_argument('--epsilon-cycle-len', default=int(2e2), type=int)
|
||||||
|
|
||||||
|
parser.add_argument('--max-train-days', default=int(1e1), type=int)
|
||||||
|
|
||||||
|
parser.add_argument('--rp-type', default="ornstein-uhlenbeck", type=str)
|
||||||
|
parser.add_argument('--rp-theta', default=0.15, type=float)
|
||||||
|
parser.add_argument('--rp-sigma', default=0.2, type=float)
|
||||||
|
parser.add_argument('--rp-sigma-min', default=0.15, type=float)
|
||||||
|
parser.add_argument('--rp-mu', default=0.0, type=float)
|
||||||
|
|
||||||
|
parser.add_argument('--clip-delta', type=int, default=10)
|
||||||
|
parser.add_argument('--save-step', type=int, default=int(1e4))
|
||||||
|
|
||||||
|
parser.add_argument('--restore-args-from', type=str, default=None)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def restore_args(args):
|
||||||
|
with open(args.restore_args_from, "r") as fin:
|
||||||
|
params = json.load(fin)
|
||||||
|
|
||||||
|
del params["seed"]
|
||||||
|
del params["difficulty"]
|
||||||
|
del params["max_obstacles"]
|
||||||
|
|
||||||
|
del params["logdir"]
|
||||||
|
del params["num_threads"]
|
||||||
|
del params["num_train_threads"]
|
||||||
|
|
||||||
|
del params["skip_frames"]
|
||||||
|
|
||||||
|
for agent in ["actor", "critic"]:
|
||||||
|
del params["{}_lr".format(agent)]
|
||||||
|
del params["{}_lr_end".format(agent)]
|
||||||
|
del params["restore_{}_from".format(agent)]
|
||||||
|
|
||||||
|
del params["grad_clip"]
|
||||||
|
|
||||||
|
del params["tau"]
|
||||||
|
|
||||||
|
del params["train_steps"]
|
||||||
|
del params["batch_size"]
|
||||||
|
|
||||||
|
del params["buffer_size"]
|
||||||
|
|
||||||
|
del params["prioritized_replay"]
|
||||||
|
del params["prioritized_replay_alpha"]
|
||||||
|
del params["prioritized_replay_beta0"]
|
||||||
|
|
||||||
|
del params["initial_epsilon"]
|
||||||
|
del params["final_epsilon"]
|
||||||
|
del params["max_episodes"]
|
||||||
|
del params["max_update_steps"]
|
||||||
|
del params["epsilon_cycle_len"]
|
||||||
|
|
||||||
|
del params["max_train_days"]
|
||||||
|
|
||||||
|
del params["rp_type"]
|
||||||
|
del params["rp_theta"]
|
||||||
|
del params["rp_sigma"]
|
||||||
|
del params["rp_sigma_min"]
|
||||||
|
del params["rp_mu"]
|
||||||
|
|
||||||
|
del params["clip_delta"]
|
||||||
|
del params["save_step"]
|
||||||
|
|
||||||
|
del params["restore_args_from"]
|
||||||
|
|
||||||
|
for key, value in params.items():
|
||||||
|
setattr(args, key, value)
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def train(args, model_fn, act_update_fns, multi_thread, train_single, play_single):
|
||||||
|
create_if_need(args.logdir)
|
||||||
|
|
||||||
|
if args.restore_args_from is not None:
|
||||||
|
args = restore_args(args)
|
||||||
|
|
||||||
|
with open("{}/args.json".format(args.logdir), "w") as fout:
|
||||||
|
json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
|
||||||
|
env = create_env(args)
|
||||||
|
|
||||||
|
if args.flip_state_action and hasattr(env, "state_transform"):
|
||||||
|
args.flip_states = env.state_transform.flip_states
|
||||||
|
args.batch_size = args.batch_size // 2
|
||||||
|
|
||||||
|
args.n_action = env.action_space.shape[0]
|
||||||
|
args.n_observation = env.observation_space.shape[0]
|
||||||
|
|
||||||
|
args.actor_layers = str2params(args.actor_layers)
|
||||||
|
args.critic_layers = str2params(args.critic_layers)
|
||||||
|
|
||||||
|
args.actor_activation = activations[args.actor_activation]
|
||||||
|
args.critic_activation = activations[args.critic_activation]
|
||||||
|
|
||||||
|
actor, critic = model_fn(args)
|
||||||
|
|
||||||
|
if args.restore_actor_from is not None:
|
||||||
|
actor.load_state_dict(torch.load(args.restore_actor_from))
|
||||||
|
if args.restore_critic_from is not None:
|
||||||
|
critic.load_state_dict(torch.load(args.restore_critic_from))
|
||||||
|
|
||||||
|
actor.train()
|
||||||
|
critic.train()
|
||||||
|
actor.share_memory()
|
||||||
|
critic.share_memory()
|
||||||
|
|
||||||
|
target_actor = copy.deepcopy(actor)
|
||||||
|
target_critic = copy.deepcopy(critic)
|
||||||
|
|
||||||
|
hard_update(target_actor, actor)
|
||||||
|
hard_update(target_critic, critic)
|
||||||
|
|
||||||
|
target_actor.train()
|
||||||
|
target_critic.train()
|
||||||
|
target_actor.share_memory()
|
||||||
|
target_critic.share_memory()
|
||||||
|
|
||||||
|
_, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
best_reward = Value("f", 0.0)
|
||||||
|
try:
|
||||||
|
if args.num_threads == args.num_train_threads:
|
||||||
|
for rank in range(args.num_threads):
|
||||||
|
args.thread = rank
|
||||||
|
p = mp.Process(
|
||||||
|
target=multi_thread,
|
||||||
|
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
|
||||||
|
best_reward))
|
||||||
|
p.start()
|
||||||
|
processes.append(p)
|
||||||
|
else:
|
||||||
|
global_episode = Value("i", 0)
|
||||||
|
global_update_step = Value("i", 0)
|
||||||
|
episodes_queue = mp.Queue()
|
||||||
|
for rank in range(args.num_threads):
|
||||||
|
args.thread = rank
|
||||||
|
if rank < args.num_train_threads:
|
||||||
|
p = mp.Process(
|
||||||
|
target=train_single,
|
||||||
|
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
|
||||||
|
global_episode, global_update_step, episodes_queue))
|
||||||
|
else:
|
||||||
|
p = mp.Process(
|
||||||
|
target=play_single,
|
||||||
|
args=(actor, critic, target_actor, target_critic, args, act_update_fns,
|
||||||
|
global_episode, global_update_step, episodes_queue,
|
||||||
|
best_reward))
|
||||||
|
p.start()
|
||||||
|
processes.append(p)
|
||||||
|
|
||||||
|
for p in processes:
|
||||||
|
p.join()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
save_fn()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
os.environ['OMP_NUM_THREADS'] = '1'
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
args = parse_args()
|
||||||
|
train(args,
|
||||||
|
create_model,
|
||||||
|
create_act_update_fns,
|
||||||
|
train_multi_thread,
|
||||||
|
train_single_thread,
|
||||||
|
play_single_thread)
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 15 MiB |
Binary file not shown.
|
After Width: | Height: | Size: 15 MiB |
@@ -0,0 +1,2 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
conda create -n opensim-rl -c kidzik opensim git python=3.5.2 anaconda -y
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
conda upgrade pip -y && \
|
||||||
|
conda install -c conda-forge lapack git -y && \
|
||||||
|
conda install ipython libgcc -y && \
|
||||||
|
conda install pytorch torchvision -c soumith -y && \
|
||||||
|
pip install tensorflow==1.3.0 gym && \
|
||||||
|
pip install git+https://github.com/stanfordnmbl/osim-rl.git
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
conda upgrade pip -y && \
|
||||||
|
conda install -c conda-forge lapack git -y && \
|
||||||
|
conda install ipython libgcc -y && \
|
||||||
|
conda install pytorch torchvision -c soumith -y && \
|
||||||
|
pip install tensorflow==1.3.0 gym mpi4py && \
|
||||||
|
pip install git+https://github.com/stanfordnmbl/osim-rl.git
|
||||||
Reference in New Issue
Block a user