diff --git a/.gitignore b/.gitignore
index 7bbc71c..72c7d1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -99,3 +99,7 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+.DS_Store
+.idea
+log*
diff --git a/README.md b/README.md
index 669c4e8..3c828b7 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,76 @@
 # Run-Skeleton-Run
-Reason8.ai PyTorch solution for NIPS RL 2017 challenge
+[Reason8.ai](https://reason8.ai) PyTorch solution for 3rd place [NIPS RL 2017 challenge](https://www.crowdai.org/challenges/nips-2017-learning-to-run/leaderboards?challenge_round_id=12).
+
+Additional thanks to [Michail Pavlov](https://github.com/fgvbrt) for collaboration.
+
+## Agent policies
+
+### no-flip-state-action
+
+![Alt Text](http://www.sheawong.com/wp-content/uploads/2013/08/keephatin.gif)
+
+![alt text](https://github.com/Scitator/Run-Skeleton-Run/blob/master/gifs/noflip.gif)
+
+### flip-state-action
+
+![alt text](https://github.com/Scitator/Run-Skeleton-Run/blob/master/gifs/flip.gif)
+
+
+## How to setup environment?
+
+1. `sh setup_conda.sh`
+2. `source activate opensim-rl`
+
+Would like to test baselines? (Need MPI support)
+3. `sudo apt-get install openmpi-bin openmpi-doc libopenmpi-dev`
+3+. `sh setup_env_mpi.sh`
+
+OR like DDPG agents?
+3. `sh setup_env.sh`
+
+4. Congrats! Now you are ready to check our agents.
+
+
+## Run DDPG agent
+
+```
+CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
+    --logdir ./logs_ddpg \
+    --num-threads 4 \
+    --ddpg-wrapper \
+    --skip-frames 5 \
+    --fail-reward -0.2 \
+    --reward-scale 10 \
+    --flip-state-action \
+    --actor-layers 64-64 --actor-layer-norm --actor-parameters-noise \
+    --actor-lr 0.001 --actor-lr-end 0.00001 \
+    --critic-layers 64-32 --critic-layer-norm \
+    --critic-lr 0.002 --critic-lr-end 0.00001 \
+    --initial-epsilon 0.5 --final-epsilon 0.001 \
+    --tau 0.0001
+```
+
+
+## Evaluate DDPG agent
+
+```
+CUDA_VISIBLE_DEVICES="" PYTHONPATH=./ python ddpg/submit.py \
+    --restore-actor-from ./logs_ddpg/actor_state_dict.pkl \
+    --restore-critic-from ./logs_ddpg/critic_state_dict.pkl \
+    --restore-args-from ./logs_ddpg/args.json \
+    --num-episodes 10
+
+```
+
+
+## Run TRPO/PPO agent
+
+```
+CUDA_VISIBLE_DEVICES="" PYTHONPATH=. python ddpg/train.py \
+    --agent ppo \
+    --logdir ./logs_baseline \
+    --baseline-wrapper \
+    --skip-frames 5 \
+    --fail-reward -0.2 \
+    --reward-scale 10
+```
diff --git a/baselines/__init__.py b/baselines/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/baselines/baselines_common/__init__.py b/baselines/baselines_common/__init__.py
new file mode 100644
index 0000000..a07d7df
--- /dev/null
+++ b/baselines/baselines_common/__init__.py
@@ -0,0 +1,4 @@
+from baselines.baselines_common.console_util import *
+from baselines.baselines_common.dataset import Dataset
+from baselines.baselines_common.math_util import *
+from baselines.baselines_common.misc_util import *
diff --git a/baselines/baselines_common/cg.py b/baselines/baselines_common/cg.py
new file mode 100644
index 0000000..59fda0e
--- /dev/null
+++ b/baselines/baselines_common/cg.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+
+def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
+    """
+    Demmel p 312
+    """
+    p = b.copy()
+    r = b.copy()
+    x = np.zeros_like(b)
+    rdotr = r.dot(r)
+
+    fmtstr = "%10i %10.3g %10.3g"
+    titlestr = "%10s %10s %10s"
+    if verbose:
+        print(titlestr % ("iter", "residual norm", "soln norm"))
+
+    for i in range(cg_iters):
+        if callback is not None:
+            callback(x)
+        if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
+        z = f_Ax(p)
+        v = rdotr / p.dot(z)
+        x += v * p
+        r -= v * z
+        newrdotr = r.dot(r)
+        mu = newrdotr / rdotr
+        p = r + mu * p
+
+        rdotr = newrdotr
+        if rdotr < residual_tol:
+            break
+
+    if callback is not None:
+        callback(x)
+    if verbose:
+        print(fmtstr % (i + 1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
+    return x
diff --git a/baselines/baselines_common/console_util.py b/baselines/baselines_common/console_util.py
new file mode 100644
index 0000000..6def0c9
--- /dev/null
+++ b/baselines/baselines_common/console_util.py
@@ -0,0 +1,62 @@
+from __future__ import print_function
+from contextlib import contextmanager
+import numpy as np
+import time
+
+
+# ================================================================
+# Misc
+# ================================================================
+
+def fmt_row(width, row, header=False):
+    out = " | ".join(fmt_item(x, width) for x in row)
+    if header: out = out + "\n" + "-" * len(out)
+    return out
+
+
+def fmt_item(x, l):
+    if isinstance(x, np.ndarray):
+        assert x.ndim == 0
+        x = x.item()
+    if isinstance(x, float):
+        rep = "%g" % x
+    else:
+        rep = str(x)
+    return " " * (l - len(rep)) + rep
+
+
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+
+
+def colorize(string, color, bold=False, highlight=False):
+    attr = []
+    num = color2num[color]
+    if highlight: num += 10
+    attr.append(str(num))
+    if bold: attr.append('1')
+    return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
+
+
+MESSAGE_DEPTH = 0
+
+
+@contextmanager
+def timed(msg):
+    global MESSAGE_DEPTH  # pylint: disable=W0603
+    print(colorize('\t' * MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
+    tstart = time.time()
+    MESSAGE_DEPTH += 1
+    yield
+    MESSAGE_DEPTH -= 1
+    print(colorize('\t' * MESSAGE_DEPTH + "done in %.3f seconds" % (time.time() - tstart),
+                   color='magenta'))
diff --git a/baselines/baselines_common/dataset.py b/baselines/baselines_common/dataset.py
new file mode 100644
index 0000000..85b5e55
--- /dev/null
+++ b/baselines/baselines_common/dataset.py
@@ -0,0 +1,63 @@
+import numpy as np
+
+
+class Dataset(object):
+    def __init__(self, data_map, deterministic=False, shuffle=True):
+        self.data_map = data_map
+        self.deterministic = deterministic
+        self.enable_shuffle = shuffle
+        self.n = next(iter(data_map.values())).shape[0]
+        self._next_id = 0
+        self.shuffle()
+
+    def shuffle(self):
+        if self.deterministic:
+            return
+        perm = np.arange(self.n)
+        np.random.shuffle(perm)
+
+        for key in self.data_map:
+            self.data_map[key] = self.data_map[key][perm]
+
+        self._next_id = 0
+
+    def next_batch(self, batch_size):
+        if self._next_id >= self.n and self.enable_shuffle:
+            self.shuffle()
+
+        cur_id = self._next_id
+        cur_batch_size = min(batch_size, self.n - self._next_id)
+        self._next_id += cur_batch_size
+
+        data_map = dict()
+        for key in self.data_map:
+            data_map[key] = self.data_map[key][cur_id:cur_id + cur_batch_size]
+        return data_map
+
+    def iterate_once(self, batch_size):
+        if self.enable_shuffle: self.shuffle()
+
+        while self._next_id <= self.n - batch_size:
+            yield self.next_batch(batch_size)
+        self._next_id = 0
+
+    def subset(self, num_elements, deterministic=True):
+        data_map = dict()
+        for key in self.data_map:
+            data_map[key] = self.data_map[key][:num_elements]
+        return Dataset(data_map, deterministic)
+
+
+def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True,
+                include_final_partial_batch=True):
+    assert (num_batches is None) != (
+    batch_size is None), 'Provide num_batches or batch_size, but not both'
+    arrays = tuple(map(np.asarray, arrays))
+    n = arrays[0].shape[0]
+    assert all(a.shape[0] == n for a in arrays[1:])
+    inds = np.arange(n)
+    if shuffle: np.random.shuffle(inds)
+    sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
+    for batch_inds in np.array_split(inds, sections):
+        if include_final_partial_batch or len(batch_inds) == batch_size:
+            yield tuple(a[batch_inds] for a in arrays)
diff --git a/baselines/baselines_common/distributions.py b/baselines/baselines_common/distributions.py
new file mode 100644
index 0000000..1d42559
--- /dev/null
+++ b/baselines/baselines_common/distributions.py
@@ -0,0 +1,377 @@
+import tensorflow as tf
+import numpy as np
+import baselines.baselines_common.tf_util as U
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+class Pd(object):
+    """
+    A particular probability distribution
+    """
+
+    def flatparam(self):
+        raise NotImplementedError
+
+    def mode(self):
+        raise NotImplementedError
+
+    def neglogp(self, x):
+        # Usually it's easier to define the negative logprob
+        raise NotImplementedError
+
+    def kl(self, other):
+        raise NotImplementedError
+
+    def entropy(self):
+        raise NotImplementedError
+
+    def sample(self):
+        raise NotImplementedError
+
+    def logp(self, x):
+        return - self.neglogp(x)
+
+
+class PdType(object):
+    """
+    Parametrized family of probability distributions
+    """
+
+    def pdclass(self):
+        raise NotImplementedError
+
+    def pdfromflat(self, flat):
+        return self.pdclass()(flat)
+
+    def param_shape(self):
+        raise NotImplementedError
+
+    def sample_shape(self):
+        raise NotImplementedError
+
+    def sample_dtype(self):
+        raise NotImplementedError
+
+    def param_placeholder(self, prepend_shape, name=None):
+        return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name)
+
+    def sample_placeholder(self, prepend_shape, name=None):
+        return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(),
+                              name=name)
+
+
+class CategoricalPdType(PdType):
+    def __init__(self, ncat):
+        self.ncat = ncat
+
+    def pdclass(self):
+        return CategoricalPd
+
+    def param_shape(self):
+        return [self.ncat]
+
+    def sample_shape(self):
+        return []
+
+    def sample_dtype(self):
+        return tf.int32
+
+
+class MultiCategoricalPdType(PdType):
+    def __init__(self, low, high):
+        self.low = low
+        self.high = high
+        self.ncats = high - low + 1
+
+    def pdclass(self):
+        return MultiCategoricalPd
+
+    def pdfromflat(self, flat):
+        return MultiCategoricalPd(self.low, self.high, flat)
+
+    def param_shape(self):
+        return [sum(self.ncats)]
+
+    def sample_shape(self):
+        return [len(self.ncats)]
+
+    def sample_dtype(self):
+        return tf.int32
+
+
+class DiagGaussianPdType(PdType):
+    def __init__(self, size):
+        self.size = size
+
+    def pdclass(self):
+        return DiagGaussianPd
+
+    def param_shape(self):
+        return [2 * self.size]
+
+    def sample_shape(self):
+        return [self.size]
+
+    def sample_dtype(self):
+        return tf.float32
+
+
+class BernoulliPdType(PdType):
+    def __init__(self, size):
+        self.size = size
+
+    def pdclass(self):
+        return BernoulliPd
+
+    def param_shape(self):
+        return [self.size]
+
+    def sample_shape(self):
+        return [self.size]
+
+    def sample_dtype(self):
+        return tf.int32
+
+
+# WRONG SECOND DERIVATIVES
+# class CategoricalPd(Pd):
+#     def __init__(self, logits):
+#         self.logits = logits
+#         self.ps = tf.nn.softmax(logits)
+#     @classmethod
+#     def fromflat(cls, flat):
+#         return cls(flat)
+#     def flatparam(self):
+#         return self.logits
+#     def mode(self):
+#         return U.argmax(self.logits, axis=-1)
+#     def logp(self, x):
+#         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
+#     def kl(self, other):
+#         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
+#                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
+#     def entropy(self):
+#         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
+#     def sample(self):
+#         u = tf.random_uniform(tf.shape(self.logits))
+#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
+
+class CategoricalPd(Pd):
+    def __init__(self, logits):
+        self.logits = logits
+
+    def flatparam(self):
+        return self.logits
+
+    def mode(self):
+        return U.argmax(self.logits, axis=-1)
+
+    def neglogp(self, x):
+        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        # Note: we can't use sparse_softmax_cross_entropy_with_logits because
+        #       the implementation does not allow second-order derivatives...
+        one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
+        return tf.nn.softmax_cross_entropy_with_logits(
+            logits=self.logits,
+            labels=one_hot_actions)
+
+    def kl(self, other):
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        ea1 = tf.exp(a1)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
+        z1 = U.sum(ea1, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
+
+    def entropy(self):
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
+
+    def sample(self):
+        u = tf.random_uniform(tf.shape(self.logits))
+        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
+
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+
+
+class MultiCategoricalPd(Pd):
+    def __init__(self, low, high, flat):
+        self.flat = flat
+        self.low = tf.constant(low, dtype=tf.int32)
+        self.categoricals = list(
+            map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
+
+    def flatparam(self):
+        return self.flat
+
+    def mode(self):
+        return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1),
+                                  tf.int32)
+
+    def neglogp(self, x):
+        return tf.add_n([p.neglogp(px) for p, px in zip(
+            self.categoricals, tf.unstack(x - self.low,
+                                          axis=len(x.get_shape()) - 1))])
+
+    def kl(self, other):
+        return tf.add_n([
+            p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
+        ])
+
+    def entropy(self):
+        return tf.add_n([p.entropy() for p in self.categoricals])
+
+    def sample(self):
+        return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1),
+                                  tf.int32)
+
+    @classmethod
+    def fromflat(cls, flat):
+        raise NotImplementedError
+
+
+class DiagGaussianPd(Pd):
+    def __init__(self, flat):
+        self.flat = flat
+        mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat)
+        self.mean = mean
+        self.logstd = logstd
+        self.std = tf.exp(logstd)
+
+    def flatparam(self):
+        return self.flat
+
+    def mode(self):
+        return self.mean
+
+    def neglogp(self, x):
+        return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
+               + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
+               + U.sum(self.logstd, axis=-1)
+
+    def kl(self, other):
+        assert isinstance(other, DiagGaussianPd)
+        return U.sum(other.logstd - self.logstd + (
+            tf.square(self.std) + tf.square(self.mean - other.mean)) / (
+                         2.0 * tf.square(other.std)) - 0.5, axis=-1)
+
+    def entropy(self):
+        return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
+
+    def sample(self):
+        return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
+
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+
+
+class BernoulliPd(Pd):
+    def __init__(self, logits):
+        self.logits = logits
+        self.ps = tf.sigmoid(logits)
+
+    def flatparam(self):
+        return self.logits
+
+    def mode(self):
+        return tf.round(self.ps)
+
+    def neglogp(self, x):
+        return U.sum(
+            tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)),
+            axis=-1)
+
+    def kl(self, other):
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps),
+                     axis=-1) - U.sum(
+            tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+
+    def entropy(self):
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps),
+                     axis=-1)
+
+    def sample(self):
+        u = tf.random_uniform(tf.shape(self.ps))
+        return tf.to_float(math_ops.less(u, self.ps))
+
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+
+
+def make_pdtype(ac_space):
+    from gym import spaces
+    if isinstance(ac_space, spaces.Box):
+        assert len(ac_space.shape) == 1
+        return DiagGaussianPdType(ac_space.shape[0])
+    elif isinstance(ac_space, spaces.Discrete):
+        return CategoricalPdType(ac_space.n)
+    elif isinstance(ac_space, spaces.MultiDiscrete):
+        return MultiCategoricalPdType(ac_space.low, ac_space.high)
+    elif isinstance(ac_space, spaces.MultiBinary):
+        return BernoulliPdType(ac_space.n)
+    else:
+        raise NotImplementedError
+
+
+def shape_el(v, i):
+    maybe = v.get_shape()[i]
+    if maybe is not None:
+        return maybe
+    else:
+        return tf.shape(v)[i]
+
+
+@U.in_session
+def test_probtypes():
+    np.random.seed(0)
+
+    pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
+    diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2)  # pylint: disable=E1101
+    validate_probtype(diag_gauss, pdparam_diag_gauss)
+
+    pdparam_categorical = np.array([-.2, .3, .5])
+    categorical = CategoricalPdType(pdparam_categorical.size)  # pylint: disable=E1101
+    validate_probtype(categorical, pdparam_categorical)
+
+    pdparam_bernoulli = np.array([-.2, .3, .5])
+    bernoulli = BernoulliPdType(pdparam_bernoulli.size)  # pylint: disable=E1101
+    validate_probtype(bernoulli, pdparam_bernoulli)
+
+
+def validate_probtype(probtype, pdparam):
+    N = 100000
+    # Check to see if mean negative log likelihood == differential entropy
+    Mval = np.repeat(pdparam[None, :], N, axis=0)
+    M = probtype.param_placeholder([N])
+    X = probtype.sample_placeholder([N])
+    pd = probtype.pdclass()(M)
+    calcloglik = U.function([X, M], pd.logp(X))
+    calcent = U.function([M], pd.entropy())
+    Xval = U.eval(pd.sample(), feed_dict={M: Mval})
+    logliks = calcloglik(Xval, Mval)
+    entval_ll = - logliks.mean()  # pylint: disable=E1101
+    entval_ll_stderr = logliks.std() / np.sqrt(N)  # pylint: disable=E1101
+    entval = calcent(Mval).mean()  # pylint: disable=E1101
+    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas
+
+    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
+    M2 = probtype.param_placeholder([N])
+    pd2 = probtype.pdclass()(M2)
+    q = pdparam + np.random.randn(pdparam.size) * 0.1
+    Mval2 = np.repeat(q[None, :], N, axis=0)
+    calckl = U.function([M, M2], pd.kl(pd2))
+    klval = calckl(Mval, Mval2).mean()  # pylint: disable=E1101
+    logliks = calcloglik(Xval, Mval2)
+    klval_ll = - entval - logliks.mean()  # pylint: disable=E1101
+    klval_ll_stderr = logliks.std() / np.sqrt(N)  # pylint: disable=E1101
+    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
diff --git a/baselines/baselines_common/math_util.py b/baselines/baselines_common/math_util.py
new file mode 100644
index 0000000..476927b
--- /dev/null
+++ b/baselines/baselines_common/math_util.py
@@ -0,0 +1,92 @@
+import numpy as np
+import scipy.signal
+
+
+def discount(x, gamma):
+    """
+    computes discounted sums along 0th dimension of x.
+
+    inputs
+    ------
+    x: ndarray
+    gamma: float
+
+    outputs
+    -------
+    y: ndarray with same shape as x, satisfying
+
+        y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
+                where k = len(x) - t - 1
+
+    """
+    assert x.ndim >= 1
+    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+
+def explained_variance(ypred, y):
+    """
+    Computes fraction of variance that ypred explains about y.
+    Returns 1 - Var[y-ypred] / Var[y]
+
+    interpretation:
+        ev=0  =>  might as well have predicted zero
+        ev=1  =>  perfect prediction
+        ev<0  =>  worse than just predicting zero
+
+    """
+    assert y.ndim == 1 and ypred.ndim == 1
+    vary = np.var(y)
+    return np.nan if vary == 0 else 1 - np.var(y - ypred) / vary
+
+
+def explained_variance_2d(ypred, y):
+    assert y.ndim == 2 and ypred.ndim == 2
+    vary = np.var(y, axis=0)
+    out = 1 - np.var(y - ypred) / vary
+    out[vary < 1e-10] = 0
+    return out
+
+
+def ncc(ypred, y):
+    return np.corrcoef(ypred, y)[1, 0]
+
+
+def flatten_arrays(arrs):
+    return np.concatenate([arr.flat for arr in arrs])
+
+
+def unflatten_vector(vec, shapes):
+    i = 0
+    arrs = []
+    for shape in shapes:
+        size = np.prod(shape)
+        arr = vec[i:i + size].reshape(shape)
+        arrs.append(arr)
+        i += size
+    return arrs
+
+
+def discount_with_boundaries(X, New, gamma):
+    """
+    X: 2d array of floats, time x features
+    New: 2d array of bools, indicating when a new episode has started
+    """
+    Y = np.zeros_like(X)
+    T = X.shape[0]
+    Y[T - 1] = X[T - 1]
+    for t in range(T - 2, -1, -1):
+        Y[t] = X[t] + gamma * Y[t + 1] * (1 - New[t + 1])
+    return Y
+
+
+def test_discount_with_boundaries():
+    gamma = 0.9
+    x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
+    starts = [1.0, 0.0, 0.0, 1.0]
+    y = discount_with_boundaries(x, starts, gamma)
+    assert np.allclose(y, [
+        1 + gamma * 2 + gamma ** 2 * 3,
+        2 + gamma * 3,
+        3,
+        4
+    ])
diff --git a/baselines/baselines_common/misc_util.py b/baselines/baselines_common/misc_util.py
new file mode 100644
index 0000000..4e45ce7
--- /dev/null
+++ b/baselines/baselines_common/misc_util.py
@@ -0,0 +1,328 @@
+import gym
+import numpy as np
+import os
+import pickle
+import random
+import tempfile
+import time
+import zipfile
+
+
+def zipsame(*seqs):
+    L = len(seqs[0])
+    assert all(len(seq) == L for seq in seqs[1:])
+    return zip(*seqs)
+
+
+def unpack(seq, sizes):
+    """
+    Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
+    None = just one bare element, not a list
+
+    Example:
+    unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
+    """
+    seq = list(seq)
+    it = iter(seq)
+    assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
+    for size in sizes:
+        if size is None:
+            yield it.__next__()
+        else:
+            li = []
+            for _ in range(size):
+                li.append(it.__next__())
+            yield li
+
+
+class EzPickle(object):
+    """Objects that are pickled and unpickled via their constructor
+    arguments.
+
+    Example usage:
+
+        class Dog(Animal, EzPickle):
+            def __init__(self, furcolor, tailkind="bushy"):
+                Animal.__init__()
+                EzPickle.__init__(furcolor, tailkind)
+                ...
+
+    When this object is unpickled, a new Dog will be constructed by passing the provided
+    furcolor and tailkind into the constructor. However, philosophers are still not sure
+    whether it is still the same dog.
+
+    This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
+    and Atari.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self._ezpickle_args = args
+        self._ezpickle_kwargs = kwargs
+
+    def __getstate__(self):
+        return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
+
+    def __setstate__(self, d):
+        out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
+        self.__dict__.update(out.__dict__)
+
+
+def set_global_seeds(i):
+    try:
+        import tensorflow as tf
+    except ImportError:
+        pass
+    else:
+        tf.set_random_seed(i)
+    np.random.seed(i)
+    random.seed(i)
+
+
+def pretty_eta(seconds_left):
+    """Print the number of seconds in human readable format.
+
+    Examples:
+    2 days
+    2 hours and 37 minutes
+    less than a minute
+
+    Paramters
+    ---------
+    seconds_left: int
+        Number of seconds to be converted to the ETA
+    Returns
+    -------
+    eta: str
+        String representing the pretty ETA.
+    """
+    minutes_left = seconds_left // 60
+    seconds_left %= 60
+    hours_left = minutes_left // 60
+    minutes_left %= 60
+    days_left = hours_left // 24
+    hours_left %= 24
+
+    def helper(cnt, name):
+        return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
+
+    if days_left > 0:
+        msg = helper(days_left, 'day')
+        if hours_left > 0:
+            msg += ' and ' + helper(hours_left, 'hour')
+        return msg
+    if hours_left > 0:
+        msg = helper(hours_left, 'hour')
+        if minutes_left > 0:
+            msg += ' and ' + helper(minutes_left, 'minute')
+        return msg
+    if minutes_left > 0:
+        return helper(minutes_left, 'minute')
+    return 'less than a minute'
+
+
+class RunningAvg(object):
+    def __init__(self, gamma, init_value=None):
+        """Keep a running estimate of a quantity. This is a bit like mean
+        but more sensitive to recent changes.
+
+        Parameters
+        ----------
+        gamma: float
+            Must be between 0 and 1, where 0 is the most sensitive to recent
+            changes.
+        init_value: float or None
+            Initial value of the estimate. If None, it will be set on the first update.
+        """
+        self._value = init_value
+        self._gamma = gamma
+
+    def update(self, new_val):
+        """Update the estimate.
+
+        Parameters
+        ----------
+        new_val: float
+            new observated value of estimated quantity.
+        """
+        if self._value is None:
+            self._value = new_val
+        else:
+            self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
+
+    def __float__(self):
+        """Get the current estimate"""
+        return self._value
+
+
+class SimpleMonitor(gym.Wrapper):
+    def __init__(self, env):
+        """Adds two qunatities to info returned by every step:
+
+            num_steps: int
+                Number of steps takes so far
+            rewards: [float]
+                All the cumulative rewards for the episodes completed so far.
+        """
+        super().__init__(env)
+        # current episode state
+        self._current_reward = None
+        self._num_steps = None
+        # temporary monitor state that we do not save
+        self._time_offset = None
+        self._total_steps = None
+        # monitor state
+        self._episode_rewards = []
+        self._episode_lengths = []
+        self._episode_end_times = []
+
+    def _reset(self):
+        obs = self.env.reset()
+        # recompute temporary state if needed
+        if self._time_offset is None:
+            self._time_offset = time.time()
+            if len(self._episode_end_times) > 0:
+                self._time_offset -= self._episode_end_times[-1]
+        if self._total_steps is None:
+            self._total_steps = sum(self._episode_lengths)
+        # update monitor state
+        if self._current_reward is not None:
+            self._episode_rewards.append(self._current_reward)
+            self._episode_lengths.append(self._num_steps)
+            self._episode_end_times.append(time.time() - self._time_offset)
+        # reset episode state
+        self._current_reward = 0
+        self._num_steps = 0
+
+        return obs
+
+    def _step(self, action):
+        obs, rew, done, info = self.env.step(action)
+        self._current_reward += rew
+        self._num_steps += 1
+        self._total_steps += 1
+        info['steps'] = self._total_steps
+        info['rewards'] = self._episode_rewards
+        return (obs, rew, done, info)
+
+    def get_state(self):
+        return {
+            'env_id': self.env.unwrapped.spec.id,
+            'episode_data': {
+                'episode_rewards': self._episode_rewards,
+                'episode_lengths': self._episode_lengths,
+                'episode_end_times': self._episode_end_times,
+                'initial_reset_time': 0,
+            }
+        }
+
+    def set_state(self, state):
+        assert state['env_id'] == self.env.unwrapped.spec.id
+        ed = state['episode_data']
+        self._episode_rewards = ed['episode_rewards']
+        self._episode_lengths = ed['episode_lengths']
+        self._episode_end_times = ed['episode_end_times']
+
+
+def boolean_flag(parser, name, default=False, help=None):
+    """Add a boolean flag to argparse parser.
+
+    Parameters
+    ----------
+    parser: argparse.Parser
+        parser to add the flag to
+    name: str
+        --<name> will enable the flag, while --no-<name> will disable it
+    default: bool or None
+        default value of the flag
+    help: str
+        help string for the flag
+    """
+    dest = name.replace('-', '_')
+    parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
+    parser.add_argument("--no-" + name, action="store_false", dest=dest)
+
+
+def get_wrapper_by_name(env, classname):
+    """Given an a gym environment possibly wrapped multiple times, returns a wrapper
+    of class named classname or raises ValueError if no such wrapper was applied
+
+    Parameters
+    ----------
+    env: gym.Env of gym.Wrapper
+        gym environment
+    classname: str
+        name of the wrapper
+
+    Returns
+    -------
+    wrapper: gym.Wrapper
+        wrapper named classname
+    """
+    currentenv = env
+    while True:
+        if classname == currentenv.class_name():
+            return currentenv
+        elif isinstance(currentenv, gym.Wrapper):
+            currentenv = currentenv.env
+        else:
+            raise ValueError("Couldn't find wrapper named %s" % classname)
+
+
+def relatively_safe_pickle_dump(obj, path, compression=False):
+    """This is just like regular pickle dump, except from the fact that failure cases are
+    different:
+
+        - It's never possible that we end up with a pickle in corrupted state.
+        - If a there was a different file at the path, that file will remain unchanged in the
+          even of failure (provided that filesystem rename is atomic).
+        - it is sometimes possible that we end up with useless temp file which needs to be
+          deleted manually (it will be removed automatically on the next function call)
+
+    The indended use case is periodic checkpoints of experiment state, such that we never
+    corrupt previous checkpoints if the current one fails.
+
+    Parameters
+    ----------
+    obj: object
+        object to pickle
+    path: str
+        path to the output file
+    compression: bool
+        if true pickle will be compressed
+    """
+    temp_storage = path + ".relatively_safe"
+    if compression:
+        # Using gzip here would be simpler, but the size is limited to 2GB
+        with tempfile.NamedTemporaryFile() as uncompressed_file:
+            pickle.dump(obj, uncompressed_file)
+            with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
+                myzip.write(uncompressed_file.name, "data")
+    else:
+        with open(temp_storage, "wb") as f:
+            pickle.dump(obj, f)
+    os.rename(temp_storage, path)
+
+
+def pickle_load(path, compression=False):
+    """Unpickle a possible compressed pickle.
+
+    Parameters
+    ----------
+    path: str
+        path to the output file
+    compression: bool
+        if true assumes that pickle was compressed when created and attempts decompression.
+
+    Returns
+    -------
+    obj: object
+        the unpickled object
+    """
+
+    if compression:
+        with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
+            with myzip.open("data") as f:
+                return pickle.load(f)
+    else:
+        with open(path, "rb") as f:
+            return pickle.load(f)
diff --git a/baselines/baselines_common/mpi_adam.py b/baselines/baselines_common/mpi_adam.py
new file mode 100644
index 0000000..70fb22f
--- /dev/null
+++ b/baselines/baselines_common/mpi_adam.py
@@ -0,0 +1,85 @@
+from mpi4py import MPI
+import baselines.baselines_common.tf_util as U
+import tensorflow as tf
+import numpy as np
+
+
+class MpiAdam(object):
+    def __init__(self, var_list, *,
+                 beta1=0.9, beta2=0.999, epsilon=1e-08,
+                 scale_grad_by_procs=True,
+                 comm=None):
+        self.var_list = var_list
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.scale_grad_by_procs = scale_grad_by_procs
+        size = sum(U.numel(v) for v in var_list)
+        self.m = np.zeros(size, 'float32')
+        self.v = np.zeros(size, 'float32')
+
+        self.t = 0
+        self.setfromflat = U.SetFromFlat(var_list)
+        self.getflat = U.GetFlat(var_list)
+        self.comm = MPI.COMM_WORLD if comm is None else comm
+
+    def update(self, localg, stepsize):
+        if self.t % 100 == 0:
+            self.check_synced()
+        localg = localg.astype('float32')
+        globalg = np.zeros_like(localg)
+        self.comm.Allreduce(localg, globalg, op=MPI.SUM)
+        if self.scale_grad_by_procs:
+            globalg /= self.comm.Get_size()
+
+        self.t += 1
+        a = stepsize * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
+        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
+        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
+        step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
+        self.setfromflat(self.getflat() + step)
+
+    def sync(self):
+        theta = self.getflat()
+        self.comm.Bcast(theta, root=0)
+        self.setfromflat(theta)
+
+    def check_synced(self):
+        if self.comm.Get_rank() == 0:  # this is root
+            theta = self.getflat()
+            self.comm.Bcast(theta, root=0)
+        else:
+            thetalocal = self.getflat()
+            thetaroot = np.empty_like(thetalocal)
+            self.comm.Bcast(thetaroot, root=0)
+            assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
+
+
+@U.in_session
+def test_MpiAdam():
+    np.random.seed(0)
+    tf.set_random_seed(0)
+
+    a = tf.Variable(np.random.randn(3).astype('float32'))
+    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
+    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
+
+    stepsize = 1e-2
+    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
+    do_update = U.function([], loss, updates=[update_op])
+
+    tf.get_default_session().run(tf.global_variables_initializer())
+    for i in range(10):
+        print(i, do_update())
+
+    tf.set_random_seed(0)
+    tf.get_default_session().run(tf.global_variables_initializer())
+
+    var_list = [a, b]
+    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
+    adam = MpiAdam(var_list)
+
+    for i in range(10):
+        l, g = lossandgrad()
+        adam.update(g, stepsize)
+        print(i, l)
diff --git a/baselines/baselines_common/mpi_fork.py b/baselines/baselines_common/mpi_fork.py
new file mode 100644
index 0000000..c92bc16
--- /dev/null
+++ b/baselines/baselines_common/mpi_fork.py
@@ -0,0 +1,24 @@
+import os, subprocess, sys
+
+
+def mpi_fork(n, bind_to_core=False):
+    """Re-launches the current script with workers
+    Returns "parent" for original parent, "child" for MPI children
+    """
+    if n <= 1:
+        return "child"
+    if os.getenv("IN_MPI") is None:
+        env = os.environ.copy()
+        env.update(
+            MKL_NUM_THREADS="1",
+            OMP_NUM_THREADS="1",
+            IN_MPI="1"
+        )
+        args = ["mpirun", "-np", str(n)]
+        if bind_to_core:
+            args += ["-bind-to", "core"]
+        args += [sys.executable] + sys.argv
+        subprocess.check_call(args, env=env)
+        return "parent"
+    else:
+        return "child"
diff --git a/baselines/baselines_common/mpi_moments.py b/baselines/baselines_common/mpi_moments.py
new file mode 100644
index 0000000..0b8473a
--- /dev/null
+++ b/baselines/baselines_common/mpi_moments.py
@@ -0,0 +1,52 @@
+from mpi4py import MPI
+import numpy as np
+from baselines.baselines_common import zipsame
+
+
+def mpi_moments(x, axis=0):
+    x = np.asarray(x, dtype='float64')
+    newshape = list(x.shape)
+    newshape.pop(axis)
+    n = np.prod(newshape, dtype=int)
+    totalvec = np.zeros(n * 2 + 1, 'float64')
+    addvec = np.concatenate([x.sum(axis=axis).ravel(),
+                             np.square(x).sum(axis=axis).ravel(),
+                             np.array([x.shape[axis]], dtype='float64')])
+    MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
+    sum = totalvec[:n]
+    sumsq = totalvec[n:2 * n]
+    count = totalvec[2 * n]
+    if count == 0:
+        mean = np.empty(newshape);
+        mean[:] = np.nan
+        std = np.empty(newshape);
+        std[:] = np.nan
+    else:
+        mean = sum / count
+        std = np.sqrt(np.maximum(sumsq / count - np.square(mean), 0))
+    return mean, std, count
+
+
+def test_runningmeanstd():
+    comm = MPI.COMM_WORLD
+    np.random.seed(0)
+    for (triple, axis) in [
+        ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0),
+        ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0),
+        ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1),
+    ]:
+
+        x = np.concatenate(triple, axis=axis)
+        ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
+
+        ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis)
+
+        for (a1, a2) in zipsame(ms1, ms2):
+            print(a1, a2)
+            assert np.allclose(a1, a2)
+            print("ok!")
+
+
+if __name__ == "__main__":
+    # mpirun -np 3 python <script>
+    test_runningmeanstd()
diff --git a/baselines/baselines_common/mpi_running_mean_std.py b/baselines/baselines_common/mpi_running_mean_std.py
new file mode 100644
index 0000000..8f45846
--- /dev/null
+++ b/baselines/baselines_common/mpi_running_mean_std.py
@@ -0,0 +1,112 @@
+from mpi4py import MPI
+import tensorflow as tf
+import baselines.baselines_common.tf_util as U
+import numpy as np
+
+
+class RunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-2, shape=()):
+        self._sum = tf.get_variable(
+            dtype=tf.float64,
+            shape=shape,
+            initializer=tf.constant_initializer(0.0),
+            name="runningsum", trainable=False)
+        self._sumsq = tf.get_variable(
+            dtype=tf.float64,
+            shape=shape,
+            initializer=tf.constant_initializer(epsilon),
+            name="runningsumsq", trainable=False)
+        self._count = tf.get_variable(
+            dtype=tf.float64,
+            shape=(),
+            initializer=tf.constant_initializer(epsilon),
+            name="count", trainable=False)
+        self.shape = shape
+
+        self.mean = tf.to_float(self._sum / self._count)
+        self.std = tf.sqrt(
+            tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2))
+
+        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
+        newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
+        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
+        self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
+                                        updates=[tf.assign_add(self._sum, newsum),
+                                                 tf.assign_add(self._sumsq, newsumsq),
+                                                 tf.assign_add(self._count, newcount)])
+
+    def update(self, x):
+        x = x.astype('float64')
+        n = int(np.prod(self.shape))
+        totalvec = np.zeros(n * 2 + 1, 'float64')
+        addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(),
+                                 np.array([len(x)], dtype='float64')])
+        MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
+        self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2 * n].reshape(self.shape),
+                           totalvec[2 * n])
+
+
+@U.in_session
+def test_runningmeanstd():
+    for (x1, x2, x3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
+    ]:
+        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
+        U.initialize()
+
+        x = np.concatenate([x1, x2, x3], axis=0)
+        ms1 = [x.mean(axis=0), x.std(axis=0)]
+        rms.update(x1)
+        rms.update(x2)
+        rms.update(x3)
+        ms2 = U.eval([rms.mean, rms.std])
+
+        assert np.allclose(ms1, ms2)
+
+
+@U.in_session
+def test_dist():
+    np.random.seed(0)
+    p1, p2, p3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1))
+    q1, q2, q3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1))
+
+    # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
+    # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
+
+    comm = MPI.COMM_WORLD
+    assert comm.Get_size() == 2
+    if comm.Get_rank() == 0:
+        x1, x2, x3 = p1, p2, p3
+    elif comm.Get_rank() == 1:
+        x1, x2, x3 = q1, q2, q3
+    else:
+        assert False
+
+    rms = RunningMeanStd(epsilon=0.0, shape=(1,))
+    U.initialize()
+
+    rms.update(x1)
+    rms.update(x2)
+    rms.update(x3)
+
+    bigvec = np.concatenate([p1, p2, p3, q1, q2, q3])
+
+    def checkallclose(x, y):
+        print(x, y)
+        return np.allclose(x, y)
+
+    assert checkallclose(
+        bigvec.mean(axis=0),
+        U.eval(rms.mean)
+    )
+    assert checkallclose(
+        bigvec.std(axis=0),
+        U.eval(rms.std)
+    )
+
+
+if __name__ == "__main__":
+    # Run with mpirun -np 2 python <filename>
+    test_dist()
diff --git a/baselines/baselines_common/mpi_saver.py b/baselines/baselines_common/mpi_saver.py
new file mode 100644
index 0000000..3c4651a
--- /dev/null
+++ b/baselines/baselines_common/mpi_saver.py
@@ -0,0 +1,35 @@
+from mpi4py import MPI
+import baselines.baselines_common.tf_util as U
+import tensorflow as tf
+
+
+class MpiSaver(object):
+    def __init__(self, var_list=None, *,
+                 comm=None,
+                 log_prefix="/tmp"):
+        self.var_list = var_list
+        self.t = 0
+
+        self.saver = tf.train.Saver(
+            var_list=var_list,
+            max_to_keep=100,
+            keep_checkpoint_every_n_hours=0.25,
+            pad_step_number=True,
+            save_relative_paths=True)
+        self.log_prefix = log_prefix
+
+        self.comm = MPI.COMM_WORLD if comm is None else comm
+
+    def restore(self, restore_from=None):
+        if restore_from is not None:
+            self.saver.restore(U.get_session(), restore_from)
+            self.t += int(restore_from.split("-")[-1])
+        self.sync()
+
+    def sync(self):
+        if self.comm.Get_rank() == 0:  # this is root
+            self.saver.save(
+                U.get_session(),
+                "{}/model.ckpt".format(self.log_prefix),
+                global_step=self.t)
+            self.t += 1
diff --git a/baselines/baselines_common/schedules.py b/baselines/baselines_common/schedules.py
new file mode 100644
index 0000000..9dfff50
--- /dev/null
+++ b/baselines/baselines_common/schedules.py
@@ -0,0 +1,99 @@
+"""This file is used for specifying various schedules that evolve over
+time throughout the execution of the algorithm, such as:
+ - learning rate for the optimizer
+ - exploration epsilon for the epsilon greedy exploration strategy
+ - beta parameter for beta parameter in prioritized replay
+
+Each schedule has a function `value(t)` which returns the current value
+of the parameter given the timestep t of the optimization procedure.
+"""
+
+
+class Schedule(object):
+    def value(self, t):
+        """Value of the schedule at time t"""
+        raise NotImplementedError()
+
+
+class ConstantSchedule(object):
+    def __init__(self, value):
+        """Value remains constant over time.
+
+        Parameters
+        ----------
+        value: float
+            Constant value of the schedule
+        """
+        self._v = value
+
+    def value(self, t):
+        """See Schedule.value"""
+        return self._v
+
+
+def linear_interpolation(l, r, alpha):
+    return l + alpha * (r - l)
+
+
+class PiecewiseSchedule(object):
+    def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
+        """Piecewise schedule.
+
+        endpoints: [(int, int)]
+            list of pairs `(time, value)` meanining that schedule should output
+            `value` when `t==time`. All the values for time must be sorted in
+            an increasing order. When t is between two times, e.g. `(time_a, value_a)`
+            and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
+            `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
+            time passed between `time_a` and `time_b` for time `t`.
+        interpolation: lambda float, float, float: float
+            a function that takes value to the left and to the right of t according
+            to the `endpoints`. Alpha is the fraction of distance from left endpoint to
+            right endpoint that t has covered. See linear_interpolation for example.
+        outside_value: float
+            if the value is requested outside of all the intervals sepecified in
+            `endpoints` this value is returned. If None then AssertionError is
+            raised when outside value is requested.
+        """
+        idxes = [e[0] for e in endpoints]
+        assert idxes == sorted(idxes)
+        self._interpolation = interpolation
+        self._outside_value = outside_value
+        self._endpoints = endpoints
+
+    def value(self, t):
+        """See Schedule.value"""
+        for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
+            if l_t <= t and t < r_t:
+                alpha = float(t - l_t) / (r_t - l_t)
+                return self._interpolation(l, r, alpha)
+
+        # t does not belong to any of the pieces, so doom.
+        assert self._outside_value is not None
+        return self._outside_value
+
+
+class LinearSchedule(object):
+    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
+        """Linear interpolation between initial_p and final_p over
+        schedule_timesteps. After this many timesteps pass final_p is
+        returned.
+
+        Parameters
+        ----------
+        schedule_timesteps: int
+            Number of timesteps for which to linearly anneal initial_p
+            to final_p
+        initial_p: float
+            initial output value
+        final_p: float
+            final output value
+        """
+        self.schedule_timesteps = schedule_timesteps
+        self.final_p = final_p
+        self.initial_p = initial_p
+
+    def value(self, t):
+        """See Schedule.value"""
+        fraction = min(float(t) / self.schedule_timesteps, 1.0)
+        return self.initial_p + fraction * (self.final_p - self.initial_p)
diff --git a/baselines/baselines_common/segment_tree.py b/baselines/baselines_common/segment_tree.py
new file mode 100644
index 0000000..a5a7dfc
--- /dev/null
+++ b/baselines/baselines_common/segment_tree.py
@@ -0,0 +1,146 @@
+import operator
+
+
+class SegmentTree(object):
+    def __init__(self, capacity, operation, neutral_element):
+        """Build a Segment Tree data structure.
+
+        https://en.wikipedia.org/wiki/Segment_tree
+
+        Can be used as regular array, but with two
+        important differences:
+
+            a) setting item's value is slightly slower.
+               It is O(lg capacity) instead of O(1).
+            b) user has access to an efficient `reduce`
+               operation which reduces `operation` over
+               a contiguous subsequence of items in the
+               array.
+
+        Paramters
+        ---------
+        capacity: int
+            Total size of the array - must be a power of two.
+        operation: lambda obj, obj -> obj
+            and operation for combining elements (eg. sum, max)
+            must for a mathematical group together with the set of
+            possible values for array elements.
+        neutral_element: obj
+            neutral element for the operation above. eg. float('-inf')
+            for max and 0 for sum.
+        """
+        assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
+        self._capacity = capacity
+        self._value = [neutral_element for _ in range(2 * capacity)]
+        self._operation = operation
+
+    def _reduce_helper(self, start, end, node, node_start, node_end):
+        if start == node_start and end == node_end:
+            return self._value[node]
+        mid = (node_start + node_end) // 2
+        if end <= mid:
+            return self._reduce_helper(start, end, 2 * node, node_start, mid)
+        else:
+            if mid + 1 <= start:
+                return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
+            else:
+                return self._operation(
+                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
+                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
+                )
+
+    def reduce(self, start=0, end=None):
+        """Returns result of applying `self.operation`
+        to a contiguous subsequence of the array.
+
+            self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
+
+        Parameters
+        ----------
+        start: int
+            beginning of the subsequence
+        end: int
+            end of the subsequences
+
+        Returns
+        -------
+        reduced: obj
+            result of reducing self.operation over the specified range of array elements.
+        """
+        if end is None:
+            end = self._capacity
+        if end < 0:
+            end += self._capacity
+        end -= 1
+        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
+
+    def __setitem__(self, idx, val):
+        # index of the leaf
+        idx += self._capacity
+        self._value[idx] = val
+        idx //= 2
+        while idx >= 1:
+            self._value[idx] = self._operation(
+                self._value[2 * idx],
+                self._value[2 * idx + 1]
+            )
+            idx //= 2
+
+    def __getitem__(self, idx):
+        assert 0 <= idx < self._capacity
+        return self._value[self._capacity + idx]
+
+
+class SumSegmentTree(SegmentTree):
+    def __init__(self, capacity):
+        super(SumSegmentTree, self).__init__(
+            capacity=capacity,
+            operation=operator.add,
+            neutral_element=0.0
+        )
+
+    def sum(self, start=0, end=None):
+        """Returns arr[start] + ... + arr[end]"""
+        return super(SumSegmentTree, self).reduce(start, end)
+
+    def find_prefixsum_idx(self, prefixsum):
+        """Find the highest index `i` in the array such that
+            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
+
+        if array values are probabilities, this function
+        allows to sample indexes according to the discrete
+        probability efficiently.
+
+        Parameters
+        ----------
+        perfixsum: float
+            upperbound on the sum of array prefix
+
+        Returns
+        -------
+        idx: int
+            highest index satisfying the prefixsum constraint
+        """
+        assert 0 <= prefixsum <= self.sum() + 1e-5
+        idx = 1
+        while idx < self._capacity:  # while non-leaf
+            if self._value[2 * idx] > prefixsum:
+                idx = 2 * idx
+            else:
+                prefixsum -= self._value[2 * idx]
+                idx = 2 * idx + 1
+        return idx - self._capacity
+
+
+class MinSegmentTree(SegmentTree):
+    def __init__(self, capacity):
+        super(MinSegmentTree, self).__init__(
+            capacity=capacity,
+            operation=min,
+            neutral_element=float('inf')
+        )
+
+    def min(self, start=0, end=None):
+        """Returns min(arr[start], ...,  arr[end])"""
+
+        return super(MinSegmentTree, self).reduce(start, end)
diff --git a/baselines/baselines_common/tf_util.py b/baselines/baselines_common/tf_util.py
new file mode 100644
index 0000000..cb7096b
--- /dev/null
+++ b/baselines/baselines_common/tf_util.py
@@ -0,0 +1,753 @@
+import numpy as np
+import tensorflow as tf  # pylint: ignore-module
+import builtins
+import functools
+import copy
+import os
+import collections
+
+
+# ================================================================
+# Make consistent with numpy
+# ================================================================
+
+clip = tf.clip_by_value
+
+
+def sum(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
+
+
+def mean(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
+
+
+def var(x, axis=None, keepdims=False):
+    meanx = mean(x, axis=axis, keepdims=keepdims)
+    return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
+
+
+def std(x, axis=None, keepdims=False):
+    return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
+
+
+def max(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
+
+
+def min(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
+
+
+def concatenate(arrs, axis=0):
+    return tf.concat(axis=axis, values=arrs)
+
+
+def argmax(x, axis=None):
+    return tf.argmax(x, axis=axis)
+
+
+def switch(condition, then_expression, else_expression):
+    """Switches between two operations depending on a scalar value (int or bool).
+    Note that both `then_expression` and `else_expression`
+    should be symbolic tensors of the *same shape*.
+
+    # Arguments
+        condition: scalar tensor.
+        then_expression: TensorFlow operation.
+        else_expression: TensorFlow operation.
+    """
+    x_shape = copy.copy(then_expression.get_shape())
+    x = tf.cond(tf.cast(condition, 'bool'),
+                lambda: then_expression,
+                lambda: else_expression)
+    x.set_shape(x_shape)
+    return x
+
+# ================================================================
+# Extras
+# ================================================================
+
+
+def l2loss(params):
+    if len(params) == 0:
+        return tf.constant(0.0)
+    else:
+        return tf.add_n([sum(tf.square(p)) for p in params])
+
+
+def lrelu(x, leak=0.2):
+    f1 = 0.5 * (1 + leak)
+    f2 = 0.5 * (1 - leak)
+    return f1 * x + f2 * abs(x)
+
+
+def categorical_sample_logits(X):
+    # https://github.com/tensorflow/tensorflow/issues/456
+    U = tf.random_uniform(tf.shape(X))
+    return argmax(X - tf.log(-tf.log(U)), axis=1)
+
+
+# ================================================================
+# Inputs
+# ================================================================
+
+
+def is_placeholder(x):
+    return type(x) is tf.Tensor and len(x.op.inputs) == 0
+
+
+class TfInput(object):
+    def __init__(self, name="(unnamed)"):
+        """Generalized Tensorflow placeholder. The main differences are:
+            - possibly uses multiple placeholders internally and returns multiple values
+            - can apply light postprocessing to the value feed to placeholder.
+        """
+        self.name = name
+
+    def get(self):
+        """Return the tf variable(s) representing the possibly postprocessed value
+        of placeholder(s).
+        """
+        raise NotImplemented()
+
+    def make_feed_dict(data):
+        """Given data input it to the placeholder(s)."""
+        raise NotImplemented()
+
+
+class PlacholderTfInput(TfInput):
+    def __init__(self, placeholder):
+        """Wrapper for regular tensorflow placeholder."""
+        super().__init__(placeholder.name)
+        self._placeholder = placeholder
+
+    def get(self):
+        return self._placeholder
+
+    def make_feed_dict(self, data):
+        return {self._placeholder: data}
+
+
+class BatchInput(PlacholderTfInput):
+    def __init__(self, shape, dtype=tf.float32, name=None):
+        """Creates a placeholder for a batch of tensors of a given shape and dtype
+
+        Parameters
+        ----------
+        shape: [int]
+            shape of a single elemenet of the batch
+        dtype: tf.dtype
+            number representation used for tensor contents
+        name: str
+            name of the underlying placeholder
+        """
+        super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
+
+
+class Uint8Input(PlacholderTfInput):
+    def __init__(self, shape, name=None):
+        """Takes input in uint8 format which is cast to float32 and divided by 255
+        before passing it to the model.
+
+        On GPU this ensures lower data transfer times.
+
+        Parameters
+        ----------
+        shape: [int]
+            shape of the tensor.
+        name: str
+            name of the underlying placeholder
+        """
+
+        super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
+        self._shape = shape
+        self._output = tf.cast(super().get(), tf.float32) / 255.0
+
+    def get(self):
+        return self._output
+
+
+def ensure_tf_input(thing):
+    """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
+    if isinstance(thing, TfInput):
+        return thing
+    elif is_placeholder(thing):
+        return PlacholderTfInput(thing)
+    else:
+        raise ValueError("Must be a placeholder or TfInput")
+
+# ================================================================
+# Mathematical utils
+# ================================================================
+
+
+def huber_loss(x, delta=1.0):
+    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
+    return tf.where(
+        tf.abs(x) < delta,
+        tf.square(x) * 0.5,
+        delta * (tf.abs(x) - 0.5 * delta)
+    )
+
+# ================================================================
+# Optimizer utils
+# ================================================================
+
+
+def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
+    """Minimized `objective` using `optimizer` w.r.t. variables in
+    `var_list` while ensure the norm of the gradients for each
+    variable is clipped to `clip_val`
+    """
+    gradients = optimizer.compute_gradients(objective, var_list=var_list)
+    for i, (grad, var) in enumerate(gradients):
+        if grad is not None:
+            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
+    return optimizer.apply_gradients(gradients)
+
+
+# ================================================================
+# Global session
+# ================================================================
+
+def get_session():
+    """Returns recently made Tensorflow session"""
+    return tf.get_default_session()
+
+
+def make_session(num_cpu):
+    """Returns a session that will use <num_cpu> CPU's only"""
+    tf_config = tf.ConfigProto(
+        inter_op_parallelism_threads=num_cpu,
+        intra_op_parallelism_threads=num_cpu)
+    return tf.Session(config=tf_config)
+
+
+def single_threaded_session():
+    """Returns a session which will only use a single CPU"""
+    return make_session(1)
+
+
+ALREADY_INITIALIZED = set()
+
+
+def initialize():
+    """Initialize all the uninitialized variables in the global scope."""
+    new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
+    get_session().run(tf.variables_initializer(new_variables))
+    ALREADY_INITIALIZED.update(new_variables)
+
+
+def eval(expr, feed_dict=None):
+    if feed_dict is None:
+        feed_dict = {}
+    return get_session().run(expr, feed_dict=feed_dict)
+
+
+VALUE_SETTERS = collections.OrderedDict()
+
+
+def set_value(v, val):
+    global VALUE_SETTERS
+    if v in VALUE_SETTERS:
+        set_op, set_endpoint = VALUE_SETTERS[v]
+    else:
+        set_endpoint = tf.placeholder(v.dtype)
+        set_op = v.assign(set_endpoint)
+        VALUE_SETTERS[v] = (set_op, set_endpoint)
+    get_session().run(set_op, feed_dict={set_endpoint: val})
+
+
+# ================================================================
+# Saving variables
+# ================================================================
+
+
+def load_state(fname):
+    saver = tf.train.Saver()
+    saver.restore(get_session(), fname)
+
+
+def save_state(fname):
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    saver = tf.train.Saver()
+    saver.save(get_session(), fname)
+
+# ================================================================
+# Model components
+# ================================================================
+
+
+def normc_initializer(std=1.0):
+    def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
+        out = np.random.randn(*shape).astype(np.float32)
+        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
+        return tf.constant(out)
+    return _initializer
+
+
+def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
+           summary_tag=None):
+    with tf.variable_scope(name):
+        stride_shape = [1, stride[0], stride[1], 1]
+        filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
+
+        # there are "num input feature maps * filter height * filter width"
+        # inputs to each hidden unit
+        fan_in = intprod(filter_shape[:3])
+        # each unit in the lower layer receives a gradient from:
+        # "num output feature maps * filter height * filter width" /
+        #   pooling size
+        fan_out = intprod(filter_shape[:2]) * num_filters
+        # initialize weights with random weights
+        w_bound = np.sqrt(6. / (fan_in + fan_out))
+
+        w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
+                            collections=collections)
+        b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
+                            collections=collections)
+
+        if summary_tag is not None:
+            tf.summary.image(summary_tag,
+                             tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
+                                          [2, 0, 1, 3]),
+                             max_images=10)
+
+        return tf.nn.conv2d(x, w, stride_shape, pad) + b
+
+
+def dense(x, size, name, weight_init=None, bias=True):
+    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
+    ret = tf.matmul(x, w)
+    if bias:
+        b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
+        return ret + b
+    else:
+        return ret
+
+
+def wndense(x, size, name, init_scale=1.0):
+    v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
+                        initializer=tf.random_normal_initializer(0, 0.05))
+    g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
+    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
+
+    # use weight normalization (Salimans & Kingma, 2016)
+    x = tf.matmul(x, v)
+    scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
+    return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
+
+
+def densenobias(x, size, name, weight_init=None):
+    return dense(x, size, name, weight_init=weight_init, bias=False)
+
+
+def dropout(x, pkeep, phase=None, mask=None):
+    mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
+    if phase is None:
+        return mask * x
+    else:
+        return switch(phase, mask * x, pkeep * x)
+
+
+# ================================================================
+# Theano-like Function
+# ================================================================
+
+
+
+def function(inputs, outputs, updates=None, givens=None):
+    """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
+    computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
+    values to be fed to the input's placeholders and produces the values of the expressions
+    in outputs.
+
+    Input values can be passed in the same order as inputs or can be provided as kwargs based
+    on placeholder name (passed to constructor or accessible via placeholder.op.name).
+
+    Example:
+        x = tf.placeholder(tf.int32, (), name="x")
+        y = tf.placeholder(tf.int32, (), name="y")
+        z = 3 * x + 2 * y
+        lin = function([x, y], z, givens={y: 0})
+
+        with single_threaded_session():
+            initialize()
+
+            assert lin(2) == 6
+            assert lin(x=3) == 9
+            assert lin(2, 2) == 10
+            assert lin(x=2, y=3) == 12
+
+    Parameters
+    ----------
+    inputs: [tf.placeholder or TfInput]
+        list of input arguments
+    outputs: [tf.Variable] or tf.Variable
+        list of outputs or a single output to be returned from function. Returned
+        value will also have the same shape.
+    """
+    if isinstance(outputs, list):
+        return _Function(inputs, outputs, updates, givens=givens)
+    elif isinstance(outputs, (dict, collections.OrderedDict)):
+        f = _Function(inputs, outputs.values(), updates, givens=givens)
+        return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
+    else:
+        f = _Function(inputs, [outputs], updates, givens=givens)
+        return lambda *args, **kwargs: f(*args, **kwargs)[0]
+
+
+class _Function(object):
+    def __init__(self, inputs, outputs, updates, givens, check_nan=False):
+        for inpt in inputs:
+            if not issubclass(type(inpt), TfInput):
+                assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of src.baselines_common.TfInput"
+        self.inputs = inputs
+        updates = updates or []
+        self.update_group = tf.group(*updates)
+        self.outputs_update = list(outputs) + [self.update_group]
+        self.givens = {} if givens is None else givens
+        self.check_nan = check_nan
+
+    def _feed_input(self, feed_dict, inpt, value):
+        if issubclass(type(inpt), TfInput):
+            feed_dict.update(inpt.make_feed_dict(value))
+        elif is_placeholder(inpt):
+            feed_dict[inpt] = value
+
+    def __call__(self, *args, **kwargs):
+        assert len(args) <= len(self.inputs), "Too many arguments provided"
+        feed_dict = {}
+        # Update the args
+        for inpt, value in zip(self.inputs, args):
+            self._feed_input(feed_dict, inpt, value)
+        # Update the kwargs
+        kwargs_passed_inpt_names = set()
+        for inpt in self.inputs[len(args):]:
+            inpt_name = inpt.name.split(':')[0]
+            inpt_name = inpt_name.split('/')[-1]
+            assert inpt_name not in kwargs_passed_inpt_names, \
+                "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
+            if inpt_name in kwargs:
+                kwargs_passed_inpt_names.add(inpt_name)
+                self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
+            else:
+                assert inpt in self.givens, "Missing argument " + inpt_name
+        assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
+        # Update feed dict with givens.
+        for inpt in self.givens:
+            feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
+        results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
+        if self.check_nan:
+            if any(np.isnan(r).any() for r in results):
+                raise RuntimeError("Nan detected")
+        return results
+
+
+def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
+    if isinstance(outputs, list):
+        return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
+    else:
+        f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
+        return lambda *inputs: f(*inputs)[0]
+
+
+class _MemFriendlyFunction(object):
+    def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
+        self.nondata_inputs = nondata_inputs
+        self.data_inputs = data_inputs
+        self.outputs = list(outputs)
+        self.batch_size = batch_size
+
+    def __call__(self, *inputvals):
+        assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
+        nondata_vals = inputvals[0:len(self.nondata_inputs)]
+        data_vals = inputvals[len(self.nondata_inputs):]
+        feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
+        n = data_vals[0].shape[0]
+        for v in data_vals[1:]:
+            assert v.shape[0] == n
+        for i_start in range(0, n, self.batch_size):
+            slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
+            for (var, val) in zip(self.data_inputs, slice_vals):
+                feed_dict[var] = val
+            results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
+            if i_start == 0:
+                sum_results = results
+            else:
+                for i in range(len(results)):
+                    sum_results[i] = sum_results[i] + results[i]
+        for i in range(len(results)):
+            sum_results[i] = sum_results[i] / n
+        return sum_results
+
+# ================================================================
+# Modules
+# ================================================================
+
+
+class Module(object):
+    def __init__(self, name):
+        self.name = name
+        self.first_time = True
+        self.scope = None
+        self.cache = {}
+
+    def __call__(self, *args):
+        if args in self.cache:
+            print("(%s) retrieving value from cache" % (self.name,))
+            return self.cache[args]
+        with tf.variable_scope(self.name, reuse=not self.first_time):
+            scope = tf.get_variable_scope().name
+            if self.first_time:
+                self.scope = scope
+                print("(%s) running function for the first time" % (self.name,))
+            else:
+                assert self.scope == scope, "Tried calling function with a different scope"
+                print("(%s) running function on new inputs" % (self.name,))
+            self.first_time = False
+            out = self._call(*args)
+        self.cache[args] = out
+        return out
+
+    def _call(self, *args):
+        raise NotImplementedError
+
+    @property
+    def trainable_variables(self):
+        assert self.scope is not None, "need to call module once before getting variables"
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
+
+    @property
+    def variables(self):
+        assert self.scope is not None, "need to call module once before getting variables"
+        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
+
+
+def module(name):
+    @functools.wraps
+    def wrapper(f):
+        class WrapperModule(Module):
+            def _call(self, *args):
+                return f(*args)
+        return WrapperModule(name)
+    return wrapper
+
+# ================================================================
+# Graph traversal
+# ================================================================
+
+
+VARIABLES = {}
+
+
+def get_parents(node):
+    return node.op.inputs
+
+
+def topsorted(outputs):
+    """
+    Topological sort via non-recursive depth-first search
+    """
+    assert isinstance(outputs, (list, tuple))
+    marks = {}
+    out = []
+    stack = []  # pylint: disable=W0621
+    # i: node
+    # jidx = number of children visited so far from that node
+    # marks: state of each node, which is one of
+    #   0: haven't visited
+    #   1: have visited, but not done visiting children
+    #   2: done visiting children
+    for x in outputs:
+        stack.append((x, 0))
+        while stack:
+            (i, jidx) = stack.pop()
+            if jidx == 0:
+                m = marks.get(i, 0)
+                if m == 0:
+                    marks[i] = 1
+                elif m == 1:
+                    raise ValueError("not a dag")
+                else:
+                    continue
+            ps = get_parents(i)
+            if jidx == len(ps):
+                marks[i] = 2
+                out.append(i)
+            else:
+                stack.append((i, jidx + 1))
+                j = ps[jidx]
+                stack.append((j, 0))
+    return out
+
+
+# ================================================================
+# Flat vectors
+# ================================================================
+
+def var_shape(x):
+    out = x.get_shape().as_list()
+    assert all(isinstance(a, int) for a in out), \
+        "shape function assumes that shape is fully known"
+    return out
+
+
+def numel(x):
+    return intprod(var_shape(x))
+
+
+def intprod(x):
+    return int(np.prod(x))
+
+
+def flatgrad(loss, var_list, clip_norm=None):
+    grads = tf.gradients(loss, var_list)
+    if clip_norm is not None:
+        grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
+    return tf.concat(axis=0, values=[
+        tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
+        for (v, grad) in zip(var_list, grads)
+    ])
+
+
+class SetFromFlat(object):
+    def __init__(self, var_list, dtype=tf.float32):
+        shapes = list(map(var_shape, var_list))
+        total_size = np.sum([intprod(shape) for shape in shapes])
+
+        self.theta = theta = tf.placeholder(dtype, [total_size])
+        start = 0
+        assigns = []
+        for (shape, v) in zip(shapes, var_list):
+            size = intprod(shape)
+            assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
+            start += size
+        self.op = tf.group(*assigns)
+
+    def __call__(self, theta):
+        get_session().run(self.op, feed_dict={self.theta: theta})
+
+
+class GetFlat(object):
+    def __init__(self, var_list):
+        self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
+
+    def __call__(self):
+        return get_session().run(self.op)
+
+# ================================================================
+# Misc
+# ================================================================
+
+
+def fancy_slice_2d(X, inds0, inds1):
+    """
+    like numpy X[inds0, inds1]
+    XXX this implementation is bad
+    """
+    inds0 = tf.cast(inds0, tf.int64)
+    inds1 = tf.cast(inds1, tf.int64)
+    shape = tf.cast(tf.shape(X), tf.int64)
+    ncols = shape[1]
+    Xflat = tf.reshape(X, [-1])
+    return tf.gather(Xflat, inds0 * ncols + inds1)
+
+
+# ================================================================
+# Scopes
+# ================================================================
+
+
+def scope_vars(scope, trainable_only=False):
+    """
+    Get variables inside a scope
+    The scope can be specified as a string
+
+    Parameters
+    ----------
+    scope: str or VariableScope
+        scope in which the variables reside.
+    trainable_only: bool
+        whether or not to return only the variables that were marked as trainable.
+
+    Returns
+    -------
+    vars: [tf.Variable]
+        list of variables in `scope`.
+    """
+    return tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
+        scope=scope if isinstance(scope, str) else scope.name
+    )
+
+
+def scope_name():
+    """Returns the name of current scope as a string, e.g. deepq/q_func"""
+    return tf.get_variable_scope().name
+
+
+def absolute_scope_name(relative_scope_name):
+    """Appends parent scope name to `relative_scope_name`"""
+    return scope_name() + "/" + relative_scope_name
+
+
+def lengths_to_mask(lengths_b, max_length):
+    """
+    Turns a vector of lengths into a boolean mask
+
+    Args:
+        lengths_b: an integer vector of lengths
+        max_length: maximum length to fill the mask
+
+    Returns:
+        a boolean array of shape (batch_size, max_length)
+        row[i] consists of True repeated lengths_b[i] times, followed by False
+    """
+    lengths_b = tf.convert_to_tensor(lengths_b)
+    assert lengths_b.get_shape().ndims == 1
+    mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
+    return mask_bt
+
+
+def in_session(f):
+    @functools.wraps(f)
+    def newfunc(*args, **kwargs):
+        with tf.Session():
+            f(*args, **kwargs)
+    return newfunc
+
+
+_PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
+
+
+def get_placeholder(name, dtype, shape):
+    if name in _PLACEHOLDER_CACHE:
+        out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
+        assert dtype1 == dtype and shape1 == shape
+        return out
+    else:
+        out = tf.placeholder(dtype=dtype, shape=shape, name=name)
+        _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
+        return out
+
+
+def get_placeholder_cached(name):
+    return _PLACEHOLDER_CACHE[name][0]
+
+
+def flattenallbut0(x):
+    return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
+
+
+def reset():
+    global _PLACEHOLDER_CACHE
+    global VARIABLES
+    _PLACEHOLDER_CACHE = {}
+    VARIABLES = {}
+    tf.reset_default_graph()
diff --git a/baselines/baselines_common/vec_env/__init__.py b/baselines/baselines_common/vec_env/__init__.py
new file mode 100644
index 0000000..34c50d7
--- /dev/null
+++ b/baselines/baselines_common/vec_env/__init__.py
@@ -0,0 +1,19 @@
+class VecEnv(object):
+    """
+    Vectorized environment base class
+    """
+    def step(self, vac):
+        """
+        Apply sequence of actions to sequence of environments
+        actions -> (observations, rewards, news)
+
+        where 'news' is a boolean vector indicating whether each element is new.
+        """
+        raise NotImplementedError
+    def reset(self):
+        """
+        Reset all environments
+        """
+        raise NotImplementedError
+    def close(self):
+        pass
\ No newline at end of file
diff --git a/baselines/baselines_common/vec_env/subproc_vec_env.py b/baselines/baselines_common/vec_env/subproc_vec_env.py
new file mode 100644
index 0000000..7e5a888
--- /dev/null
+++ b/baselines/baselines_common/vec_env/subproc_vec_env.py
@@ -0,0 +1,79 @@
+import numpy as np
+from multiprocessing import Process, Pipe
+from src.common.vec_env import VecEnv
+
+
+def worker(remote, env_fn_wrapper):
+    env = env_fn_wrapper.x()
+    while True:
+        cmd, data = remote.recv()
+        if cmd == 'step':
+            ob, reward, done, info = env.step(data)
+            if done:
+                ob = env.reset()
+            remote.send((ob, reward, done, info))
+        elif cmd == 'reset':
+            ob = env.reset()
+            remote.send(ob)
+        elif cmd == 'close':
+            remote.close()
+            break
+        elif cmd == 'get_spaces':
+            remote.send((env.action_space, env.observation_space))
+        else:
+            raise NotImplementedError
+
+
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+    """
+
+    def __init__(self, x):
+        self.x = x
+
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+
+class SubprocVecEnv(VecEnv):
+    def __init__(self, env_fns):
+        """
+        envs: list of gym environments to run in subprocesses
+        """
+        nenvs = len(env_fns)
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
+        self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
+                   for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
+        for p in self.ps:
+            p.start()
+
+        self.remotes[0].send(('get_spaces', None))
+        self.action_space, self.observation_space = self.remotes[0].recv()
+
+    def step(self, actions):
+        for remote, action in zip(self.remotes, actions):
+            remote.send(('step', action))
+        results = [remote.recv() for remote in self.remotes]
+        obs, rews, dones, infos = zip(*results)
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        for remote in self.remotes:
+            remote.send(('reset', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def close(self):
+        for remote in self.remotes:
+            remote.send(('close', None))
+        for p in self.ps:
+            p.join()
+
+    @property
+    def num_envs(self):
+        return len(self.remotes)
diff --git a/baselines/nets.py b/baselines/nets.py
new file mode 100644
index 0000000..bf39868
--- /dev/null
+++ b/baselines/nets.py
@@ -0,0 +1,95 @@
+import tensorflow as tf
+import baselines.baselines_common.tf_util as U
+from baselines.baselines_common.mpi_running_mean_std import RunningMeanStd
+from baselines.baselines_common.distributions import make_pdtype, DiagGaussianPdType, BernoulliPdType
+
+
+def mlp_block(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
+    with tf.variable_scope(name_or_scope=name):
+        for i in range(num_hid_layers):
+            x = U.dense(
+                x, hid_size,
+                name="fc%i" % (i + 1), weight_init=U.normc_initializer(1.0))
+            x = activation_fn(x)
+        return x
+
+
+def feature_net(x, name, num_hid_layers, hid_size, activation_fn=tf.nn.tanh):
+    with tf.variable_scope(name_or_scope=name):
+        x = mlp_block(
+            x, name="mlp",
+            hid_size=hid_size, num_hid_layers=num_hid_layers, activation_fn=activation_fn)
+        return x
+
+
+class Actor(object):
+    def __init__(self, name, *args, **kwargs):
+        with tf.variable_scope(name):
+            self._init(*args, **kwargs)
+            self.scope = tf.get_variable_scope().name
+
+    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, noise_type=None):
+        if noise_type == "gaussian":
+            self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0])
+        else:
+            self.pdtype = pdtype = make_pdtype(ac_space)
+
+        ob = U.get_placeholder(
+            name="ob", dtype=tf.float32,
+            shape=[None] + list(ob_space.shape))
+
+        with tf.variable_scope("obfilter"):
+            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
+        obz = (ob - self.ob_rms.mean) / self.ob_rms.std
+        obz = tf.clip_by_value(obz, -5.0, 5.0)
+
+        # critic net (value network)
+        last_out = feature_net(
+            obz, name="vf",
+            num_hid_layers=num_hid_layers, hid_size=hid_size,
+            activation_fn=tf.nn.tanh)
+        self.vpred = U.dense(
+            last_out, 1,
+            name="vf_final", weight_init=U.normc_initializer(1.0))[:, 0]
+
+        # actor net (policy network)
+        last_out = feature_net(
+            obz, name="pol",
+            num_hid_layers=num_hid_layers, hid_size=hid_size,
+            activation_fn=tf.nn.tanh)
+
+        if gaussian_fixed_var and isinstance(self.pdtype, DiagGaussianPdType):
+            mean = U.dense(
+                last_out, pdtype.param_shape()[0] // 2,
+                name="pol_final", weight_init=U.normc_initializer(0.01))
+            logstd = tf.get_variable(
+                name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
+                initializer=tf.zeros_initializer())
+            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
+        else:
+            pdparam = U.dense(
+                last_out, pdtype.param_shape()[0],
+                name="pol_final", weight_init=U.normc_initializer(0.01))
+
+        # pd - probability distribution
+        self.pd = pdtype.pdfromflat(pdparam)
+
+        self.state_in = []
+        self.state_out = []
+
+        stochastic = tf.placeholder(dtype=tf.bool, shape=())
+        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
+        self._act = U.function([stochastic, ob], [ac, self.vpred])
+
+    def act(self, stochastic, ob):
+        ac1, vpred1 = self._act(stochastic, ob[None])
+        return ac1[0], vpred1[0]
+
+    def get_variables(self):
+        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
+
+    def get_trainable_variables(self):
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
+
+    def get_initial_state(self):
+        return []
diff --git a/baselines/ppo.py b/baselines/ppo.py
new file mode 100644
index 0000000..2823db4
--- /dev/null
+++ b/baselines/ppo.py
@@ -0,0 +1,171 @@
+import tensorflow as tf
+import numpy as np
+import time
+from mpi4py import MPI
+from collections import deque
+from contextlib import contextmanager
+
+from common.logger import Logger
+from baselines.baselines_common import Dataset, explained_variance, fmt_row, zipsame
+import baselines.baselines_common.tf_util as U
+from baselines.baselines_common.mpi_adam import MpiAdam
+from baselines.baselines_common.mpi_saver import MpiSaver
+from baselines.baselines_common.mpi_moments import mpi_moments
+
+from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
+
+
+def learn(env, policy_func, args, *,
+          timesteps_per_batch,  # timesteps per actor per update
+          clip_param, entcoeff,  # clipping parameter epsilon, entropy coeff
+          optim_epochs, optim_stepsize, optim_batchsize,  # optimization hypers
+          gamma, lam,  # advantage estimation
+          adam_epsilon=1e-5,
+          schedule='constant'):  # annealing for stepsize parameters (epsilon and adam),
+    # Setup losses and stuff
+    # ----------------------------------------
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
+    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
+    atarg = tf.placeholder(dtype=tf.float32,
+                           shape=[None])  # Target advantage function (if applicable)
+    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
+
+    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
+                            shape=[])  # learning rate multiplier, updated with schedule
+    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon
+
+    ob = U.get_placeholder_cached(name="ob")
+    ac = pi.pdtype.sample_placeholder([None])
+
+    kloldnew = oldpi.pd.kl(pi.pd)
+    ent = pi.pd.entropy()
+    meankl = U.mean(kloldnew)
+    meanent = U.mean(ent)
+    pol_entpen = (-entcoeff) * meanent
+
+    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
+    surr1 = ratio * atarg  # surrogate from conservative policy iteration
+    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
+    pol_surr = - U.mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
+    vf_loss = U.mean(tf.square(pi.vpred - ret))
+    total_loss = pol_surr + pol_entpen + vf_loss
+    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
+    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
+
+    var_list = pi.get_trainable_variables()
+    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
+                             losses + [U.flatgrad(total_loss, var_list)])
+    adam = MpiAdam(var_list, epsilon=adam_epsilon)
+    policy_var_list = [v for v in var_list if v.name.split("/")[0].startswith("pi")]
+    saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
+
+    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
+                                                    for (oldv, newv) in
+                                                    zipsame(oldpi.get_variables(),
+                                                            pi.get_variables())])
+    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
+
+    U.initialize()
+    saver.restore(restore_from=args.restore_actor_from)
+    adam.sync()
+
+    # Prepare for rollouts
+    # ----------------------------------------
+    seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
+
+    episodes_so_far = 0
+    timesteps_so_far = 0
+    iters_so_far = 0
+    tstart = time.time()
+    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
+    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
+
+    # max_timesteps = 1e10
+    cur_lrmult = 1.0
+
+    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
+    logger = Logger(args.logdir)
+
+    while time.time() - tstart < 86400 * args.max_train_days:
+        # if schedule == 'constant':
+        #     cur_lrmult = 1.0
+        # elif schedule == 'linear':
+        #     cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
+        # else:
+        #     raise NotImplementedError
+
+        # logger.log("********** Iteration %i ************" % iters_so_far)
+
+        seg = seg_gen.__next__()
+        add_vtarg_and_adv(seg, gamma, lam)
+
+        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
+        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
+        vpredbefore = seg["vpred"]  # predicted value function before udpate
+        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
+        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True)
+        optim_batchsize = optim_batchsize or ob.shape[0]
+
+        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
+
+        assign_old_eq_new()  # set old parameter values to new parameter values
+        # logger.log("Optimizing...")
+        # logger.log(fmt_row(13, loss_names))
+        # Here we do a bunch of optimization epochs over the data
+        for _ in range(optim_epochs):
+            losses = []  # list of tuples, each of which gives the loss for a minibatch
+            for batch in d.iterate_once(optim_batchsize):
+                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
+                                            batch["vtarg"], cur_lrmult)
+                adam.update(g, optim_stepsize * cur_lrmult)
+                losses.append(newlosses)
+            # logger.log(fmt_row(13, np.mean(losses, axis=0)))
+
+        saver.sync()
+        # logger.log("Evaluating losses...")
+        losses = []
+        for batch in d.iterate_once(optim_batchsize):
+            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
+                                       cur_lrmult)
+            losses.append(newlosses)
+        meanlosses, _, _ = mpi_moments(losses, axis=0)
+        # logger.log(fmt_row(13, meanlosses))
+
+        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
+        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
+        lens, rews = map(flatten_lists, zip(*listoflrpairs))
+        lenbuffer.extend(lens)
+        rewbuffer.extend(rews)
+
+        episodes_so_far += len(lens)
+        timesteps_so_far += sum(lens)
+        iters_so_far += 1
+
+        # Logging
+        logger.scalar_summary("episodes", len(lens), iters_so_far)
+
+        for (lossname, lossval) in zip(loss_names, meanlosses):
+            logger.scalar_summary(lossname, lossval, episodes_so_far)
+
+        logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
+
+        logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
+        logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
+        logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
+
+        elapsed_time = time.time() - tstart
+
+        logger.scalar_summary(
+            "episode per minute",
+            episodes_so_far / elapsed_time * 60,
+            episodes_so_far)
+        logger.scalar_summary(
+            "step per second",
+            timesteps_so_far / elapsed_time,
+            episodes_so_far)
+
+
+def flatten_lists(listoflists):
+    return [el for list_ in listoflists for el in list_]
diff --git a/baselines/train.py b/baselines/train.py
new file mode 100644
index 0000000..80bf696
--- /dev/null
+++ b/baselines/train.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# noinspection PyUnresolvedReferences
+
+import os
+import json
+import argparse
+from mpi4py import MPI
+
+from common.misc_util import boolean_flag, str2params, create_if_need
+from common.misc_util import set_global_seeds
+from common.env_wrappers import create_env
+
+from baselines.nets import Actor
+from baselines import trpo, ppo
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--agent',
+        type=str,
+        default="trpo",
+        choices=["trpo", "ppo"],
+        help='Which agent to use. (default: %(default)s)')
+
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--difficulty', type=int, default=2)
+    parser.add_argument('--max-obstacles', type=int, default=3)
+
+    parser.add_argument('--logdir', type=str, default="./logs")
+
+    boolean_flag(parser, "baseline-wrapper", default=False)
+    parser.add_argument('--skip-frames', type=int, default=1)
+    parser.add_argument('--reward-scale', type=float, default=1.)
+    parser.add_argument('--fail-reward', type=float, default=0.0)
+
+    parser.add_argument('--hid-size', type=int, default=64)
+    parser.add_argument('--num-hid-layers', type=int, default=2)
+
+    parser.add_argument('--gamma', type=float, default=0.96)
+
+    parser.add_argument('--restore-args-from', type=str, default=None)
+    parser.add_argument('--restore-actor-from', type=str, default=None)
+
+    parser.add_argument(
+        '--max-train-days',
+        default=int(1e1),
+        type=int)
+
+    args = parser.parse_args()
+    return args
+
+
+def restore_params(args):
+    with open(args.restore_args_from, "r") as fin:
+        params = json.load(fin)
+
+    del params["seed"]
+    del params["difficulty"]
+    del params["max_obstacles"]
+
+    del params["skip_frames"]
+
+    del params["restore_args_from"]
+    del params["restore_actor_from"]
+
+    for key, value in params.items():
+        setattr(args, key, value)
+    return args
+        
+
+def train(args):
+    import baselines.baselines_common.tf_util as U
+
+    sess = U.single_threaded_session()
+    sess.__enter__()
+
+    if args.restore_args_from is not None:
+        args = restore_params(args)
+
+    rank = MPI.COMM_WORLD.Get_rank()
+
+    workerseed = args.seed + 241 * MPI.COMM_WORLD.Get_rank()
+    set_global_seeds(workerseed)
+
+    def policy_fn(name, ob_space, ac_space):
+        return Actor(
+            name=name,
+            ob_space=ob_space, ac_space=ac_space,
+            hid_size=args.hid_size, num_hid_layers=args.num_hid_layers,
+            noise_type=args.noise_type)
+
+    env = create_env(args)
+    env.seed(workerseed)
+
+    if rank == 0:
+        create_if_need(args.logdir)
+        with open("{}/args.json".format(args.logdir), "w") as fout:
+            json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
+
+    try:
+        args.thread = rank
+        if args.agent == "trpo":
+            trpo.learn(
+                env, policy_fn, args,
+                timesteps_per_batch=1024,
+                gamma=args.gamma,
+                lam=0.98,
+                max_kl=0.01,
+                cg_iters=10,
+                cg_damping=0.1,
+                vf_iters=5,
+                vf_stepsize=1e-3)
+        elif args.agent == "ppo":
+            # optimal settings:
+            # timesteps_per_batch = optim_epochs *  optim_batchsize
+            ppo.learn(
+                env, policy_fn, args,
+                timesteps_per_batch=256,
+                gamma=args.gamma,
+                lam=0.95,
+                clip_param=0.2,
+                entcoeff=0.0,
+                optim_epochs=4,
+                optim_stepsize=3e-4,
+                optim_batchsize=64,
+                schedule='constant')
+        else:
+            raise NotImplementedError
+    except KeyboardInterrupt:
+        print("closing envs...")
+
+    env.close()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    args.noise_type = "gaussian"
+    train(args)
diff --git a/baselines/trajectories.py b/baselines/trajectories.py
new file mode 100644
index 0000000..47482b0
--- /dev/null
+++ b/baselines/trajectories.py
@@ -0,0 +1,76 @@
+import numpy as np
+
+
+def traj_segment_generator(pi, env, args, horizon, stochastic):
+    # Initialize state variables
+    t = 0
+    ac = env.action_space.sample()  # not used, just so we have the datatype
+    new = True  # marks if we're on first timestep of an episode
+    ob = env.reset(difficulty=args.difficulty)
+
+    cur_ep_ret = 0  # return in current episode
+    cur_ep_len = 0  # len of current episode
+    ep_rets = []  # returns of completed episodes in this segment
+    ep_lens = []  # lengths of ...
+
+    # Initialize history arrays
+    obs = np.array([ob for _ in range(horizon)])
+    rews = np.zeros(horizon, 'float32')
+    vpreds = np.zeros(horizon, 'float32')
+    news = np.zeros(horizon, 'int32')
+    acs = np.array([ac for _ in range(horizon)])
+    prevacs = acs.copy()
+
+    while True:
+        prevac = ac
+        ac, vpred = pi.act(stochastic, ob)
+        # Slight weirdness here because we need value function at time T
+        # before returning segment [0, T-1] so we get the correct
+        # terminal value
+        if t > 0 and t % horizon == 0:
+            yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
+                   "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
+                   "ep_rets": ep_rets, "ep_lens": ep_lens}
+            # @TODO: TRPO & PPO implementation diff
+            # _, vpred = pi.act(stochastic, ob)  # @TODO: uncomment??? IMPORTANT!!
+            # Be careful!!! if you change the downstream algorithm to aggregate
+            # several of these batches, then be sure to do a deepcopy
+            ep_rets = []
+            ep_lens = []
+        i = t % horizon
+        obs[i] = ob
+        vpreds[i] = vpred
+        news[i] = new
+        acs[i] = ac
+        prevacs[i] = prevac
+
+        ob, rew, new, _ = env.step(ac)
+        rews[i] = rew
+
+        cur_ep_ret += rew
+        cur_ep_len += 1
+        if new:
+            ep_rets.append(cur_ep_ret)
+            ep_lens.append(cur_ep_len)
+            cur_ep_ret = 0
+            cur_ep_len = 0
+            ob = env.reset(difficulty=args.difficulty)
+        t += 1
+
+
+def add_vtarg_and_adv(seg, gamma, lam):
+    """
+    Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
+    """
+    # last element is only used for last vtarg, but we already zeroed it if last new = 1
+    new = np.append(seg["new"], 0)
+    vpred = np.append(seg["vpred"], seg["nextvpred"])
+    T = len(seg["rew"])
+    seg["adv"] = gaelam = np.empty(T, 'float32')
+    rew = seg["rew"]
+    lastgaelam = 0
+    for t in reversed(range(T)):
+        nonterminal = 1 - new[t + 1]
+        delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
+        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
+    seg["tdlamret"] = seg["adv"] + seg["vpred"]
diff --git a/baselines/trpo.py b/baselines/trpo.py
new file mode 100644
index 0000000..bf3d465
--- /dev/null
+++ b/baselines/trpo.py
@@ -0,0 +1,243 @@
+import tensorflow as tf
+import numpy as np
+import time
+from mpi4py import MPI
+from collections import deque
+from contextlib import contextmanager
+
+from common.logger import Logger
+
+from baselines.baselines_common import explained_variance, zipsame, dataset
+import baselines.baselines_common.tf_util as U
+from baselines.baselines_common import colorize
+from baselines.baselines_common.mpi_adam import MpiAdam
+from baselines.baselines_common.mpi_saver import MpiSaver
+from baselines.baselines_common.cg import cg
+
+from baselines.trajectories import traj_segment_generator, add_vtarg_and_adv
+
+
+def learn(env, policy_func, args, *,
+          timesteps_per_batch,  # what to train on
+          max_kl, cg_iters,
+          gamma, lam,  # advantage estimation
+          entcoeff=0.0,
+          cg_damping=1e-2,
+          vf_stepsize=3e-4,
+          vf_iters=3):
+    nworkers = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+    np.set_printoptions(precision=3)
+    # Setup losses and stuff
+    # ----------------------------------------
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    pi = policy_func("pi", ob_space, ac_space)
+    oldpi = policy_func("oldpi", ob_space, ac_space)
+    atarg = tf.placeholder(
+        dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
+    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
+
+    ob = U.get_placeholder_cached(name="ob")
+    ac = pi.pdtype.sample_placeholder([None])
+
+    kloldnew = oldpi.pd.kl(pi.pd)
+    ent = pi.pd.entropy()
+    meankl = U.mean(kloldnew)
+    meanent = U.mean(ent)
+    entbonus = entcoeff * meanent
+
+    vferr = U.mean(tf.square(pi.vpred - ret))
+
+    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
+    surrgain = U.mean(ratio * atarg)
+
+    optimgain = surrgain + entbonus
+    losses = [optimgain, meankl, entbonus, surrgain, meanent]
+    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
+
+    dist = meankl
+
+    all_var_list = pi.get_trainable_variables()
+    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
+    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
+    vfadam = MpiAdam(vf_var_list)
+
+    policy_var_list = [v for v in all_var_list if v.name.split("/")[0].startswith("pi")]
+    saver = MpiSaver(policy_var_list, log_prefix=args.logdir)
+
+    get_flat = U.GetFlat(var_list)
+    set_from_flat = U.SetFromFlat(var_list)
+    klgrads = tf.gradients(dist, var_list)
+    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
+    shapes = [var.get_shape().as_list() for var in var_list]
+    start = 0
+    tangents = []
+    for shape in shapes:
+        sz = U.intprod(shape)
+        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
+        start += sz
+    gvp = tf.add_n([U.sum(g * tangent) for (g, tangent) in
+                    zipsame(klgrads, tangents)])  # pylint: disable=E1111
+    fvp = U.flatgrad(gvp, var_list)
+
+    assign_old_eq_new = U.function(
+        [], [],
+        updates=[tf.assign(oldv, newv)
+                 for (oldv, newv) in
+                 zipsame(oldpi.get_variables(), pi.get_variables())])
+    compute_losses = U.function([ob, ac, atarg], losses)
+    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
+    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
+    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
+
+    @contextmanager
+    def timed(msg):
+        if rank == 0:
+            print(colorize(msg, color='magenta'))
+            tstart = time.time()
+            yield
+            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
+        else:
+            yield
+
+    def allmean(x):
+        assert isinstance(x, np.ndarray)
+        out = np.empty_like(x)
+        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
+        out /= nworkers
+        return out
+
+    U.initialize()
+    saver.restore(restore_from=args.restore_actor_from)
+    th_init = get_flat()
+    MPI.COMM_WORLD.Bcast(th_init, root=0)
+    set_from_flat(th_init)
+    vfadam.sync()
+    print("Init param sum", th_init.sum(), flush=True)
+
+    # Prepare for rollouts
+    # ----------------------------------------
+    seg_gen = traj_segment_generator(pi, env, args, timesteps_per_batch, stochastic=True)
+
+    episodes_so_far = 0
+    timesteps_so_far = 0
+    iters_so_far = 0
+    tstart = time.time()
+    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
+
+    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
+    logger = Logger(args.logdir)
+
+    while time.time() - tstart < 86400 * args.max_train_days:
+        # logger.log("********** Iteration %i ************" % iters_so_far)
+        meanlosses = [0] * len(loss_names)
+        with timed("sampling"):
+            seg = seg_gen.__next__()
+        add_vtarg_and_adv(seg, gamma, lam)
+
+        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
+        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
+        vpredbefore = seg["vpred"]  # predicted value function before udpate
+        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
+
+        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
+        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
+
+        segargs = seg["ob"], seg["ac"], seg["adv"]
+        fvpargs = [arr[::5] for arr in segargs]
+
+        def fisher_vector_product(p):
+            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
+
+        assign_old_eq_new()  # set old parameter values to new parameter values
+        with timed("computegrad"):
+            *lossbefore, g = compute_lossandgrad(*segargs)
+        lossbefore = allmean(np.array(lossbefore))
+        g = allmean(g)
+        if np.allclose(g, 0):
+            pass
+        #     logger.log("Got zero gradient. not updating")
+        else:
+            with timed("cg"):
+                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
+            assert np.isfinite(stepdir).all()
+            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
+            lm = np.sqrt(shs / max_kl)
+            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
+            fullstep = stepdir / lm
+            expectedimprove = g.dot(fullstep)
+            surrbefore = lossbefore[0]
+            stepsize = 1.0
+            thbefore = get_flat()
+            for _ in range(10):
+                thnew = thbefore + fullstep * stepsize
+                set_from_flat(thnew)
+                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*segargs)))
+                improve = surr - surrbefore
+                # logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
+                # if not np.isfinite(meanlosses).all():
+                #     logger.log("Got non-finite value of losses -- bad!")
+                # elif kl > max_kl * 1.5:
+                #     logger.log("violated KL constraint. shrinking step.")
+                # elif improve < 0:
+                #     logger.log("surrogate didn't improve. shrinking step.")
+                # else:
+                #     logger.log("Stepsize OK!")
+                #     break
+                stepsize *= .5
+            else:
+                # logger.log("couldn't compute a good step")
+                set_from_flat(thbefore)
+            if nworkers > 1 and iters_so_far % 20 == 0:
+                paramsums = MPI.COMM_WORLD.allgather(
+                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
+                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
+
+        with timed("vf"):
+            for _ in range(vf_iters):
+                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
+                                                         include_final_partial_batch=False,
+                                                         batch_size=64):
+                    g = allmean(compute_vflossandgrad(mbob, mbret))
+                    vfadam.update(g, vf_stepsize)
+
+        saver.sync()
+
+        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
+        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
+        lens, rews = map(flatten_lists, zip(*listoflrpairs))
+        lenbuffer.extend(lens)
+        rewbuffer.extend(rews)
+
+        episodes_so_far += len(lens)
+        timesteps_so_far += sum(lens)
+        iters_so_far += 1
+
+        # Logging
+        logger.scalar_summary("episodes", len(lens), iters_so_far)
+
+        for (lossname, lossval) in zip(loss_names, meanlosses):
+            logger.scalar_summary(lossname, lossval, episodes_so_far)
+
+        logger.scalar_summary("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), episodes_so_far)
+
+        logger.scalar_summary("step", np.mean(lenbuffer), episodes_so_far)
+        logger.scalar_summary("reward", np.mean(rewbuffer), episodes_so_far)
+        logger.scalar_summary("best reward", np.max(rewbuffer), episodes_so_far)
+
+        elapsed_time = time.time() - tstart
+
+        logger.scalar_summary(
+            "episode per minute",
+            episodes_so_far / elapsed_time * 60,
+            episodes_so_far)
+        logger.scalar_summary(
+            "step per second",
+            timesteps_so_far / elapsed_time,
+            episodes_so_far)
+
+
+def flatten_lists(listoflists):
+    return [el for list_ in listoflists for el in list_]
diff --git a/common/__init__.py b/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/common/buffers.py b/common/buffers.py
new file mode 100644
index 0000000..8bc3c44
--- /dev/null
+++ b/common/buffers.py
@@ -0,0 +1,215 @@
+import random
+
+import numpy as np
+
+from common.segment_tree import SumSegmentTree, MinSegmentTree
+
+
+class ReplayBuffer(object):
+    def __init__(self, size):
+        """Create Prioritized Replay buffer.
+
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        """
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, obs_t, action, reward, obs_tp1, done):
+        data = (obs_t, action, reward, obs_tp1, done)
+
+        if self._next_idx >= len(self._storage):
+            self._storage.append(data)
+        else:
+            self._storage[self._next_idx] = data
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
+        for i in idxes:
+            data = self._storage[i]
+            obs_t, action, reward, obs_tp1, done = data
+            obses_t.append(np.array(obs_t, copy=False))
+            actions.append(np.array(action, copy=False))
+            rewards.append(reward)
+            obses_tp1.append(np.array(obs_tp1, copy=False))
+            dones.append(done)
+        return np.array(obses_t), \
+               np.array(actions), \
+               np.array(rewards), \
+               np.array(obses_tp1), \
+               np.array(dones)
+
+    def sample(self, batch_size):
+        """Sample a batch of experiences.
+
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        """
+        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+class PrioritizedReplayBuffer(ReplayBuffer):
+    def __init__(self, size, alpha=0.5):
+        """Create Prioritized Replay buffer.
+
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        alpha: float
+            how much prioritization is used
+            (0 - no prioritization, 1 - full prioritization)
+
+        See Also
+        --------
+        ReplayBuffer.__init__
+        """
+        super(PrioritizedReplayBuffer, self).__init__(size)
+        assert alpha > 0
+        self._alpha = alpha
+
+        it_capacity = 1
+        while it_capacity < size:
+            it_capacity *= 2
+
+        self._it_sum = SumSegmentTree(it_capacity)
+        self._it_min = MinSegmentTree(it_capacity)
+        self._max_priority = 1.0
+
+    def add(self, *args, **kwargs):
+        """See ReplayBuffer.store_effect"""
+        idx = self._next_idx
+        super().add(*args, **kwargs)
+        self._it_sum[idx] = self._max_priority ** self._alpha
+        self._it_min[idx] = self._max_priority ** self._alpha
+
+    def _sample_proportional(self, batch_size):
+        res = []
+        for _ in range(batch_size):
+            # TODO(szymon): should we ensure no repeats?
+            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
+            idx = self._it_sum.find_prefixsum_idx(mass)
+            res.append(idx)
+        return res
+
+    def sample(self, batch_size, beta=0.5):
+        """Sample a batch of experiences.
+
+        compared to ReplayBuffer.sample
+        it also returns importance weights and idxes
+        of sampled experiences.
+
+
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+        beta: float
+            To what degree to use importance weights
+            (0 - no corrections, 1 - full correction)
+
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        weights: np.array
+            Array of shape (batch_size,) and dtype np.float32
+            denoting importance weight of each sampled transition
+        idxes: np.array
+            Array of shape (batch_size,) and dtype np.int32
+            idexes in buffer of sampled experiences
+        """
+        assert beta > 0
+
+        idxes = self._sample_proportional(batch_size)
+
+        weights = []
+        p_min = self._it_min.min() / self._it_sum.sum()
+        max_weight = (p_min * len(self._storage)) ** (-beta)
+
+        for idx in idxes:
+            p_sample = self._it_sum[idx] / self._it_sum.sum()
+            weight = (p_sample * len(self._storage)) ** (-beta)
+            weights.append(weight / max_weight)
+        weights = np.array(weights)
+        encoded_sample = self._encode_sample(idxes)
+        return tuple(list(encoded_sample) + [weights, idxes])
+
+    def update_priorities(self, idxes, priorities):
+        """Update priorities of sampled transitions.
+
+        sets priority of transition at index idxes[i] in buffer
+        to priorities[i].
+
+        Parameters
+        ----------
+        idxes: [int]
+            List of idxes of sampled transitions
+        priorities: [float]
+            List of updated priorities corresponding to
+            transitions at the sampled idxes denoted by
+            variable `idxes`.
+        """
+        assert len(idxes) == len(priorities)
+        for idx, priority in zip(idxes, priorities):
+            assert priority > 0
+            assert 0 <= idx < len(self._storage)
+            self._it_sum[idx] = priority ** self._alpha
+            self._it_min[idx] = priority ** self._alpha
+
+            self._max_priority = max(self._max_priority, priority)
+
+buffers = {
+    "simple": ReplayBuffer,
+    "prioritized": PrioritizedReplayBuffer
+}
+
+
+def buffer_generator(buffer, batch_size=32):
+    result = None
+    while True:
+        observation, action, reward, next_observation, done = yield result
+        buffer.add(observation, action, reward, next_observation, done)
+        result = buffer.sample(batch_size=batch_size)
+
+
+def create_buffer(args):
+    if args.prioritized_replay:
+        return PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
+    else:
+        return ReplayBuffer(args.buffer_size)
\ No newline at end of file
diff --git a/common/env_wrappers.py b/common/env_wrappers.py
new file mode 100644
index 0000000..b323757
--- /dev/null
+++ b/common/env_wrappers.py
@@ -0,0 +1,95 @@
+import numpy as np
+import gym
+from gym.spaces import Box
+from osim.env import RunEnv
+
+from common.state_transform import StateVelCentr
+
+
+class DdpgWrapper(gym.Wrapper):
+    def __init__(self, env, args):
+        gym.Wrapper.__init__(self, env)
+        self.state_transform = StateVelCentr(
+            obstacles_mode='standard',
+            exclude_centr=True,
+            vel_states=[])
+        self.observation_space = Box(-1000, 1000, self.state_transform.state_size)
+        self.skip_frames = args.skip_frames
+        self.reward_scale = args.reward_scale
+        self.fail_reward = args.fail_reward
+        # [-1, 1] <-> [0, 1]
+        action_mean = .5
+        action_std = .5
+        self.normalize_action = lambda x: (x - action_mean) / action_std
+        self.denormalise_action = lambda x: x * action_std + action_mean
+
+    def reset(self, **kwargs):
+        return self._reset(**kwargs)
+
+    def _reset(self, **kwargs):
+        observation = self.env.reset(**kwargs)
+        self.env_step = 0
+        self.state_transform.reset()
+        observation, _ = self.state_transform.process(observation)
+        observation = self.observation(observation)
+        return observation
+
+    def _step(self, action):
+        action = self.denormalise_action(action)
+        total_reward = 0.
+        for _ in range(self.skip_frames):
+            observation, reward, done, _ = self.env.step(action)
+            observation, obst_rew = self.state_transform.process(observation)
+            total_reward += reward + obst_rew
+            self.env_step += 1
+            if done:
+                if self.env_step < 1000:  # hardcoded
+                    total_reward += self.fail_reward
+                break
+
+        observation = self.observation(observation)
+        total_reward *= self.reward_scale
+        return observation, total_reward, done, None
+
+    def observation(self, observation):
+        return self._observation(observation)
+
+    def _observation(self, observation):
+        observation = np.array(observation, dtype=np.float32)
+        return observation
+
+
+def create_env(args):
+    env = RunEnv(visualize=False, max_obstacles=args.max_obstacles)
+
+    if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
+        env = DdpgWrapper(env, args)
+
+    return env
+
+
+def create_observation_handler(args):
+
+    if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"):
+        state_transform = StateVelCentr(
+            obstacles_mode='standard',
+            exclude_centr=True,
+            vel_states=[])
+
+        def observation_handler(observation, previous_action=None):
+            observation = np.array(observation, dtype=np.float32)
+            observation, _ = state_transform.process(observation)
+            return observation
+    else:
+        def observation_handler(observation, previous_action=None):
+            observation = np.array(observation, dtype=np.float32)
+            return observation
+
+    return observation_handler
+
+
+def create_action_handler(args):
+    action_mean = .5
+    action_std = .5
+    action_handler = lambda x: x * action_std + action_mean
+    return action_handler
diff --git a/common/loss.py b/common/loss.py
new file mode 100644
index 0000000..2aee7e8
--- /dev/null
+++ b/common/loss.py
@@ -0,0 +1,60 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def create_linear_decay_fn(initial_value, final_value, max_step):
+    def decay_fn(step):
+        relative = 1. - step / max_step
+        return initial_value * relative + final_value * (1. - relative)
+
+    return decay_fn
+
+
+def create_cycle_decay_fn(initial_value, final_value, cycle_len, num_cycles):
+    max_step = cycle_len * num_cycles
+
+    def decay_fn(step):
+        relative = 1. - step / max_step
+        relative_cosine = 0.5 * (np.cos(np.pi * np.mod(step, cycle_len) / cycle_len) + 1.0)
+        return relative_cosine * (initial_value - final_value) * relative + final_value
+
+    return decay_fn
+
+
+def create_decay_fn(decay_type, **kwargs):
+    if decay_type == "linear":
+        return create_linear_decay_fn(**kwargs)
+    elif decay_type == "cycle":
+        return create_cycle_decay_fn(**kwargs)
+    else:
+        raise NotImplementedError()
+
+
+class QuadricLinearLoss(nn.Module):
+    def __init__(self, clip_delta):
+        super(QuadricLinearLoss, self).__init__()
+        self.clip_delta = clip_delta
+
+    def forward(self, y_pred, y_true, weights):
+        td_error = y_true - y_pred
+        td_error_abs = torch.abs(td_error)
+        quadratic_part = torch.clamp(td_error_abs, max=self.clip_delta)
+        linear_part = td_error_abs - quadratic_part
+        loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
+        loss = torch.mean(loss * weights)
+        return loss
+
+losses = {
+    "mse": nn.MSELoss,
+    "quadric-linear": QuadricLinearLoss
+}
+
+
+def create_loss(args):
+    if args.loss_type == "mse":
+        return nn.MSELoss()
+    elif args.loss_type == "quadric-linear":
+        return QuadricLinearLoss(clip_delta=args.clip_delta)
+    else:
+        raise NotImplementedError()
diff --git a/common/misc_util.py b/common/misc_util.py
new file mode 100644
index 0000000..ff4cf6e
--- /dev/null
+++ b/common/misc_util.py
@@ -0,0 +1,88 @@
+import os
+import sys
+import random
+import numpy as np
+
+
+def create_if_need(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def boolean_flag(parser, name, default=False, help=None):
+    """Add a boolean flag to argparse parser.
+
+    Parameters
+    ----------
+    parser: argparse.Parser
+        parser to add the flag to
+    name: str
+        --<name> will enable the flag, while --no-<name> will disable it
+    default: bool or None
+        default value of the flag
+    help: str
+        help string for the flag
+    """
+    dest = name.replace('-', '_')
+    parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
+    parser.add_argument("--no-" + name, action="store_false", dest=dest)
+
+
+def str2params(string, delimeter="-"):
+    try:
+        result = list(map(int, string.split(delimeter)))
+    except:
+        result = None
+    return result
+
+
+def set_global_seeds(i):
+    try:
+        import torch
+    except ImportError:
+        pass
+    else:
+        torch.manual_seed(i)
+    try:
+        import tensorflow as tf
+    except ImportError:
+        pass
+    else:
+        tf.set_random_seed(i)
+    np.random.seed(i)
+    random.seed(i)
+
+
+def query_yes_no(question, default="no"):
+    """Ask a yes/no question via input() and return their answer.
+
+    "question" is a string that is presented to the user.
+    "default" is the presumed answer if the user just hits <Enter>.
+        It must be "yes" (the default), "no" or None (meaning
+        an answer is required of the user).
+
+    The "answer" return value is True for "yes" or False for "no".
+    """
+    valid = {
+        "yes": True, "y": True, "ye": True,
+        "no": False, "n": False
+    }
+    if default is None:
+        prompt = " [y/n] "
+    elif default == "yes":
+        prompt = " [Y/n] "
+    elif default == "no":
+        prompt = " [y/N] "
+    else:
+        raise ValueError("invalid default answer: '%s'" % default)
+
+    while True:
+        sys.stdout.write(question + prompt)
+        choice = input().lower()
+        if default is not None and choice == '':
+            return valid[default]
+        elif choice in valid:
+            return valid[choice]
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no' "
+                             "(or 'y' or 'n').\n")
\ No newline at end of file
diff --git a/common/modules/LayerNorm.py b/common/modules/LayerNorm.py
new file mode 100644
index 0000000..34b1e74
--- /dev/null
+++ b/common/modules/LayerNorm.py
@@ -0,0 +1,15 @@
+import torch
+import torch.nn as nn
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, features, eps=1e-6):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(features))
+        self.beta = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.gamma * (x - mean) / (std + self.eps) + self.beta
diff --git a/common/modules/NoisyLinear.py b/common/modules/NoisyLinear.py
new file mode 100644
index 0000000..c9e33a6
--- /dev/null
+++ b/common/modules/NoisyLinear.py
@@ -0,0 +1,92 @@
+import math
+
+import torch
+from torch.nn.parameter import Parameter
+import torch.nn.functional as F
+from torch.nn.modules.module import Module
+from torch.autograd import Variable
+
+
+class NoisyLinear(Module):
+    """Applies a noisy linear transformation to the incoming data:
+    :math:`y = (mu_w + sigma_w \cdot epsilon_w)x + mu_b + sigma_b \cdot epsilon_b`
+    More details can be found in the paper `Noisy Networks for Exploration` _ .
+    Args:
+        in_features: size of each input sample
+        out_features: size of each output sample
+        bias: If set to False, the layer will not learn an additive bias. Default: True
+        factorised: whether or not to use factorised noise. Default: True
+        std_init: initialization constant for standard deviation component of weights. If None,
+            defaults to 0.017 for independent and 0.4 for factorised. Default: None
+    Shape:
+        - Input: :math:`(N, in\_features)`
+        - Output: :math:`(N, out\_features)`
+    Attributes:
+        weight: the learnable weights of the module of shape (out_features x in_features)
+        bias:   the learnable bias of the module of shape (out_features)
+    Examples::
+        >>> m = nn.NoisyLinear(20, 30)
+        >>> input = autograd.Variable(torch.randn(128, 20))
+        >>> output = m(input)
+        >>> print(output.size())
+    """
+
+    def __init__(self, in_features, out_features, bias=True, factorised=True, std_init=None):
+        super(NoisyLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.factorised = factorised
+        self.weight_mu = Parameter(torch.Tensor(out_features, in_features))
+        self.weight_sigma = Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias_mu = Parameter(torch.Tensor(out_features))
+            self.bias_sigma = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        if not std_init:
+            if self.factorised:
+                self.std_init = 0.4
+            else:
+                self.std_init = 0.017
+        else:
+            self.std_init = std_init
+        self.reset_parameters(bias)
+
+    def reset_parameters(self, bias):
+        if self.factorised:
+            mu_range = 1. / math.sqrt(self.weight_mu.size(1))
+            self.weight_mu.data.uniform_(-mu_range, mu_range)
+            self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
+            if bias:
+                self.bias_mu.data.uniform_(-mu_range, mu_range)
+                self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
+        else:
+            mu_range = math.sqrt(3. / self.weight_mu.size(1))
+            self.weight_mu.data.uniform_(-mu_range, mu_range)
+            self.weight_sigma.data.fill_(self.std_init)
+            if bias:
+                self.bias_mu.data.uniform_(-mu_range, mu_range)
+                self.bias_sigma.data.fill_(self.std_init)
+
+    def scale_noise(self, size):
+        x = torch.Tensor(size).normal_()
+        x = x.sign().mul(x.abs().sqrt())
+        return x
+
+    def forward(self, input):
+        if self.factorised:
+            epsilon_in = self.scale_noise(self.in_features)
+            epsilon_out = self.scale_noise(self.out_features)
+            weight_epsilon = Variable(epsilon_out.ger(epsilon_in))
+            bias_epsilon = Variable(self.scale_noise(self.out_features))
+        else:
+            weight_epsilon = Variable(torch.Tensor(self.out_features, self.in_features).normal_())
+            bias_epsilon = Variable(torch.Tensor(self.out_features).normal_())
+        return F.linear(input,
+                        self.weight_mu + self.weight_sigma.mul(weight_epsilon),
+                        self.bias_mu + self.bias_sigma.mul(bias_epsilon))
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+               + str(self.in_features) + ' -> ' \
+               + str(self.out_features) + ')'
diff --git a/common/modules/__init__.py b/common/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/common/nets.py b/common/nets.py
new file mode 100644
index 0000000..ab619c9
--- /dev/null
+++ b/common/nets.py
@@ -0,0 +1,42 @@
+from collections import OrderedDict
+from itertools import tee
+
+import torch
+import torch.nn as nn
+
+from common.modules.LayerNorm import LayerNorm
+
+
+def pairwise(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b = tee(iterable)
+    next(b, None)
+    return zip(a, b)
+
+
+class LinearNet(nn.Module):
+    def __init__(self, layers, activation=torch.nn.ELU,
+                 layer_norm=False, linear_layer=nn.Linear):
+        super(LinearNet, self).__init__()
+        self.input_shape = layers[0]
+        self.output_shape = layers[-1]
+
+        if layer_norm:
+            layer_fn = lambda layer: [
+                ("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
+                ("layer_norm_{}".format(layer[0]), LayerNorm(layer[1][1])),
+                ("act_{}".format(layer[0]), activation())]
+        else:
+            layer_fn = lambda layer: [
+                ("linear_{}".format(layer[0]), linear_layer(layer[1][0], layer[1][1])),
+                ("act_{}".format(layer[0]), activation())]
+
+        self.net = torch.nn.Sequential(
+            OrderedDict([
+                x for y in map(
+                    lambda layer: layer_fn(layer),
+                    enumerate(pairwise(layers))) for x in y]))
+
+    def forward(self, x):
+        x = self.net.forward(x)
+        return x
diff --git a/common/random_process.py b/common/random_process.py
new file mode 100644
index 0000000..6a61a4e
--- /dev/null
+++ b/common/random_process.py
@@ -0,0 +1,62 @@
+import numpy as np
+
+
+class RandomProcess(object):
+    def reset_states(self):
+        pass
+
+
+class AnnealedGaussianProcess(RandomProcess):
+    def __init__(self, mu, sigma, sigma_min, n_steps_annealing=int(1e5)):
+        self.mu = mu
+        self.sigma = sigma
+        self.n_steps = 0
+
+        if sigma_min is not None:
+            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
+            self.c = sigma
+            self.sigma_min = sigma_min
+        else:
+            self.m = 0.
+            self.c = sigma
+            self.sigma_min = sigma
+
+    @property
+    def current_sigma(self):
+        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
+        return sigma
+
+
+class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
+    def __init__(self, theta, mu=0., sigma=1., dt=1e-2,
+                 x0=None, size=1, sigma_min=None, n_steps_annealing=int(1e5)):
+        super(OrnsteinUhlenbeckProcess, self).__init__(
+            mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
+        self.theta = theta
+        self.mu = mu
+        self.dt = dt
+        self.x0 = x0
+        self.size = size
+        self.reset_states()
+
+    def sample(self):
+        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
+            self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
+        self.x_prev = x
+        self.n_steps += 1
+        return x
+
+    def reset_states(self):
+        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
+
+
+def create_random_process(args):
+    if args.rp_type == "ornstein-uhlenbeck":
+        return OrnsteinUhlenbeckProcess(
+            size=args.n_action,
+            theta=args.rp_theta,
+            mu=args.rp_mu,
+            sigma=args.rp_sigma,
+            sigma_min=args.rp_sigma_min)
+    else:
+        raise NotImplementedError()
diff --git a/common/segment_tree.py b/common/segment_tree.py
new file mode 100644
index 0000000..a5a7dfc
--- /dev/null
+++ b/common/segment_tree.py
@@ -0,0 +1,146 @@
+import operator
+
+
+class SegmentTree(object):
+    def __init__(self, capacity, operation, neutral_element):
+        """Build a Segment Tree data structure.
+
+        https://en.wikipedia.org/wiki/Segment_tree
+
+        Can be used as regular array, but with two
+        important differences:
+
+            a) setting item's value is slightly slower.
+               It is O(lg capacity) instead of O(1).
+            b) user has access to an efficient `reduce`
+               operation which reduces `operation` over
+               a contiguous subsequence of items in the
+               array.
+
+        Paramters
+        ---------
+        capacity: int
+            Total size of the array - must be a power of two.
+        operation: lambda obj, obj -> obj
+            and operation for combining elements (eg. sum, max)
+            must for a mathematical group together with the set of
+            possible values for array elements.
+        neutral_element: obj
+            neutral element for the operation above. eg. float('-inf')
+            for max and 0 for sum.
+        """
+        assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
+        self._capacity = capacity
+        self._value = [neutral_element for _ in range(2 * capacity)]
+        self._operation = operation
+
+    def _reduce_helper(self, start, end, node, node_start, node_end):
+        if start == node_start and end == node_end:
+            return self._value[node]
+        mid = (node_start + node_end) // 2
+        if end <= mid:
+            return self._reduce_helper(start, end, 2 * node, node_start, mid)
+        else:
+            if mid + 1 <= start:
+                return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
+            else:
+                return self._operation(
+                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
+                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
+                )
+
+    def reduce(self, start=0, end=None):
+        """Returns result of applying `self.operation`
+        to a contiguous subsequence of the array.
+
+            self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
+
+        Parameters
+        ----------
+        start: int
+            beginning of the subsequence
+        end: int
+            end of the subsequences
+
+        Returns
+        -------
+        reduced: obj
+            result of reducing self.operation over the specified range of array elements.
+        """
+        if end is None:
+            end = self._capacity
+        if end < 0:
+            end += self._capacity
+        end -= 1
+        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
+
+    def __setitem__(self, idx, val):
+        # index of the leaf
+        idx += self._capacity
+        self._value[idx] = val
+        idx //= 2
+        while idx >= 1:
+            self._value[idx] = self._operation(
+                self._value[2 * idx],
+                self._value[2 * idx + 1]
+            )
+            idx //= 2
+
+    def __getitem__(self, idx):
+        assert 0 <= idx < self._capacity
+        return self._value[self._capacity + idx]
+
+
+class SumSegmentTree(SegmentTree):
+    def __init__(self, capacity):
+        super(SumSegmentTree, self).__init__(
+            capacity=capacity,
+            operation=operator.add,
+            neutral_element=0.0
+        )
+
+    def sum(self, start=0, end=None):
+        """Returns arr[start] + ... + arr[end]"""
+        return super(SumSegmentTree, self).reduce(start, end)
+
+    def find_prefixsum_idx(self, prefixsum):
+        """Find the highest index `i` in the array such that
+            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
+
+        if array values are probabilities, this function
+        allows to sample indexes according to the discrete
+        probability efficiently.
+
+        Parameters
+        ----------
+        perfixsum: float
+            upperbound on the sum of array prefix
+
+        Returns
+        -------
+        idx: int
+            highest index satisfying the prefixsum constraint
+        """
+        assert 0 <= prefixsum <= self.sum() + 1e-5
+        idx = 1
+        while idx < self._capacity:  # while non-leaf
+            if self._value[2 * idx] > prefixsum:
+                idx = 2 * idx
+            else:
+                prefixsum -= self._value[2 * idx]
+                idx = 2 * idx + 1
+        return idx - self._capacity
+
+
+class MinSegmentTree(SegmentTree):
+    def __init__(self, capacity):
+        super(MinSegmentTree, self).__init__(
+            capacity=capacity,
+            operation=min,
+            neutral_element=float('inf')
+        )
+
+    def min(self, start=0, end=None):
+        """Returns min(arr[start], ...,  arr[end])"""
+
+        return super(MinSegmentTree, self).reduce(start, end)
diff --git a/common/state_transform.py b/common/state_transform.py
new file mode 100644
index 0000000..746f012
--- /dev/null
+++ b/common/state_transform.py
@@ -0,0 +1,336 @@
+from __future__ import division
+import numpy as np
+from collections import OrderedDict
+
+
+def get_state_names(all=False, obst=False):
+    names = ['pelvis_' + n for n in ('rot', 'x', 'y')]
+    names += ['pelvis_vel_' + n for n in ('rot', 'x', 'y')]
+    names += ['hip_right', 'knee_right', 'ankle_right', 'hip_left', 'knee_left', 'ankle_left']
+    names += ['hip_right_vel', 'knee_right_vel', 'ankle_right_vel', 'hip_left_vel', 'knee_left_vel', 'ankle_left_vel']
+    names += ['mass_x', 'mass_y']
+    names += ['mass_x_vel', 'mass_y_vel']
+
+    if all:
+        names += [b + '_' + i for b in ['head', 'pelvis2', 'torso', 'toes_left',
+                                        'toes_right', 'talus_left', 'talus_right'] for i in
+                  ['x', 'y']]
+    else:
+        names += [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right',
+                                        'talus_left', 'talus_right'] for i in
+                  ['x', 'y']]
+
+    names += ['muscle_left', 'muscle_right']
+    if obst:
+        names += ['obst_dist', 'obst_y', 'obst_r']
+    return names
+
+
+def get_names_to_center(centr):
+    if centr == 'pelvis':
+        pelvis_or_mass = 'mass'
+    elif centr == 'mass':
+        pelvis_or_mass = 'pelvis'
+    else:
+        raise ValueError('centr should be in [mass or pelvis], not {}'.format(centr))
+    return [b + '_x' for b in ['head', pelvis_or_mass, 'torso', 'toes_left',
+                               'toes_right', 'talus_left', 'talus_right']]
+
+
+def get_bodies_names():
+    return [b + '_' + i for b in ['head', 'torso', 'toes_left', 'toes_right', 'talus_left', 'talus_right']
+            for i in ['x', 'y']]
+
+
+def get_names_obstacles():
+    return ['toes_left', 'toes_right', 'talus_left', 'talus_right']
+
+
+def calculate_velocity(cur, prev):
+    if prev is None:
+        return np.zeros_like(cur)
+    return 100.*(cur - prev)
+
+
+def _get_pattern_idxs(lst, pattern):
+    idxs = [i for i, x in enumerate(lst) if pattern in x]
+    return idxs
+
+
+class State(object):
+    def __init__(self, obstacles_mode='bodies_dist', obst_grid_dist=1,
+                     grid_points=100, predict_bodies=True, add_step=True, osb_first=False):
+        assert obstacles_mode in ['exclude', 'grid', 'bodies_dist', 'standard']
+
+        self.state_idxs = [i for i, n in enumerate(get_state_names(True, True)) if n not in ['pelvis2_x', 'pelvis2_y']]
+        self.state_names = get_state_names()
+        self.step = 0
+        self.add_step = add_step
+        self.osb_first = osb_first
+        self.obstacles_mode = obstacles_mode
+        self.obstacles = OrderedDict()
+
+        self.obst_names = []
+        if obstacles_mode == 'standard':
+            self.obst_names = ['obst_dist', 'obst_y', 'obst_r']
+        elif obstacles_mode == 'grid':
+            self.obst_names = ['obst_grid_{}'.format(i) for i in range(grid_points)]
+            self.obst_grid_dist = obst_grid_dist
+            self.obst_grid_points = grid_points
+            self.obst_grid_size = obst_grid_dist * 2 / grid_points
+        elif obstacles_mode == 'bodies_dist':
+            self._obst_names = get_names_obstacles()
+            for i in range(3):
+                for n in self._obst_names:
+                    self.obst_names.append('{}_{}_obst_x_start'.format(n, i))
+                    self.obst_names.append('{}_{}_obst_x_end'.format(n, i))
+                    self.obst_names.append('{}_{}_obst_y'.format(n, i))
+        self.obst_names.append('is_obstacle')
+
+        if self.add_step:
+            self.state_names.append('step')
+
+        self.predict_bodies = predict_bodies
+        self.bodies_idxs_x = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_x')]
+        self.bodies_idxs_y = [self.state_names.index(n) for n in get_bodies_names() if n.endswith('_y')]
+        self.bodies_idxs = self.bodies_idxs_x + self.bodies_idxs_y
+        self.mass_x_idx = self.state_names.index('mass_x')
+        self.mass_y_idx = self.state_names.index('mass_y')
+
+        self.state_names_out = self.state_names
+        self._set_left_right()
+
+    def _set_left_right(self):
+        self.left_idxs = _get_pattern_idxs(self.state_names, '_left')
+        self.right_idxs = _get_pattern_idxs(self.state_names, '_right')
+
+    def reset(self):
+        self.step = 0
+        self.prev_orig = None
+        self.prev_pred = None
+        self.obstacles = OrderedDict()
+
+    def _predict_bodies(self, state):
+        state = np.copy(state)
+
+        if self.step > 0:
+
+            def update_bodies(cur, prev_orig, prev_pred, d):
+                flt = cur == prev_orig
+                cur[flt] = prev_pred[flt] + d
+
+            # does not matter orig or pred
+            dx = state[self.mass_x_idx] - self.prev_orig[self.mass_x_idx]
+            dy = state[self.mass_y_idx] - self.prev_orig[self.mass_y_idx]
+
+            cur_bodies_x = state[self.bodies_idxs_x]
+            cur_bodies_y = state[self.bodies_idxs_y]
+
+            # need for filter
+            prev_orig_bodies_x = self.prev_orig[self.bodies_idxs_x]
+            prev_orig_bodies_y = self.prev_orig[self.bodies_idxs_y]
+
+            # need for updating
+            prev_pred_bodies_x = self.prev_pred[self.bodies_idxs_x]
+            prev_pred_bodies_y = self.prev_pred[self.bodies_idxs_y]
+
+            update_bodies(cur_bodies_x, prev_orig_bodies_x, prev_pred_bodies_x, dx)
+            update_bodies(cur_bodies_y, prev_orig_bodies_y, prev_pred_bodies_y, dy)
+
+            state[self.bodies_idxs_x] = cur_bodies_x
+            state[self.bodies_idxs_y] = cur_bodies_y
+        return state
+
+    def _add_obstacle(self, state):
+        pelvis_x = state[1]
+        obstacle_x = state[-3]
+
+        if obstacle_x != 100:
+            obstacle_x += pelvis_x
+            if round(obstacle_x, 5) not in self.obstacles:
+                self.obstacles[round(obstacle_x, 5)] = [obstacle_x, state[-2], state[-1]]
+                #print('obstacles {}, step {}'.format(self.obstacles.keys(), self.step))
+        if len(self.obstacles) > 3:
+            Warning('more than 3 obstacles')
+
+    def _get_obstacle_state_reward(self, state):
+        is_obst = float(state[-3] != 100)
+
+        if self.obstacles_mode == 'exclude':
+            return [is_obst], 0.
+        elif self.obstacles_mode == 'standard':
+            if not is_obst:
+                return [-1., 0., 0., is_obst], 0.
+            obst_features = np.clip(state[-3:], -10., 10.)
+            return np.append(obst_features, is_obst), 0.
+        elif self.obstacles_mode == 'gird':
+            mass_x = state[self.state_names.index('mass_x')]
+            obst_grid = np.zeros(self.obst_grid_points)
+            for k, v in self.obstacles.iteritems():
+                obst_x, obst_y, obst_r = v
+                obst_h = obst_y + obst_r
+                obst_left = int(np.ceil((obst_x - mass_x - obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
+                obst_right = int(np.ceil((obst_x - mass_x + obst_r) / self.obst_grid_size) + self.obst_grid_points // 2)
+                obst_left = max(obst_left, 0)
+                obst_right = max(obst_right, -1)
+                obst_grid[obst_left:obst_right + 1] = obst_h
+            obst_features = np.append(obst_grid, is_obst)
+            return obst_features, 0
+        else:
+            obst_state = []
+            obst_reward = 0
+            for i in range(3):
+                if i >= len(self.obstacles):
+                    for n in self._obst_names:
+                        body_y = state[self.state_names.index(n + '_y')]
+                        obst_state.extend([10, 10, body_y])
+                else:
+                    v = self.obstacles.values()[i]
+                    obst_x, obst_y, obst_r = v
+                    obst_h = obst_y + obst_r
+                    obst_x_start = obst_x - obst_r
+                    obst_x_end = obst_x + obst_r
+                    for n in self._obst_names:
+                        body_x = state[self.state_names.index(n + '_x')]
+                        body_y = state[self.state_names.index(n + '_y')]
+                        obst_state.append(obst_x_start - body_x)
+                        obst_state.append(obst_x_end - body_x)
+                        obst_state.append(body_y - obst_h)
+                        if obst_reward >= 0 and body_x >= (obst_x_start - obst_r/2) \
+                                and (body_x <= obst_x_end+obst_r/2) and (obst_h + obst_r/2) >= body_y:
+                            obst_reward = -0.5
+            obst_state.append(is_obst)
+            return np.asarray(obst_state), obst_reward
+
+    def process(self, state):
+        state = np.asarray(state)
+        state = state[self.state_idxs]
+
+        if self.osb_first and self.step == 0:
+            state[-3:] = [100, 0, 0]
+
+        self._add_obstacle(state)
+        obst_state, obst_reward = self._get_obstacle_state_reward(state)
+        state_orig = state[:-3]
+
+        if self.add_step:
+            state_orig = np.append(state_orig, 1. * self.step / 1000)
+
+        if self.predict_bodies:
+            state = self._predict_bodies(state_orig)
+        else:
+            state = state_orig
+
+        self.step += 1
+        self.prev_orig = state_orig
+        self.prev_pred = np.copy(state)
+
+        return (state, obst_state), obst_reward
+
+    def flip_state(self, state, copy=True):
+        assert np.ndim(state) == 1
+        state = np.asarray(state)
+        state = self.flip_states(state.reshape(1, -1), copy)
+        return state.ravel()
+
+    def flip_states(self, states, copy=True):
+        assert np.ndim(states) == 2
+        states = np.asarray(states)
+        if copy:
+            states = states.copy()
+        left = states[:, self.left_idxs]
+        right = states[:, self.right_idxs]
+        states[:, self.left_idxs] = right
+        states[:, self.right_idxs] = left
+        return states
+
+    @property
+    def state_size(self):
+        return len(self.state_names_out) + len(self.obst_names)
+
+
+class StateVel(State):
+    def __init__(self, vel_states=get_bodies_names(), obstacles_mode='bodies_dist',
+                 add_step=True, predict_bodies=True, osb_first=False):
+        super(StateVel, self).__init__(obstacles_mode=obstacles_mode,
+                                       predict_bodies=predict_bodies,
+                                       add_step=add_step,
+                                       osb_first=osb_first)
+        self.vel_idxs = [self.state_names.index(k) for k in vel_states]
+        self.prev_vals = None
+        self.state_names += [n + '_vel' for n in vel_states]
+        self.state_names_out = self.state_names
+        # left right idxs
+        self._set_left_right()
+
+    def reset(self):
+        super(StateVel, self).reset()
+        self.prev_vals = None
+
+    def process(self, state):
+        (state, obst_state), obst_reward = super(StateVel, self).process(state)
+        cur_vals = state[self.vel_idxs]
+        vel = calculate_velocity(cur_vals, self.prev_vals)
+        self.prev_vals = cur_vals
+        state = np.concatenate((state, vel, obst_state))
+        return state, obst_reward
+
+
+class StateVelCentr(State):
+    def __init__(self, centr_state='pelvis_x', vel_states=get_bodies_names(),
+                 states_to_center=get_names_to_center('pelvis'),
+                 vel_before_centr=True, obstacles_mode='bodies_dist',
+                 exclude_centr=False, predict_bodies=True,
+                 add_step=True, osb_first=False):
+        super(StateVelCentr, self).__init__(obstacles_mode=obstacles_mode,
+                                            predict_bodies=predict_bodies,
+                                            add_step=add_step,
+                                                osb_first=osb_first)
+
+        # center
+        self.centr_idx = self.state_names.index(centr_state)
+        self.states_to_center = [self.state_names.index(k) for k in states_to_center]
+        # velocities
+        self.prev_vals = None
+        self.vel_idxs = [self.state_names.index(k) for k in vel_states]
+        self.vel_before_centr = vel_before_centr
+        self.state_names += [n + '_vel' for n in vel_states]
+        self.exclude_centr = exclude_centr
+
+        if self.exclude_centr:
+            self.state_names_out = self.state_names[:max(0, self.centr_idx)] + \
+                          self.state_names[self.centr_idx + 1:]
+        else:
+            self.state_names_out = self.state_names
+
+        # left right idxs
+        self._set_left_right()
+
+    def _set_left_right(self):
+        state_names = self.state_names_out
+        self.left_idxs = _get_pattern_idxs(state_names, '_left')
+        self.right_idxs = _get_pattern_idxs(state_names, '_right')
+
+    def reset(self):
+        super(StateVelCentr, self).reset()
+        self.prev_vals = None
+
+    def process(self, state):
+        (state, obst_state), obst_reward = super(StateVelCentr, self).process(state)
+
+        if self.vel_before_centr:
+            cur_vals = state[self.vel_idxs]
+            vel = calculate_velocity(cur_vals, self.prev_vals)
+            self.prev_vals = cur_vals
+            state[self.states_to_center] -= state[self.centr_idx]
+        else:
+            state[self.states_to_center] -= state[self.centr_idx]
+            cur_vals = state[self.vel_idxs]
+            vel = calculate_velocity(cur_vals, self.prev_vals)
+            self.prev_vals = cur_vals
+
+        if self.exclude_centr:
+            state = np.concatenate([state[:max(0, self.centr_idx)], state[self.centr_idx+1:]])
+
+        state = np.concatenate((state, vel, obst_state))
+        return state, obst_reward
diff --git a/common/torch_util.py b/common/torch_util.py
new file mode 100644
index 0000000..88f64dd
--- /dev/null
+++ b/common/torch_util.py
@@ -0,0 +1,37 @@
+import torch
+from torch.autograd import Variable
+
+USE_CUDA = torch.cuda.is_available()
+FLOAT = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
+
+
+def to_numpy(var):
+    return var.cpu().data.numpy() if USE_CUDA else var.data.numpy()
+
+
+def to_tensor(ndarray, volatile=False, requires_grad=False, dtype=FLOAT):
+    return Variable(
+        torch.from_numpy(ndarray), volatile=volatile, requires_grad=requires_grad
+    ).type(dtype)
+
+
+def soft_update(target, source, tau):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(
+            target_param.data * (1.0 - tau) + param.data * tau
+        )
+
+
+def hard_update(target, source):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(param.data)
+
+
+activations = {
+    "relu": torch.nn.ReLU,
+    "elu": torch.nn.ELU,
+    "leakyrelu": torch.nn.LeakyReLU,
+    "selu": torch.nn.SELU,
+    "sigmoid": torch.nn.Sigmoid,
+    "tanh": torch.nn.Tanh
+}
diff --git a/ddpg/__init__.py b/ddpg/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ddpg/debug.py b/ddpg/debug.py
new file mode 100644
index 0000000..2b9ebc3
--- /dev/null
+++ b/ddpg/debug.py
@@ -0,0 +1,70 @@
+import os
+import torch
+import copy
+from multiprocessing import Value
+
+from common.misc_util import str2params, create_if_need
+from common.env_wrappers import create_env
+from common.torch_util import activations, hard_update
+
+from ddpg.model import create_model, create_act_update_fns, train_multi_thread
+from ddpg.train import parse_args
+
+
+def debug(args, model_fn, act_update_fns, multi_thread):
+    create_if_need(args.logdir)
+    env = create_env(args)
+
+    if args.flip_state_action and hasattr(env, "state_transform"):
+        args.flip_states = env.state_transform.flip_states
+
+    args.n_action = env.action_space.shape[0]
+    args.n_observation = env.observation_space.shape[0]
+
+    args.actor_layers = str2params(args.actor_layers)
+    args.critic_layers = str2params(args.critic_layers)
+
+    args.actor_activation = activations[args.actor_activation]
+    args.critic_activation = activations[args.critic_activation]
+
+    actor, critic = model_fn(args)
+
+    if args.restore_actor_from is not None:
+        actor.load_state_dict(torch.load(args.restore_actor_from))
+    if args.restore_critic_from is not None:
+        critic.load_state_dict(torch.load(args.restore_critic_from))
+
+    actor.train()
+    critic.train()
+    actor.share_memory()
+    critic.share_memory()
+
+    target_actor = copy.deepcopy(actor)
+    target_critic = copy.deepcopy(critic)
+
+    hard_update(target_actor, actor)
+    hard_update(target_critic, critic)
+
+    target_actor.train()
+    critic.train()
+    target_actor.share_memory()
+    target_critic.share_memory()
+
+    _, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
+
+    args.thread = 0
+    best_reward = Value("f", 0.0)
+    multi_thread(actor, critic, target_actor, target_critic, args, act_update_fns, best_reward)
+
+    save_fn()
+
+
+if __name__ == '__main__':
+    os.environ['OMP_NUM_THREADS'] = '1'
+    torch.set_num_threads(1)
+    args = parse_args()
+    debug(
+        args,
+        create_model,
+        create_act_update_fns,
+        train_multi_thread)
diff --git a/ddpg/model.py b/ddpg/model.py
new file mode 100644
index 0000000..d23110d
--- /dev/null
+++ b/ddpg/model.py
@@ -0,0 +1,477 @@
+import random
+import numpy as np
+import torch
+import queue as py_queue
+import time
+import torch.nn as nn
+from pprint import pprint
+
+from ddpg.nets import Actor, Critic
+from common.torch_util import to_numpy, to_tensor, soft_update
+from common.misc_util import create_if_need, set_global_seeds
+from common.logger import Logger
+from common.buffers import create_buffer
+from common.loss import create_loss, create_decay_fn
+from common.env_wrappers import create_env
+from common.random_process import create_random_process
+
+
+def create_model(args):
+    actor = Actor(
+        args.n_observation, args.n_action, args.actor_layers,
+        activation=args.actor_activation,
+        layer_norm=args.actor_layer_norm,
+        parameters_noise=args.actor_parameters_noise,
+        parameters_noise_factorised=args.actor_parameters_noise_factorised,
+        last_activation=nn.Tanh)
+    critic = Critic(
+        args.n_observation, args.n_action, args.critic_layers,
+        activation=args.critic_activation,
+        layer_norm=args.critic_layer_norm,
+        parameters_noise=args.critic_parameters_noise,
+        parameters_noise_factorised=args.critic_parameters_noise_factorised)
+
+    pprint(actor)
+    pprint(critic)
+
+    return actor, critic
+
+
+def create_act_update_fns(actor, critic, target_actor, target_critic, args):
+    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
+    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
+
+    criterion = create_loss(args)
+
+    low_action_boundary = -1.
+    high_action_boundary = 1.
+
+    def act_fn(observation, noise=0):
+        nonlocal actor
+        action = to_numpy(actor(to_tensor(np.array([observation], dtype=np.float32)))).squeeze(0)
+        action += noise
+        action = np.clip(action, low_action_boundary, high_action_boundary)
+        return action
+
+    def update_fn(
+            observations, actions, rewards, next_observations, dones, weights,
+            actor_lr=1e-4, critic_lr=1e-3):
+        nonlocal actor, critic, target_actor, target_critic, actor_optim, critic_optim
+
+        if hasattr(args, "flip_states"):
+            observations_flip = args.flip_states(observations)
+            next_observations_flip = args.flip_states(next_observations)
+            actions_flip = np.zeros_like(actions)
+            actions_flip[:, :args.n_action // 2] = actions[:, args.n_action // 2:]
+            actions_flip[:, args.n_action // 2:] = actions[:, :args.n_action // 2]
+
+            observations = np.concatenate((observations, observations_flip))
+            actions = np.concatenate((actions, actions_flip))
+            rewards = np.tile(rewards.ravel(), 2)
+            next_observations = np.concatenate((next_observations, next_observations_flip))
+            dones = np.tile(dones.ravel(), 2)
+
+        dones = dones[:, None].astype(np.bool)
+        rewards = rewards[:, None].astype(np.float32)
+
+        dones = to_tensor(np.invert(dones).astype(np.float32))
+        rewards = to_tensor(rewards)
+        weights = to_tensor(weights, requires_grad=False)
+
+        next_v_values = target_critic(
+            to_tensor(next_observations, volatile=True),
+            target_actor(to_tensor(next_observations, volatile=True)),
+        )
+        next_v_values.volatile = False
+
+        reward_predicted = dones * args.gamma * next_v_values
+        td_target = rewards + reward_predicted
+
+        # Critic update
+        critic.zero_grad()
+
+        v_values = critic(to_tensor(observations), to_tensor(actions))
+        value_loss = criterion(v_values, td_target, weights=weights)
+        value_loss.backward()
+
+        torch.nn.utils.clip_grad_norm(critic.parameters(), args.grad_clip)
+        for param_group in critic_optim.param_groups:
+            param_group["lr"] = critic_lr
+
+        critic_optim.step()
+
+        # Actor update
+        actor.zero_grad()
+
+        policy_loss = -critic(
+            to_tensor(observations),
+            actor(to_tensor(observations))
+        )
+
+        policy_loss = torch.mean(policy_loss * weights)
+        policy_loss.backward()
+
+        torch.nn.utils.clip_grad_norm(actor.parameters(), args.grad_clip)
+        for param_group in actor_optim.param_groups:
+            param_group["lr"] = actor_lr
+
+        actor_optim.step()
+
+        # Target update
+        soft_update(target_actor, actor, args.tau)
+        soft_update(target_critic, critic, args.tau)
+
+        metrics = {
+            "value_loss": value_loss,
+            "policy_loss": policy_loss
+        }
+
+        td_v_values = critic(
+            to_tensor(observations, volatile=True, requires_grad=False),
+            to_tensor(actions, volatile=True, requires_grad=False))
+        td_error = td_target - td_v_values
+
+        info = {
+            "td_error": to_numpy(td_error)
+        }
+
+        return metrics, info
+
+    def save_fn(episode=None):
+        nonlocal actor, critic
+        if episode is None:
+            save_path = args.logdir
+        else:
+            save_path = "{}/episode_{}".format(args.logdir, episode)
+            create_if_need(save_path)
+        torch.save(actor.state_dict(), "{}/actor_state_dict.pkl".format(save_path))
+        torch.save(critic.state_dict(), "{}/critic_state_dict.pkl".format(save_path))
+        torch.save(target_actor.state_dict(), "{}/target_actor_state_dict.pkl".format(save_path))
+        torch.save(target_critic.state_dict(), "{}/target_critic_state_dict.pkl".format(save_path))
+
+    return act_fn, update_fn, save_fn
+
+
+def train_multi_thread(actor, critic, target_actor, target_critic, args, prepare_fn, best_reward):
+    workerseed = args.seed + 241 * args.thread
+    set_global_seeds(workerseed)
+
+    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
+    create_if_need(args.logdir)
+
+    act_fn, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
+    logger = Logger(args.logdir)
+
+    buffer = create_buffer(args)
+    if args.prioritized_replay:
+        beta_deacy_fn = create_decay_fn(
+            "linear",
+            initial_value=args.prioritized_replay_beta0,
+            final_value=1.0,
+            max_step=args.max_episodes)
+
+    env = create_env(args)
+    random_process = create_random_process(args)
+
+    actor_learning_rate_decay_fn = create_decay_fn(
+        "linear",
+        initial_value=args.actor_lr,
+        final_value=args.actor_lr_end,
+        max_step=args.max_episodes)
+    critic_learning_rate_decay_fn = create_decay_fn(
+        "linear",
+        initial_value=args.critic_lr,
+        final_value=args.critic_lr_end,
+        max_step=args.max_episodes)
+
+    epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
+
+    epsilon_decay_fn = create_decay_fn(
+        "cycle",
+        initial_value=args.initial_epsilon,
+        final_value=args.final_epsilon,
+        cycle_len=epsilon_cycle_len,
+        num_cycles=args.max_episodes // epsilon_cycle_len)
+
+    episode = 0
+    step = 0
+    start_time = time.time()
+    while episode < args.max_episodes:
+        if episode % 100 == 0:
+            env = create_env(args)
+        seed = random.randrange(2 ** 32 - 2)
+
+        actor_lr = actor_learning_rate_decay_fn(episode)
+        critic_lr = critic_learning_rate_decay_fn(episode)
+        epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
+
+        episode_metrics = {
+            "value_loss": 0.0,
+            "policy_loss": 0.0,
+            "reward": 0.0,
+            "step": 0,
+            "epsilon": epsilon
+        }
+
+        observation = env.reset(seed=seed, difficulty=args.difficulty)
+        random_process.reset_states()
+        done = False
+
+        while not done:
+            action = act_fn(observation, noise=epsilon*random_process.sample())
+            next_observation, reward, done, _ = env.step(action)
+
+            buffer.add(observation, action, reward, next_observation, done)
+            episode_metrics["reward"] += reward
+            episode_metrics["step"] += 1
+
+            if len(buffer) >= args.train_steps:
+
+                if args.prioritized_replay:
+                    (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
+                     weights, batch_idxes) = \
+                        buffer.sample(batch_size=args.batch_size, beta=beta_deacy_fn(episode))
+                else:
+                    (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
+                        buffer.sample(batch_size=args.batch_size)
+                    weights, batch_idxes = np.ones_like(tr_rewards), None
+
+                step_metrics, step_info = update_fn(
+                    tr_observations, tr_actions, tr_rewards,
+                    tr_next_observations, tr_dones,
+                    weights, actor_lr, critic_lr)
+
+                if args.prioritized_replay:
+                    new_priorities = np.abs(step_info["td_error"]) + 1e-6
+                    buffer.update_priorities(batch_idxes, new_priorities)
+
+                for key, value in step_metrics.items():
+                    value = to_numpy(value)[0]
+                    episode_metrics[key] += value
+
+            observation = next_observation
+
+        episode += 1
+
+        if episode_metrics["reward"] > 15.0 * args.reward_scale \
+                and episode_metrics["reward"] > best_reward.value:
+            best_reward.value = episode_metrics["reward"]
+            logger.scalar_summary("best reward", best_reward.value, episode)
+            save_fn(episode)
+
+        step += episode_metrics["step"]
+        elapsed_time = time.time() - start_time
+
+        for key, value in episode_metrics.items():
+            value = value if "loss" not in key else value / episode_metrics["step"]
+            logger.scalar_summary(key, value, episode)
+        logger.scalar_summary(
+            "episode per minute",
+            episode / elapsed_time * 60,
+            episode)
+        logger.scalar_summary(
+            "step per second",
+            step / elapsed_time,
+            episode)
+        logger.scalar_summary("actor lr", actor_lr, episode)
+        logger.scalar_summary("critic lr", critic_lr, episode)
+
+        if episode % args.save_step == 0:
+            save_fn(episode)
+
+        if elapsed_time > 86400 * args.max_train_days:
+            episode = args.max_episodes + 1
+
+    save_fn(episode)
+
+    raise KeyboardInterrupt
+
+
+def train_single_thread(
+        actor, critic, target_actor, target_critic, args, prepare_fn,
+        global_episode, global_update_step, episodes_queue):
+    workerseed = args.seed + 241 * args.thread
+    set_global_seeds(workerseed)
+
+    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
+    create_if_need(args.logdir)
+
+    _, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
+
+    logger = Logger(args.logdir)
+
+    buffer = create_buffer(args)
+
+    if args.prioritized_replay:
+        beta_deacy_fn = create_decay_fn(
+            "linear",
+            initial_value=args.prioritized_replay_beta0,
+            final_value=1.0,
+            max_step=args.max_update_steps)
+
+    actor_learning_rate_decay_fn = create_decay_fn(
+        "linear",
+        initial_value=args.actor_lr,
+        final_value=args.actor_lr_end,
+        max_step=args.max_update_steps)
+    critic_learning_rate_decay_fn = create_decay_fn(
+        "linear",
+        initial_value=args.critic_lr,
+        final_value=args.critic_lr_end,
+        max_step=args.max_update_steps)
+
+    update_step = 0
+    received_examples = 1  # just hack
+    while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
+            and global_update_step.value < args.max_update_steps * args.num_train_threads:
+        actor_lr = actor_learning_rate_decay_fn(update_step)
+        critic_lr = critic_learning_rate_decay_fn(update_step)
+
+        actor_lr = min(args.actor_lr, max(args.actor_lr_end, actor_lr))
+        critic_lr = min(args.critic_lr, max(args.critic_lr_end, critic_lr))
+
+        while True:
+            try:
+                replay = episodes_queue.get_nowait()
+                for (observation, action, reward, next_observation, done) in replay:
+                    buffer.add(observation, action, reward, next_observation, done)
+                received_examples += len(replay)
+            except py_queue.Empty:
+                break
+
+        if len(buffer) >= args.train_steps:
+            if args.prioritized_replay:
+                beta = beta_deacy_fn(update_step)
+                beta = min(1.0, max(args.prioritized_replay_beta0, beta))
+                (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
+                 weights, batch_idxes) = \
+                    buffer.sample(
+                        batch_size=args.batch_size,
+                        beta=beta)
+            else:
+                (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
+                    buffer.sample(batch_size=args.batch_size)
+                weights, batch_idxes = np.ones_like(tr_rewards), None
+
+            step_metrics, step_info = update_fn(
+                tr_observations, tr_actions, tr_rewards,
+                tr_next_observations, tr_dones,
+                weights, actor_lr, critic_lr)
+
+            update_step += 1
+            global_update_step.value += 1
+
+            if args.prioritized_replay:
+                new_priorities = np.abs(step_info["td_error"]) + 1e-6
+                buffer.update_priorities(batch_idxes, new_priorities)
+
+            for key, value in step_metrics.items():
+                value = to_numpy(value)[0]
+                logger.scalar_summary(key, value, update_step)
+
+            logger.scalar_summary("actor lr", actor_lr, update_step)
+            logger.scalar_summary("critic lr", critic_lr, update_step)
+
+            if update_step % args.save_step == 0:
+                save_fn(update_step)
+        else:
+            time.sleep(1)
+
+        logger.scalar_summary("buffer size", len(buffer), global_episode.value)
+        logger.scalar_summary(
+            "updates per example",
+            update_step * args.batch_size / received_examples,
+            global_episode.value)
+
+    save_fn(update_step)
+
+    raise KeyboardInterrupt
+
+
+def play_single_thread(
+        actor, critic, target_actor, target_critic, args, prepare_fn,
+        global_episode, global_update_step, episodes_queue,
+        best_reward):
+    workerseed = args.seed + 241 * args.thread
+    set_global_seeds(workerseed)
+
+    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
+    create_if_need(args.logdir)
+
+    act_fn, _, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
+
+    logger = Logger(args.logdir)
+    env = create_env(args)
+    random_process = create_random_process(args)
+
+    epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)
+
+    epsilon_decay_fn = create_decay_fn(
+        "cycle",
+        initial_value=args.initial_epsilon,
+        final_value=args.final_epsilon,
+        cycle_len=epsilon_cycle_len,
+        num_cycles=args.max_episodes // epsilon_cycle_len)
+
+    episode = 1
+    step = 0
+    start_time = time.time()
+    while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
+            and global_update_step.value < args.max_update_steps * args.num_train_threads:
+        if episode % 100 == 0:
+            env = create_env(args)
+        seed = random.randrange(2 ** 32 - 2)
+
+        epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))
+
+        episode_metrics = {
+            "reward": 0.0,
+            "step": 0,
+            "epsilon": epsilon
+        }
+
+        observation = env.reset(seed=seed, difficulty=args.difficulty)
+        random_process.reset_states()
+        done = False
+
+        replay = []
+        while not done:
+            action = act_fn(observation, noise=epsilon * random_process.sample())
+            next_observation, reward, done, _ = env.step(action)
+
+            replay.append((observation, action, reward, next_observation, done))
+            episode_metrics["reward"] += reward
+            episode_metrics["step"] += 1
+
+            observation = next_observation
+
+        episodes_queue.put(replay)
+
+        episode += 1
+        global_episode.value += 1
+
+        if episode_metrics["reward"] > best_reward.value:
+            best_reward.value = episode_metrics["reward"]
+            logger.scalar_summary("best reward", best_reward.value, episode)
+
+            if episode_metrics["reward"] > 15.0 * args.reward_scale:
+                save_fn(episode)
+
+        step += episode_metrics["step"]
+        elapsed_time = time.time() - start_time
+
+        for key, value in episode_metrics.items():
+            logger.scalar_summary(key, value, episode)
+        logger.scalar_summary(
+            "episode per minute",
+            episode / elapsed_time * 60,
+            episode)
+        logger.scalar_summary(
+            "step per second",
+            step / elapsed_time,
+            episode)
+
+        if elapsed_time > 86400 * args.max_train_days:
+            global_episode.value = args.max_episodes * (args.num_threads - args.num_train_threads) + 1
+
+    raise KeyboardInterrupt
diff --git a/ddpg/nets.py b/ddpg/nets.py
new file mode 100644
index 0000000..06b2807
--- /dev/null
+++ b/ddpg/nets.py
@@ -0,0 +1,90 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from common.nets import LinearNet
+from common.modules.NoisyLinear import NoisyLinear
+
+
+def fanin_init(size, fanin=None):
+    fanin = fanin or size[0]
+    v = 1. / np.sqrt(fanin)
+    return torch.Tensor(size).uniform_(-v, v)
+
+
+class Actor(nn.Module):
+    def __init__(self, n_observation, n_action,
+                 layers, activation=torch.nn.ELU,
+                 layer_norm=False,
+                 parameters_noise=False, parameters_noise_factorised=False,
+                 last_activation=torch.nn.Tanh, init_w=3e-3):
+        super(Actor, self).__init__()
+
+        if parameters_noise:
+            def linear_layer(x_in, x_out):
+                return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
+        else:
+            linear_layer = nn.Linear
+
+        self.feature_net = LinearNet(
+            layers=[n_observation] + layers,
+            activation=activation,
+            layer_norm=layer_norm,
+            linear_layer=linear_layer)
+        self.policy_net = LinearNet(
+            layers=[self.feature_net.output_shape, n_action],
+            activation=last_activation,
+            layer_norm=False
+        )
+        self.init_weights(init_w)
+
+    def init_weights(self, init_w):
+        for layer in self.feature_net.net:
+            if isinstance(layer, nn.Linear):
+                layer.weight.data = fanin_init(layer.weight.data.size())
+
+        for layer in self.feature_net.net:
+            if isinstance(layer, nn.Linear):
+                layer.weight.data.uniform_(-init_w, init_w)
+
+    def forward(self, observation):
+        x = observation
+        x = self.feature_net.forward(x)
+        x = self.policy_net.forward(x)
+        return x
+
+
+class Critic(nn.Module):
+    def __init__(self, n_observation, n_action,
+                 layers, activation=torch.nn.ELU,
+                 layer_norm=False,
+                 parameters_noise=False, parameters_noise_factorised=False,
+                 init_w=3e-3):
+        super(Critic, self).__init__()
+
+        if parameters_noise:
+            def linear_layer(x_in, x_out):
+                return NoisyLinear(x_in, x_out, factorised=parameters_noise_factorised)
+        else:
+            linear_layer = nn.Linear
+
+        self.feature_net = LinearNet(
+            layers=[n_observation + n_action] + layers,
+            activation=activation,
+            layer_norm=layer_norm,
+            linear_layer=linear_layer)
+        self.value_net = nn.Linear(self.feature_net.output_shape, 1)
+        self.init_weights(init_w)
+
+    def init_weights(self, init_w):
+        for layer in self.feature_net.net:
+            if isinstance(layer, nn.Linear):
+                layer.weight.data = fanin_init(layer.weight.data.size())
+
+        self.value_net.weight.data.uniform_(-init_w, init_w)
+
+    def forward(self, observation, action):
+        x = torch.cat((observation, action), dim=1)
+        x = self.feature_net.forward(x)
+        x = self.value_net.forward(x)
+        return x
diff --git a/ddpg/submit.py b/ddpg/submit.py
new file mode 100644
index 0000000..e071ae7
--- /dev/null
+++ b/ddpg/submit.py
@@ -0,0 +1,186 @@
+import os
+import json
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+from pprint import pprint
+
+from osim.env import RunEnv
+from osim.http.client import Client
+
+from common.misc_util import boolean_flag, query_yes_no
+from common.env_wrappers import create_observation_handler, create_action_handler, create_env
+
+from ddpg.train import str2params, activations
+from ddpg.model import create_model, create_act_update_fns
+
+
+REMOTE_BASE = 'http://grader.crowdai.org:1729'
+ACTION_SHAPE = 18
+SEEDS = [
+    3834825972, 3049289152, 3538742899, 2904257823, 4011088434,
+    2684066875, 781202090, 1691535473, 898088606, 1301477286
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--restore-args-from', type=str, default=None)
+    parser.add_argument('--restore-actor-from', type=str, default=None)
+    parser.add_argument('--restore-critic-from', type=str, default=None)
+
+    parser.add_argument('--max-obstacles', type=int, default=3)
+    parser.add_argument('--num-episodes', type=int, default=1)
+    parser.add_argument('--token', type=str, default=None)
+
+    boolean_flag(parser, "visualize", default=False)
+    boolean_flag(parser, "submit", default=False)
+
+    return parser.parse_args()
+
+
+def restore_args(args):
+    with open(args.restore_args_from, "r") as fin:
+        params = json.load(fin)
+
+    unwanted = [
+        "max_obstacles",
+        "restore_args_from",
+        "restore_actor_from",
+        "restore_critic_from"
+    ]
+
+    for unwanted_key in unwanted:
+        value = params.pop(unwanted_key, None)
+        if value is not None:
+            del value
+
+    for key, value in params.items():
+        setattr(args, key, value)
+    return args
+
+
+def submit(actor, critic, args, act_update_fn):
+    act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
+
+    client = Client(REMOTE_BASE)
+
+    all_episode_metrics = []
+
+    episode_metrics = {
+        "reward": 0.0,
+        "step": 0,
+    }
+
+    observation_handler = create_observation_handler(args)
+    action_handler = create_action_handler(args)
+    observation = client.env_create(args.token)
+    action = np.zeros(ACTION_SHAPE, dtype=np.float32)
+    observation = observation_handler(observation, action)
+
+    submitted = False
+    while not submitted:
+        print(episode_metrics["reward"])
+        action = act_fn(observation)
+
+        observation, reward, done, _ = client.env_step(action_handler(action).tolist())
+
+        episode_metrics["reward"] += reward
+        episode_metrics["step"] += 1
+
+        if done:
+            all_episode_metrics.append(episode_metrics)
+
+            episode_metrics = {
+                "reward": 0.0,
+                "step": 0,
+            }
+
+            observation_handler = create_observation_handler(args)
+            action_handler = create_action_handler(args)
+            observation = client.env_create(args.token)
+
+            if not observation:
+                submitted = True
+                break
+
+            action = np.zeros(ACTION_SHAPE, dtype=np.float32)
+            observation = observation_handler(observation, action)
+        else:
+            observation = observation_handler(observation, action)
+
+    df = pd.DataFrame(all_episode_metrics)
+    pprint(df.describe())
+
+    if query_yes_no("Submit?"):
+        client.submit()
+
+
+def test(actor, critic, args, act_update_fn):
+    act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
+    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles)
+
+    all_episode_metrics = []
+    for episode in range(args.num_episodes):
+        episode_metrics = {
+            "reward": 0.0,
+            "step": 0,
+        }
+
+        observation_handler = create_observation_handler(args)
+        action_handler = create_action_handler(args)
+        observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)])
+        action = np.zeros(ACTION_SHAPE, dtype=np.float32)
+        observation = observation_handler(observation, action)
+
+        done = False
+        while not done:
+            print(episode_metrics["reward"])
+            action = act_fn(observation)
+
+            observation, reward, done, _ = env.step(action_handler(action))
+
+            episode_metrics["reward"] += reward
+            episode_metrics["step"] += 1
+
+            if done:
+                break
+
+            observation = observation_handler(observation, action)
+
+        all_episode_metrics.append(episode_metrics)
+
+    df = pd.DataFrame(all_episode_metrics)
+    pprint(df.describe())
+
+
+def submit_or_test(args, model_fn, act_update_fn, submit_fn, test_fn):
+    args = restore_args(args)
+    env = create_env(args)
+
+    args.n_action = env.action_space.shape[0]
+    args.n_observation = env.observation_space.shape[0]
+
+    args.actor_layers = str2params(args.actor_layers)
+    args.critic_layers = str2params(args.critic_layers)
+
+    args.actor_activation = activations[args.actor_activation]
+    args.critic_activation = activations[args.critic_activation]
+
+    actor, critic = model_fn(args)
+    actor.load_state_dict(torch.load(args.restore_actor_from))
+    critic.load_state_dict(torch.load(args.restore_critic_from))
+
+    if args.submit:
+        submit_fn(actor, critic, args, act_update_fn)
+    else:
+        test_fn(actor, critic, args, act_update_fn)
+
+
+if __name__ == '__main__':
+    os.environ['OMP_NUM_THREADS'] = '1'
+    torch.set_num_threads(1)
+    args = parse_args()
+    submit_or_test(args, create_model, create_act_update_fns, submit, test)
diff --git a/ddpg/train.py b/ddpg/train.py
new file mode 100644
index 0000000..91a5281
--- /dev/null
+++ b/ddpg/train.py
@@ -0,0 +1,237 @@
+import argparse
+import os
+import json
+import copy
+import torch
+import torch.multiprocessing as mp
+from multiprocessing import Value
+
+from common.misc_util import boolean_flag, str2params, create_if_need
+from common.env_wrappers import create_env
+from common.torch_util import activations, hard_update
+
+from ddpg.model import create_model, create_act_update_fns, train_multi_thread, \
+    train_single_thread, play_single_thread
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--difficulty', type=int, default=2)
+    parser.add_argument('--max-obstacles', type=int, default=3)
+
+    parser.add_argument('--logdir', type=str, default="./logs")
+    parser.add_argument('--num-threads', type=int, default=1)
+    parser.add_argument('--num-train-threads', type=int, default=1)
+
+    boolean_flag(parser, "ddpg-wrapper", default=False)
+    parser.add_argument('--skip-frames', type=int, default=1)
+    parser.add_argument('--fail-reward', type=float, default=0.0)
+    parser.add_argument('--reward-scale', type=float, default=1.)
+    boolean_flag(parser, "flip-state-action", default=False)
+
+    for agent in ["actor", "critic"]:
+        parser.add_argument('--{}-layers'.format(agent), type=str, default="64-64")
+        parser.add_argument('--{}-activation'.format(agent), type=str, default="relu")
+        boolean_flag(parser, "{}-layer-norm".format(agent), default=False)
+        boolean_flag(parser, "{}-parameters-noise".format(agent), default=False)
+        boolean_flag(parser, "{}-parameters-noise-factorised".format(agent), default=False)
+
+        parser.add_argument('--{}-lr'.format(agent), type=float, default=1e-3)
+        parser.add_argument('--{}-lr-end'.format(agent), type=float, default=5e-5)
+
+        parser.add_argument('--restore-{}-from'.format(agent), type=str, default=None)
+
+    parser.add_argument('--gamma', type=float, default=0.96)
+    parser.add_argument('--loss-type', type=str, default="quadric-linear")
+    parser.add_argument('--grad-clip', type=float, default=10.)
+
+    parser.add_argument('--tau', default=0.01, type=float)
+
+    parser.add_argument('--train-steps', type=int, default=int(1e4))
+    parser.add_argument('--batch-size', type=int, default=256)  # per worker
+
+    parser.add_argument('--buffer-size', type=int, default=int(1e6))
+
+    boolean_flag(parser, "prioritized-replay", default=False)
+    parser.add_argument('--prioritized-replay-alpha', default=0.6, type=float)
+    parser.add_argument('--prioritized-replay-beta0', default=0.4, type=float)
+
+    parser.add_argument('--initial-epsilon', default=1., type=float)
+    parser.add_argument('--final-epsilon', default=0.01, type=float)
+    parser.add_argument('--max-episodes', default=int(1e4), type=int)
+    parser.add_argument('--max-update-steps', default=int(5e6), type=int)
+    parser.add_argument('--epsilon-cycle-len', default=int(2e2), type=int)
+
+    parser.add_argument('--max-train-days', default=int(1e1), type=int)
+
+    parser.add_argument('--rp-type', default="ornstein-uhlenbeck", type=str)
+    parser.add_argument('--rp-theta', default=0.15, type=float)
+    parser.add_argument('--rp-sigma', default=0.2, type=float)
+    parser.add_argument('--rp-sigma-min', default=0.15, type=float)
+    parser.add_argument('--rp-mu', default=0.0, type=float)
+
+    parser.add_argument('--clip-delta', type=int, default=10)
+    parser.add_argument('--save-step', type=int, default=int(1e4))
+
+    parser.add_argument('--restore-args-from', type=str, default=None)
+
+    return parser.parse_args()
+
+
+def restore_args(args):
+    with open(args.restore_args_from, "r") as fin:
+        params = json.load(fin)
+
+    del params["seed"]
+    del params["difficulty"]
+    del params["max_obstacles"]
+
+    del params["logdir"]
+    del params["num_threads"]
+    del params["num_train_threads"]
+
+    del params["skip_frames"]
+
+    for agent in ["actor", "critic"]:
+        del params["{}_lr".format(agent)]
+        del params["{}_lr_end".format(agent)]
+        del params["restore_{}_from".format(agent)]
+
+    del params["grad_clip"]
+
+    del params["tau"]
+
+    del params["train_steps"]
+    del params["batch_size"]
+
+    del params["buffer_size"]
+
+    del params["prioritized_replay"]
+    del params["prioritized_replay_alpha"]
+    del params["prioritized_replay_beta0"]
+
+    del params["initial_epsilon"]
+    del params["final_epsilon"]
+    del params["max_episodes"]
+    del params["max_update_steps"]
+    del params["epsilon_cycle_len"]
+
+    del params["max_train_days"]
+
+    del params["rp_type"]
+    del params["rp_theta"]
+    del params["rp_sigma"]
+    del params["rp_sigma_min"]
+    del params["rp_mu"]
+
+    del params["clip_delta"]
+    del params["save_step"]
+
+    del params["restore_args_from"]
+
+    for key, value in params.items():
+        setattr(args, key, value)
+    return args
+
+
+def train(args, model_fn, act_update_fns, multi_thread, train_single, play_single):
+    create_if_need(args.logdir)
+
+    if args.restore_args_from is not None:
+        args = restore_args(args)
+
+    with open("{}/args.json".format(args.logdir), "w") as fout:
+        json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True)
+
+    env = create_env(args)
+
+    if args.flip_state_action and hasattr(env, "state_transform"):
+        args.flip_states = env.state_transform.flip_states
+        args.batch_size = args.batch_size // 2
+
+    args.n_action = env.action_space.shape[0]
+    args.n_observation = env.observation_space.shape[0]
+
+    args.actor_layers = str2params(args.actor_layers)
+    args.critic_layers = str2params(args.critic_layers)
+
+    args.actor_activation = activations[args.actor_activation]
+    args.critic_activation = activations[args.critic_activation]
+
+    actor, critic = model_fn(args)
+
+    if args.restore_actor_from is not None:
+        actor.load_state_dict(torch.load(args.restore_actor_from))
+    if args.restore_critic_from is not None:
+        critic.load_state_dict(torch.load(args.restore_critic_from))
+
+    actor.train()
+    critic.train()
+    actor.share_memory()
+    critic.share_memory()
+
+    target_actor = copy.deepcopy(actor)
+    target_critic = copy.deepcopy(critic)
+
+    hard_update(target_actor, actor)
+    hard_update(target_critic, critic)
+
+    target_actor.train()
+    target_critic.train()
+    target_actor.share_memory()
+    target_critic.share_memory()
+
+    _, _, save_fn = act_update_fns(actor, critic, target_actor, target_critic, args)
+
+    processes = []
+    best_reward = Value("f", 0.0)
+    try:
+        if args.num_threads == args.num_train_threads:
+            for rank in range(args.num_threads):
+                args.thread = rank
+                p = mp.Process(
+                    target=multi_thread,
+                    args=(actor, critic, target_actor, target_critic, args, act_update_fns,
+                          best_reward))
+                p.start()
+                processes.append(p)
+        else:
+            global_episode = Value("i", 0)
+            global_update_step = Value("i", 0)
+            episodes_queue = mp.Queue()
+            for rank in range(args.num_threads):
+                args.thread = rank
+                if rank < args.num_train_threads:
+                    p = mp.Process(
+                        target=train_single,
+                        args=(actor, critic, target_actor, target_critic, args, act_update_fns,
+                              global_episode, global_update_step, episodes_queue))
+                else:
+                    p = mp.Process(
+                        target=play_single,
+                        args=(actor, critic, target_actor, target_critic, args, act_update_fns,
+                              global_episode, global_update_step, episodes_queue,
+                              best_reward))
+                p.start()
+                processes.append(p)
+
+        for p in processes:
+            p.join()
+    except KeyboardInterrupt:
+        pass
+
+    save_fn()
+
+
+if __name__ == '__main__':
+    os.environ['OMP_NUM_THREADS'] = '1'
+    torch.set_num_threads(1)
+    args = parse_args()
+    train(args,
+          create_model,
+          create_act_update_fns,
+          train_multi_thread,
+          train_single_thread,
+          play_single_thread)
diff --git a/gifs/flip.gif b/gifs/flip.gif
new file mode 100644
index 0000000..89e32a5
Binary files /dev/null and b/gifs/flip.gif differ
diff --git a/gifs/noflip.gif b/gifs/noflip.gif
new file mode 100644
index 0000000..a9ae09c
Binary files /dev/null and b/gifs/noflip.gif differ
diff --git a/setup_conda.sh b/setup_conda.sh
new file mode 100644
index 0000000..80b4ad7
--- /dev/null
+++ b/setup_conda.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+conda create -n opensim-rl -c kidzik opensim git python=3.5.2 anaconda -y
diff --git a/setup_env.sh b/setup_env.sh
new file mode 100644
index 0000000..5864895
--- /dev/null
+++ b/setup_env.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+conda upgrade pip -y && \
+	conda install -c conda-forge lapack git -y && \
+	conda install ipython libgcc -y && \
+	conda install pytorch torchvision -c soumith -y && \
+	pip install tensorflow==1.3.0 gym && \
+	pip install git+https://github.com/stanfordnmbl/osim-rl.git
\ No newline at end of file
diff --git a/setup_env_mpi.sh b/setup_env_mpi.sh
new file mode 100644
index 0000000..f3070d0
--- /dev/null
+++ b/setup_env_mpi.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+conda upgrade pip -y && \
+	conda install -c conda-forge lapack git -y && \
+	conda install ipython libgcc -y && \
+	conda install pytorch torchvision -c soumith -y && \
+	pip install tensorflow==1.3.0 gym mpi4py && \
+	pip install git+https://github.com/stanfordnmbl/osim-rl.git
\ No newline at end of file