Add MuJoCo

This commit is contained in:
Ilya Kostrikov
2017-09-27 08:20:19 -04:00
parent 54a0f98180
commit 09e75e26ae
9 changed files with 392 additions and 40 deletions
+30 -4
View File
@@ -1,5 +1,7 @@
# pytorch-a2c-ppo-acktr
## Update 09/27/2017: now supports both Atari and MuJoCo/Roboschool!
This is a PyTorch implementation of
* Advantage Actor Critic (A2C), a synchronous deterministic version of [A3C](https://arxiv.org/pdf/1602.01783v1.pdf)
* Proximal Policy Optimization [PPO](https://arxiv.org/pdf/1707.06347.pdf)
@@ -13,31 +15,55 @@ This implementation is inspired by the OpenAI baselines for [A2C](https://github
Contributions are very welcome. If you know how to make this code better, don't hesitate to send a pull request. Also see a todo list below.
Also I'm searching for volunteers to run all experiments on Atari and MuJoCo (with multiple random seeds).
## Disclaimer
It's extremely difficult to reproduce results for Reinforcement Learning methods. See ["Deep Reinforcement Learning that Matters"](https://arxiv.org/abs/1709.06560) for more information. I tried to reproduce OpenAI results as closely as possible. However, majors differences in performance can be caused even by minor differences in TensorFlow and PyTorch libraries.
### TODO
* Add MuJoCo and continuous actions
* Improve this README file. Rearrange images.
* Improve performance of KFAC, see kfac.py for more information
* Run evaluation for all games and algorithms
## Usage
### A2C
### Atari
#### A2C
```
python main.py --env-name "PongNoFrameskip-v4"
```
### PPO
#### PPO
```
python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --num-processes 8 --num-steps 256 --vis-interval 1 --log-interval 1
```
### ACKTR
#### ACKTR
```
python main.py --env-name "PongNoFrameskip-v4" --algo acktr --num-processes 32 --num-steps 20
```
### MuJoCo
#### A2C
```
python main.py --env-name "Reacher-v1" --num-stack 1 --num-frames 1000000
```
#### PPO
```
python main.py --env-name "Reacher-v1" --algo ppo --use-gae --vis-interval 1 --log-interval 1 --num-stack 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --ppo-epoch 10 --batch-size 64 --gamma 0.99 --tau 0.95 --num-frames 1000000
```
#### ACKTR
ACKTR requires some modifications to be made specifically for MuJoCo. But at the moment, I want to keep this code as unified as possible. Thus, I'm going for better ways to integrate it into the codebase.
## Results
### A2C
+3 -1
View File
@@ -4,7 +4,7 @@ import gym
from gym.spaces.box import Box
from baselines import bench
from baselines.common.atari_wrappers import *
from baselines.common.atari_wrappers import wrap_deepmind
def make_env(env_id, seed, rank, log_dir):
@@ -14,6 +14,8 @@ def make_env(env_id, seed, rank, log_dir):
env = bench.Monitor(env,
os.path.join(log_dir,
"{}.monitor.json".format(rank)))
# Ugly hack to detect atari.
if env.action_space.__class__.__name__ == 'Discrete':
env = wrap_deepmind(env)
env = WrapPyTorch(env)
return env
+6 -7
View File
@@ -11,8 +11,6 @@ import torch.optim as optim
def _extract_patches(x, kernel_size, stride, padding):
#result = P.im2col(Variable(x), kernel_size, stride, padding).data
#return result.view(result.size(0), -1, result.size(-2), result.size(-1))
if padding[0] + padding[1] > 0:
x = F.pad(x, (padding[1], padding[1], padding[0],
padding[0])).data # Actually check dims
@@ -164,7 +162,6 @@ class KFACOptimizer(optim.Optimizer):
raise NotImplementedError(
'Layer {} is not supported'.format(classname))
#@profile
def step(self):
# Add weight decay
if self.weight_decay > 0:
@@ -187,10 +184,12 @@ class KFACOptimizer(optim.Optimizer):
self.m_aa[m].cpu().double(), eigenvectors=True)
self.d_g[m], self.Q_g[m] = torch.symeig(
self.m_gg[m].cpu().double(), eigenvectors=True)
self.d_a[m], self.Q_a[m] = self.d_a[
m].float().cuda(), self.Q_a[m].float().cuda()
self.d_g[m], self.Q_g[m] = self.d_g[
m].float().cuda(), self.Q_g[m].float().cuda()
self.d_a[m], self.Q_a[m] = self.d_a[m].float(), self.Q_a[m].float()
self.d_g[m], self.Q_g[m] = self.d_g[m].float(), self.Q_g[m].float()
if self.m_aa[m].is_cuda:
self.d_a[m], self.Q_a[m] = self.d_a[m].cuda(), self.Q_a[m].cuda()
self.d_g[m], self.Q_g[m] = self.d_g[m].cuda(), self.Q_g[m].cuda()
self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
+28 -13
View File
@@ -15,9 +15,9 @@ from arguments import get_args
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from envs import make_env
from kfac import KFACOptimizer
from model import ActorCritic
from model import CNNPolicy, MLPPolicy
from storage import RolloutStorage
from vizualize_atari import visdom_plot
from visualize import visdom_plot
args = get_args()
@@ -59,7 +59,12 @@ def main():
obs_shape = envs.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
actor_critic = ActorCritic(obs_shape[0], envs.action_space)
if envs.action_space.__class__.__name__ == 'Discrete':
actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
action_shape = 1
else:
actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
action_shape = envs.action_space.shape[0]
if args.cuda:
actor_critic.cuda()
@@ -71,13 +76,15 @@ def main():
elif args.algo == 'acktr':
optimizer = KFACOptimizer(actor_critic)
rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space.n)
rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
current_state = torch.zeros(args.num_processes, *obs_shape)
def update_current_state(state):
state = torch.from_numpy(np.stack(state)).float()
current_state[:, :-1] = current_state[:, 1:]
current_state[:, -1] = state
shape_dim0 = envs.observation_space.shape[0]
state = torch.from_numpy(state).float()
if args.num_stack > 1:
current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
current_state[:, -shape_dim0:] = state
state = envs.reset()
update_current_state(state)
@@ -103,7 +110,6 @@ def main():
# Obser reward and next state
state, reward, done, info = envs.step(cpu_actions)
reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
episode_rewards += reward
@@ -115,17 +121,24 @@ def main():
if args.cuda:
masks = masks.cuda()
if current_state.dim() == 4:
current_state *= masks.unsqueeze(2).unsqueeze(2)
else:
current_state *= masks
update_current_state(state)
rollouts.insert(step, current_state, action.data, value.data, reward, masks)
next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
if hasattr(actor_critic, 'obs_filter'):
actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
if args.algo in ['a2c', 'acktr']:
values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, 1)))
values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
values = values.view(args.num_steps, args.num_processes, 1)
action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
@@ -164,6 +177,8 @@ def main():
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
old_model.load_state_dict(actor_critic.state_dict())
if hasattr(actor_critic, 'obs_filter'):
old_model.obs_filter = actor_critic.obs_filter
for _ in range(args.ppo_epoch):
sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
@@ -171,8 +186,8 @@ def main():
indices = torch.LongTensor(indices)
if args.cuda:
indices = indices.cuda()
states_batch = rollouts.states[:-1].view(-1, *rollouts.states.size()[-3:])[indices]
actions_batch = rollouts.actions.view(-1, 1)[indices]
states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
actions_batch = rollouts.actions.view(-1, action_shape)[indices]
return_batch = rollouts.returns[:-1].view(-1, 1)[indices]
# Reshape to do in a single forward pass for all steps
@@ -183,7 +198,7 @@ def main():
ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
adv_targ = Variable(advantages.view(-1, 1)[indices])
surr1 = ratio * adv_targ
surr2 = ratio.clamp(1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
value_loss = (Variable(return_batch) - values).pow(2).mean()
+119 -7
View File
@@ -3,6 +3,8 @@ import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from running_stat import ObsNorm
def weights_init(m):
@@ -28,9 +30,9 @@ class AddBias(nn.Module):
return x + bias
class ActorCritic(torch.nn.Module):
class CNNPolicy(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
super(CNNPolicy, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4, bias=False)
self.ab1 = AddBias(32)
self.conv2 = nn.Conv2d(32, 64, 4, stride=2, bias=False)
@@ -41,19 +43,20 @@ class ActorCritic(torch.nn.Module):
self.linear1 = nn.Linear(32 * 7 * 7, 512, bias=False)
self.ab_fc1 = AddBias(512)
num_outputs = action_space.n
self.critic_linear = nn.Linear(512, 1, bias=False)
self.ab_fc2 = AddBias(1)
num_outputs = action_space.n
self.actor_linear = nn.Linear(512, num_outputs, bias=False)
self.ab_fc3 = AddBias(num_outputs)
self.apply(weights_init)
self.conv1.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
self.conv2.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
self.conv3.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
self.linear1.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
relu_gain = nn.init.calculate_gain('relu')
self.conv1.weight.data.mul_(relu_gain)
self.conv2.weight.data.mul_(relu_gain)
self.conv3.weight.data.mul_(relu_gain)
self.linear1.weight.data.mul_(relu_gain)
self.train()
@@ -97,3 +100,112 @@ class ActorCritic(torch.nn.Module):
dist_entropy = -(log_probs * probs).sum(-1).mean()
return values, action_log_probs, dist_entropy
def weights_init_mlp(m):
classname = m.__class__.__name__
if classname.find('Linear') != -1:
m.weight.data.normal_(0, 1)
m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
if m.bias is not None:
m.bias.data.fill_(0)
class MLPPolicy(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(MLPPolicy, self).__init__()
self.obs_filter = ObsNorm((1, num_inputs), clip=5)
self.action_space = action_space
self.a_fc1 = nn.Linear(num_inputs, 64, bias=False)
self.a_ab1 = AddBias(64)
self.a_fc2 = nn.Linear(64, 64, bias=False)
self.a_ab2 = AddBias(64)
self.a_fc_mean = nn.Linear(64, action_space.shape[0], bias=False)
self.a_ab_mean = AddBias(action_space.shape[0])
self.a_ab_logstd = AddBias(action_space.shape[0])
self.v_fc1 = nn.Linear(num_inputs, 64, bias=False)
self.v_ab1 = AddBias(64)
self.v_fc2 = nn.Linear(64, 64, bias=False)
self.v_ab2 = AddBias(64)
self.v_fc3 = nn.Linear(64, 1, bias=False)
self.v_ab3 = AddBias(1)
self.apply(weights_init_mlp)
tanh_gain = nn.init.calculate_gain('tanh')
#self.a_fc1.weight.data.mul_(tanh_gain)
#self.a_fc2.weight.data.mul_(tanh_gain)
self.a_fc_mean.weight.data.mul_(0.01)
#self.v_fc1.weight.data.mul_(tanh_gain)
#self.v_fc2.weight.data.mul_(tanh_gain)
self.train()
def cuda(self, **args):
super(MLPPolicy, self).cuda(**args)
self.obs_filter.cuda()
def forward(self, inputs):
inputs.data = self.obs_filter(inputs.data)
x = self.v_fc1(inputs)
x = self.v_ab1(x)
x = F.tanh(x)
x = self.v_fc2(x)
x = self.v_ab2(x)
x = F.tanh(x)
x = self.v_fc3(x)
x = self.v_ab3(x)
value = x
x = self.a_fc1(inputs)
x = self.a_ab1(x)
x = F.tanh(x)
x = self.a_fc2(x)
x = self.a_ab2(x)
x = F.tanh(x)
x = self.a_fc_mean(x)
x = self.a_ab_mean(x)
action_mean = x
# An ugly hack for my KFAC implementation.
zeros = Variable(torch.zeros(x.size()), volatile=x.volatile)
if x.is_cuda:
zeros = zeros.cuda()
x = self.a_ab_logstd(zeros)
action_logstd = x
return value, action_mean, action_logstd
def act(self, inputs):
value, action_mean, action_logstd = self(inputs)
action_std = action_logstd.exp()
noise = Variable(torch.randn(action_std.size()))
if action_std.is_cuda:
noise = noise.cuda()
action = action_mean + action_std * noise
return value, action
def evaluate_actions(self, inputs, actions):
assert inputs.dim() == 2, "Expect to have inputs in num_processes * num_steps x ... format"
value, action_mean, action_logstd = self(inputs)
action_std = action_logstd.exp()
action_log_probs = -0.5 * ((actions - action_mean) / action_std).pow(2) - 0.5 * math.log(2 * math.pi) - action_logstd
action_log_probs = action_log_probs.sum(1, keepdim=True)
dist_entropy = 0.5 + math.log(2 * math.pi) + action_log_probs
dist_entropy = dist_entropy.sum(-1).mean()
return value, action_log_probs, dist_entropy
+8
View File
@@ -0,0 +1,8 @@
#!/home/kostrikov/.linuxbrew/bin/fish
set envs "Reacher-v1" "HalfCheetah-v1" "Hopper-v1" "Walker2d-v1"
set seed 42
for x in (seq 4)
python main.py --env-name "$envs[$x]" --seed $seed --algo ppo --use-gae --vis-interval 1 --log-interval 1 --num-stack 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --ppo-epoch 10 --batch-size 64 --log-dir "/tmp/gym/$x" --gamma 0.99 --tau 0.95&
set seed (math $seed + 1)
end
+44
View File
@@ -0,0 +1,44 @@
import random
import torch
class ObsNorm(object):
def __init__(self, shape, demean=True, destd=True, clip=10.0):
self.demean = demean
self.destd = destd
self.clip = clip
self.count = torch.zeros(1).double() + 1e-2
self.sum = torch.zeros(shape).double()
self.sum_sqr = torch.zeros(shape).double() + 1e-2
self.mean = torch.zeros(shape)
self.std = torch.ones(shape)
def cuda(self):
self.count = self.count.cuda()
self.sum = self.sum.cuda()
self.sum_sqr = self.sum_sqr.cuda()
self.mean = self.mean.cuda()
self.std = self.std.cuda()
def update(self, x):
self.count += x.size(0)
self.sum += x.sum(0, keepdim=True).double()
self.sum_sqr += x.pow(2).sum(0, keepdim=True).double()
self.mean = self.sum / self.count
self.std = (self.sum_sqr / self.count - self.mean.pow(2)).clamp(1e-2, 1e9).sqrt()
self.mean = self.mean.float()
self.std = self.std.float()
def __call__(self, x):
if self.demean:
x = x - self.mean
if self.destd:
x = x / self.std
if self.clip:
x = x.clamp(-self.clip, self.clip)
return x
+10 -5
View File
@@ -2,13 +2,19 @@ import torch
class RolloutStorage(object):
def __init__(self, num_steps, num_processes, obs_shape, action_shape):
def __init__(self, num_steps, num_processes, obs_shape, action_space):
self.states = torch.zeros(num_steps + 1, num_processes, *obs_shape)
self.rewards = torch.zeros(num_steps, num_processes, 1)
self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
self.returns = torch.zeros(num_steps + 1, num_processes, 1)
self.actions = torch.LongTensor(num_steps, num_processes, 1)
self.masks = torch.zeros(num_steps, num_processes, 1)
if action_space.__class__.__name__ == 'Discrete':
action_shape = 1
else:
action_shape = action_space.shape[0]
self.actions = torch.zeros(num_steps, num_processes, action_shape)
if action_space.__class__.__name__ == 'Discrete':
self.actions = self.actions.long()
self.masks = torch.ones(num_steps + 1, num_processes, 1)
def cuda(self):
self.states = self.states.cuda()
@@ -30,8 +36,7 @@ class RolloutStorage(object):
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.size(0))):
delta = self.rewards[step] + gamma * self.value_preds[step +
1] * self.masks[step] - self.value_preds[step]
delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step] - self.value_preds[step]
gae = delta + gamma * tau * self.masks[step] * gae
self.returns[step] = gae + self.value_preds[step]
else:
+141
View File
@@ -0,0 +1,141 @@
# Copied from https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/visualize_atari.py
# and https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/load.py
# Thanks to the author and OpenAI team!
import glob
import json
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import medfilt
matplotlib.rcParams.update({'font.size': 8})
def smooth_reward_curve(x, y):
# Halfwidth of our smoothing convolution
halfwidth = min(31, int(np.ceil(len(x) / 30)))
k = halfwidth
xsmoo = x[k:-k]
ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \
np.convolve(np.ones_like(y), np.ones(2 * k + 1), mode='valid')
downsample = max(int(np.floor(len(xsmoo) / 1e3)), 1)
return xsmoo[::downsample], ysmoo[::downsample]
def fix_point(x, y, interval):
np.insert(x, 0, 0)
np.insert(y, 0, 0)
fx, fy = [], []
pointer = 0
ninterval = int(max(x) / interval + 1)
for i in range(ninterval):
tmpx = interval * i
while pointer + 1 < len(x) and tmpx > x[pointer + 1]:
pointer += 1
if pointer + 1 < len(x):
alpha = (y[pointer + 1] - y[pointer]) / \
(x[pointer + 1] - x[pointer])
tmpy = y[pointer] + alpha * (tmpx - x[pointer])
fx.append(tmpx)
fy.append(tmpy)
return fx, fy
def load_data(indir, smooth, bin_size):
datas = []
infiles = glob.glob(os.path.join(indir, '*monitor.json'))
for inf in infiles:
with open(inf, 'r') as f:
t_start = float(json.loads(f.readline())['t_start'])
for line in f:
tmp = json.loads(line)
t_time = float(tmp['t']) + t_start
tmp = [t_time, int(tmp['l']), float(tmp['r'])]
datas.append(tmp)
datas = sorted(datas, key=lambda d_entry: d_entry[0])
result = []
timesteps = 0
for i in range(len(datas)):
result.append([timesteps, datas[i][-1]])
timesteps += datas[i][1]
if len(result) < bin_size:
return [None, None]
x, y = np.array(result)[:, 0], np.array(result)[:, 1]
if smooth == 1:
x, y = smooth_reward_curve(x, y)
if smooth == 2:
y = medfilt(y, kernel_size=9)
x, y = fix_point(x, y, bin_size)
return [x, y]
color_defaults = [
'#1f77b4', # muted blue
'#ff7f0e', # safety orange
'#2ca02c', # cooked asparagus green
'#d62728', # brick red
'#9467bd', # muted purple
'#8c564b', # chestnut brown
'#e377c2', # raspberry yogurt pink
'#7f7f7f', # middle gray
'#bcbd22', # curry yellow-green
'#17becf' # blue-teal
]
def visdom_plot(viz, win, folder, game, name, bin_size=100, smooth=1):
tx, ty = load_data(folder, smooth, bin_size)
if tx is None or ty is None:
return win
fig = plt.figure()
plt.plot(tx, ty, label="{}".format(name))
# Ugly hack to detect atari
if game.find('NoFrameskip') > -1:
plt.xticks([4*1e6, 4*2e6, 4*4e6, 4*6e6, 4*8e6, 4*10e6],
["1M", "2M", "4M", "6M", "8M", "10M"])
plt.xlim(0, 40e6)
else:
plt.xticks([1e5, 2e5, 4e5, 6e5, 8e5, 1e5],
["0.1M", "0.2M", "0.4M", "0.6M", "0.8M", "1M"])
plt.xlim(0, 1e6)
plt.xlabel('Number of Timesteps')
plt.ylabel('Rewards')
plt.title(game)
plt.legend(loc=4)
plt.show()
plt.draw()
image = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, ))
plt.close(fig)
# Show it in visdom
image = np.transpose(image, (2, 0, 1))
return viz.image(image, win=win)
if __name__ == "__main__":
from visdom import Visdom
viz = Visdom()
visdom_plot(viz, None, '/tmp/gym/', 'BreakOut', 'a2c', bin_size=100, smooth=1)