From ec47ca7ed98e8b8f9d48de591f15e799f0cabab8 Mon Sep 17 00:00:00 2001 From: Ilya Kostrikov Date: Sun, 17 Sep 2017 23:08:50 -0400 Subject: [PATCH] Add KFAC --- README.md | 29 ++++++- kfac.py | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 30 +++++++- model.py | 43 +++++++++-- 4 files changed, 309 insertions(+), 16 deletions(-) create mode 100644 kfac.py diff --git a/README.md b/README.md index 3147aca..5e2c36a 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,27 @@ -# pytorch-a2c-ppo +# pytorch-a2c-ppo-acktr -This is a PyTorch implementation of Advantage Actor Critic (A2C), a synchronous deterministic version of A3C ["Asynchronous Methods for Deep Reinforcement Learning"](https://arxiv.org/pdf/1602.01783v1.pdf) and [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf). Also see the OpenAI posts: [A2C/A3C](https://blog.openai.com/baselines-acktr-a2c/) and [PPO](https://blog.openai.com/openai-baselines-ppo/) for more information. +This is a PyTorch implementation of +* Advantage Actor Critic (A2C), a synchronous deterministic version of [A3C](https://arxiv.org/pdf/1602.01783v1.pdf) +* Proximal Policy Optimization [PPO](https://arxiv.org/pdf/1707.06347.pdf) +* Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation [ACKTR](https://arxiv.org/abs/1708.05144) -This implementation is inspired by the OpenAI baselines for [A2C](https://github.com/openai/baselines/tree/master/baselines/a2c) and [PPO](https://github.com/openai/baselines/tree/master/baselines/ppo1). It uses the same hyper parameters and the model since they were well tuned for Atari games. +Also see the OpenAI posts: [A2C/ACKTR](https://blog.openai.com/baselines-acktr-a2c/) and [PPO](https://blog.openai.com/openai-baselines-ppo/) for more information. + +This implementation is inspired by the OpenAI baselines for [A2C](https://github.com/openai/baselines/tree/master/baselines/a2c), [ACKTR](https://github.com/openai/baselines/tree/master/baselines/acktr) and [PPO](https://github.com/openai/baselines/tree/master/baselines/ppo1). It uses the same hyper parameters and the model since they were well tuned for Atari games. ## Contributions -Contributions are very welcome. If you know how to make this code better, don't hesitate to send a pull request. +Contributions are very welcome. If you know how to make this code better, don't hesitate to send a pull request. Also see a todo list below. + +### TODO +* Add MuJoCo and continuous actions +* Improve performance of KFAC, see kfac.py for more information +* Run evaluation for all games and algorithms ## Usage ### A2C + ``` python main.py --env-name "PongNoFrameskip-v4" ``` @@ -21,6 +32,12 @@ python main.py --env-name "PongNoFrameskip-v4" python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --num-processes 8 --num-steps 256 --vis-interval 1 --log-interval 1 ``` +### ACKTR + +``` +python main.py --env-name "PongNoFrameskip-v4" --algo acktr --num-processes 32 --num-steps 20 +``` + ## Results ### A2C @@ -36,3 +53,7 @@ python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --num-proces ### PPO Coming soon. + +### ACKTR + +Coming soon. diff --git a/kfac.py b/kfac.py new file mode 100644 index 0000000..0f186b7 --- /dev/null +++ b/kfac.py @@ -0,0 +1,223 @@ +import math + +import torch +import torch.optim as optim + + +# TODO: In order to make this code faster: +# 1) Implement _extract_patches as a single cuda kernel_size +# 2) Compute QR decomposition in a separate process +# 3) Actually make a general KFAC optimizer so it fits PyTorch + + +def _extract_patches(x, kernel_size, stride, padding): + #result = P.im2col(Variable(x), kernel_size, stride, padding).data + #return result.view(result.size(0), -1, result.size(-2), result.size(-1)) + if padding[0] + padding[1] > 0: + x = F.pad(x, (padding[1], padding[1], padding[0], + padding[0])).data # Actually check dims + x = x.unfold(2, kernel_size[0], stride[0]) + x = x.unfold(3, kernel_size[1], stride[1]) + x = x.transpose_(1, 2).transpose_(2, 3).contiguous() + x = x.view( + x.size(0), x.size(1), x.size(2), x.size(3) * x.size(4) * x.size(5)) + return x + + +def compute_cov_a(a, classname, layer_info, fast_cnn): + batch_size = a.size(0) + + if classname == 'Conv2d': + if fast_cnn: + a = _extract_patches(a, *layer_info) + a = a.view(a.size(0), -1, a.size(-1)) + a = a.mean(1) + else: + a = _extract_patches(a, *layer_info) + a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2)) + elif classname == 'AddBias': + is_cuda = a.is_cuda + a = torch.ones(a.size(0), 1) + if is_cuda: + a = a.cuda() + + return a.t() @ (a / batch_size) + + +def compute_cov_g(g, classname, layer_info, fast_cnn): + batch_size = g.size(0) + + if classname == 'Conv2d': + if fast_cnn: + g = g.view(g.size(0), g.size(1), -1) + g = g.sum(-1) + else: + g = g.transpose(1, 2).transpose(2, 3).contiguous() + g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2)) + elif classname == 'AddBias': + g = g.view(g.size(0), g.size(1), -1) + g = g.sum(-1) + + g_ = g * batch_size + return g_.t() @ (g_ / g.size(0)) + + +def update_running_stat(aa, m_aa, momentum): + # Do the trick to keep aa unchanged and not create any additional tensors + m_aa *= momentum / (1 - momentum) + m_aa += aa + m_aa *= (1 - momentum) + + +class KFACOptimizer(optim.Optimizer): + def __init__(self, + model, + lr=0.25, + momentum=0.9, + stat_decay=0.99, + kl_clip=0.001, + damping=1e-2, + weight_decay=0, + fast_cnn=False, + Ts=1, + Tf=10): + defaults = dict() + super(KFACOptimizer, self).__init__(model.parameters(), defaults) + + self.known_modules = {'Linear', 'Conv2d', 'AddBias'} + + self.modules = [] + self.grad_outputs = {} + + self.model = model + self._prepare_model() + + self.steps = 0 + + self.m_aa, self.m_gg = {}, {} + self.Q_a, self.Q_g = {}, {} + self.d_a, self.d_g = {}, {} + + self.momentum = momentum + self.stat_decay = stat_decay + + self.lr = lr + self.kl_clip = kl_clip + self.damping = damping + self.weight_decay = weight_decay + + self.fast_cnn = fast_cnn + + self.Ts = Ts + self.Tf = Tf + + self.optim = optim.SGD( + model.parameters(), + lr=self.lr * (1 - self.momentum), + momentum=self.momentum) + + def _save_input(self, module, input): + if input[0].volatile == False and self.steps % self.Ts == 0: + classname = module.__class__.__name__ + layer_info = None + if classname == 'Conv2d': + layer_info = (module.kernel_size, module.stride, + module.padding) + + aa = compute_cov_a(input[0].data, classname, layer_info, + self.fast_cnn) + + # Initialize buffers + if self.steps == 0: + self.m_aa[module] = aa.clone() + + update_running_stat(aa, self.m_aa[module], self.stat_decay) + + def _save_grad_output(self, module, grad_input, grad_output): + if self.acc_stats: + classname = module.__class__.__name__ + layer_info = None + if classname == 'Conv2d': + layer_info = (module.kernel_size, module.stride, + module.padding) + + gg = compute_cov_g(grad_output[0].data, classname, + layer_info, self.fast_cnn) + + # Initialize buffers + if self.steps == 0: + self.m_gg[module] = gg.clone() + + update_running_stat(gg, self.m_gg[module], self.stat_decay) + + def _prepare_model(self): + for module in self.model.children(): + classname = module.__class__.__name__ + if classname in self.known_modules: + assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \ + "You must have a bias as a separate layer" + + self.modules.append(module) + module.register_forward_pre_hook(self._save_input) + module.register_backward_hook(self._save_grad_output) + elif len(list(module.parameters())) > 0: + raise NotImplementedError( + 'Layer {} is not supported'.format(classname)) + + #@profile + def step(self): + # Add weight decay + if self.weight_decay > 0: + for p in self.model.parameters(): + p.grad.data.add_(self.weight_decay, p.data) + + updates = {} + for i, m in enumerate(self.modules): + assert len(list(m.parameters()) + ) == 1, "Can handle only one parameter at the moment" + classname = m.__class__.__name__ + p = next(m.parameters()) + + la = self.damping + self.weight_decay + + if self.steps % self.Tf == 0: + # My asynchronous implementation exists, I will add it later. + # Experimenting with different ways to this in PyTorch. + self.d_a[m], self.Q_a[m] = torch.symeig( + self.m_aa[m].cpu().double(), eigenvectors=True) + self.d_g[m], self.Q_g[m] = torch.symeig( + self.m_gg[m].cpu().double(), eigenvectors=True) + self.d_a[m], self.Q_a[m] = self.d_a[ + m].float().cuda(), self.Q_a[m].float().cuda() + self.d_g[m], self.Q_g[m] = self.d_g[ + m].float().cuda(), self.Q_g[m].float().cuda() + self.d_a[m].mul_((self.d_a[m] > 1e-6).float()) + self.d_g[m].mul_((self.d_g[m] > 1e-6).float()) + + if classname == 'Conv2d': + p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1) + else: + p_grad_mat = p.grad.data + + v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m] + v2 = v1 / ( + self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la) + v = self.Q_g[m] @ v2 @ self.Q_a[m].t() + + v = v.view(p.grad.data.size()) + updates[p] = v + + vg_sum = 0 + for p in self.model.parameters(): + v = updates[p] + vg_sum += (v * p.grad.data * self.lr * self.lr).sum() + + nu = min(1, math.sqrt(self.kl_clip / vg_sum)) + + for p in self.model.parameters(): + v = updates[p] + p.grad.data.copy_(v) + p.grad.data.mul_(nu) + + self.optim.step() + self.steps += 1 diff --git a/main.py b/main.py index e0ef881..263b6d1 100755 --- a/main.py +++ b/main.py @@ -13,12 +13,13 @@ from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from envs import make_env +from kfac import KFACOptimizer from model import ActorCritic from vizualize_atari import visdom_plot parser = argparse.ArgumentParser(description='RL') parser.add_argument('--algo', default='a2c', - help='algorithm to use: a2c | ppo') + help='algorithm to use: a2c | ppo | acktr') parser.add_argument('--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)') parser.add_argument('--eps', type=float, default=1e-5, @@ -69,7 +70,7 @@ parser.add_argument('--no-vis', action='store_true', default=False, args = parser.parse_args() -assert args.algo in ['a2c', 'ppo'] +assert args.algo in ['a2c', 'ppo', 'acktr'] if args.algo == 'ppo': assert args.num_processes * args.num_steps % args.batch_size == 0 args.cuda = not args.no_cuda and torch.cuda.is_available() @@ -117,6 +118,8 @@ def main(): optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), eps=args.eps) + elif args.algo == 'acktr': + optimizer = KFACOptimizer(actor_critic) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, obs_shape[1], obs_shape[2]) @@ -205,7 +208,7 @@ def main(): returns[step] = returns[step + 1] * \ args.gamma * masks[step] + rewards[step] - if args.algo == 'a2c': + if args.algo in ['a2c', 'acktr']: # Reshape to do in a single forward pass for all steps values, logits = actor_critic(Variable(states[:-1].view(-1, *states.size()[-3:]))) log_probs = F.log_softmax(logits) @@ -228,10 +231,29 @@ def main(): action_loss = -(Variable(advantages.data) * action_log_probs).mean() + if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: + # Sampled fisher, see Martens 2014 + actor_critic.zero_grad() + pg_fisher_loss = -action_log_probs.mean() + + value_noise = Variable(torch.randn(values[:-1].size())) + if args.cuda: + value_noise = value_noise.cuda() + + sample_values = values[:-1] + value_noise + vf_fisher_loss = - (values[:-1] - Variable(sample_values.data)).pow(2).mean() + + fisher_loss = pg_fisher_loss + vf_fisher_loss + optimizer.acc_stats = True + fisher_loss.backward(retain_graph=True) + optimizer.acc_stats = False + optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() - nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) + if args.algo == 'a2c': + nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) + optimizer.step() elif args.algo == 'ppo': advantages = returns[:-1] - value_preds[:-1] diff --git a/model.py b/model.py index dbcd4e8..c4e6565 100755 --- a/model.py +++ b/model.py @@ -13,18 +13,40 @@ def weights_init(m): m.bias.data.fill_(0) +# Necessary for my KFAC implementation. +class AddBias(nn.Module): + def __init__(self, out_features): + super(AddBias, self).__init__() + self.bias = nn.Parameter(torch.zeros(out_features, 1)) + + def forward(self, x): + if x.dim() == 2: + bias = self.bias.t().view(1, -1) + else: + bias = self.bias.t().view(1, -1, 1, 1) + + return x + bias + + class ActorCritic(torch.nn.Module): def __init__(self, num_inputs, action_space): super(ActorCritic, self).__init__() - self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) - self.conv2 = nn.Conv2d(32, 64, 4, stride=2) - self.conv3 = nn.Conv2d(64, 64, 3, stride=1) + self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4, bias=False) + self.ab1 = AddBias(32) + self.conv2 = nn.Conv2d(32, 64, 4, stride=2, bias=False) + self.ab2 = AddBias(64) + self.conv3 = nn.Conv2d(64, 32, 3, stride=1, bias=False) + self.ab3 = AddBias(32) - self.linear1 = nn.Linear(64 * 7 * 7, 512) + self.linear1 = nn.Linear(32 * 7 * 7, 512, bias=False) + self.ab_fc1 = AddBias(512) num_outputs = action_space.n - self.critic_linear = nn.Linear(512, 1) - self.actor_linear = nn.Linear(512, num_outputs) + self.critic_linear = nn.Linear(512, 1, bias=False) + self.ab_fc2 = AddBias(1) + + self.actor_linear = nn.Linear(512, num_outputs, bias=False) + self.ab_fc3 = AddBias(num_outputs) self.apply(weights_init) @@ -37,16 +59,21 @@ class ActorCritic(torch.nn.Module): def forward(self, inputs): x = self.conv1(inputs / 255.0) + x = self.ab1(x) x = F.relu(x) x = self.conv2(x) + x = self.ab2(x) x = F.relu(x) x = self.conv3(x) + x = self.ab3(x) x = F.relu(x) - x = x.view(-1, 64 * 7 * 7) + x = x.view(-1, 32 * 7 * 7) x = self.linear1(x) + x = self.ab_fc1(x) x = F.relu(x) - return self.critic_linear(x), self.actor_linear(x) + return self.ab_fc2(self.critic_linear(x)), self.ab_fc3( + self.actor_linear(x))