added openAIAdam optimizer

This commit is contained in:
thomwolf
2018-06-14 13:19:07 +02:00
parent 0b730579c5
commit 0704c84ab9
4 changed files with 112 additions and 29 deletions
-1
View File
@@ -6,7 +6,6 @@ import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
def gelu(x):
+90 -9
View File
@@ -1,23 +1,104 @@
import math
import numpy as np
import torch
from torch.optim import Optimizer
from torch.nn.utils import clip_grad_norm
def warmup_cosine(x, warmup=0.002):
pass
s = 1 if x <= warmup else 0
return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
def warmup_constant(x, warmup=0.002):
pass
s = 1 if x <= warmup else 0
return s*(x/warmup) + (1-s)*1
def warmup_linear(x, warmup=0.002):
pass
s = 1 if x <= warmup else 0
return (s*(x/warmup) + (1-s))*(1-x)
schedules = {
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
adam with weight decay fix
"""
pass
def __init__(self, params, lr, schedule, warmup, t_total,
b1=0.9, b2=0.999, e=1e-8, l2=0,
vector_l2=False, max_grad_norm=-1, **kwargs):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0 <= warmup:
raise ValueError("Invalid warmup: {}".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {}".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {}".format(b2))
if not 0.0 <= e:
raise ValueError("Invalid epsilon value: {}".format(e))
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
max_grad_norm=max_grad_norm)
super(OpenAIAdam, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['b1'], group['b2']
state['step'] += 1
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Add weight decay at the end (fixed version)
if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
p.data.add_(-lr_scheduled * group['l2'], p.data)
return loss
+3 -3
View File
@@ -27,9 +27,9 @@ def text_standardize(text):
text = text.replace('', '-')
text = text.replace('', '...')
text = text.replace('´', "'")
text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub('\s*\n\s*', ' \n ', text)
text = re.sub('[^\S\n]+', ' ', text)
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub(r'\s*\n\s*', ' \n ', text)
text = re.sub(r'[^\S\n]+', ' ', text)
return text.strip()
class TextEncoder(object):
+19 -16
View File
@@ -18,29 +18,20 @@ from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from model_py import Model, LMHead, ClfHead, load_openai_pretrained_model
from opt import adam, warmup_cosine, warmup_linear, warmup_constant
from opt import OpenAIAdam
from datasets import rocstories
from analysis import rocstories as rocstories_analysis
from text_utils import TextEncoder
from utils import (encode_dataset, flatten, iter_data,
ResultLogger, make_path)
OPT_FNS = {
'adam':adam,
}
LR_SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_linear':warmup_linear,
'warmup_constant':warmup_constant,
}
class LossCompute:
"A Loss compute and train function."
def __init__(self, lm_criterion, clf_criterion, lm_coef):
def __init__(self, lm_criterion, clf_criterion, lm_coef, opt=None):
self.lm_criterion = lm_criterion
self.clf_criterion = clf_criterion
self.lm_coef = lm_coef
self.opt = opt
def __call__(self, X, Y, M, lm_logits, clf_logits):
# Language modeling loss
@@ -53,11 +44,18 @@ class LossCompute:
# Classification loss
clf_losses = self.clf_criterion(clf_logits, Y)
if self.lm_coef > 0:
train_loss = clf_losses.sum() + self.lm_coef * lm_losses.sum()
else:
train_loss = clf_losses.sum()
return train_loss
train_loss.backward()
if self.opt is not None:
self.opt.step()
self.opt.optimizer.zero_grad()
return train_loss.item()
def transform_roc(X1, X2, X3):
n_batch = len(X1)
@@ -229,7 +227,14 @@ if __name__ == '__main__':
model = Model(vocab, args)
lm_head = LMHead(model, args)
clf_head = ClfHead(clf_token, args)
compute_loss = LossCompute(nn.CrossEntropyLoss(reduce=False), nn.CrossEntropyLoss(reduce=False), lm_coef) # TODO check loss functions
criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions
model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule,
warmup=lr_warmup, t_total=n_updates_total, b1=b1,
b2=b2, e=e, l2=l2, vector_l2=vector_l2,
max_grad_norm=max_grad_norm)
compute_loss = LossCompute(criterion, criterion, lm_coef, model_opt)
# TODO Initialize model (?)
# TODO add train() and eval()
load_openai_pretrained_model(model, n_ctx, n_special, args)
@@ -258,8 +263,6 @@ if __name__ == '__main__':
lm_logits = lm_head(h)
clf_logits = clf_head(h, XMB)
loss = compute_loss(XMB, YMB, MMB, lm_logits, clf_logits)
loss.backward()
n_updates += 1
#if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
# log()