mirror of
https://github.com/wassname/openai-transformer-lm-gutenberg-erotic.git
synced 2026-06-27 16:10:19 +08:00
added openAIAdam optimizer
This commit is contained in:
@@ -6,7 +6,6 @@ import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
def gelu(x):
|
||||
|
||||
@@ -1,23 +1,104 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.nn.utils import clip_grad_norm
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
pass
|
||||
s = 1 if x <= warmup else 0
|
||||
return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
|
||||
|
||||
def warmup_constant(x, warmup=0.002):
|
||||
pass
|
||||
s = 1 if x <= warmup else 0
|
||||
return s*(x/warmup) + (1-s)*1
|
||||
|
||||
def warmup_linear(x, warmup=0.002):
|
||||
pass
|
||||
s = 1 if x <= warmup else 0
|
||||
return (s*(x/warmup) + (1-s))*(1-x)
|
||||
|
||||
schedules = {
|
||||
SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_constant':warmup_constant,
|
||||
'warmup_linear':warmup_linear,
|
||||
}
|
||||
|
||||
def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
|
||||
|
||||
class OpenAIAdam(Optimizer):
|
||||
"""Implements Open AI version of Adam algorithm with weight decay fix.
|
||||
"""
|
||||
adam with weight decay fix
|
||||
"""
|
||||
pass
|
||||
def __init__(self, params, lr, schedule, warmup, t_total,
|
||||
b1=0.9, b2=0.999, e=1e-8, l2=0,
|
||||
vector_l2=False, max_grad_norm=-1, **kwargs):
|
||||
if not 0.0 <= lr:
|
||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||
if schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0 <= warmup:
|
||||
raise ValueError("Invalid warmup: {}".format(warmup))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {}".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {}".format(b2))
|
||||
if not 0.0 <= e:
|
||||
raise ValueError("Invalid epsilon value: {}".format(e))
|
||||
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
|
||||
b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(OpenAIAdam, self).__init__(params, defaults)
|
||||
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['b1'], group['b2']
|
||||
|
||||
state['step'] += 1
|
||||
|
||||
# Add grad clipping
|
||||
if group['max_grad_norm'] > 0:
|
||||
clip_grad_norm(p, group['max_grad_norm'])
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
exp_avg.mul_(beta1).add_(1 - beta1, grad)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
|
||||
denom = exp_avg_sq.sqrt().add_(group['eps'])
|
||||
|
||||
bias_correction1 = 1 - beta1 ** state['step']
|
||||
bias_correction2 = 1 - beta2 ** state['step']
|
||||
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
|
||||
|
||||
p.data.addcdiv_(-step_size, exp_avg, denom)
|
||||
|
||||
# Add weight decay at the end (fixed version)
|
||||
if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
|
||||
p.data.add_(-lr_scheduled * group['l2'], p.data)
|
||||
|
||||
return loss
|
||||
|
||||
+3
-3
@@ -27,9 +27,9 @@ def text_standardize(text):
|
||||
text = text.replace('―', '-')
|
||||
text = text.replace('…', '...')
|
||||
text = text.replace('´', "'")
|
||||
text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
||||
text = re.sub('\s*\n\s*', ' \n ', text)
|
||||
text = re.sub('[^\S\n]+', ' ', text)
|
||||
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
||||
text = re.sub(r'\s*\n\s*', ' \n ', text)
|
||||
text = re.sub(r'[^\S\n]+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
class TextEncoder(object):
|
||||
|
||||
@@ -18,29 +18,20 @@ from sklearn.utils import shuffle
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
from model_py import Model, LMHead, ClfHead, load_openai_pretrained_model
|
||||
from opt import adam, warmup_cosine, warmup_linear, warmup_constant
|
||||
from opt import OpenAIAdam
|
||||
from datasets import rocstories
|
||||
from analysis import rocstories as rocstories_analysis
|
||||
from text_utils import TextEncoder
|
||||
from utils import (encode_dataset, flatten, iter_data,
|
||||
ResultLogger, make_path)
|
||||
|
||||
OPT_FNS = {
|
||||
'adam':adam,
|
||||
}
|
||||
|
||||
LR_SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_linear':warmup_linear,
|
||||
'warmup_constant':warmup_constant,
|
||||
}
|
||||
|
||||
class LossCompute:
|
||||
"A Loss compute and train function."
|
||||
def __init__(self, lm_criterion, clf_criterion, lm_coef):
|
||||
def __init__(self, lm_criterion, clf_criterion, lm_coef, opt=None):
|
||||
self.lm_criterion = lm_criterion
|
||||
self.clf_criterion = clf_criterion
|
||||
self.lm_coef = lm_coef
|
||||
self.opt = opt
|
||||
|
||||
def __call__(self, X, Y, M, lm_logits, clf_logits):
|
||||
# Language modeling loss
|
||||
@@ -53,11 +44,18 @@ class LossCompute:
|
||||
|
||||
# Classification loss
|
||||
clf_losses = self.clf_criterion(clf_logits, Y)
|
||||
|
||||
if self.lm_coef > 0:
|
||||
train_loss = clf_losses.sum() + self.lm_coef * lm_losses.sum()
|
||||
else:
|
||||
train_loss = clf_losses.sum()
|
||||
return train_loss
|
||||
|
||||
train_loss.backward()
|
||||
if self.opt is not None:
|
||||
self.opt.step()
|
||||
self.opt.optimizer.zero_grad()
|
||||
return train_loss.item()
|
||||
|
||||
|
||||
def transform_roc(X1, X2, X3):
|
||||
n_batch = len(X1)
|
||||
@@ -229,7 +227,14 @@ if __name__ == '__main__':
|
||||
model = Model(vocab, args)
|
||||
lm_head = LMHead(model, args)
|
||||
clf_head = ClfHead(clf_token, args)
|
||||
compute_loss = LossCompute(nn.CrossEntropyLoss(reduce=False), nn.CrossEntropyLoss(reduce=False), lm_coef) # TODO check loss functions
|
||||
|
||||
criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions
|
||||
model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule,
|
||||
warmup=lr_warmup, t_total=n_updates_total, b1=b1,
|
||||
b2=b2, e=e, l2=l2, vector_l2=vector_l2,
|
||||
max_grad_norm=max_grad_norm)
|
||||
|
||||
compute_loss = LossCompute(criterion, criterion, lm_coef, model_opt)
|
||||
# TODO Initialize model (?)
|
||||
# TODO add train() and eval()
|
||||
load_openai_pretrained_model(model, n_ctx, n_special, args)
|
||||
@@ -258,8 +263,6 @@ if __name__ == '__main__':
|
||||
lm_logits = lm_head(h)
|
||||
clf_logits = clf_head(h, XMB)
|
||||
loss = compute_loss(XMB, YMB, MMB, lm_logits, clf_logits)
|
||||
loss.backward()
|
||||
|
||||
n_updates += 1
|
||||
#if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
|
||||
# log()
|
||||
|
||||
Reference in New Issue
Block a user