mirror of
https://github.com/wassname/openai-transformer-lm-gutenberg-erotic.git
synced 2026-06-27 16:10:19 +08:00
code
This commit is contained in:
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2018 OpenAI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,9 @@
|
||||
# finetune-transformer-lm
|
||||
Code and model for the paper "Improving Language Understanding by Generative Pre-Training"
|
||||
|
||||
Currently this code implements the ROCStories Cloze Test result reported in the paper by running:
|
||||
`python train.py --dataset rocstories --desc rocstories --submit --analysis --data_dir [path to data here]`
|
||||
|
||||
Note: The code is currently non-deterministic due to various GPU ops. The median accuracy of 10 runs with this codebase (using default hyperparameters) is 85.8% - slightly lower than the reported single run of 86.5% from the paper.
|
||||
|
||||
The ROCStories dataset can be downloaded from the associated [website](http://cs.rochester.edu/nlp/rocstories/).
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
from datasets import _rocstories
|
||||
|
||||
def rocstories(data_dir, pred_path, log_path):
|
||||
preds = pd.read_csv(pred_path, delimiter='\t')['prediction'].values.tolist()
|
||||
_, _, _, labels = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
|
||||
test_accuracy = accuracy_score(labels, preds)*100.
|
||||
logs = [json.loads(line) for line in open(log_path)][1:]
|
||||
best_validation_index = np.argmax([log['va_acc'] for log in logs])
|
||||
valid_accuracy = logs[best_validation_index]['va_acc']
|
||||
print('ROCStories Valid Accuracy: %.2f'%(valid_accuracy))
|
||||
print('ROCStories Test Accuracy: %.2f'%(test_accuracy))
|
||||
+51
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
import csv
|
||||
import numpy as np
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
seed = 3535999445
|
||||
|
||||
def _rocstories(path):
|
||||
with open(path) as f:
|
||||
f = csv.reader(f)
|
||||
st = []
|
||||
ct1 = []
|
||||
ct2 = []
|
||||
y = []
|
||||
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
|
||||
if i > 0:
|
||||
s = ' '.join(line[1:5])
|
||||
c1 = line[5]
|
||||
c2 = line[6]
|
||||
st.append(s)
|
||||
ct1.append(c1)
|
||||
ct2.append(c2)
|
||||
y.append(int(line[-1])-1)
|
||||
return st, ct1, ct2, y
|
||||
|
||||
def rocstories(data_dir, n_train=1497, n_valid=374):
|
||||
storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
|
||||
teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
|
||||
tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
|
||||
trX1, trX2, trX3 = [], [], []
|
||||
trY = []
|
||||
for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
|
||||
trX1.append(s)
|
||||
trX2.append(c1)
|
||||
trX3.append(c2)
|
||||
trY.append(y)
|
||||
|
||||
vaX1, vaX2, vaX3 = [], [], []
|
||||
vaY = []
|
||||
for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
|
||||
vaX1.append(s)
|
||||
vaX2.append(c1)
|
||||
vaX3.append(c2)
|
||||
vaY.append(y)
|
||||
trY = np.asarray(trY, dtype=np.int32)
|
||||
vaY = np.asarray(vaY, dtype=np.int32)
|
||||
return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)
|
||||
@@ -0,0 +1,163 @@
|
||||
import numpy as np
|
||||
import math
|
||||
import copy
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
vocab = n_vocab + n_special + n_ctx
|
||||
|
||||
def gelu(x):
|
||||
return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x, 3))))
|
||||
|
||||
def swish(x):
|
||||
return x*torch.sigmoid(x)
|
||||
|
||||
ACT_FNS = {
|
||||
'relu': nn.relu,
|
||||
'swish': swish,
|
||||
'gelu': gelu
|
||||
}
|
||||
|
||||
def clones(module, N):
|
||||
"Produce N identical layers."
|
||||
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
"Construct a layernorm module (See citation for details)."
|
||||
def __init__(self, n_state, eps=1e-6):
|
||||
super(LayerNorm, self).__init__()
|
||||
self.g = nn.Parameter(torch.ones(n_state))
|
||||
self.b = nn.Parameter(torch.zeros(n_state))
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, x):
|
||||
mean = x.mean(-1, keepdim=True)
|
||||
std = x.std(-1, keepdim=True)
|
||||
# One difference with the TF version here: we add epsilon outside of sqrt
|
||||
return self.g * (x - mean) / (std + self.eps) + self.b
|
||||
|
||||
|
||||
class Conv1D(nn.Module):
|
||||
def __init__(self, nf, rf, nx):
|
||||
super(Conv1D, self).__init__()
|
||||
self.rf = rf
|
||||
if rf == 1: #faster 1x1 conv
|
||||
self.w = Parameter(torch.ones(nx, nf)) # TODO change to random normal
|
||||
self.b = Parameter(torch.zeros(nf))
|
||||
else: #was used to train LM
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self, x):
|
||||
if self.rf == 1:
|
||||
size_out = x.size()[:-1] + [nf]
|
||||
x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
|
||||
x = x.view(*size_out)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, nx, n_state, n_head, attn_pdrop, resid_pdrop, scale=False):
|
||||
super(Attention, self).__init__()
|
||||
self.c_attn = Conv1D(n_state*3, 1, nx)
|
||||
self.c_proj = Conv1D(n_state, 1, nx)
|
||||
self.scale = scale
|
||||
self.n_head = n_head
|
||||
self.attn_dropout = nn.Dropout(attn_pdrop)
|
||||
self.resid_dropout = nn.Dropout(resid_pdrop)
|
||||
|
||||
@staticmethod
|
||||
def mask_attn_weights(w):
|
||||
n = w.size(-1)
|
||||
b = torch.tril(np.ones(n, n)).view(1, 1, n, n)
|
||||
return w * b + -1e9*(1-b)
|
||||
|
||||
def _attn(self, q, k, v):
|
||||
w = torch.matmul(q, k)
|
||||
if self.scale:
|
||||
w = w / math.sqrt(v.size(-1))
|
||||
w = self.mask_attn_weights(w)
|
||||
w = nn.Softmax()(w)
|
||||
w = self.attn_dropout(w)
|
||||
return torch.matmul(w, v)
|
||||
|
||||
def merge_heads(self, x):
|
||||
new_x_shape = x.size()[:-2] + [np.prod(x.size()[-2:])]
|
||||
x = x.view(*new_x_shape) # in Tensorflow version: merge_states
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def split_heads(self, x, k=False):
|
||||
new_x_shape = x.size()[:-1] + [self.n_head, x.size(-1)//self.n_head]
|
||||
x = x.view(*new_x_shape) # in Tensorflow version: split_states
|
||||
if k:
|
||||
return x.permute(0, 2, 3, 1)
|
||||
else:
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_attn(x)
|
||||
query, key, value = x.split(3, dim=2)
|
||||
query = self.split_heads(query)
|
||||
key = self.split_heads(key, k=True)
|
||||
value = self.split_heads(value)
|
||||
a = self._attn(query, key, value)
|
||||
a = self.merge_heads(a)
|
||||
a = self.c_proj(a)
|
||||
a = self.resid_dropout(a)
|
||||
return a
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, nx, n_state, afn, resid_pdrop):
|
||||
super(MLP, self).__init__()
|
||||
self.c_fc = Conv1D(n_state, 1, nx)
|
||||
self.c_proj = Conv1D(nx, 1, nx)
|
||||
self.act = ACT_FNS[afn]
|
||||
self.dropout = nn.Dropout(resid_pdrop)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.act(self.c_fc(x))
|
||||
h = self.c_proj(h)
|
||||
return self.dropout(h)
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, nx, n_head, attn_pdrop, resid_pdrop, afn, scale=False):
|
||||
super(Block, self).__init__()
|
||||
self.attn = Attention(nx, nx, n_head, attn_pdrop, resid_pdrop, scale)
|
||||
self.ln_1 = LayerNorm(nx)
|
||||
self.mlp = MLP(nx, nx*4, afn, resid_pdrop)
|
||||
self.ln_2 = LayerNorm(nx)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.attn(x)
|
||||
h = self.ln_1(x)
|
||||
h = self.mlp(x)
|
||||
h = self.ln_2(x)
|
||||
return h
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
""" Transformer model """
|
||||
def __init__(self, vocab, n_embd, pdrop, n_layers,
|
||||
nx, n_head, attn_pdrop, resid_pdrop, afn):
|
||||
super(Model, self).__init__()
|
||||
self.embed = nn.Embedding(vocab, n_embd)
|
||||
self.drop = nn.Dropout(pdrop)
|
||||
self.blocks = clones(Block(nx, n_head, attn_pdrop,
|
||||
resid_pdrop, afn, scale=True), n_layers)
|
||||
self.decoder = nn.Linear(nhid, vocab, bias=False)
|
||||
self.decoder.weight = self.embed.weight
|
||||
|
||||
def forward(self, x, m):
|
||||
x = x.view(-1, x.size(2), x.size(3))
|
||||
m = m.view(-1, m.size(2))
|
||||
e = self.embed(x)
|
||||
h = e.sum(dim=2)
|
||||
for block in self.blocks:
|
||||
h = block(h)
|
||||
return h
|
||||
@@ -0,0 +1,49 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
s = tf.cast(x <= warmup, tf.float32)
|
||||
return s*(x/warmup) + (1-s)*(0.5 * (1 + tf.cos(math.pi * x)))
|
||||
|
||||
def warmup_constant(x, warmup=0.002):
|
||||
s = tf.cast(x <= warmup, tf.float32)
|
||||
return s*(x/warmup) + (1-s)*1
|
||||
|
||||
def warmup_linear(x, warmup=0.002):
|
||||
s = tf.cast(x <= warmup, tf.float32)
|
||||
return (s*(x/warmup) + (1-s))*(1-x)
|
||||
|
||||
schedules = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_constant':warmup_constant,
|
||||
'warmup_linear':warmup_linear,
|
||||
}
|
||||
|
||||
def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
|
||||
"""
|
||||
adam with weight decay fix
|
||||
"""
|
||||
t = tf.Variable(0, dtype=tf.float32, trainable=False)
|
||||
tt = t+1
|
||||
updates = [t.assign(tt)]
|
||||
if max_grad_norm > 0:
|
||||
grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
for p, g in zip(params, grads):
|
||||
if p is None or g is None:
|
||||
print("can't train", p.name, g)
|
||||
else:
|
||||
if isinstance(g, tf.IndexedSlices):
|
||||
g = tf.convert_to_tensor(g)
|
||||
m = tf.Variable(p*0, dtype=tf.float32, trainable=False)
|
||||
v = tf.Variable(p*0, dtype=tf.float32, trainable=False)
|
||||
lrt = lr*tf.sqrt(1-b2**tt)/(1-b1**tt)
|
||||
lrt *= schedule(t/t_total)
|
||||
mt = b1*m + (1-b1)*g
|
||||
vt = b2*v + (1-b2)*g*g
|
||||
if (len(p.get_shape()) > 1 or vector_l2) and l2 > 0:
|
||||
pt = p - lrt * (mt / (tf.sqrt(vt) + e) + l2*p)
|
||||
else:
|
||||
pt = p - lrt * (mt / (tf.sqrt(vt) + e))
|
||||
updates.extend([m.assign(mt), v.assign(vt), p.assign(pt)])
|
||||
return tf.group(*updates)
|
||||
+108
@@ -0,0 +1,108 @@
|
||||
import re
|
||||
import ftfy
|
||||
import json
|
||||
import spacy
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
def get_pairs(word):
|
||||
"""
|
||||
Return set of symbol pairs in a word.
|
||||
word is represented as tuple of symbols (symbols being variable-length strings)
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
def text_standardize(text):
|
||||
"""
|
||||
fixes some issues the spacy tokenizer had on books corpus
|
||||
also does some whitespace standardization
|
||||
"""
|
||||
text = text.replace('—', '-')
|
||||
text = text.replace('–', '-')
|
||||
text = text.replace('―', '-')
|
||||
text = text.replace('…', '...')
|
||||
text = text.replace('´', "'")
|
||||
text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
||||
text = re.sub('\s*\n\s*', ' \n ', text)
|
||||
text = re.sub('[^\S\n]+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
class TextEncoder(object):
|
||||
"""
|
||||
mostly a wrapper for a public python bpe tokenizer
|
||||
"""
|
||||
|
||||
def __init__(self, encoder_path, bpe_path):
|
||||
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
|
||||
self.encoder = json.load(open(encoder_path))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
merges = open(bpe_path).read().split('\n')[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
def bpe(self, token):
|
||||
word = tuple(token[:-1]) + ( token[-1] + '</w>',)
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token+'</w>'
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
||||
new_word.append(first+second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
if word == '\n </w>':
|
||||
word = '\n</w>'
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def encode(self, texts, verbose=True):
|
||||
texts_tokens = []
|
||||
if verbose:
|
||||
for text in tqdm(texts, ncols=80, leave=False):
|
||||
text = self.nlp(text_standardize(ftfy.fix_text(text)))
|
||||
text_tokens = []
|
||||
for token in text:
|
||||
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
|
||||
texts_tokens.append(text_tokens)
|
||||
else:
|
||||
for text in texts:
|
||||
text = self.nlp(text_standardize(ftfy.fix_text(text)))
|
||||
text_tokens = []
|
||||
for token in text:
|
||||
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
|
||||
texts_tokens.append(text_tokens)
|
||||
return texts_tokens
|
||||
@@ -0,0 +1,349 @@
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
import json
|
||||
import joblib
|
||||
import random
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
from opt import adam, warmup_cosine, warmup_linear, warmup_constant
|
||||
from datasets import rocstories
|
||||
from analysis import rocstories as rocstories_analysis
|
||||
from text_utils import TextEncoder
|
||||
from utils import encode_dataset, flatten, iter_data, find_trainable_variables, get_ema_vars, convert_gradient_to_tensor, shape_list, ResultLogger, assign_to_gpu, average_grads, make_path
|
||||
|
||||
OPT_FNS = {
|
||||
'adam':adam,
|
||||
}
|
||||
|
||||
LR_SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_linear':warmup_linear,
|
||||
'warmup_constant':warmup_constant,
|
||||
}
|
||||
|
||||
class LossCompute:
|
||||
"A Loss compute and train function."
|
||||
def __init__(self, generator, lm_criterion, n_embed, opt=None):
|
||||
self.generator = generator
|
||||
self.lm_criterion = lm_criterion
|
||||
self.opt = opt
|
||||
self.n_embed = n_embed
|
||||
|
||||
def __call__(self, X, Y, M, h, norm):
|
||||
# Language modeling loss
|
||||
h_trunc = h[:, :-1].contiguous().view(-1, self.n_embed) # Shape: 252, 768
|
||||
x_shifted = X[:, 1:, 0].contiguous().view(-1) # Shape: 252
|
||||
lm_logits = self.generator(h_trunc)
|
||||
lm_losses = self.lm_criterion(h_trunc, x_shifted)
|
||||
lm_losses = lm_losses.view(x.size(0), X.size(1))
|
||||
lm_losses = lm_losses * M[:, 1:]
|
||||
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
|
||||
|
||||
# Classification loss
|
||||
clf_h = h.view(-1, self.n_embed)
|
||||
|
||||
# loss.backward()
|
||||
# if self.opt is not None:
|
||||
# self.opt.step()
|
||||
# self.opt.optimizer.zero_grad()
|
||||
return lm_losses
|
||||
|
||||
def model(X, M, Y, train=False, reuse=False):
|
||||
we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd],
|
||||
initializer=tf.random_normal_initializer(stddev=0.02))
|
||||
we = dropout(we, embd_pdrop, train)
|
||||
|
||||
X = tf.reshape(X, [-1, n_ctx, 2])
|
||||
M = tf.reshape(M, [-1, n_ctx])
|
||||
|
||||
h = embed(X, we)
|
||||
for layer in range(n_layer):
|
||||
h = block(h, 'h%d'%layer, train=train, scale=True)
|
||||
|
||||
lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
|
||||
lm_logits = tf.matmul(lm_h, we, transpose_b=True)
|
||||
lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
|
||||
lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1])
|
||||
lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1)
|
||||
|
||||
clf_h = tf.reshape(h, [-1, n_embd])
|
||||
pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
|
||||
clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx)
|
||||
|
||||
clf_h = tf.reshape(clf_h, [-1, 2, n_embd])
|
||||
if train and clf_pdrop > 0:
|
||||
shape = shape_list(clf_h)
|
||||
shape[1] = 1
|
||||
clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape)
|
||||
clf_h = tf.reshape(clf_h, [-1, n_embd])
|
||||
clf_logits = clf(clf_h, 1, train=train)
|
||||
clf_logits = tf.reshape(clf_logits, [-1, 2])
|
||||
|
||||
clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y)
|
||||
return clf_logits, clf_losses, lm_losses
|
||||
|
||||
def mgpu_train(*xs):
|
||||
gpu_ops = []
|
||||
gpu_grads = []
|
||||
xs = (tf.split(x, n_gpu, 0) for x in xs)
|
||||
for i, xs in enumerate(zip(*xs)):
|
||||
do_reuse = True if i > 0 else None
|
||||
with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
|
||||
clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
|
||||
if lm_coef > 0:
|
||||
train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
|
||||
else:
|
||||
train_loss = tf.reduce_mean(clf_losses)
|
||||
params = find_trainable_variables("model")
|
||||
grads = tf.gradients(train_loss, params)
|
||||
grads = list(zip(grads, params))
|
||||
gpu_grads.append(grads)
|
||||
gpu_ops.append([clf_logits, clf_losses, lm_losses])
|
||||
ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
|
||||
grads = average_grads(gpu_grads)
|
||||
grads = [g for g, p in grads]
|
||||
train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
|
||||
return [train]+ops
|
||||
|
||||
def mgpu_predict(*xs):
|
||||
gpu_ops = []
|
||||
xs = (tf.split(x, n_gpu, 0) for x in xs)
|
||||
for i, xs in enumerate(zip(*xs)):
|
||||
with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
|
||||
clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
|
||||
gpu_ops.append([clf_logits, clf_losses, lm_losses])
|
||||
ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
|
||||
return ops
|
||||
|
||||
def transform_roc(X1, X2, X3):
|
||||
n_batch = len(X1)
|
||||
xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32)
|
||||
mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32)
|
||||
start = encoder['_start_']
|
||||
delimiter = encoder['_delimiter_']
|
||||
for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)):
|
||||
x12 = [start]+x1[:max_len]+[delimiter]+x2[:max_len]+[clf_token]
|
||||
x13 = [start]+x1[:max_len]+[delimiter]+x3[:max_len]+[clf_token]
|
||||
l12 = len(x12)
|
||||
l13 = len(x13)
|
||||
xmb[i, 0, :l12, 0] = x12
|
||||
xmb[i, 1, :l13, 0] = x13
|
||||
mmb[i, 0, :l12] = 1
|
||||
mmb[i, 1, :l13] = 1
|
||||
xmb[:, :, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
|
||||
return xmb, mmb
|
||||
|
||||
def iter_apply(Xs, Ms, Ys):
|
||||
fns = [lambda x:np.concatenate(x, 0), lambda x:float(np.sum(x))]
|
||||
results = []
|
||||
for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
|
||||
n = len(xmb)
|
||||
if n == n_batch_train:
|
||||
res = sess.run([eval_mgpu_logits, eval_mgpu_clf_loss], {X_train:xmb, M_train:mmb, Y_train:ymb})
|
||||
else:
|
||||
res = sess.run([eval_logits, eval_clf_loss], {X:xmb, M:mmb, Y:ymb})
|
||||
res = [r*n for r in res]
|
||||
results.append(res)
|
||||
results = zip(*results)
|
||||
return [fn(res) for res, fn in zip(results, fns)]
|
||||
|
||||
def iter_predict(Xs, Ms):
|
||||
logits = []
|
||||
for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
|
||||
n = len(xmb)
|
||||
if n == n_batch_train:
|
||||
logits.append(sess.run(eval_mgpu_logits, {X_train:xmb, M_train:mmb}))
|
||||
else:
|
||||
logits.append(sess.run(eval_logits, {X:xmb, M:mmb}))
|
||||
logits = np.concatenate(logits, 0)
|
||||
return logits
|
||||
|
||||
def save(path):
|
||||
ps = sess.run(params)
|
||||
joblib.dump(ps, make_path(path))
|
||||
|
||||
def log():
|
||||
global best_score
|
||||
tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
|
||||
va_logits, va_cost = iter_apply(vaX, vaM, vaY)
|
||||
tr_cost = tr_cost/len(trY[:n_valid])
|
||||
va_cost = va_cost/n_valid
|
||||
tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1))*100.
|
||||
va_acc = accuracy_score(vaY, np.argmax(va_logits, 1))*100.
|
||||
logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
|
||||
print('%d %d %.3f %.3f %.2f %.2f'%(n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
|
||||
if submit:
|
||||
score = va_acc
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
save(os.path.join(save_dir, desc, 'best_params.jl'))
|
||||
|
||||
argmax = lambda x:np.argmax(x, 1)
|
||||
|
||||
pred_fns = {
|
||||
'rocstories':argmax,
|
||||
}
|
||||
|
||||
filenames = {
|
||||
'rocstories':'ROCStories.tsv',
|
||||
}
|
||||
|
||||
label_decoders = {
|
||||
'rocstories':None,
|
||||
}
|
||||
|
||||
def predict():
|
||||
filename = filenames[dataset]
|
||||
pred_fn = pred_fns[dataset]
|
||||
label_decoder = label_decoders[dataset]
|
||||
predictions = pred_fn(iter_predict(teX, teM))
|
||||
if label_decoder is not None:
|
||||
predictions = [label_decoder[prediction] for prediction in predictions]
|
||||
path = os.path.join(submission_dir, filename)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, 'w') as f:
|
||||
f.write('{}\t{}\n'.format('index', 'prediction'))
|
||||
for i, prediction in enumerate(predictions):
|
||||
f.write('{}\t{}\n'.format(i, prediction))
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--desc', type=str)
|
||||
parser.add_argument('--dataset', type=str)
|
||||
parser.add_argument('--log_dir', type=str, default='log/')
|
||||
parser.add_argument('--save_dir', type=str, default='save/')
|
||||
parser.add_argument('--data_dir', type=str, default='data/')
|
||||
parser.add_argument('--submission_dir', type=str, default='submission/')
|
||||
parser.add_argument('--submit', action='store_true')
|
||||
parser.add_argument('--analysis', action='store_true')
|
||||
parser.add_argument('--seed', type=int, default=42)
|
||||
parser.add_argument('--n_iter', type=int, default=3)
|
||||
parser.add_argument('--n_batch', type=int, default=8)
|
||||
parser.add_argument('--max_grad_norm', type=int, default=1)
|
||||
parser.add_argument('--lr', type=float, default=6.25e-5)
|
||||
parser.add_argument('--lr_warmup', type=float, default=0.002)
|
||||
parser.add_argument('--n_ctx', type=int, default=512)
|
||||
parser.add_argument('--n_embd', type=int, default=768)
|
||||
parser.add_argument('--n_head', type=int, default=12)
|
||||
parser.add_argument('--n_layer', type=int, default=12)
|
||||
parser.add_argument('--embd_pdrop', type=float, default=0.1)
|
||||
parser.add_argument('--attn_pdrop', type=float, default=0.1)
|
||||
parser.add_argument('--resid_pdrop', type=float, default=0.1)
|
||||
parser.add_argument('--clf_pdrop', type=float, default=0.1)
|
||||
parser.add_argument('--l2', type=float, default=0.01)
|
||||
parser.add_argument('--vector_l2', action='store_true')
|
||||
parser.add_argument('--n_gpu', type=int, default=4)
|
||||
parser.add_argument('--opt', type=str, default='adam')
|
||||
parser.add_argument('--afn', type=str, default='gelu')
|
||||
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
||||
parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json')
|
||||
parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
|
||||
parser.add_argument('--n_transfer', type=int, default=12)
|
||||
parser.add_argument('--lm_coef', type=float, default=0.5)
|
||||
parser.add_argument('--b1', type=float, default=0.9)
|
||||
parser.add_argument('--b2', type=float, default=0.999)
|
||||
parser.add_argument('--e', type=float, default=1e-8)
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
globals().update(args.__dict__)
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
tf.set_random_seed(seed)
|
||||
|
||||
logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
|
||||
text_encoder = TextEncoder(encoder_path, bpe_path)
|
||||
encoder = text_encoder.encoder
|
||||
n_vocab = len(text_encoder.encoder)
|
||||
|
||||
(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
|
||||
n_y = 2
|
||||
encoder['_start_'] = len(encoder)
|
||||
encoder['_delimiter_'] = len(encoder)
|
||||
encoder['_classify_'] = len(encoder)
|
||||
clf_token = encoder['_classify_']
|
||||
n_special = 3
|
||||
max_len = n_ctx//2-2
|
||||
n_ctx = min(
|
||||
max(
|
||||
[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
|
||||
+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
|
||||
+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
|
||||
)+3, n_ctx
|
||||
)
|
||||
trX, trM = transform_roc(trX1, trX2, trX3)
|
||||
vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
|
||||
if submit:
|
||||
teX, teM = transform_roc(teX1, teX2, teX3)
|
||||
|
||||
n_train = len(trY)
|
||||
n_valid = len(vaY)
|
||||
n_batch_train = n_batch*n_gpu
|
||||
n_updates_total = (n_train//n_batch_train)*n_iter
|
||||
|
||||
X_train = tf.placeholder(tf.int32, [n_batch_train, 2, n_ctx, 2])
|
||||
M_train = tf.placeholder(tf.float32, [n_batch_train, 2, n_ctx])
|
||||
X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
|
||||
M = tf.placeholder(tf.float32, [None, 2, n_ctx])
|
||||
|
||||
Y_train = tf.placeholder(tf.int32, [n_batch_train])
|
||||
Y = tf.placeholder(tf.int32, [None])
|
||||
|
||||
train, logits, clf_losses, lm_losses = mgpu_train(X_train, M_train, Y_train)
|
||||
clf_loss = tf.reduce_mean(clf_losses)
|
||||
|
||||
params = find_trainable_variables('model')
|
||||
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
shapes = json.load(open('model/params_shapes.json'))
|
||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||
init_params = [np.load('model/params_{}.npy'.format(n)) for n in range(10)]
|
||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
|
||||
init_params[0] = init_params[0][:n_ctx]
|
||||
init_params[0] = np.concatenate([init_params[1], (np.random.randn(n_special, n_embd)*0.02).astype(np.float32), init_params[0]], 0)
|
||||
del init_params[1]
|
||||
|
||||
if n_transfer == -1:
|
||||
n_transfer = 0
|
||||
else:
|
||||
n_transfer = 1+n_transfer*12
|
||||
sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])])
|
||||
|
||||
eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)
|
||||
eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=True)
|
||||
eval_clf_loss = tf.reduce_mean(eval_clf_losses)
|
||||
eval_mgpu_clf_loss = tf.reduce_mean(eval_mgpu_clf_losses)
|
||||
|
||||
n_updates = 0
|
||||
n_epochs = 0
|
||||
if dataset != 'stsb':
|
||||
trYt = trY
|
||||
if submit:
|
||||
save(os.path.join(save_dir, desc, 'best_params.jl'))
|
||||
best_score = 0
|
||||
for i in range(n_iter):
|
||||
for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True):
|
||||
cost, _ = sess.run([clf_loss, train], {X_train:xmb, M_train:mmb, Y_train:ymb})
|
||||
n_updates += 1
|
||||
if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
|
||||
log()
|
||||
n_epochs += 1
|
||||
log()
|
||||
if submit:
|
||||
sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))])
|
||||
predict()
|
||||
if analysis:
|
||||
rocstories_analysis(data_dir, os.path.join(submission_dir, 'ROCStories.tsv'), os.path.join(log_dir, 'rocstories.jsonl'))
|
||||
@@ -0,0 +1,186 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import math
|
||||
import time
|
||||
import unicodedata
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.framework import function
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
|
||||
def encode_dataset(*splits, encoder):
|
||||
encoded_splits = []
|
||||
for split in splits[0]:
|
||||
fields = []
|
||||
for field in split:
|
||||
if isinstance(field[0], str):
|
||||
field = encoder.encode(field)
|
||||
fields.append(field)
|
||||
encoded_splits.append(fields)
|
||||
return encoded_splits
|
||||
|
||||
def stsb_label_encoding(labels, nclass=6):
|
||||
"""
|
||||
Label encoding from Tree LSTM paper (Tai, Socher, Manning)
|
||||
"""
|
||||
Y = np.zeros((len(labels), nclass)).astype(np.float32)
|
||||
for j, y in enumerate(labels):
|
||||
for i in range(nclass):
|
||||
if i == np.floor(y) + 1:
|
||||
Y[j,i] = y - np.floor(y)
|
||||
if i == np.floor(y):
|
||||
Y[j,i] = np.floor(y) - y + 1
|
||||
return Y
|
||||
|
||||
def shape_list(x):
|
||||
"""
|
||||
deal with dynamic shape in tensorflow cleanly
|
||||
"""
|
||||
ps = x.get_shape().as_list()
|
||||
ts = tf.shape(x)
|
||||
return [ts[i] if ps[i] is None else ps[i] for i in range(len(ps))]
|
||||
|
||||
def np_softmax(x, t=1):
|
||||
x = x/t
|
||||
x = x - np.max(x, axis=-1, keepdims=True)
|
||||
ex = np.exp(x)
|
||||
return ex/np.sum(ex, axis=-1, keepdims=True)
|
||||
|
||||
def make_path(f):
|
||||
d = os.path.dirname(f)
|
||||
if d and not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
return f
|
||||
|
||||
def _identity_init(shape, dtype, partition_info, scale):
|
||||
n = shape[-1]
|
||||
w = np.eye(n)*scale
|
||||
if len([s for s in shape if s != 1]) == 2:
|
||||
w = w.reshape(shape)
|
||||
return w.astype(np.float32)
|
||||
|
||||
def identity_init(scale=1.0):
|
||||
return partial(_identity_init, scale=scale)
|
||||
|
||||
def _np_init(shape, dtype, partition_info, w):
|
||||
return w
|
||||
|
||||
def np_init(w):
|
||||
return partial(_np_init, w=w)
|
||||
|
||||
class ResultLogger(object):
|
||||
def __init__(self, path, *args, **kwargs):
|
||||
if 'time' not in kwargs:
|
||||
kwargs['time'] = time.time()
|
||||
self.f_log = open(make_path(path), 'w')
|
||||
self.f_log.write(json.dumps(kwargs)+'\n')
|
||||
|
||||
def log(self, **kwargs):
|
||||
if 'time' not in kwargs:
|
||||
kwargs['time'] = time.time()
|
||||
self.f_log.write(json.dumps(kwargs)+'\n')
|
||||
self.f_log.flush()
|
||||
|
||||
def close(self):
|
||||
self.f_log.close()
|
||||
|
||||
def find_trainable_variables(key):
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, ".*{}.*".format(key))
|
||||
|
||||
def flatten(outer):
|
||||
return [el for inner in outer for el in inner]
|
||||
|
||||
def remove_none(l):
|
||||
return [e for e in l if e is not None]
|
||||
|
||||
def iter_data(*datas, n_batch=128, truncate=False, verbose=False, max_batches=float("inf")):
|
||||
n = len(datas[0])
|
||||
if truncate:
|
||||
n = (n//n_batch)*n_batch
|
||||
n = min(n, max_batches*n_batch)
|
||||
n_batches = 0
|
||||
if verbose:
|
||||
f = sys.stderr
|
||||
else:
|
||||
f = open(os.devnull, 'w')
|
||||
for i in tqdm(range(0, n, n_batch), total=n//n_batch, file=f, ncols=80, leave=False):
|
||||
if n_batches >= max_batches: raise StopIteration
|
||||
if len(datas) == 1:
|
||||
yield datas[0][i:i+n_batch]
|
||||
else:
|
||||
yield (d[i:i+n_batch] for d in datas)
|
||||
n_batches += 1
|
||||
|
||||
def get_ema_if_exists(v, gvs):
|
||||
name = v.name.split(':')[0]
|
||||
ema_name = name+'/ExponentialMovingAverage:0'
|
||||
ema_v = [v for v in gvs if v.name == ema_name]
|
||||
if len(ema_v) == 0:
|
||||
ema_v = [v]
|
||||
return ema_v[0]
|
||||
|
||||
def get_ema_vars(*vs):
|
||||
if tf.get_variable_scope().reuse:
|
||||
gvs = tf.global_variables()
|
||||
vs = [get_ema_if_exists(v, gvs) for v in vs]
|
||||
if len(vs) == 1:
|
||||
return vs[0]
|
||||
else:
|
||||
return vs
|
||||
|
||||
@function.Defun(
|
||||
python_grad_func=lambda x, dy: tf.convert_to_tensor(dy),
|
||||
shape_func=lambda op: [op.inputs[0].get_shape()])
|
||||
def convert_gradient_to_tensor(x):
|
||||
"""force gradient to be a dense tensor
|
||||
it's often faster to do dense embedding gradient on GPU than sparse on CPU
|
||||
"""
|
||||
return x
|
||||
|
||||
def assign_to_gpu(gpu=0, ps_dev="/device:CPU:0"):
|
||||
def _assign(op):
|
||||
node_def = op if isinstance(op, tf.NodeDef) else op.node_def
|
||||
if node_def.op == "Variable":
|
||||
return ps_dev
|
||||
else:
|
||||
return "/gpu:%d" % gpu
|
||||
return _assign
|
||||
|
||||
def average_grads(tower_grads):
|
||||
def average_dense(grad_and_vars):
|
||||
if len(grad_and_vars) == 1:
|
||||
return grad_and_vars[0][0]
|
||||
|
||||
grad = grad_and_vars[0][0]
|
||||
for g, _ in grad_and_vars[1:]:
|
||||
grad += g
|
||||
return grad / len(grad_and_vars)
|
||||
|
||||
def average_sparse(grad_and_vars):
|
||||
if len(grad_and_vars) == 1:
|
||||
return grad_and_vars[0][0]
|
||||
|
||||
indices = []
|
||||
values = []
|
||||
for g, _ in grad_and_vars:
|
||||
indices += [g.indices]
|
||||
values += [g.values]
|
||||
indices = tf.concat(indices, 0)
|
||||
values = tf.concat(values, 0)
|
||||
return tf.IndexedSlices(values, indices, grad_and_vars[0][0].dense_shape)
|
||||
|
||||
average_grads = []
|
||||
for grad_and_vars in zip(*tower_grads):
|
||||
if grad_and_vars[0][0] is None:
|
||||
grad = None
|
||||
elif isinstance(grad_and_vars[0][0], tf.IndexedSlices):
|
||||
grad = average_sparse(grad_and_vars)
|
||||
else:
|
||||
grad = average_dense(grad_and_vars)
|
||||
v = grad_and_vars[0][1]
|
||||
grad_and_var = (grad, v)
|
||||
average_grads.append(grad_and_var)
|
||||
return average_grads
|
||||
Reference in New Issue
Block a user