This commit is contained in:
thomwolf
2018-06-13 16:07:58 +02:00
parent 83202ae27e
commit 8858f99438
9 changed files with 954 additions and 0 deletions
+21
View File
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 OpenAI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+9
View File
@@ -0,0 +1,9 @@
# finetune-transformer-lm
Code and model for the paper "Improving Language Understanding by Generative Pre-Training"
Currently this code implements the ROCStories Cloze Test result reported in the paper by running:
`python train.py --dataset rocstories --desc rocstories --submit --analysis --data_dir [path to data here]`
Note: The code is currently non-deterministic due to various GPU ops. The median accuracy of 10 runs with this codebase (using default hyperparameters) is 85.8% - slightly lower than the reported single run of 86.5% from the paper.
The ROCStories dataset can be downloaded from the associated [website](http://cs.rochester.edu/nlp/rocstories/).
+18
View File
@@ -0,0 +1,18 @@
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from datasets import _rocstories
def rocstories(data_dir, pred_path, log_path):
preds = pd.read_csv(pred_path, delimiter='\t')['prediction'].values.tolist()
_, _, _, labels = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
test_accuracy = accuracy_score(labels, preds)*100.
logs = [json.loads(line) for line in open(log_path)][1:]
best_validation_index = np.argmax([log['va_acc'] for log in logs])
valid_accuracy = logs[best_validation_index]['va_acc']
print('ROCStories Valid Accuracy: %.2f'%(valid_accuracy))
print('ROCStories Test Accuracy: %.2f'%(test_accuracy))
+51
View File
@@ -0,0 +1,51 @@
import os
import csv
import numpy as np
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
seed = 3535999445
def _rocstories(path):
with open(path) as f:
f = csv.reader(f)
st = []
ct1 = []
ct2 = []
y = []
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
if i > 0:
s = ' '.join(line[1:5])
c1 = line[5]
c2 = line[6]
st.append(s)
ct1.append(c1)
ct2.append(c2)
y.append(int(line[-1])-1)
return st, ct1, ct2, y
def rocstories(data_dir, n_train=1497, n_valid=374):
storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
trX1, trX2, trX3 = [], [], []
trY = []
for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
trX1.append(s)
trX2.append(c1)
trX3.append(c2)
trY.append(y)
vaX1, vaX2, vaX3 = [], [], []
vaY = []
for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
vaX1.append(s)
vaX2.append(c1)
vaX3.append(c2)
vaY.append(y)
trY = np.asarray(trY, dtype=np.int32)
vaY = np.asarray(vaY, dtype=np.int32)
return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)
+163
View File
@@ -0,0 +1,163 @@
import numpy as np
import math
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
vocab = n_vocab + n_special + n_ctx
def gelu(x):
return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x, 3))))
def swish(x):
return x*torch.sigmoid(x)
ACT_FNS = {
'relu': nn.relu,
'swish': swish,
'gelu': gelu
}
def clones(module, N):
"Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
class LayerNorm(nn.Module):
"Construct a layernorm module (See citation for details)."
def __init__(self, n_state, eps=1e-6):
super(LayerNorm, self).__init__()
self.g = nn.Parameter(torch.ones(n_state))
self.b = nn.Parameter(torch.zeros(n_state))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
# One difference with the TF version here: we add epsilon outside of sqrt
return self.g * (x - mean) / (std + self.eps) + self.b
class Conv1D(nn.Module):
def __init__(self, nf, rf, nx):
super(Conv1D, self).__init__()
self.rf = rf
if rf == 1: #faster 1x1 conv
self.w = Parameter(torch.ones(nx, nf)) # TODO change to random normal
self.b = Parameter(torch.zeros(nf))
else: #was used to train LM
raise NotImplementedError
def forward(self, x):
if self.rf == 1:
size_out = x.size()[:-1] + [nf]
x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
x = x.view(*size_out)
else:
raise NotImplementedError
return x
class Attention(nn.Module):
def __init__(self, nx, n_state, n_head, attn_pdrop, resid_pdrop, scale=False):
super(Attention, self).__init__()
self.c_attn = Conv1D(n_state*3, 1, nx)
self.c_proj = Conv1D(n_state, 1, nx)
self.scale = scale
self.n_head = n_head
self.attn_dropout = nn.Dropout(attn_pdrop)
self.resid_dropout = nn.Dropout(resid_pdrop)
@staticmethod
def mask_attn_weights(w):
n = w.size(-1)
b = torch.tril(np.ones(n, n)).view(1, 1, n, n)
return w * b + -1e9*(1-b)
def _attn(self, q, k, v):
w = torch.matmul(q, k)
if self.scale:
w = w / math.sqrt(v.size(-1))
w = self.mask_attn_weights(w)
w = nn.Softmax()(w)
w = self.attn_dropout(w)
return torch.matmul(w, v)
def merge_heads(self, x):
new_x_shape = x.size()[:-2] + [np.prod(x.size()[-2:])]
x = x.view(*new_x_shape) # in Tensorflow version: merge_states
return x.permute(0, 2, 1, 3)
def split_heads(self, x, k=False):
new_x_shape = x.size()[:-1] + [self.n_head, x.size(-1)//self.n_head]
x = x.view(*new_x_shape) # in Tensorflow version: split_states
if k:
return x.permute(0, 2, 3, 1)
else:
return x.permute(0, 2, 1, 3)
def forward(self, x):
x = self.c_attn(x)
query, key, value = x.split(3, dim=2)
query = self.split_heads(query)
key = self.split_heads(key, k=True)
value = self.split_heads(value)
a = self._attn(query, key, value)
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a)
return a
class MLP(nn.Module):
def __init__(self, nx, n_state, afn, resid_pdrop):
super(MLP, self).__init__()
self.c_fc = Conv1D(n_state, 1, nx)
self.c_proj = Conv1D(nx, 1, nx)
self.act = ACT_FNS[afn]
self.dropout = nn.Dropout(resid_pdrop)
def forward(self, x):
h = self.act(self.c_fc(x))
h = self.c_proj(h)
return self.dropout(h)
class Block(nn.Module):
def __init__(self, nx, n_head, attn_pdrop, resid_pdrop, afn, scale=False):
super(Block, self).__init__()
self.attn = Attention(nx, nx, n_head, attn_pdrop, resid_pdrop, scale)
self.ln_1 = LayerNorm(nx)
self.mlp = MLP(nx, nx*4, afn, resid_pdrop)
self.ln_2 = LayerNorm(nx)
def forward(self, x):
h = self.attn(x)
h = self.ln_1(x)
h = self.mlp(x)
h = self.ln_2(x)
return h
class Model(nn.Module):
""" Transformer model """
def __init__(self, vocab, n_embd, pdrop, n_layers,
nx, n_head, attn_pdrop, resid_pdrop, afn):
super(Model, self).__init__()
self.embed = nn.Embedding(vocab, n_embd)
self.drop = nn.Dropout(pdrop)
self.blocks = clones(Block(nx, n_head, attn_pdrop,
resid_pdrop, afn, scale=True), n_layers)
self.decoder = nn.Linear(nhid, vocab, bias=False)
self.decoder.weight = self.embed.weight
def forward(self, x, m):
x = x.view(-1, x.size(2), x.size(3))
m = m.view(-1, m.size(2))
e = self.embed(x)
h = e.sum(dim=2)
for block in self.blocks:
h = block(h)
return h
+49
View File
@@ -0,0 +1,49 @@
import math
import numpy as np
import tensorflow as tf
def warmup_cosine(x, warmup=0.002):
s = tf.cast(x <= warmup, tf.float32)
return s*(x/warmup) + (1-s)*(0.5 * (1 + tf.cos(math.pi * x)))
def warmup_constant(x, warmup=0.002):
s = tf.cast(x <= warmup, tf.float32)
return s*(x/warmup) + (1-s)*1
def warmup_linear(x, warmup=0.002):
s = tf.cast(x <= warmup, tf.float32)
return (s*(x/warmup) + (1-s))*(1-x)
schedules = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
"""
adam with weight decay fix
"""
t = tf.Variable(0, dtype=tf.float32, trainable=False)
tt = t+1
updates = [t.assign(tt)]
if max_grad_norm > 0:
grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
for p, g in zip(params, grads):
if p is None or g is None:
print("can't train", p.name, g)
else:
if isinstance(g, tf.IndexedSlices):
g = tf.convert_to_tensor(g)
m = tf.Variable(p*0, dtype=tf.float32, trainable=False)
v = tf.Variable(p*0, dtype=tf.float32, trainable=False)
lrt = lr*tf.sqrt(1-b2**tt)/(1-b1**tt)
lrt *= schedule(t/t_total)
mt = b1*m + (1-b1)*g
vt = b2*v + (1-b2)*g*g
if (len(p.get_shape()) > 1 or vector_l2) and l2 > 0:
pt = p - lrt * (mt / (tf.sqrt(vt) + e) + l2*p)
else:
pt = p - lrt * (mt / (tf.sqrt(vt) + e))
updates.extend([m.assign(mt), v.assign(vt), p.assign(pt)])
return tf.group(*updates)
+108
View File
@@ -0,0 +1,108 @@
import re
import ftfy
import json
import spacy
from tqdm import tqdm
def get_pairs(word):
"""
Return set of symbol pairs in a word.
word is represented as tuple of symbols (symbols being variable-length strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def text_standardize(text):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text = text.replace('', '-')
text = text.replace('', '-')
text = text.replace('', '-')
text = text.replace('', '...')
text = text.replace('´', "'")
text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub('\s*\n\s*', ' \n ', text)
text = re.sub('[^\S\n]+', ' ', text)
return text.strip()
class TextEncoder(object):
"""
mostly a wrapper for a public python bpe tokenizer
"""
def __init__(self, encoder_path, bpe_path):
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.encoder = json.load(open(encoder_path))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(bpe_path).read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
def bpe(self, token):
word = tuple(token[:-1]) + ( token[-1] + '</w>',)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token+'</w>'
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
if word == '\n </w>':
word = '\n</w>'
self.cache[token] = word
return word
def encode(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
+349
View File
@@ -0,0 +1,349 @@
import os
import time
import math
import json
import joblib
import random
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from functools import partial
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from opt import adam, warmup_cosine, warmup_linear, warmup_constant
from datasets import rocstories
from analysis import rocstories as rocstories_analysis
from text_utils import TextEncoder
from utils import encode_dataset, flatten, iter_data, find_trainable_variables, get_ema_vars, convert_gradient_to_tensor, shape_list, ResultLogger, assign_to_gpu, average_grads, make_path
OPT_FNS = {
'adam':adam,
}
LR_SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_linear':warmup_linear,
'warmup_constant':warmup_constant,
}
class LossCompute:
"A Loss compute and train function."
def __init__(self, generator, lm_criterion, n_embed, opt=None):
self.generator = generator
self.lm_criterion = lm_criterion
self.opt = opt
self.n_embed = n_embed
def __call__(self, X, Y, M, h, norm):
# Language modeling loss
h_trunc = h[:, :-1].contiguous().view(-1, self.n_embed) # Shape: 252, 768
x_shifted = X[:, 1:, 0].contiguous().view(-1) # Shape: 252
lm_logits = self.generator(h_trunc)
lm_losses = self.lm_criterion(h_trunc, x_shifted)
lm_losses = lm_losses.view(x.size(0), X.size(1))
lm_losses = lm_losses * M[:, 1:]
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
# Classification loss
clf_h = h.view(-1, self.n_embed)
# loss.backward()
# if self.opt is not None:
# self.opt.step()
# self.opt.optimizer.zero_grad()
return lm_losses
def model(X, M, Y, train=False, reuse=False):
we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd],
initializer=tf.random_normal_initializer(stddev=0.02))
we = dropout(we, embd_pdrop, train)
X = tf.reshape(X, [-1, n_ctx, 2])
M = tf.reshape(M, [-1, n_ctx])
h = embed(X, we)
for layer in range(n_layer):
h = block(h, 'h%d'%layer, train=train, scale=True)
lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
lm_logits = tf.matmul(lm_h, we, transpose_b=True)
lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1])
lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1)
clf_h = tf.reshape(h, [-1, n_embd])
pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx)
clf_h = tf.reshape(clf_h, [-1, 2, n_embd])
if train and clf_pdrop > 0:
shape = shape_list(clf_h)
shape[1] = 1
clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape)
clf_h = tf.reshape(clf_h, [-1, n_embd])
clf_logits = clf(clf_h, 1, train=train)
clf_logits = tf.reshape(clf_logits, [-1, 2])
clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y)
return clf_logits, clf_losses, lm_losses
def mgpu_train(*xs):
gpu_ops = []
gpu_grads = []
xs = (tf.split(x, n_gpu, 0) for x in xs)
for i, xs in enumerate(zip(*xs)):
do_reuse = True if i > 0 else None
with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
if lm_coef > 0:
train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
else:
train_loss = tf.reduce_mean(clf_losses)
params = find_trainable_variables("model")
grads = tf.gradients(train_loss, params)
grads = list(zip(grads, params))
gpu_grads.append(grads)
gpu_ops.append([clf_logits, clf_losses, lm_losses])
ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
grads = average_grads(gpu_grads)
grads = [g for g, p in grads]
train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
return [train]+ops
def mgpu_predict(*xs):
gpu_ops = []
xs = (tf.split(x, n_gpu, 0) for x in xs)
for i, xs in enumerate(zip(*xs)):
with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
gpu_ops.append([clf_logits, clf_losses, lm_losses])
ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
return ops
def transform_roc(X1, X2, X3):
n_batch = len(X1)
xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32)
mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32)
start = encoder['_start_']
delimiter = encoder['_delimiter_']
for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)):
x12 = [start]+x1[:max_len]+[delimiter]+x2[:max_len]+[clf_token]
x13 = [start]+x1[:max_len]+[delimiter]+x3[:max_len]+[clf_token]
l12 = len(x12)
l13 = len(x13)
xmb[i, 0, :l12, 0] = x12
xmb[i, 1, :l13, 0] = x13
mmb[i, 0, :l12] = 1
mmb[i, 1, :l13] = 1
xmb[:, :, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
return xmb, mmb
def iter_apply(Xs, Ms, Ys):
fns = [lambda x:np.concatenate(x, 0), lambda x:float(np.sum(x))]
results = []
for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
n = len(xmb)
if n == n_batch_train:
res = sess.run([eval_mgpu_logits, eval_mgpu_clf_loss], {X_train:xmb, M_train:mmb, Y_train:ymb})
else:
res = sess.run([eval_logits, eval_clf_loss], {X:xmb, M:mmb, Y:ymb})
res = [r*n for r in res]
results.append(res)
results = zip(*results)
return [fn(res) for res, fn in zip(results, fns)]
def iter_predict(Xs, Ms):
logits = []
for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
n = len(xmb)
if n == n_batch_train:
logits.append(sess.run(eval_mgpu_logits, {X_train:xmb, M_train:mmb}))
else:
logits.append(sess.run(eval_logits, {X:xmb, M:mmb}))
logits = np.concatenate(logits, 0)
return logits
def save(path):
ps = sess.run(params)
joblib.dump(ps, make_path(path))
def log():
global best_score
tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
va_logits, va_cost = iter_apply(vaX, vaM, vaY)
tr_cost = tr_cost/len(trY[:n_valid])
va_cost = va_cost/n_valid
tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1))*100.
va_acc = accuracy_score(vaY, np.argmax(va_logits, 1))*100.
logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
print('%d %d %.3f %.3f %.2f %.2f'%(n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
if submit:
score = va_acc
if score > best_score:
best_score = score
save(os.path.join(save_dir, desc, 'best_params.jl'))
argmax = lambda x:np.argmax(x, 1)
pred_fns = {
'rocstories':argmax,
}
filenames = {
'rocstories':'ROCStories.tsv',
}
label_decoders = {
'rocstories':None,
}
def predict():
filename = filenames[dataset]
pred_fn = pred_fns[dataset]
label_decoder = label_decoders[dataset]
predictions = pred_fn(iter_predict(teX, teM))
if label_decoder is not None:
predictions = [label_decoder[prediction] for prediction in predictions]
path = os.path.join(submission_dir, filename)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
f.write('{}\t{}\n'.format('index', 'prediction'))
for i, prediction in enumerate(predictions):
f.write('{}\t{}\n'.format(i, prediction))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--desc', type=str)
parser.add_argument('--dataset', type=str)
parser.add_argument('--log_dir', type=str, default='log/')
parser.add_argument('--save_dir', type=str, default='save/')
parser.add_argument('--data_dir', type=str, default='data/')
parser.add_argument('--submission_dir', type=str, default='submission/')
parser.add_argument('--submit', action='store_true')
parser.add_argument('--analysis', action='store_true')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--n_iter', type=int, default=3)
parser.add_argument('--n_batch', type=int, default=8)
parser.add_argument('--max_grad_norm', type=int, default=1)
parser.add_argument('--lr', type=float, default=6.25e-5)
parser.add_argument('--lr_warmup', type=float, default=0.002)
parser.add_argument('--n_ctx', type=int, default=512)
parser.add_argument('--n_embd', type=int, default=768)
parser.add_argument('--n_head', type=int, default=12)
parser.add_argument('--n_layer', type=int, default=12)
parser.add_argument('--embd_pdrop', type=float, default=0.1)
parser.add_argument('--attn_pdrop', type=float, default=0.1)
parser.add_argument('--resid_pdrop', type=float, default=0.1)
parser.add_argument('--clf_pdrop', type=float, default=0.1)
parser.add_argument('--l2', type=float, default=0.01)
parser.add_argument('--vector_l2', action='store_true')
parser.add_argument('--n_gpu', type=int, default=4)
parser.add_argument('--opt', type=str, default='adam')
parser.add_argument('--afn', type=str, default='gelu')
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json')
parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
parser.add_argument('--n_transfer', type=int, default=12)
parser.add_argument('--lm_coef', type=float, default=0.5)
parser.add_argument('--b1', type=float, default=0.9)
parser.add_argument('--b2', type=float, default=0.999)
parser.add_argument('--e', type=float, default=1e-8)
args = parser.parse_args()
print(args)
globals().update(args.__dict__)
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)
logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
text_encoder = TextEncoder(encoder_path, bpe_path)
encoder = text_encoder.encoder
n_vocab = len(text_encoder.encoder)
(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
n_y = 2
encoder['_start_'] = len(encoder)
encoder['_delimiter_'] = len(encoder)
encoder['_classify_'] = len(encoder)
clf_token = encoder['_classify_']
n_special = 3
max_len = n_ctx//2-2
n_ctx = min(
max(
[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
)+3, n_ctx
)
trX, trM = transform_roc(trX1, trX2, trX3)
vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
if submit:
teX, teM = transform_roc(teX1, teX2, teX3)
n_train = len(trY)
n_valid = len(vaY)
n_batch_train = n_batch*n_gpu
n_updates_total = (n_train//n_batch_train)*n_iter
X_train = tf.placeholder(tf.int32, [n_batch_train, 2, n_ctx, 2])
M_train = tf.placeholder(tf.float32, [n_batch_train, 2, n_ctx])
X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
M = tf.placeholder(tf.float32, [None, 2, n_ctx])
Y_train = tf.placeholder(tf.int32, [n_batch_train])
Y = tf.placeholder(tf.int32, [None])
train, logits, clf_losses, lm_losses = mgpu_train(X_train, M_train, Y_train)
clf_loss = tf.reduce_mean(clf_losses)
params = find_trainable_variables('model')
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(tf.global_variables_initializer())
shapes = json.load(open('model/params_shapes.json'))
offsets = np.cumsum([np.prod(shape) for shape in shapes])
init_params = [np.load('model/params_{}.npy'.format(n)) for n in range(10)]
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
init_params[0] = init_params[0][:n_ctx]
init_params[0] = np.concatenate([init_params[1], (np.random.randn(n_special, n_embd)*0.02).astype(np.float32), init_params[0]], 0)
del init_params[1]
if n_transfer == -1:
n_transfer = 0
else:
n_transfer = 1+n_transfer*12
sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])])
eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)
eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=True)
eval_clf_loss = tf.reduce_mean(eval_clf_losses)
eval_mgpu_clf_loss = tf.reduce_mean(eval_mgpu_clf_losses)
n_updates = 0
n_epochs = 0
if dataset != 'stsb':
trYt = trY
if submit:
save(os.path.join(save_dir, desc, 'best_params.jl'))
best_score = 0
for i in range(n_iter):
for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True):
cost, _ = sess.run([clf_loss, train], {X_train:xmb, M_train:mmb, Y_train:ymb})
n_updates += 1
if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
log()
n_epochs += 1
log()
if submit:
sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))])
predict()
if analysis:
rocstories_analysis(data_dir, os.path.join(submission_dir, 'ROCStories.tsv'), os.path.join(log_dir, 'rocstories.jsonl'))
+186
View File
@@ -0,0 +1,186 @@
import os
import re
import sys
import json
import math
import time
import unicodedata
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import function
from tqdm import tqdm
from functools import partial
def encode_dataset(*splits, encoder):
encoded_splits = []
for split in splits[0]:
fields = []
for field in split:
if isinstance(field[0], str):
field = encoder.encode(field)
fields.append(field)
encoded_splits.append(fields)
return encoded_splits
def stsb_label_encoding(labels, nclass=6):
"""
Label encoding from Tree LSTM paper (Tai, Socher, Manning)
"""
Y = np.zeros((len(labels), nclass)).astype(np.float32)
for j, y in enumerate(labels):
for i in range(nclass):
if i == np.floor(y) + 1:
Y[j,i] = y - np.floor(y)
if i == np.floor(y):
Y[j,i] = np.floor(y) - y + 1
return Y
def shape_list(x):
"""
deal with dynamic shape in tensorflow cleanly
"""
ps = x.get_shape().as_list()
ts = tf.shape(x)
return [ts[i] if ps[i] is None else ps[i] for i in range(len(ps))]
def np_softmax(x, t=1):
x = x/t
x = x - np.max(x, axis=-1, keepdims=True)
ex = np.exp(x)
return ex/np.sum(ex, axis=-1, keepdims=True)
def make_path(f):
d = os.path.dirname(f)
if d and not os.path.exists(d):
os.makedirs(d)
return f
def _identity_init(shape, dtype, partition_info, scale):
n = shape[-1]
w = np.eye(n)*scale
if len([s for s in shape if s != 1]) == 2:
w = w.reshape(shape)
return w.astype(np.float32)
def identity_init(scale=1.0):
return partial(_identity_init, scale=scale)
def _np_init(shape, dtype, partition_info, w):
return w
def np_init(w):
return partial(_np_init, w=w)
class ResultLogger(object):
def __init__(self, path, *args, **kwargs):
if 'time' not in kwargs:
kwargs['time'] = time.time()
self.f_log = open(make_path(path), 'w')
self.f_log.write(json.dumps(kwargs)+'\n')
def log(self, **kwargs):
if 'time' not in kwargs:
kwargs['time'] = time.time()
self.f_log.write(json.dumps(kwargs)+'\n')
self.f_log.flush()
def close(self):
self.f_log.close()
def find_trainable_variables(key):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, ".*{}.*".format(key))
def flatten(outer):
return [el for inner in outer for el in inner]
def remove_none(l):
return [e for e in l if e is not None]
def iter_data(*datas, n_batch=128, truncate=False, verbose=False, max_batches=float("inf")):
n = len(datas[0])
if truncate:
n = (n//n_batch)*n_batch
n = min(n, max_batches*n_batch)
n_batches = 0
if verbose:
f = sys.stderr
else:
f = open(os.devnull, 'w')
for i in tqdm(range(0, n, n_batch), total=n//n_batch, file=f, ncols=80, leave=False):
if n_batches >= max_batches: raise StopIteration
if len(datas) == 1:
yield datas[0][i:i+n_batch]
else:
yield (d[i:i+n_batch] for d in datas)
n_batches += 1
def get_ema_if_exists(v, gvs):
name = v.name.split(':')[0]
ema_name = name+'/ExponentialMovingAverage:0'
ema_v = [v for v in gvs if v.name == ema_name]
if len(ema_v) == 0:
ema_v = [v]
return ema_v[0]
def get_ema_vars(*vs):
if tf.get_variable_scope().reuse:
gvs = tf.global_variables()
vs = [get_ema_if_exists(v, gvs) for v in vs]
if len(vs) == 1:
return vs[0]
else:
return vs
@function.Defun(
python_grad_func=lambda x, dy: tf.convert_to_tensor(dy),
shape_func=lambda op: [op.inputs[0].get_shape()])
def convert_gradient_to_tensor(x):
"""force gradient to be a dense tensor
it's often faster to do dense embedding gradient on GPU than sparse on CPU
"""
return x
def assign_to_gpu(gpu=0, ps_dev="/device:CPU:0"):
def _assign(op):
node_def = op if isinstance(op, tf.NodeDef) else op.node_def
if node_def.op == "Variable":
return ps_dev
else:
return "/gpu:%d" % gpu
return _assign
def average_grads(tower_grads):
def average_dense(grad_and_vars):
if len(grad_and_vars) == 1:
return grad_and_vars[0][0]
grad = grad_and_vars[0][0]
for g, _ in grad_and_vars[1:]:
grad += g
return grad / len(grad_and_vars)
def average_sparse(grad_and_vars):
if len(grad_and_vars) == 1:
return grad_and_vars[0][0]
indices = []
values = []
for g, _ in grad_and_vars:
indices += [g.indices]
values += [g.values]
indices = tf.concat(indices, 0)
values = tf.concat(values, 0)
return tf.IndexedSlices(values, indices, grad_and_vars[0][0].dense_shape)
average_grads = []
for grad_and_vars in zip(*tower_grads):
if grad_and_vars[0][0] is None:
grad = None
elif isinstance(grad_and_vars[0][0], tf.IndexedSlices):
grad = average_sparse(grad_and_vars)
else:
grad = average_dense(grad_and_vars)
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads