use pytorch lightning remove potential leakage, misc

This commit is contained in:
wassname
2022-02-11 20:05:08 +08:00
parent e8ab8fc1f4
commit 399896bd52
10 changed files with 3413 additions and 1025 deletions
+2 -2
View File
@@ -1,3 +1,4 @@
/default/
# Created by https://www.gitignore.io/api/linux,python,windows,jupyternotebook
@@ -147,5 +148,4 @@ $RECYCLE.BIN/
*.lnk
# End of https://www.gitignore.io/api/linux,python,windows,jupyternotebook
.demo_cache.sqlite
demo_cache.sqlite
-533
View File
File diff suppressed because one or more lines are too long
-153
View File
@@ -1,153 +0,0 @@
# %reload_ext autoreload
# %autoreload 2
import matplotlib.pyplot as plt
# %matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12.0, 3)
import numpy as np
import tqdm
import torch
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from utils import read_timeseries,generate_sequence, plt_lmbda
from module import GTPP
from run import get_parser
# +
parser = get_parser()
config = parser.parse_args([])
path = 'data/'
if config.data == 'exponential_hawkes':
train_data = read_timeseries(path + config.data + '_training.csv')
val_data = read_timeseries(path + config.data + '_validation.csv')
test_data = read_timeseries(path + config.data + '_testing.csv')
else:
raise NotImplemented('only exponential_hawkes')
train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
model = GTPP(config)
best_loss = 1e3
patients = 0
tol = 333
for epoch in range(config.epochs):
model.train()
loss1 = loss2 = loss3 = 0
for batch in train_loader:
loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)
loss1 += loss
loss2 += log_lmbda
loss3 += int_lmbda
model.eval()
for batch in val_loader:
val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)
if best_loss > val_loss:
best_loss = val_loss.item()
else:
patients += 1
if patients >= tol:
print("Early Stop")
print("epoch", epoch)
plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
break
if epoch % config.prt_evry == 0:
print("Epochs:{}".format(epoch))
print("Training : Negative Log Likelihood:{:2.6f} Log Lambda:{:2.6f}: Integral Lambda:{:2.6f}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
print("Validation: Negative Log Likelihood:{:2.6f} Log Lambda:{:2.6f}: Integral Lambda:{:2.6f}".format(val_loss / val_timeseq.size(0),
-val_log_lmbda / val_timeseq.size(0),
val_int_lmbda/val_timeseq.size(0)))
plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
print("end")
# -
# +
# class CryptoTraderPL_NLL(pl.LightningModule):
# def __init__(self, config):
# super().__init__()
# self.config = config
# self._model = GTPP(config)
# def forward(self, x):
# return self._model(x)
# def training_step(self, batch, batch_idx, phase='train'):
# """
# Training step which runs for N steps, and get loss over all of them
# """
# x, l, r = batch
# y_pred = self._model(x)
# # we have multiple targets. So move them to batch
# l2 = l.reshape(-1)
# y_pred2 = y_pred.reshape((*l2.shape, 3))
# loss = F.nll_loss(y_pred2, l2)
# # record weights
# self.log_dict({
# f'loss/{phase}': loss,
# }, prog_bar=True)
# assert torch.isfinite(loss)
# return loss
# def validation_step(self, batch, batch_idx):
# return self.training_step(batch, batch_idx, phase='val')
# def predict_step(self, batch, batch_idx):
# x, y, r = batch
# y_pred = self.forward(x)
# return y_pred, y, r
# def configure_optimizers(self):
# optim = Ranger21(self.parameters(),
# lr=self.train_kwargs['lr'],
# num_epochs=num_epochs,
# num_batches_per_epoch=num_batches_per_epoch,
# weight_decay=self.train_kwargs['weight_decay'])
# return {'optimizer': optim, 'monitor': 'loss/val'}
# -
+2885
View File
File diff suppressed because one or more lines are too long
+181
View File
@@ -0,0 +1,181 @@
# %reload_ext autoreload
# %autoreload 2
import matplotlib.pyplot as plt
# %matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12.0, 3)
import numpy as np
import tqdm
import torch
from torch.utils.data import DataLoader
from pathlib import Path
import pandas as pd
from utils import read_timeseries,generate_sequence, plt_lmbda
from module import GTPP
from run import get_parser
parser = get_parser()
argv = """
--epochs=100
""".replace('\n', '').split()
config = parser.parse_args(argv)
config
# # Data
# +
path = 'data/'
if config.data == 'exponential_hawkes':
train_data = read_timeseries(path + config.data + '_training.csv')
val_data = read_timeseries(path + config.data + '_validation.csv')
test_data = read_timeseries(path + config.data + '_testing.csv')
else:
raise NotImplemented('only exponential_hawkes')
train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
# -
# # Model
import torch.optim
# +
import pytorch_lightning as pl
class CryptoTraderNPP(pl.LightningModule):
def __init__(self, config):
super().__init__()
self.config = config
self._model = GTPP(config)
def forward(self, x):
return self._model(x)
def training_step(self, batch, batch_idx, phase='train'):
torch.set_grad_enabled(True) # we need grad event in val and test
loss, log_lmbda, int_lmbda, lmbda = self._model(batch)
if phase!='train':
# free the graph, free mem
loss = loss.detach()
# record weights
self.log_dict({
f'loss/{phase}': loss,
}, prog_bar=True)
assert torch.isfinite(loss)
return loss
def validation_step(self, batch, batch_idx):
return self.training_step(batch, batch_idx, phase='val')
def predict_step(self, batch, batch_idx):
y_pred = self.forward(batch)
# on predict we want to return multiple values, not just the loss
return (y_pred, *batch)
def on_phase_end(self) -> None:
# this seems to help with cuda memory
self._model.zero_grad()
torch.cuda.empty_cache()
def on_train_end(self):
self.on_phase_end()
def on_validation_end(self):
self.on_phase_end()
def on_predict_end(self):
self.on_phase_end()
def on_epoch_end(self):
if self.trainer.current_epoch%5==0:
i=0
device = self.device
self.eval().cpu()
plt.title(f'train {i} e={self.trainer.current_epoch}')
plt_lmbda(train_data[i], model=self, seq_len=config.seq_len, log_mode=config.log_mode)
plt.show()
plt.title(f'val {i} e={self.trainer.current_epoch}')
plt_lmbda(val_data[i], model=self, seq_len=config.seq_len, log_mode=config.log_mode)
plt.show()
model.to(device).train()
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=config.lr)
return {'optimizer': optim, 'monitor': 'loss/val'}
# -
model = CryptoTraderNPP(config)
model
# # Train
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger
trainer = pl.Trainer(
max_epochs=config.epochs,
gpus=1,
logger=[
CSVLogger('../outputs/logs')
],
)
trainer.fit(model, train_loader, val_loader)
# # Hist
csv_logger = trainer.logger[0]
hp = Path(csv_logger.experiment.metrics_file_path)
df = pd.read_csv(hp).groupby('epoch').min()[['loss/train', 'loss/val']]
df.plot(logy=True)
plt.show()
df.plot()
# # Plot
# +
i=0
plt.title(f'train {i}')
plt_lmbda(train_data[i], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
plt.show()
plt.title(f'val {i}')
plt_lmbda(val_data[i], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
plt.show()
# -
plt.title(f'train {i}')
plt_lmbda(train_data[i], model=model, seq_len=config.seq_len, log_mode=~config.log_mode)
plt.show()
plt.title(f'train {i}')
plt_lmbda(train_data[i], model=model, alpha=0.01, lmbda0=0, seq_len=config.seq_len, log_mode=config.log_mode)
plt.show()
+33 -28
View File
@@ -3,7 +3,7 @@ from torch import nn
from torch.autograd import grad
from torch.optim import Adam
from torch.nn import functional as F
from optimization import BertAdam
# from optimization import BertAdam
from matplotlib import pyplot as plt
@@ -21,12 +21,20 @@ class IntensityNet(nn.Module):
self.mean_first = config.mean_first
self.log_t = config.log_t
self.init_weights_positive()
def init_weights_positive(self):
eps = 1e-10
for p in self.parameters():
p.data = torch.abs(p.data)
p.data = torch.clamp(p.data, min=eps)
def forward(self, hidden_state, target_time):
eps = 1e-10
for p in self.parameters():
p.data *= (p.data>=0)
p.data = torch.clamp(p.data, min=eps)
target_time.requires_grad_(True)
if self.log_t:
@@ -53,6 +61,7 @@ class IntensityNet(nn.Module):
return [nll, log_lmbda_mean, int_lmbda_mean, lmbda]
LEAK=1
class GTPP(nn.Module):
def __init__(self, config):
@@ -66,22 +75,22 @@ class GTPP(nn.Module):
self.embedding = nn.Embedding(num_embeddings=config.event_class, embedding_dim=config.emb_dim)
self.emb_drop = nn.Dropout(p=config.dropout)
self.lstm = nn.LSTM(input_size=1+config.emb_dim,
self.lstm = nn.LSTM(input_size=LEAK+config.emb_dim,
hidden_size=config.hid_dim,
batch_first=True,
bidirectional=False)
self.intensity_net = IntensityNet(config)
self.set_optimizer(total_step=1)
# self.set_optimizer(total_step=1)
def set_optimizer(self, total_step, use_bert=False):
if use_bert:
self.set_optimizer = BertAdam(params=self.parameters(),
lr=self.lr,
warmup=0.1,
t_total=total_step)
else:
self.set_optimizer = Adam(self.parameters(), lr=self.lr)
# def set_optimizer(self, total_step, use_bert=False):
# if use_bert:
# self.set_optimizer = BertAdam(params=self.parameters(),
# lr=self.lr,
# warmup=0.1,
# t_total=total_step)
# else:
# self.set_optimizer = Adam(self.parameters(), lr=self.lr)
def forward(self, batch):
@@ -89,31 +98,27 @@ class GTPP(nn.Module):
event_seq = event_seq.long()
emb = self.embedding(event_seq)
emb = self.emb_drop(emb)
lstm_input = torch.cat([emb, time_seq.unsqueeze(-1)], dim=-1)
if LEAK:
lstm_input = torch.cat([emb[:, :-1], time_seq[:, :-1].unsqueeze(-1)], dim=-1)
else:
lstm_input = emb
hidden_state, _ = self.lstm(lstm_input)
# FIXME wait we pass the target time into the LSTM. Is this data leakage?
nll, log_lmbda, int_lmbda, lmbda = self.intensity_net(hidden_state, time_seq[:, -1])
return [nll, log_lmbda.detach(), int_lmbda.detach(), lmbda.detach()]
def train_batch(self, batch):
self.set_optimizer.zero_grad()
nll, log_lmbda, int_lmbda, lmbda = self.forward(batch)
loss = nll
loss.backward()
self.set_optimizer.step()
return nll.item(), log_lmbda.item(), int_lmbda.item(), lmbda
# def train_batch(self, batch):
# self.set_optimizer.zero_grad()
# nll, log_lmbda, int_lmbda, lmbda = self.forward(batch)
# loss = nll
# loss.backward()
# self.set_optimizer.step()
# return nll.item(), log_lmbda.item(), int_lmbda.item(), lmbda
+260 -260
View File
@@ -1,304 +1,304 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""
# # coding=utf-8
# # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
# """PyTorch optimization for BERT model."""
import math
import torch
# import math
# import torch
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
import logging
import abc
import sys
# from torch.optim import Optimizer
# from torch.optim.optimizer import required
# from torch.nn.utils import clip_grad_norm_
# import logging
# import abc
# import sys
logger = logging.getLogger(__name__)
# logger = logging.getLogger(__name__)
if sys.version_info >= (3, 4):
ABC = abc.ABC
else:
ABC = abc.ABCMeta('ABC', (), {})
# if sys.version_info >= (3, 4):
# ABC = abc.ABC
# else:
# ABC = abc.ABCMeta('ABC', (), {})
class _LRSchedule(ABC):
""" Parent of all LRSchedules here. """
warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense
def __init__(self, warmup=0.002, t_total=-1, **kw):
"""
:param warmup: what fraction of t_total steps will be used for linear warmup
:param t_total: how many training steps (updates) are planned
:param kw:
"""
super(_LRSchedule, self).__init__(**kw)
if t_total < 0:
logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
warmup = max(warmup, 0.)
self.warmup, self.t_total = float(warmup), float(t_total)
self.warned_for_t_total_at_progress = -1
# class _LRSchedule(ABC):
# """ Parent of all LRSchedules here. """
# warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense
# def __init__(self, warmup=0.002, t_total=-1, **kw):
# """
# :param warmup: what fraction of t_total steps will be used for linear warmup
# :param t_total: how many training steps (updates) are planned
# :param kw:
# """
# super(_LRSchedule, self).__init__(**kw)
# if t_total < 0:
# logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
# if not 0.0 <= warmup < 1.0 and not warmup == -1:
# raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
# warmup = max(warmup, 0.)
# self.warmup, self.t_total = float(warmup), float(t_total)
# self.warned_for_t_total_at_progress = -1
def get_lr(self, step, nowarn=False):
"""
:param step: which of t_total steps we're on
:param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps
:return: learning rate multiplier for current update
"""
if self.t_total < 0:
return 1.
progress = float(step) / self.t_total
ret = self.get_lr_(progress)
# warning for exceeding t_total (only active with warmup_linear
if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
logger.warning(
"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
.format(ret, self.__class__.__name__))
self.warned_for_t_total_at_progress = progress
# end warning
return ret
# def get_lr(self, step, nowarn=False):
# """
# :param step: which of t_total steps we're on
# :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps
# :return: learning rate multiplier for current update
# """
# if self.t_total < 0:
# return 1.
# progress = float(step) / self.t_total
# ret = self.get_lr_(progress)
# # warning for exceeding t_total (only active with warmup_linear
# if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
# logger.warning(
# "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
# .format(ret, self.__class__.__name__))
# self.warned_for_t_total_at_progress = progress
# # end warning
# return ret
@abc.abstractmethod
def get_lr_(self, progress):
"""
:param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress
:return: learning rate multiplier for current update
"""
return 1.
# @abc.abstractmethod
# def get_lr_(self, progress):
# """
# :param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress
# :return: learning rate multiplier for current update
# """
# return 1.
class ConstantLR(_LRSchedule):
def get_lr_(self, progress):
return 1.
# class ConstantLR(_LRSchedule):
# def get_lr_(self, progress):
# return 1.
class WarmupCosineSchedule(_LRSchedule):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
"""
warn_t_total = True
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
"""
:param warmup: see LRSchedule
:param t_total: see LRSchedule
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
:param kw:
"""
super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
self.cycles = cycles
# class WarmupCosineSchedule(_LRSchedule):
# """
# Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
# Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
# If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
# """
# warn_t_total = True
# def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
# """
# :param warmup: see LRSchedule
# :param t_total: see LRSchedule
# :param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
# :param kw:
# """
# super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
# self.cycles = cycles
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
# def get_lr_(self, progress):
# if progress < self.warmup:
# return progress / self.warmup
# else:
# progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
# return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts).
"""
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
assert(cycles >= 1.)
# class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
# """
# Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
# If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
# learning rate (with hard restarts).
# """
# def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
# super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
# assert(cycles >= 1.)
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
return ret
# def get_lr_(self, progress):
# if progress < self.warmup:
# return progress / self.warmup
# else:
# progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
# ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
# return ret
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
"""
All training progress is divided in `cycles` (default=1.) parts of equal length.
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
"""
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
assert(warmup * cycles < 1.)
warmup = warmup * cycles if warmup >= 0 else warmup
super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
# class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
# """
# All training progress is divided in `cycles` (default=1.) parts of equal length.
# Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
# followed by a learning rate decreasing from 1. to 0. following a cosine curve.
# """
# def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
# assert(warmup * cycles < 1.)
# warmup = warmup * cycles if warmup >= 0 else warmup
# super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
def get_lr_(self, progress):
progress = progress * self.cycles % 1.
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * progress))
return ret
# def get_lr_(self, progress):
# progress = progress * self.cycles % 1.
# if progress < self.warmup:
# return progress / self.warmup
# else:
# progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
# ret = 0.5 * (1. + math.cos(math.pi * progress))
# return ret
class WarmupConstantSchedule(_LRSchedule):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Keeps learning rate equal to 1. after warmup.
"""
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
return 1.
# class WarmupConstantSchedule(_LRSchedule):
# """
# Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
# Keeps learning rate equal to 1. after warmup.
# """
# def get_lr_(self, progress):
# if progress < self.warmup:
# return progress / self.warmup
# return 1.
class WarmupLinearSchedule(_LRSchedule):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
"""
warn_t_total = True
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
return max((progress - 1.) / (self.warmup - 1.), 0.)
# class WarmupLinearSchedule(_LRSchedule):
# """
# Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
# Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
# """
# warn_t_total = True
# def get_lr_(self, progress):
# if progress < self.warmup:
# return progress / self.warmup
# return max((progress - 1.) / (self.warmup - 1.), 0.)
SCHEDULES = {
None: ConstantLR,
"none": ConstantLR,
"warmup_cosine": WarmupCosineSchedule,
"warmup_constant": WarmupConstantSchedule,
"warmup_linear": WarmupLinearSchedule
}
# SCHEDULES = {
# None: ConstantLR,
# "none": ConstantLR,
# "warmup_cosine": WarmupCosineSchedule,
# "warmup_constant": WarmupConstantSchedule,
# "warmup_linear": WarmupLinearSchedule
# }
class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
schedule: schedule to use for the warmup (see above).
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
If `None` or `'none'`, learning rate is always kept constant.
Default : `'warmup_linear'`
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# initialize schedule object
if not isinstance(schedule, _LRSchedule):
schedule_type = SCHEDULES[schedule]
schedule = schedule_type(warmup=warmup, t_total=t_total)
else:
if warmup != -1 or t_total != -1:
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object.")
defaults = dict(lr=lr, schedule=schedule,
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(BertAdam, self).__init__(params, defaults)
# class BertAdam(Optimizer):
# """Implements BERT version of Adam algorithm with weight decay fix.
# Params:
# lr: learning rate
# warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
# t_total: total number of training steps for the learning
# rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
# schedule: schedule to use for the warmup (see above).
# Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
# If `None` or `'none'`, learning rate is always kept constant.
# Default : `'warmup_linear'`
# b1: Adams b1. Default: 0.9
# b2: Adams b2. Default: 0.999
# e: Adams epsilon. Default: 1e-6
# weight_decay: Weight decay. Default: 0.01
# max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
# """
# def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
# b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
# if lr is not required and lr < 0.0:
# raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
# if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
# raise ValueError("Invalid schedule parameter: {}".format(schedule))
# if not 0.0 <= b1 < 1.0:
# raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
# if not 0.0 <= b2 < 1.0:
# raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
# if not e >= 0.0:
# raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# # initialize schedule object
# if not isinstance(schedule, _LRSchedule):
# schedule_type = SCHEDULES[schedule]
# schedule = schedule_type(warmup=warmup, t_total=t_total)
# else:
# if warmup != -1 or t_total != -1:
# logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
# "Please specify custom warmup and t_total in _LRSchedule object.")
# defaults = dict(lr=lr, schedule=schedule,
# b1=b1, b2=b2, e=e, weight_decay=weight_decay,
# max_grad_norm=max_grad_norm)
# super(BertAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
lr.append(lr_scheduled)
return lr
# def get_lr(self):
# lr = []
# for group in self.param_groups:
# for p in group['params']:
# state = self.state[p]
# if len(state) == 0:
# return [0]
# lr_scheduled = group['lr']
# lr_scheduled *= group['schedule'].get_lr(state['step'])
# lr.append(lr_scheduled)
# return lr
def step(self, closure=None):
"""Performs a single optimization step.
# def step(self, closure=None):
# """Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
# Arguments:
# closure (callable, optional): A closure that reevaluates the model
# and returns the loss.
# """
# loss = None
# if closure is not None:
# loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
# for group in self.param_groups:
# for p in group['params']:
# if p.grad is None:
# continue
# grad = p.grad.data
# if grad.is_sparse:
# raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['next_m'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['next_v'] = torch.zeros_like(p.data)
# # State initialization
# if len(state) == 0:
# state['step'] = 0
# # Exponential moving average of gradient values
# state['next_m'] = torch.zeros_like(p.data)
# # Exponential moving average of squared gradient values
# state['next_v'] = torch.zeros_like(p.data)
next_m, next_v = state['next_m'], state['next_v']
beta1, beta2 = group['b1'], group['b2']
# next_m, next_v = state['next_m'], state['next_v']
# beta1, beta2 = group['b1'], group['b2']
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# # Add grad clipping
# if group['max_grad_norm'] > 0:
# clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time
next_m.mul_(beta1).add_(1 - beta1, grad)
next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
update = next_m / (next_v.sqrt() + group['e'])
# # Decay the first and second moment running average coefficient
# # In-place operations to update the averages at the same time
# next_m.mul_(beta1).add_(1 - beta1, grad)
# next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
# update = next_m / (next_v.sqrt() + group['e'])
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay'] > 0.0:
update += group['weight_decay'] * p.data
# # Just adding the square of the weights to the loss function is *not*
# # the correct way of using L2 regularization/weight decay with Adam,
# # since that will interact with the m and v parameters in strange ways.
# #
# # Instead we want to decay the weights in a manner that doesn't interact
# # with the m/v parameters. This is equivalent to adding the square
# # of the weights to the loss with plain (non-momentum) SGD.
# if group['weight_decay'] > 0.0:
# update += group['weight_decay'] * p.data
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
# lr_scheduled = group['lr']
# lr_scheduled *= group['schedule'].get_lr(state['step'])
update_with_lr = lr_scheduled * update
p.data.add_(-update_with_lr)
# update_with_lr = lr_scheduled * update
# p.data.add_(-update_with_lr)
state['step'] += 1
# state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
# # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# # No bias correction
# # bias_correction1 = 1 - beta1 ** state['step']
# # bias_correction2 = 1 - beta2 ** state['step']
return loss
# return loss
+3
View File
@@ -1,6 +1,9 @@
Modified by wassname from the below:
Changes:
- [ ] clamp weight with epsilon for stablity
- from `p.data *= (p.data>=0)`
- to `p.data = torch.clamp(p.data, min=eps)`
- [ ] try log t
- [ ] try not mean as much in intensity layer
- [ ] use pytorch lightning
+47 -47
View File
@@ -38,76 +38,76 @@ def get_parser():
parser.add_argument("--mean_first", action="store_true", help="in model take mean first")
return parser
if __name__ == "__main__":
# if __name__ == "__main__":
parser = get_parser()
# parser = get_parser()
config = parser.parse_args()
# config = parser.parse_args()
path = 'data/'
# path = 'data/'
if config.data == 'exponential_hawkes':
# if config.data == 'exponential_hawkes':
train_data = read_timeseries(path + config.data + '_training.csv')
val_data = read_timeseries(path + config.data + '_validation.csv')
test_data = read_timeseries(path + config.data + '_testing.csv')
else:
raise NotImplemented('only exponential_hawkes')
# train_data = read_timeseries(path + config.data + '_training.csv')
# val_data = read_timeseries(path + config.data + '_validation.csv')
# test_data = read_timeseries(path + config.data + '_testing.csv')
# else:
# raise NotImplemented('only exponential_hawkes')
train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
# train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
# train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
# val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
# val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
model = GTPP(config)
# model = GTPP(config)
best_loss = 1e3
patients = 0
tol = 30
# best_loss = 1e3
# patients = 0
# tol = 30
for epoch in range(config.epochs):
# for epoch in range(config.epochs):
model.train()
# model.train()
loss1 = loss2 = loss3 = 0
# loss1 = loss2 = loss3 = 0
for batch in train_loader:
loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)
# for batch in train_loader:
# loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)
loss1 += loss
loss2 += log_lmbda
loss3 += int_lmbda
# loss1 += loss
# loss2 += log_lmbda
# loss3 += int_lmbda
model.eval()
# model.eval()
for batch in val_loader:
val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)
# for batch in val_loader:
# val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)
if best_loss > val_loss:
best_loss = val_loss.item()
else:
patients += 1
if patients >= tol:
print("Early Stop")
print("epoch", epoch)
plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
break
# if best_loss > val_loss:
# best_loss = val_loss.item()
# else:
# patients += 1
# if patients >= tol:
# print("Early Stop")
# print("epoch", epoch)
# plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
# break
if epoch % config.prt_evry == 0:
print("Epochs:{}".format(epoch))
print("Training Negative Log Likelihood:{} Log Lambda:{}: Integral Lambda:{}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
print("Validation Negative Log Likelihood:{} Log Lambda:{}: Integral Lambda:{}".format(val_loss / val_timeseq.size(0),
-val_log_lmbda / val_timeseq.size(0),
val_int_lmbda/val_timeseq.size(0)))
plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
# plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
# if epoch % config.prt_evry == 0:
# print("Epochs:{}".format(epoch))
# print("Training Negative Log Likelihood:{} Log Lambda:{}: Integral Lambda:{}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
# print("Validation Negative Log Likelihood:{} Log Lambda:{}: Integral Lambda:{}".format(val_loss / val_timeseq.size(0),
# -val_log_lmbda / val_timeseq.size(0),
# val_int_lmbda/val_timeseq.size(0)))
# plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
# # plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
print("end")
# print("end")
+2 -2
View File
@@ -47,7 +47,7 @@ def generate_sequence(timeseries, seq_len, log_mode=False):
def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0.2, alpha=0.8, beta=1.0):
def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0., alpha=0.01, beta=1.0):
lmbda_dict = dict()
pred_dict = dict()
@@ -71,6 +71,6 @@ def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0.2, a
plt.plot(t_span, lmbda_dict[0], color='green', label='true prob')
plt.plot([t for t, e in timeseries][seq_len-1:], np.array(pred_dict[0].detach()), color='olive', label='pred prob')
plt.scatter([t for t, e in timeseries], [-1 for _ in timeseries], color='blue', label='events')
plt.scatter([t for t, e in timeseries], [-.01 for _ in timeseries], color='blue', label='events')
plt.legend()
plt.show()