use pytorch lightning remove potential leakage, misc

2026-06-27 17:02:55 +08:00 · 2022-02-11 20:05:08 +08:00
parent e8ab8fc1f4
commit 399896bd52
10 changed files with 3413 additions and 1025 deletions
@@ -1,3 +1,4 @@
+/default/

 # Created by https://www.gitignore.io/api/linux,python,windows,jupyternotebook

@@ -147,5 +148,4 @@ $RECYCLE.BIN/
 *.lnk

 # End of https://www.gitignore.io/api/linux,python,windows,jupyternotebook
-.demo_cache.sqlite
-demo_cache.sqlite
+
@@ -1,153 +0,0 @@
-# %reload_ext autoreload
-# %autoreload 2
-
-import matplotlib.pyplot as plt
-# %matplotlib inline
-plt.style.use('ggplot')
-plt.rcParams['figure.figsize'] = (12.0, 3)
-
-import numpy as np
-import tqdm
-import torch
-
-from argparse import ArgumentParser
-
-from torch.utils.data import DataLoader
-
-from utils import read_timeseries,generate_sequence, plt_lmbda
-from module import GTPP
-from run import get_parser
-
-
-
-
-
-# +
-parser = get_parser()
-config = parser.parse_args([])
-
-path = 'data/'
-
-if config.data == 'exponential_hawkes':
-
-    train_data = read_timeseries(path + config.data + '_training.csv')
-    val_data = read_timeseries(path + config.data + '_validation.csv')
-    test_data = read_timeseries(path + config.data + '_testing.csv')
-else:
-    raise NotImplemented('only exponential_hawkes')
-
-
-
-train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
-train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
-val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
-val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
-
-model = GTPP(config)
-
-best_loss = 1e3
-patients = 0
-tol = 333
-
-for epoch in range(config.epochs):
-
-    model.train()
-
-    loss1 = loss2 = loss3 = 0
-
-    for batch in train_loader:
-        loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)
-
-        loss1 += loss
-        loss2 += log_lmbda
-        loss3 += int_lmbda
-
-
-    model.eval()
-
-    for batch in val_loader:
-        val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)
-
-    if best_loss > val_loss:
-        best_loss = val_loss.item()
-    else:
-        patients += 1
-        if patients >= tol:
-            print("Early Stop")
-            print("epoch", epoch)
-            plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
-            break
-
-    if epoch % config.prt_evry == 0:
-        print("Epochs:{}".format(epoch))
-        print("Training  : Negative Log Likelihood:{:2.6f}   Log Lambda:{:2.6f}:   Integral Lambda:{:2.6f}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
-        print("Validation: Negative Log Likelihood:{:2.6f}   Log Lambda:{:2.6f}:   Integral Lambda:{:2.6f}".format(val_loss / val_timeseq.size(0),
-                                                                                        -val_log_lmbda / val_timeseq.size(0),
-                                                                                        val_int_lmbda/val_timeseq.size(0)))
-        plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
-        plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
-
-
-print("end")
-# -
-
-
-# +
-# class CryptoTraderPL_NLL(pl.LightningModule):
-#     def __init__(self, config):
-#         super().__init__()
-#         self.config = config
-#         self._model = GTPP(config)
-
-#     def forward(self, x):
-#         return self._model(x)
-
-#     def training_step(self, batch, batch_idx, phase='train'):
-#         """
-#         Training step which runs for N steps, and get loss over all of them
-#         """
-#         x, l, r = batch
-#         y_pred = self._model(x)
-        
-#         # we have multiple targets. So move them to batch
-#         l2 = l.reshape(-1)
-#         y_pred2 = y_pred.reshape((*l2.shape, 3))
-#         loss = F.nll_loss(y_pred2, l2)
-
-#         # record weights
-#         self.log_dict({
-#             f'loss/{phase}': loss,
-#         }, prog_bar=True)
-
-#         assert torch.isfinite(loss)
-#         return loss
-
-#     def validation_step(self, batch, batch_idx):
-#         return self.training_step(batch, batch_idx, phase='val')
-    
-#     def predict_step(self, batch, batch_idx):
-#         x, y, r = batch
-#         y_pred = self.forward(x)
-#         return y_pred, y, r
-
-#     def configure_optimizers(self):
-#         optim = Ranger21(self.parameters(),
-#                          lr=self.train_kwargs['lr'],
-#                          num_epochs=num_epochs,
-#                          num_batches_per_epoch=num_batches_per_epoch,
-#                          weight_decay=self.train_kwargs['weight_decay'])
-#         return {'optimizer': optim, 'monitor': 'loss/val'}
-# -
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -0,0 +1,181 @@
+# %reload_ext autoreload
+# %autoreload 2
+
+import matplotlib.pyplot as plt
+# %matplotlib inline
+plt.style.use('ggplot')
+plt.rcParams['figure.figsize'] = (12.0, 3)
+
+import numpy as np
+import tqdm
+import torch
+from torch.utils.data import DataLoader
+from pathlib import Path
+import pandas as pd
+
+from utils import read_timeseries,generate_sequence, plt_lmbda
+from module import GTPP
+from run import get_parser
+
+parser = get_parser()
+argv = """
+--epochs=100
+""".replace('\n', '').split()
+config = parser.parse_args(argv)
+config
+
+# # Data
+
+# +
+
+path = 'data/'
+
+if config.data == 'exponential_hawkes':
+
+    train_data = read_timeseries(path + config.data + '_training.csv')
+    val_data = read_timeseries(path + config.data + '_validation.csv')
+    test_data = read_timeseries(path + config.data + '_testing.csv')
+else:
+    raise NotImplemented('only exponential_hawkes')
+
+
+train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
+train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
+
+val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
+val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
+
+# -
+
+# # Model
+
+import torch.optim
+
+
+# +
+import pytorch_lightning as pl
+
+class CryptoTraderNPP(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self._model = GTPP(config)
+
+    def forward(self, x):
+        return self._model(x)
+
+    def training_step(self, batch, batch_idx, phase='train'):
+        torch.set_grad_enabled(True) # we need grad event in val and test
+        
+        loss, log_lmbda, int_lmbda, lmbda = self._model(batch)
+        
+        if phase!='train':
+            # free the graph, free mem
+            loss = loss.detach()
+
+        # record weights
+        self.log_dict({
+            f'loss/{phase}': loss,
+        }, prog_bar=True)
+
+        assert torch.isfinite(loss)
+        return loss
+    
+    def validation_step(self, batch, batch_idx):
+        return self.training_step(batch, batch_idx, phase='val')
+    
+    def predict_step(self, batch, batch_idx):
+        y_pred = self.forward(batch)
+        # on predict we want to return multiple values, not just the loss
+        return (y_pred, *batch)
+    
+    def on_phase_end(self) -> None:
+        # this seems to help with cuda memory
+        self._model.zero_grad()
+        torch.cuda.empty_cache()
+        
+    def on_train_end(self):
+        self.on_phase_end()
+        
+    def on_validation_end(self):
+        self.on_phase_end()
+        
+    def on_predict_end(self):
+        self.on_phase_end()
+        
+    def on_epoch_end(self):
+        if self.trainer.current_epoch%5==0:
+            i=0
+            device = self.device
+            self.eval().cpu()
+            plt.title(f'train {i} e={self.trainer.current_epoch}')
+            plt_lmbda(train_data[i], model=self, seq_len=config.seq_len, log_mode=config.log_mode)
+            plt.show()
+
+            plt.title(f'val {i} e={self.trainer.current_epoch}')
+            plt_lmbda(val_data[i], model=self, seq_len=config.seq_len, log_mode=config.log_mode)
+            plt.show()
+
+            model.to(device).train()
+
+    def configure_optimizers(self):
+        optim = torch.optim.Adam(self.parameters(), lr=config.lr)
+        return {'optimizer': optim, 'monitor': 'loss/val'}
+
+
+# -
+model = CryptoTraderNPP(config)
+model
+
+# # Train
+
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import CSVLogger
+
+trainer = pl.Trainer(
+    max_epochs=config.epochs,
+    gpus=1,
+    logger=[
+        CSVLogger('../outputs/logs')
+    ],
+)
+
+trainer.fit(model, train_loader, val_loader)
+
+# # Hist
+
+csv_logger = trainer.logger[0]
+hp = Path(csv_logger.experiment.metrics_file_path)
+df = pd.read_csv(hp).groupby('epoch').min()[['loss/train', 'loss/val']]
+df.plot(logy=True)
+plt.show()
+df.plot()
+
+# # Plot
+
+
+# +
+i=0
+
+plt.title(f'train {i}')
+plt_lmbda(train_data[i], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
+plt.show()
+
+plt.title(f'val {i}')
+plt_lmbda(val_data[i], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
+plt.show()
+# -
+
+plt.title(f'train {i}')
+plt_lmbda(train_data[i], model=model, seq_len=config.seq_len, log_mode=~config.log_mode)
+plt.show()
+
+plt.title(f'train {i}')
+plt_lmbda(train_data[i], model=model, alpha=0.01, lmbda0=0, seq_len=config.seq_len, log_mode=config.log_mode)
+plt.show()
+
+
+
+
+
+
@@ -3,7 +3,7 @@ from torch import nn
 from torch.autograd import grad
 from torch.optim import Adam
 from torch.nn import functional as F
-from optimization import BertAdam
+# from optimization import BertAdam

 from matplotlib import pyplot as plt

@@ -21,12 +21,20 @@ class IntensityNet(nn.Module):
        self.mean_first = config.mean_first
        self.log_t = config.log_t

+        self.init_weights_positive()
+
+    def init_weights_positive(self):
+        eps = 1e-10
+        for p in self.parameters():
+            p.data = torch.abs(p.data)
+            p.data = torch.clamp(p.data, min=eps)
+

    def forward(self, hidden_state, target_time):
        eps = 1e-10

        for p in self.parameters():
-            p.data *= (p.data>=0)
+            p.data = torch.clamp(p.data, min=eps)

        target_time.requires_grad_(True)
        if self.log_t:
@@ -53,6 +61,7 @@ class IntensityNet(nn.Module):

        return [nll, log_lmbda_mean, int_lmbda_mean, lmbda]

+LEAK=1
 class GTPP(nn.Module):

    def __init__(self, config):
@@ -66,22 +75,22 @@ class GTPP(nn.Module):

        self.embedding = nn.Embedding(num_embeddings=config.event_class, embedding_dim=config.emb_dim)
        self.emb_drop = nn.Dropout(p=config.dropout)
-        self.lstm = nn.LSTM(input_size=1+config.emb_dim,
+        self.lstm = nn.LSTM(input_size=LEAK+config.emb_dim,
                            hidden_size=config.hid_dim,
                            batch_first=True,
                            bidirectional=False)
        self.intensity_net = IntensityNet(config)
-        self.set_optimizer(total_step=1)
+        # self.set_optimizer(total_step=1)


-    def set_optimizer(self, total_step, use_bert=False):
-        if use_bert:
-            self.set_optimizer = BertAdam(params=self.parameters(),
-                                          lr=self.lr,
-                                          warmup=0.1,
-                                          t_total=total_step)
-        else:
-            self.set_optimizer = Adam(self.parameters(), lr=self.lr)
+    # def set_optimizer(self, total_step, use_bert=False):
+    #     if use_bert:
+    #         self.set_optimizer = BertAdam(params=self.parameters(),
+    #                                       lr=self.lr,
+    #                                       warmup=0.1,
+    #                                       t_total=total_step)
+    #     else:
+    #         self.set_optimizer = Adam(self.parameters(), lr=self.lr)


    def forward(self, batch):
@@ -89,31 +98,27 @@ class GTPP(nn.Module):
        event_seq = event_seq.long()
        emb = self.embedding(event_seq)
        emb = self.emb_drop(emb)
-        lstm_input = torch.cat([emb, time_seq.unsqueeze(-1)], dim=-1)
+        if LEAK:
+            lstm_input = torch.cat([emb[:, :-1], time_seq[:, :-1].unsqueeze(-1)], dim=-1)
+        else:
+            lstm_input = emb
        hidden_state, _ = self.lstm(lstm_input)

+        # FIXME wait we pass the target time into the LSTM. Is this data leakage?
        nll, log_lmbda, int_lmbda, lmbda = self.intensity_net(hidden_state, time_seq[:, -1])

        return [nll, log_lmbda.detach(), int_lmbda.detach(), lmbda.detach()]


-    def train_batch(self, batch):
-
-        self.set_optimizer.zero_grad()
-        nll, log_lmbda, int_lmbda, lmbda = self.forward(batch)
-        loss = nll
-        loss.backward()
-        self.set_optimizer.step()
-
-        return nll.item(), log_lmbda.item(), int_lmbda.item(), lmbda
-
-
-
-
-
-
+    # def train_batch(self, batch):

+    #     self.set_optimizer.zero_grad()
+    #     nll, log_lmbda, int_lmbda, lmbda = self.forward(batch)
+    #     loss = nll
+    #     loss.backward()
+    #     self.set_optimizer.step()

+    #     return nll.item(), log_lmbda.item(), int_lmbda.item(), lmbda



@@ -1,304 +1,304 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for BERT model."""
+# # coding=utf-8
+# # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+# """PyTorch optimization for BERT model."""

-import math
-import torch
+# import math
+# import torch

-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-import logging
-import abc
-import sys
+# from torch.optim import Optimizer
+# from torch.optim.optimizer import required
+# from torch.nn.utils import clip_grad_norm_
+# import logging
+# import abc
+# import sys


-logger = logging.getLogger(__name__)
+# logger = logging.getLogger(__name__)


-if sys.version_info >= (3, 4):
-    ABC = abc.ABC
-else:
-    ABC = abc.ABCMeta('ABC', (), {})
+# if sys.version_info >= (3, 4):
+#     ABC = abc.ABC
+# else:
+#     ABC = abc.ABCMeta('ABC', (), {})


-class _LRSchedule(ABC):
-    """ Parent of all LRSchedules here. """
-    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
-    def __init__(self, warmup=0.002, t_total=-1, **kw):
-        """
-        :param warmup:  what fraction of t_total steps will be used for linear warmup
-        :param t_total: how many training steps (updates) are planned
-        :param kw:
-        """
-        super(_LRSchedule, self).__init__(**kw)
-        if t_total < 0:
-            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
-        warmup = max(warmup, 0.)
-        self.warmup, self.t_total = float(warmup), float(t_total)
-        self.warned_for_t_total_at_progress = -1
+# class _LRSchedule(ABC):
+#     """ Parent of all LRSchedules here. """
+#     warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+#     def __init__(self, warmup=0.002, t_total=-1, **kw):
+#         """
+#         :param warmup:  what fraction of t_total steps will be used for linear warmup
+#         :param t_total: how many training steps (updates) are planned
+#         :param kw:
+#         """
+#         super(_LRSchedule, self).__init__(**kw)
+#         if t_total < 0:
+#             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+#         if not 0.0 <= warmup < 1.0 and not warmup == -1:
+#             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+#         warmup = max(warmup, 0.)
+#         self.warmup, self.t_total = float(warmup), float(t_total)
+#         self.warned_for_t_total_at_progress = -1

-    def get_lr(self, step, nowarn=False):
-        """
-        :param step:    which of t_total steps we're on
-        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
-        :return:        learning rate multiplier for current update
-        """
-        if self.t_total < 0:
-            return 1.
-        progress = float(step) / self.t_total
-        ret = self.get_lr_(progress)
-        # warning for exceeding t_total (only active with warmup_linear
-        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
-            logger.warning(
-                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
-                    .format(ret, self.__class__.__name__))
-            self.warned_for_t_total_at_progress = progress
-        # end warning
-        return ret
+#     def get_lr(self, step, nowarn=False):
+#         """
+#         :param step:    which of t_total steps we're on
+#         :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+#         :return:        learning rate multiplier for current update
+#         """
+#         if self.t_total < 0:
+#             return 1.
+#         progress = float(step) / self.t_total
+#         ret = self.get_lr_(progress)
+#         # warning for exceeding t_total (only active with warmup_linear
+#         if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+#             logger.warning(
+#                 "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+#                     .format(ret, self.__class__.__name__))
+#             self.warned_for_t_total_at_progress = progress
+#         # end warning
+#         return ret

-    @abc.abstractmethod
-    def get_lr_(self, progress):
-        """
-        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
-        :return:            learning rate multiplier for current update
-        """
-        return 1.
+#     @abc.abstractmethod
+#     def get_lr_(self, progress):
+#         """
+#         :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+#         :return:            learning rate multiplier for current update
+#         """
+#         return 1.


-class ConstantLR(_LRSchedule):
-    def get_lr_(self, progress):
-        return 1.
+# class ConstantLR(_LRSchedule):
+#     def get_lr_(self, progress):
+#         return 1.


-class WarmupCosineSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
-    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    """
-    warn_t_total = True
-    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
-        """
-        :param warmup:      see LRSchedule
-        :param t_total:     see LRSchedule
-        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
-        :param kw:
-        """
-        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
-        self.cycles = cycles
+# class WarmupCosineSchedule(_LRSchedule):
+#     """
+#     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+#     Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+#     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+#     """
+#     warn_t_total = True
+#     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+#         """
+#         :param warmup:      see LRSchedule
+#         :param t_total:     see LRSchedule
+#         :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+#         :param kw:
+#         """
+#         super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+#         self.cycles = cycles

-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+#     def get_lr_(self, progress):
+#         if progress < self.warmup:
+#             return progress / self.warmup
+#         else:
+#             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+#             return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))


-class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-    learning rate (with hard restarts).
-    """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
-        assert(cycles >= 1.)
+# class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+#     """
+#     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+#     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+#     learning rate (with hard restarts).
+#     """
+#     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+#         super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+#         assert(cycles >= 1.)

-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
-            return ret
+#     def get_lr_(self, progress):
+#         if progress < self.warmup:
+#             return progress / self.warmup
+#         else:
+#             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+#             ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+#             return ret


-class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
-    """
-    All training progress is divided in `cycles` (default=1.) parts of equal length.
-    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
-    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-    """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        assert(warmup * cycles < 1.)
-        warmup = warmup * cycles if warmup >= 0 else warmup
-        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+# class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+#     """
+#     All training progress is divided in `cycles` (default=1.) parts of equal length.
+#     Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+#     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+#     """
+#     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+#         assert(warmup * cycles < 1.)
+#         warmup = warmup * cycles if warmup >= 0 else warmup
+#         super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)

-    def get_lr_(self, progress):
-        progress = progress * self.cycles % 1.
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * progress))
-            return ret
+#     def get_lr_(self, progress):
+#         progress = progress * self.cycles % 1.
+#         if progress < self.warmup:
+#             return progress / self.warmup
+#         else:
+#             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+#             ret = 0.5 * (1. + math.cos(math.pi * progress))
+#             return ret


-class WarmupConstantSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Keeps learning rate equal to 1. after warmup.
-    """
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return 1.
+# class WarmupConstantSchedule(_LRSchedule):
+#     """
+#     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+#     Keeps learning rate equal to 1. after warmup.
+#     """
+#     def get_lr_(self, progress):
+#         if progress < self.warmup:
+#             return progress / self.warmup
+#         return 1.


-class WarmupLinearSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    """
-    warn_t_total = True
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return max((progress - 1.) / (self.warmup - 1.), 0.)
+# class WarmupLinearSchedule(_LRSchedule):
+#     """
+#     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+#     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+#     """
+#     warn_t_total = True
+#     def get_lr_(self, progress):
+#         if progress < self.warmup:
+#             return progress / self.warmup
+#         return max((progress - 1.) / (self.warmup - 1.), 0.)


-SCHEDULES = {
-    None:       ConstantLR,
-    "none":     ConstantLR,
-    "warmup_cosine": WarmupCosineSchedule,
-    "warmup_constant": WarmupConstantSchedule,
-    "warmup_linear": WarmupLinearSchedule
-}
+# SCHEDULES = {
+#     None:       ConstantLR,
+#     "none":     ConstantLR,
+#     "warmup_cosine": WarmupCosineSchedule,
+#     "warmup_constant": WarmupConstantSchedule,
+#     "warmup_linear": WarmupLinearSchedule
+# }


-class BertAdam(Optimizer):
-    """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
-        lr: learning rate
-        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
-        t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
-        schedule: schedule to use for the warmup (see above).
-            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
-            If `None` or `'none'`, learning rate is always kept constant.
-            Default : `'warmup_linear'`
-        b1: Adams b1. Default: 0.9
-        b2: Adams b2. Default: 0.999
-        e: Adams epsilon. Default: 1e-6
-        weight_decay: Weight decay. Default: 0.01
-        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
-    """
-    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
-        if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
-        if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        # initialize schedule object
-        if not isinstance(schedule, _LRSchedule):
-            schedule_type = SCHEDULES[schedule]
-            schedule = schedule_type(warmup=warmup, t_total=t_total)
-        else:
-            if warmup != -1 or t_total != -1:
-                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
-                               "Please specify custom warmup and t_total in _LRSchedule object.")
-        defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
-        super(BertAdam, self).__init__(params, defaults)
+# class BertAdam(Optimizer):
+#     """Implements BERT version of Adam algorithm with weight decay fix.
+#     Params:
+#         lr: learning rate
+#         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+#         t_total: total number of training steps for the learning
+#             rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+#         schedule: schedule to use for the warmup (see above).
+#             Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+#             If `None` or `'none'`, learning rate is always kept constant.
+#             Default : `'warmup_linear'`
+#         b1: Adams b1. Default: 0.9
+#         b2: Adams b2. Default: 0.999
+#         e: Adams epsilon. Default: 1e-6
+#         weight_decay: Weight decay. Default: 0.01
+#         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+#     """
+#     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+#                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+#         if lr is not required and lr < 0.0:
+#             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+#         if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+#             raise ValueError("Invalid schedule parameter: {}".format(schedule))
+#         if not 0.0 <= b1 < 1.0:
+#             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+#         if not 0.0 <= b2 < 1.0:
+#             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+#         if not e >= 0.0:
+#             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+#         # initialize schedule object
+#         if not isinstance(schedule, _LRSchedule):
+#             schedule_type = SCHEDULES[schedule]
+#             schedule = schedule_type(warmup=warmup, t_total=t_total)
+#         else:
+#             if warmup != -1 or t_total != -1:
+#                 logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+#                                "Please specify custom warmup and t_total in _LRSchedule object.")
+#         defaults = dict(lr=lr, schedule=schedule,
+#                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+#                         max_grad_norm=max_grad_norm)
+#         super(BertAdam, self).__init__(params, defaults)

-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-                lr.append(lr_scheduled)
-        return lr
+#     def get_lr(self):
+#         lr = []
+#         for group in self.param_groups:
+#             for p in group['params']:
+#                 state = self.state[p]
+#                 if len(state) == 0:
+#                     return [0]
+#                 lr_scheduled = group['lr']
+#                 lr_scheduled *= group['schedule'].get_lr(state['step'])
+#                 lr.append(lr_scheduled)
+#         return lr

-    def step(self, closure=None):
-        """Performs a single optimization step.
+#     def step(self, closure=None):
+#         """Performs a single optimization step.

-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
+#         Arguments:
+#             closure (callable, optional): A closure that reevaluates the model
+#                 and returns the loss.
+#         """
+#         loss = None
+#         if closure is not None:
+#             loss = closure()

-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+#         for group in self.param_groups:
+#             for p in group['params']:
+#                 if p.grad is None:
+#                     continue
+#                 grad = p.grad.data
+#                 if grad.is_sparse:
+#                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

-                state = self.state[p]
+#                 state = self.state[p]

-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['next_m'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['next_v'] = torch.zeros_like(p.data)
+#                 # State initialization
+#                 if len(state) == 0:
+#                     state['step'] = 0
+#                     # Exponential moving average of gradient values
+#                     state['next_m'] = torch.zeros_like(p.data)
+#                     # Exponential moving average of squared gradient values
+#                     state['next_v'] = torch.zeros_like(p.data)

-                next_m, next_v = state['next_m'], state['next_v']
-                beta1, beta2 = group['b1'], group['b2']
+#                 next_m, next_v = state['next_m'], state['next_v']
+#                 beta1, beta2 = group['b1'], group['b2']

-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
+#                 # Add grad clipping
+#                 if group['max_grad_norm'] > 0:
+#                     clip_grad_norm_(p, group['max_grad_norm'])

-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                next_m.mul_(beta1).add_(1 - beta1, grad)
-                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                update = next_m / (next_v.sqrt() + group['e'])
+#                 # Decay the first and second moment running average coefficient
+#                 # In-place operations to update the averages at the same time
+#                 next_m.mul_(beta1).add_(1 - beta1, grad)
+#                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+#                 update = next_m / (next_v.sqrt() + group['e'])

-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay'] > 0.0:
-                    update += group['weight_decay'] * p.data
+#                 # Just adding the square of the weights to the loss function is *not*
+#                 # the correct way of using L2 regularization/weight decay with Adam,
+#                 # since that will interact with the m and v parameters in strange ways.
+#                 #
+#                 # Instead we want to decay the weights in a manner that doesn't interact
+#                 # with the m/v parameters. This is equivalent to adding the square
+#                 # of the weights to the loss with plain (non-momentum) SGD.
+#                 if group['weight_decay'] > 0.0:
+#                     update += group['weight_decay'] * p.data

-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
+#                 lr_scheduled = group['lr']
+#                 lr_scheduled *= group['schedule'].get_lr(state['step'])

-                update_with_lr = lr_scheduled * update
-                p.data.add_(-update_with_lr)
+#                 update_with_lr = lr_scheduled * update
+#                 p.data.add_(-update_with_lr)

-                state['step'] += 1
+#                 state['step'] += 1

-                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
-                # No bias correction
-                # bias_correction1 = 1 - beta1 ** state['step']
-                # bias_correction2 = 1 - beta2 ** state['step']
+#                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+#                 # No bias correction
+#                 # bias_correction1 = 1 - beta1 ** state['step']
+#                 # bias_correction2 = 1 - beta2 ** state['step']

-        return loss
+#         return loss
@@ -1,6 +1,9 @@
 Modified by wassname from the below:

 Changes:
+- [ ] clamp weight with epsilon for stablity
+  - from `p.data *= (p.data>=0)`
+  - to `p.data = torch.clamp(p.data, min=eps)`
 - [ ] try log t
 - [ ] try not mean as much in intensity layer
 - [ ] use pytorch lightning
@@ -38,76 +38,76 @@ def get_parser():
    parser.add_argument("--mean_first", action="store_true", help="in model take mean first")
    return parser

-if __name__ == "__main__":
+# if __name__ == "__main__":

-    parser = get_parser()
+#     parser = get_parser()


-    config = parser.parse_args()
+#     config = parser.parse_args()

-    path = 'data/'
+#     path = 'data/'

-    if config.data == 'exponential_hawkes':
+#     if config.data == 'exponential_hawkes':

-        train_data = read_timeseries(path + config.data + '_training.csv')
-        val_data = read_timeseries(path + config.data + '_validation.csv')
-        test_data = read_timeseries(path + config.data + '_testing.csv')
-    else:
-        raise NotImplemented('only exponential_hawkes')
+#         train_data = read_timeseries(path + config.data + '_training.csv')
+#         val_data = read_timeseries(path + config.data + '_validation.csv')
+#         test_data = read_timeseries(path + config.data + '_testing.csv')
+#     else:
+#         raise NotImplemented('only exponential_hawkes')



-    train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
-    train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
-    val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
-    val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))
+#     train_timeseq, train_eventseq = generate_sequence(train_data, config.seq_len, log_mode=config.log_mode)
+#     train_loader = DataLoader(torch.utils.data.TensorDataset(train_timeseq, train_eventseq), shuffle=True, batch_size=config.batch_size)
+#     val_timeseq, val_eventseq = generate_sequence(val_data, config.seq_len, log_mode=config.log_mode)
+#     val_loader = DataLoader(torch.utils.data.TensorDataset(val_timeseq, val_eventseq), shuffle=False, batch_size=len(val_data))

-    model = GTPP(config)
+#     model = GTPP(config)

-    best_loss = 1e3
-    patients = 0
-    tol = 30
+#     best_loss = 1e3
+#     patients = 0
+#     tol = 30

-    for epoch in range(config.epochs):
+#     for epoch in range(config.epochs):

-        model.train()
+#         model.train()

-        loss1 = loss2 = loss3 = 0
+#         loss1 = loss2 = loss3 = 0

-        for batch in train_loader:
-            loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)
+#         for batch in train_loader:
+#             loss, log_lmbda, int_lmbda, lmbda = model.train_batch(batch)

-            loss1 += loss
-            loss2 += log_lmbda
-            loss3 += int_lmbda
+#             loss1 += loss
+#             loss2 += log_lmbda
+#             loss3 += int_lmbda


-        model.eval()
+#         model.eval()

-        for batch in val_loader:
-            val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)
+#         for batch in val_loader:
+#             val_loss, val_log_lmbda, val_int_lmbda, _ = model(batch)

-        if best_loss > val_loss:
-            best_loss = val_loss.item()
-        else:
-            patients += 1
-            if patients >= tol:
-                print("Early Stop")
-                print("epoch", epoch)
-                plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
-                break
+#         if best_loss > val_loss:
+#             best_loss = val_loss.item()
+#         else:
+#             patients += 1
+#             if patients >= tol:
+#                 print("Early Stop")
+#                 print("epoch", epoch)
+#                 plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
+#                 break

-        if epoch % config.prt_evry == 0:
-            print("Epochs:{}".format(epoch))
-            print("Training Negative Log Likelihood:{}   Log Lambda:{}:   Integral Lambda:{}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
-            print("Validation Negative Log Likelihood:{}   Log Lambda:{}:   Integral Lambda:{}".format(val_loss / val_timeseq.size(0),
-                                                                                            -val_log_lmbda / val_timeseq.size(0),
-                                                                                            val_int_lmbda/val_timeseq.size(0)))
-            plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
-            # plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
+#         if epoch % config.prt_evry == 0:
+#             print("Epochs:{}".format(epoch))
+#             print("Training Negative Log Likelihood:{}   Log Lambda:{}:   Integral Lambda:{}".format(loss1/train_timeseq.size(0), -loss2 / train_timeseq.size(0), loss3 / train_timeseq.size(0)))
+#             print("Validation Negative Log Likelihood:{}   Log Lambda:{}:   Integral Lambda:{}".format(val_loss / val_timeseq.size(0),
+#                                                                                             -val_log_lmbda / val_timeseq.size(0),
+#                                                                                             val_int_lmbda/val_timeseq.size(0)))
+#             plt_lmbda(train_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)
+#             # plt_lmbda(test_data[0], model=model, seq_len=config.seq_len, log_mode=config.log_mode)


-    print("end")
+#     print("end")



@@ -47,7 +47,7 @@ def generate_sequence(timeseries, seq_len, log_mode=False):



-def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0.2, alpha=0.8, beta=1.0):
+def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0., alpha=0.01, beta=1.0):

    lmbda_dict = dict()
    pred_dict = dict()
@@ -71,6 +71,6 @@ def plt_lmbda(timeseries, model, seq_len, log_mode=False, dt=0.01, lmbda0=0.2, a

    plt.plot(t_span, lmbda_dict[0], color='green', label='true prob')
    plt.plot([t for t, e in timeseries][seq_len-1:], np.array(pred_dict[0].detach()), color='olive', label='pred prob')
-    plt.scatter([t for t, e in timeseries], [-1 for _ in timeseries], color='blue', label='events')
+    plt.scatter([t for t, e in timeseries], [-.01 for _ in timeseries], color='blue', label='events')
    plt.legend()
    plt.show()