mirror of
https://github.com/wassname/attentive-neural-processes.git
synced 2026-06-27 18:03:39 +08:00
test set, inputnorm, lstm before encoder
This commit is contained in:
+196
-1579
File diff suppressed because one or more lines are too long
+369
-287
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -155,7 +155,7 @@ def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), use_logy=Fals
|
||||
df['dayofweek'] = time.dt.dayofweek / 7.0
|
||||
|
||||
# Drop nan and 0's
|
||||
df = df[df['energy(kWh/hh)']!=0]
|
||||
df = df[df['energy(kWh/hh)'] != 0]
|
||||
df = df.dropna()
|
||||
|
||||
if use_logy:
|
||||
@@ -163,7 +163,9 @@ def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), use_logy=Fals
|
||||
df = df.sort_values('tstp')
|
||||
|
||||
# split data
|
||||
n_split = -int(len(df)*0.1)
|
||||
df_train = df[:n_split]
|
||||
df_test = df[n_split:]
|
||||
return df_train, df_test
|
||||
test_split= -int(len(df) * 0.1)
|
||||
val_split= int(len(df) * 0.15)
|
||||
df_test = df[:val_split]
|
||||
df_train = df[val_split:test_split]
|
||||
df_val = df[test_split:]
|
||||
return df_train, df_val, df_test
|
||||
|
||||
+45
-48
@@ -69,7 +69,7 @@ class LatentModelPL(pl.LightningModule):
|
||||
|
||||
# agg and print self.train_logs HACK https://github.com/PyTorchLightning/pytorch-lightning/issues/100
|
||||
train_logs = self.agg_logs(self.train_logs)
|
||||
train_logs_str = {k: f"{v.mean()}" for k, v in train_logs.items()}
|
||||
train_logs_str = {k: f"{v}" for k, v in train_logs.items()}
|
||||
self.train_logs = []
|
||||
print(f"step val {self.trainer.global_step}, {tensorboard_logs_str} {train_logs}")
|
||||
return logs
|
||||
@@ -95,10 +95,10 @@ class LatentModelPL(pl.LightningModule):
|
||||
if isinstance(outputs[0][j], dict):
|
||||
# Take mean of sub dicts
|
||||
keys = outputs[0][j].keys()
|
||||
aggs[j] = {k: torch.stack([x[j][k] for x in outputs if k in x[j]]).mean() for k in keys}
|
||||
aggs[j] = {k: torch.stack([x[j][k] for x in outputs if k in x[j]]).mean().item() for k in keys}
|
||||
else:
|
||||
# Take mean of numbers
|
||||
aggs[j] = torch.stack([x[j] for x in outputs if j in x]).mean()
|
||||
aggs[j] = torch.stack([x[j] for x in outputs if j in x]).mean().item()
|
||||
return aggs
|
||||
|
||||
# # Log hparams with metric, doesn't work
|
||||
@@ -117,15 +117,14 @@ class LatentModelPL(pl.LightningModule):
|
||||
return self.validation_end(*args, **kwargs)
|
||||
|
||||
def configure_optimizers(self):
|
||||
optim = torch.optim.Adam(self.parameters(), lr=self.hparams["learning_rate"], weight_decay=0)
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=1, verbose=True, min_lr=1e-7) # note early stopping has patience 3
|
||||
optim = torch.optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"], weight_decay=0)
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=self.hparams["patience"], verbose=True, min_lr=1e-7) # note early stopping has patience 3
|
||||
return [optim], [scheduler]
|
||||
|
||||
def _get_cache_dfs(self):
|
||||
if self._dfs is None:
|
||||
df_train, df_test = get_smartmeter_df()
|
||||
# self._dfs = dict(df_train=df_train[:600], df_test=df_test[:600])
|
||||
self._dfs = dict(df_train=df_train, df_test=df_test)
|
||||
df_train, df_val, df_test = get_smartmeter_df()
|
||||
self._dfs = dict(df_train=df_train, df_val=df_val, df_test=df_test)
|
||||
return self._dfs
|
||||
|
||||
def train_dataloader(self):
|
||||
@@ -144,7 +143,7 @@ class LatentModelPL(pl.LightningModule):
|
||||
)
|
||||
|
||||
def val_dataloader(self):
|
||||
df_test = self._get_cache_dfs()['df_test']
|
||||
df_test = self._get_cache_dfs()['df_val']
|
||||
data_test = SmartMeterDataSet(
|
||||
df_test, self.hparams["num_context"], self.hparams["num_extra_target"]
|
||||
)
|
||||
@@ -172,49 +171,47 @@ class LatentModelPL(pl.LightningModule):
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
"""
|
||||
Specify the hyperparams for this LightningModule
|
||||
"""
|
||||
# MODEL specific
|
||||
parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser], add_help=False)
|
||||
parser.opt_range("--learning_rate", default=1e-3, type=float, tunable=True, high=1e-2, low=1e-5, log_base=10)
|
||||
def add_suggest(trial):
|
||||
trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
|
||||
|
||||
trial.suggest_categorical("hidden_dim", [8*2**i for i in range(6)])
|
||||
trial.suggest_categorical("latent_dim", [8*2**i for i in range(6)])
|
||||
|
||||
parser.opt_list("--hidden_dim", default=128, type=int, tunable=True, options=[8*2**i for i in range(8)])
|
||||
parser.opt_list("--latent_dim", default=128, type=int, tunable=True, options=[8*2**i for i in range(8)])
|
||||
parser.add_argument("--num_heads", default=8, type=int)
|
||||
parser.add_argument("--attention_layers", default=1, type=int)
|
||||
parser.opt_list("--n_latent_encoder_layers", default=4, type=int, tunable=True, options=[1, 2, 4, 8, 16])
|
||||
parser.opt_list("--n_det_encoder_layers", default=4, type=int, tunable=True, options=[1, 2, 4, 8, 16])
|
||||
parser.opt_list("--n_decoder_layers", default=2, type=int, tunable=True, options=[1, 2, 4, 8, 16])
|
||||
trial.suggest_int("attention_layers", 1, 4)
|
||||
trial.suggest_categorical("n_latent_encoder_layers", [1, 2, 4, 8])
|
||||
trial.suggest_categorical("n_det_encoder_layers", [1, 2, 4, 8])
|
||||
trial.suggest_categorical("n_decoder_layers", [1, 2, 4, 8])
|
||||
trial.suggest_int("num_heads", 8, 8)
|
||||
|
||||
parser.opt_range("--dropout", default=0, type=float, tunable=True, low=0, high=0.75)
|
||||
parser.opt_range("--attention_dropout", default=0, type=float, tunable=True, low=0, high=0.75)
|
||||
parser.add_argument("--min_std", default=0.005, type=float)
|
||||
trial.suggest_uniform("dropout", 0, 0.9)
|
||||
trial.suggest_uniform("attention_dropout", 0, 0.9)
|
||||
|
||||
parser.opt_list(
|
||||
"--latent_enc_self_attn_type", default="multihead", type=str, tunable=True, options=['uniform', 'dot', 'multihead', 'ptmultihead']
|
||||
trial.suggest_categorical(
|
||||
"latent_enc_self_attn_type", ['uniform', 'multihead', 'ptmultihead']
|
||||
)
|
||||
parser.opt_list("--det_enc_self_attn_type", default="multihead", type=str, tunable=True, options=['uniform', 'dot', 'multihead', 'ptmultihead'])
|
||||
parser.opt_list("--det_enc_cross_attn_type", default="multihead", type=str, tunable=True, options=['uniform', 'dot', 'multihead', 'ptmultihead'])
|
||||
trial.suggest_categorical("det_enc_self_attn_type", ['uniform', 'multihead', 'ptmultihead'])
|
||||
trial.suggest_categorical("det_enc_cross_attn_type", ['uniform', 'multihead', 'ptmultihead'])
|
||||
|
||||
parser.opt_list("--use_lvar", default=False, type=bool, tunable=True, options=[False, True])
|
||||
parser.opt_list("--use_rnn", default=False, type=bool, tunable=True, options=[False, True])
|
||||
parser.opt_list("--use_deterministic_path", default=True, tunable=True, type=bool, options=[False, True])
|
||||
parser.opt_list("--use_self_attn", default=True, tunable=True, type=bool, options=[False, True])
|
||||
parser.opt_list("--batchnorm", default=True, tunable=True, type=bool, options=[False, True])
|
||||
|
||||
# training specific (for this model)
|
||||
parser.add_argument("--context_in_target", default=True, type=bool)
|
||||
parser.add_argument("--grad_clip", default=0, type=float)
|
||||
parser.add_argument("--num_context", type=int, default=24 * 2)
|
||||
parser.add_argument("--num_extra_target", type=int, default=24)
|
||||
parser.add_argument("--max_nb_epochs", default=20, type=int)
|
||||
parser.add_argument("--num_workers", default=4, type=int)
|
||||
trial.suggest_categorical("batchnorm", [False, True])
|
||||
trial.suggest_categorical("use_self_attn", [False, True])
|
||||
trial.suggest_categorical("use_lvar", [False, True])
|
||||
trial.suggest_categorical("use_deterministic_path", [False, True])
|
||||
trial.suggest_categorical("use_rnn", [True, False])
|
||||
|
||||
trial._user_attrs = {
|
||||
'batch_size': 16,
|
||||
'grad_clip': 40,
|
||||
'max_nb_epochs': 200,
|
||||
'num_workers': 4,
|
||||
'num_context': 24* 4,
|
||||
'vis_i': '670',
|
||||
'num_extra_target': 24*4,
|
||||
'x_dim': 18,
|
||||
'context_in_target': True,
|
||||
'y_dim': 1,
|
||||
'patience': 3,
|
||||
'min_std': 0.005,
|
||||
}
|
||||
return trial
|
||||
|
||||
parser.add_argument("--batch_size", default=16, type=int)
|
||||
parser.add_argument("--x_dim", default=16, type=int)
|
||||
parser.add_argument("--y_dim", default=1, type=int)
|
||||
parser.add_argument("--vis_i", default=670, type=int)
|
||||
return parser
|
||||
|
||||
|
||||
+37
-24
@@ -132,7 +132,7 @@ class LSTM_PL(pl.LightningModule):
|
||||
def validation_end(self, outputs):
|
||||
# TODO send an image to tensroboard, like in the lighting_anp.py file
|
||||
if int(self.hparams["vis_i"]) > 0:
|
||||
loader = self.val_dataloader()[0]
|
||||
loader = self.val_dataloader()
|
||||
vis_i = min(int(self.hparams["vis_i"]), len(loader.dataset))
|
||||
if isinstance(self.hparams["vis_i"], str):
|
||||
image = plot_from_loader(loader, self, vis_i=vis_i, window_len=self.hparams["window_length"])
|
||||
@@ -163,15 +163,14 @@ class LSTM_PL(pl.LightningModule):
|
||||
def configure_optimizers(self):
|
||||
optim = torch.optim.Adam(self.parameters(), lr=self.hparams["learning_rate"])
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optim, patience=2, verbose=True, min_lr=1e-5
|
||||
optim, patience=self.hparams["patience"], verbose=True, min_lr=1e-5
|
||||
) # note early stopping has patient 3
|
||||
return [optim], [scheduler]
|
||||
|
||||
def _get_cache_dfs(self):
|
||||
if self._dfs is None:
|
||||
df_train, df_test = get_smartmeter_df()
|
||||
# self._dfs = dict(df_train=df_train[:600], df_test=df_test[:600])
|
||||
self._dfs = dict(df_train=df_train, df_test=df_test)
|
||||
df_train, df_val, df_test = get_smartmeter_df()
|
||||
self._dfs = dict(df_train=df_train, df_val=df_val, df_test=df_test)
|
||||
return self._dfs
|
||||
|
||||
@pl.data_loader
|
||||
@@ -193,7 +192,7 @@ class LSTM_PL(pl.LightningModule):
|
||||
|
||||
@pl.data_loader
|
||||
def val_dataloader(self):
|
||||
df_test = self._get_cache_dfs()["df_test"]
|
||||
df_test = self._get_cache_dfs()["df_val"]
|
||||
dset_test = SequenceDfDataSet(
|
||||
df_test,
|
||||
self.hparams,
|
||||
@@ -216,27 +215,41 @@ class LSTM_PL(pl.LightningModule):
|
||||
return DataLoader(dset_test, batch_size=self.hparams.batch_size, shuffle=False)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
def add_suggest(trial: optuna.Trial):
|
||||
"""
|
||||
Specify the hyperparams for this LightningModule
|
||||
Add hyperparam ranges to an optuna trial and typical user attrs.
|
||||
|
||||
Usage:
|
||||
trial = optuna.trial.FixedTrial(
|
||||
params={
|
||||
'hidden_size': 128,
|
||||
}
|
||||
)
|
||||
trial = add_suggest(trial)
|
||||
trainer = pl.Trainer()
|
||||
model = LSTM_PL(dict(**trial.params, **trial.user_attrs), dataset_train,
|
||||
dataset_test, cache_base_path, norm)
|
||||
trainer.fit(model)
|
||||
"""
|
||||
# MODEL specific
|
||||
parser = HyperOptArgumentParser(parents=[parent_parser])
|
||||
parser.add_argument("--learning_rate", default=0.002, type=float)
|
||||
parser.add_argument("--batch_size", default=16, type=int)
|
||||
parser.add_argument("--lstm_dropout", default=0.5, type=float)
|
||||
parser.add_argument("--hidden_size", default=16, type=int)
|
||||
parser.add_argument("--input_size", default=8, type=int)
|
||||
parser.add_argument("--lstm_layers", default=8, type=int)
|
||||
parser.add_argument("--bidirectional", default=False, type=bool)
|
||||
trial.suggest_loguniform("learning_rate", 1e-6, 1e-2)
|
||||
trial.suggest_uniform("lstm_dropout", 0, 0.75)
|
||||
trial.suggest_categorical(
|
||||
"hidden_size", [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
|
||||
)
|
||||
trial.suggest_categorical("lstm_layers", [1, 2, 3, 4, 6, 8])
|
||||
trial.suggest_categorical("bidirectional", [False, True])
|
||||
|
||||
# training specific (for this model)
|
||||
parser.add_argument("--window_length", type=int, default=12)
|
||||
parser.add_argument("--target_length", type=int, default=2)
|
||||
parser.add_argument("--max_nb_epochs", default=10, type=int)
|
||||
parser.add_argument("--num_workers", default=4, type=int)
|
||||
|
||||
return parser
|
||||
trial._user_attrs = {
|
||||
"batch_size": 16,
|
||||
"grad_clip": 40,
|
||||
"max_nb_epochs": 200,
|
||||
"num_workers": 4,
|
||||
"vis_i": 670,
|
||||
"input_size": 6,
|
||||
"output_size": 1,
|
||||
"patience": 2,
|
||||
}
|
||||
return trial
|
||||
|
||||
|
||||
def plot_from_loader(loader, model, vis_i=670, n=1, window_len=0):
|
||||
|
||||
+47
-28
@@ -20,7 +20,7 @@ import torch
|
||||
import io
|
||||
import PIL
|
||||
from torchvision.transforms import ToTensor
|
||||
|
||||
from src.models.modules import BatchNormSequence
|
||||
from src.data.smart_meter import get_smartmeter_df
|
||||
|
||||
from src.utils import ObjectDict
|
||||
@@ -41,6 +41,9 @@ class Seq2SeqNet(nn.Module):
|
||||
self.hparams = hparams
|
||||
self._min_std = _min_std
|
||||
|
||||
|
||||
|
||||
self.norm_input = BatchNormSequence(self.hparams.input_size)
|
||||
self.encoder = nn.LSTM(
|
||||
input_size=self.hparams.input_size,
|
||||
hidden_size=self.hparams.hidden_size,
|
||||
@@ -49,6 +52,9 @@ class Seq2SeqNet(nn.Module):
|
||||
bidirectional=self.hparams.bidirectional,
|
||||
dropout=self.hparams.lstm_dropout,
|
||||
)
|
||||
self.multihead_attn = nn.MultiheadAttention(self.hparams.hidden_size, num_heads=8)
|
||||
|
||||
self.norm_target = BatchNormSequence(self.hparams.input_size_decoder)
|
||||
self.decoder = nn.LSTM(
|
||||
input_size=self.hparams.input_size_decoder,
|
||||
hidden_size=self.hparams.hidden_size,
|
||||
@@ -66,9 +72,23 @@ class Seq2SeqNet(nn.Module):
|
||||
|
||||
def forward(self, context_x, context_y, target_x, target_y=None):
|
||||
x = torch.cat([context_x, context_y], -1)
|
||||
|
||||
# Sometimes input normalisation can be important, an initial batch norm is a nice way to ensure this
|
||||
x = self.norm_input(x)
|
||||
target_x = self.norm_target(target_x)
|
||||
|
||||
_, (h_out, cell) = self.encoder(x)
|
||||
# hidden = [batch size, n layers * n directions, hid dim]
|
||||
# cell = [batch size, n layers * n directions, hid dim]
|
||||
|
||||
# context_x, d_encoded, target_x = k, v, q
|
||||
|
||||
# query, key, value = target_x, context_x, d_encoded
|
||||
attn_output, _ = self.multihead_attn(h_out.permute(1, 0, 2), h_out.permute(1, 0, 2), h_out.permute(1, 0, 2))
|
||||
h_out = attn_output.permute(1, 0, 2).contiguous()
|
||||
attn_output, _ = self.multihead_attn(cell.permute(1, 0, 2), cell.permute(1, 0, 2), cell.permute(1, 0, 2))
|
||||
cell = attn_output.permute(1, 0, 2).contiguous()
|
||||
|
||||
outputs, (_, _) = self.decoder(target_x, (h_out, cell))
|
||||
# output = [batch size, seq len, hid dim * n directions]
|
||||
|
||||
@@ -155,7 +175,7 @@ class LSTMSeq2Seq_PL(pl.LightningModule):
|
||||
|
||||
def show_image(self):
|
||||
# https://github.com/PytorchLightning/pytorch-lightning/blob/f8d9f8f/pytorch_lightning/core/lightning.py#L293
|
||||
loader = self.val_dataloader()[0]
|
||||
loader = self.val_dataloader()
|
||||
vis_i = min(int(self.hparams["vis_i"]), len(loader.dataset))
|
||||
# print('vis_i', vis_i)
|
||||
if isinstance(self.hparams["vis_i"], str):
|
||||
@@ -174,15 +194,14 @@ class LSTMSeq2Seq_PL(pl.LightningModule):
|
||||
def configure_optimizers(self):
|
||||
optim = torch.optim.Adam(self.parameters(), lr=self.hparams["learning_rate"])
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optim, patience=2, verbose=True, min_lr=1e-5
|
||||
optim, patience=self.hparams["patience"], verbose=True, min_lr=1e-5
|
||||
) # note early stopping has patient 3
|
||||
return [optim], [scheduler]
|
||||
|
||||
def _get_cache_dfs(self):
|
||||
if self._dfs is None:
|
||||
df_train, df_test = get_smartmeter_df()
|
||||
# self._dfs = dict(df_train=df_train[:600], df_test=df_test[:600])
|
||||
self._dfs = dict(df_train=df_train, df_test=df_test)
|
||||
df_train, df_val, df_test = get_smartmeter_df()
|
||||
self._dfs = dict(df_train=df_train, df_val=df_val, df_test=df_test)
|
||||
return self._dfs
|
||||
|
||||
@pl.data_loader
|
||||
@@ -203,7 +222,7 @@ class LSTMSeq2Seq_PL(pl.LightningModule):
|
||||
|
||||
@pl.data_loader
|
||||
def val_dataloader(self):
|
||||
df_test = self._get_cache_dfs()['df_test']
|
||||
df_test = self._get_cache_dfs()['df_val']
|
||||
data_test = SmartMeterDataSet(
|
||||
df_test, self.hparams["num_context"], self.hparams["num_extra_target"]
|
||||
)
|
||||
@@ -232,25 +251,25 @@ class LSTMSeq2Seq_PL(pl.LightningModule):
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
"""
|
||||
Specify the hyperparams for this LightningModule
|
||||
"""
|
||||
# MODEL specific
|
||||
parser = HyperOptArgumentParser(parents=[parent_parser])
|
||||
parser.add_argument("--learning_rate", default=0.002, type=float)
|
||||
parser.add_argument("--batch_size", default=16, type=int)
|
||||
parser.add_argument("--lstm_dropout", default=0.5, type=float)
|
||||
parser.add_argument("--hidden_size", default=16, type=int)
|
||||
parser.add_argument("--input_size", default=8, type=int)
|
||||
parser.add_argument("--input_size_decoder", default=8, type=int)
|
||||
parser.add_argument("--lstm_layers", default=8, type=int)
|
||||
parser.add_argument("--bidirectional", default=False, type=bool)
|
||||
def add_suggest(trial):
|
||||
trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
|
||||
trial.suggest_uniform("lstm_dropout", 0, 0.75)
|
||||
trial.suggest_categorical("hidden_size", [1, 2, 4, 8, 16, 32, 64, 128, 256, 512])
|
||||
trial.suggest_categorical("lstm_layers", [1, 2, 4, 8])
|
||||
trial.suggest_categorical("bidirectional", [False, True])
|
||||
|
||||
|
||||
# training specific (for this model)
|
||||
parser.add_argument("--num_context", type=int, default=12)
|
||||
parser.add_argument("--num_extra_target", type=int, default=2)
|
||||
parser.add_argument("--max_nb_epochs", default=10, type=int)
|
||||
parser.add_argument("--num_workers", default=4, type=int)
|
||||
|
||||
return parser
|
||||
trial._user_attrs = {
|
||||
'batch_size': 16,
|
||||
'grad_clip': 40,
|
||||
'max_nb_epochs': 200,
|
||||
'num_workers': 4,
|
||||
'num_extra_target': 24*4,
|
||||
'vis_i': '670',
|
||||
'num_context': 24*4,
|
||||
'input_size': 18,
|
||||
'input_size_decoder': 17,
|
||||
'context_in_target': True,
|
||||
'output_size': 1
|
||||
}
|
||||
return trial
|
||||
|
||||
+28
-27
@@ -165,7 +165,7 @@ class LSTM_PL(pl.LightningModule):
|
||||
def validation_end(self, outputs):
|
||||
# TODO send an image to tensroboard, like in the lighting_anp.py file
|
||||
if int(self.hparams["vis_i"]) > 0:
|
||||
loader = self.val_dataloader()[0]
|
||||
loader = self.val_dataloader()
|
||||
vis_i = min(int(self.hparams["vis_i"]), len(loader.dataset))
|
||||
if isinstance(self.hparams["vis_i"], str):
|
||||
image = plot_from_loader(loader, self, vis_i=vis_i)
|
||||
@@ -196,15 +196,14 @@ class LSTM_PL(pl.LightningModule):
|
||||
def configure_optimizers(self):
|
||||
optim = torch.optim.Adam(self.parameters(), lr=self.hparams["learning_rate"])
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optim, patience=2, verbose=True, min_lr=1e-5
|
||||
optim, patience=self.hparams["patience"], verbose=True, min_lr=1e-5
|
||||
) # note early stopping has patient 3
|
||||
return [optim], [scheduler]
|
||||
|
||||
def _get_cache_dfs(self):
|
||||
if self._dfs is None:
|
||||
df_train, df_test = get_smartmeter_df()
|
||||
# self._dfs = dict(df_train=df_train[:600], df_test=df_test[:600])
|
||||
self._dfs = dict(df_train=df_train, df_test=df_test)
|
||||
df_train, df_val, df_test = get_smartmeter_df()
|
||||
self._dfs = dict(df_train=df_train, df_val=df_val, df_test=df_test)
|
||||
return self._dfs
|
||||
|
||||
@pl.data_loader
|
||||
@@ -226,7 +225,7 @@ class LSTM_PL(pl.LightningModule):
|
||||
|
||||
@pl.data_loader
|
||||
def val_dataloader(self):
|
||||
df_test = self._get_cache_dfs()["df_test"]
|
||||
df_test = self._get_cache_dfs()["df_val"]
|
||||
dset_test = SequenceDfDataSet(
|
||||
df_test,
|
||||
self.hparams,
|
||||
@@ -249,27 +248,29 @@ class LSTM_PL(pl.LightningModule):
|
||||
return DataLoader(dset_test, batch_size=self.hparams.batch_size, shuffle=False)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
"""
|
||||
Specify the hyperparams for this LightningModule
|
||||
"""
|
||||
# MODEL specific
|
||||
parser = HyperOptArgumentParser(parents=[parent_parser])
|
||||
parser.add_argument("--learning_rate", default=0.002, type=float)
|
||||
parser.add_argument("--batch_size", default=16, type=int)
|
||||
parser.add_argument("--lstm_dropout", default=0.5, type=float)
|
||||
parser.add_argument("--hidden_size", default=16, type=int)
|
||||
parser.add_argument("--input_size", default=8, type=int)
|
||||
parser.add_argument("--lstm_layers", default=8, type=int)
|
||||
parser.add_argument("--bidirectional", default=False, type=bool)
|
||||
|
||||
# training specific (for this model)
|
||||
parser.add_argument("--window_length", type=int, default=12)
|
||||
parser.add_argument("--target_length", type=int, default=2)
|
||||
parser.add_argument("--max_nb_epochs", default=10, type=int)
|
||||
parser.add_argument("--num_workers", default=4, type=int)
|
||||
|
||||
return parser
|
||||
def add_suggest(trial):
|
||||
trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
|
||||
trial.suggest_uniform("lstm_dropout", 0, 0.75)
|
||||
trial.suggest_categorical("hidden_size", [1, 2, 4, 8, 16, 32, 64, 128, 256, 512])
|
||||
trial.suggest_categorical("lstm_layers", [1, 2, 4, 8])
|
||||
trial.suggest_categorical("bidirectional", [False, True])
|
||||
|
||||
# constants
|
||||
trial._user_attrs = {
|
||||
'batch_size': 16,
|
||||
'grad_clip': 40,
|
||||
'max_nb_epochs': 200,
|
||||
'num_workers': 4,
|
||||
'num_extra_target': 24*4,
|
||||
'vis_i': '670',
|
||||
'num_context': 24*4,
|
||||
'input_size': 18,
|
||||
'input_size_decoder': 17,
|
||||
'context_in_target': True,
|
||||
'output_size': 1,
|
||||
'patience': 3,
|
||||
}
|
||||
return trial
|
||||
|
||||
|
||||
def plot_from_loader(loader, model, vis_i=670, n=1, window_len=0):
|
||||
|
||||
+31
-7
@@ -5,7 +5,7 @@ from torch.utils.data import TensorDataset, DataLoader
|
||||
import math
|
||||
|
||||
from src.models.modules import LatentEncoder, DeterministicEncoder, Decoder
|
||||
|
||||
from src.models.modules import BatchNormSequence
|
||||
|
||||
def log_prob_sigma(value, loc, log_scale):
|
||||
"""A slightly more stable (not confirmed yet) log prob taking in log_var instead of scale.
|
||||
@@ -66,18 +66,32 @@ class LatentModel(nn.Module):
|
||||
self._use_rnn = use_rnn
|
||||
self.context_in_target = context_in_target
|
||||
|
||||
# Sometimes input normalisation can be important, an initial batch norm is a nice way to ensure this
|
||||
self.norm_x = BatchNormSequence(x_dim)
|
||||
self.norm_y = BatchNormSequence(y_dim)
|
||||
|
||||
if self._use_rnn:
|
||||
self._lstm = nn.LSTM(
|
||||
self._lstm_x = nn.LSTM(
|
||||
input_size=x_dim,
|
||||
hidden_size=hidden_dim,
|
||||
num_layers=attention_layers,
|
||||
dropout=dropout,
|
||||
batch_first=True
|
||||
)
|
||||
self._lstm_y = nn.LSTM(
|
||||
input_size=y_dim,
|
||||
hidden_size=hidden_dim,
|
||||
num_layers=attention_layers,
|
||||
dropout=dropout,
|
||||
batch_first=True
|
||||
)
|
||||
x_dim = hidden_dim
|
||||
y_dim2 = hidden_dim
|
||||
else:
|
||||
y_dim2 = y_dim
|
||||
|
||||
self._latent_encoder = LatentEncoder(
|
||||
x_dim + y_dim,
|
||||
x_dim + y_dim2,
|
||||
hidden_dim=hidden_dim,
|
||||
latent_dim=latent_dim,
|
||||
self_attention_type=latent_enc_self_attn_type,
|
||||
@@ -93,7 +107,7 @@ class LatentModel(nn.Module):
|
||||
)
|
||||
|
||||
self._deterministic_encoder = DeterministicEncoder(
|
||||
input_dim=x_dim + y_dim,
|
||||
input_dim=x_dim + y_dim2,
|
||||
x_dim=x_dim,
|
||||
hidden_dim=hidden_dim,
|
||||
self_attention_type=det_enc_self_attn_type,
|
||||
@@ -126,16 +140,26 @@ class LatentModel(nn.Module):
|
||||
|
||||
def forward(self, context_x, context_y, target_x, target_y=None):
|
||||
|
||||
# https://stackoverflow.com/a/46772183/221742
|
||||
target_x = self.norm_x(target_x)
|
||||
context_x = self.norm_x(context_x)
|
||||
context_y = self.norm_y(context_y)
|
||||
|
||||
if self._use_rnn:
|
||||
# see https://arxiv.org/abs/1910.09323 where x is substituted with h = RNN(x)
|
||||
# x need to be provided as [B, T, H]
|
||||
target_x, _ = self._lstm(target_x)
|
||||
context_x, _ = self._lstm(context_x)
|
||||
target_x, _ = self._lstm_x(target_x)
|
||||
context_x, _ = self._lstm_x(context_x)
|
||||
context_y, _ = self._lstm_y(context_y)
|
||||
|
||||
|
||||
dist_prior, log_var_prior = self._latent_encoder(context_x, context_y)
|
||||
|
||||
if target_y is not None:
|
||||
dist_post, log_var_post = self._latent_encoder(target_x, target_y)
|
||||
target_y2 = self.norm_y(target_y)
|
||||
if self._use_rnn:
|
||||
target_y2, _ = self._lstm_y(target_y2)
|
||||
dist_post, log_var_post = self._latent_encoder(target_x, target_y2)
|
||||
z = dist_post.loc
|
||||
else:
|
||||
z = dist_prior.loc
|
||||
|
||||
@@ -24,6 +24,22 @@ class LSTMBlock(nn.Module):
|
||||
return self._lstm(x)[0]
|
||||
|
||||
|
||||
class BatchNormSequence(nn.Module):
|
||||
"""Applies batch norm on features of a batch first sequence."""
|
||||
def __init__(
|
||||
self, out_channels
|
||||
):
|
||||
super().__init__()
|
||||
self.norm = nn.BatchNorm1d(out_channels)
|
||||
|
||||
def forward(self, x):
|
||||
# x.shape is (Batch, Sequence, Channels)
|
||||
# Now we want to apply batchnorm and dropout to the channels. So we put it in shape
|
||||
# (Batch, Channels, Sequence) so we can use BatchNorm1d
|
||||
x = x.permute(0, 2, 1)
|
||||
x = self.norm(x)
|
||||
return x.permute(0, 2, 1)
|
||||
|
||||
class NPBlockRelu2d(nn.Module):
|
||||
"""Block for Neural Processes."""
|
||||
|
||||
|
||||
@@ -19,9 +19,11 @@ from matplotlib import pyplot as plt
|
||||
import torch
|
||||
import io
|
||||
import PIL
|
||||
import optuna
|
||||
from torchvision.transforms import ToTensor
|
||||
|
||||
from src.data.smart_meter import get_smartmeter_df
|
||||
from src.models.modules import BatchNormSequence
|
||||
|
||||
from src.utils import ObjectDict
|
||||
|
||||
@@ -41,7 +43,6 @@ class TransformerSeq2SeqNet(nn.Module):
|
||||
self.hparams = hparams
|
||||
self._min_std = _min_std
|
||||
|
||||
# TODO project to 8*nhead
|
||||
hidden_out_size = self.hparams.hidden_out_size
|
||||
self.enc_emb = nn.Linear(self.hparams.input_size, hidden_out_size)
|
||||
layer_enc = nn.TransformerEncoderLayer(
|
||||
@@ -92,7 +93,7 @@ class TransformerSeq2SeqNet(nn.Module):
|
||||
log_sigma = torch.clamp(log_sigma, math.log(self._min_std), -math.log(self._min_std))
|
||||
|
||||
sigma = torch.exp(log_sigma)
|
||||
y_dist=torch.distributions.Normal(mean, sigma)
|
||||
y_dist = torch.distributions.Normal(mean, sigma)
|
||||
|
||||
# Loss
|
||||
loss_mse = loss_p = None
|
||||
@@ -188,15 +189,14 @@ class TransformerSeq2Seq_PL(pl.LightningModule):
|
||||
def configure_optimizers(self):
|
||||
optim = torch.optim.Adam(self.parameters(), lr=self.hparams["learning_rate"])
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optim, patience=2, verbose=True, min_lr=1e-5
|
||||
) # note early stopping has patient 3
|
||||
optim, patience=self.hparams["patience"], verbose=True, min_lr=1e-7
|
||||
) # note early stopping has patience 3
|
||||
return [optim], [scheduler]
|
||||
|
||||
def _get_cache_dfs(self):
|
||||
if self._dfs is None:
|
||||
df_train, df_test = get_smartmeter_df()
|
||||
# self._dfs = dict(df_train=df_train[:600], df_test=df_test[:600])
|
||||
self._dfs = dict(df_train=df_train, df_test=df_test)
|
||||
df_train, df_val, df_test = get_smartmeter_df()
|
||||
self._dfs = dict(df_train=df_train, df_val=df_val, df_test=df_test)
|
||||
return self._dfs
|
||||
|
||||
@pl.data_loader
|
||||
@@ -217,7 +217,7 @@ class TransformerSeq2Seq_PL(pl.LightningModule):
|
||||
|
||||
@pl.data_loader
|
||||
def val_dataloader(self):
|
||||
df_test = self._get_cache_dfs()['df_test']
|
||||
df_test = self._get_cache_dfs()['df_val']
|
||||
data_test = SmartMeterDataSet(
|
||||
df_test, self.hparams["num_context"], self.hparams["num_extra_target"]
|
||||
)
|
||||
@@ -246,27 +246,41 @@ class TransformerSeq2Seq_PL(pl.LightningModule):
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
def add_suggest(trial: optuna.Trial):
|
||||
"""
|
||||
Specify the hyperparams for this LightningModule
|
||||
Add hyperparam ranges to an optuna trial and typical user attrs.
|
||||
|
||||
Usage:
|
||||
trial = optuna.trial.FixedTrial(
|
||||
params={
|
||||
'hidden_size': 128,
|
||||
}
|
||||
)
|
||||
trial = add_suggest(trial)
|
||||
trainer = pl.Trainer()
|
||||
model = LSTM_PL(dict(**trial.params, **trial.user_attrs), dataset_train,
|
||||
dataset_test, cache_base_path, norm)
|
||||
trainer.fit(model)
|
||||
"""
|
||||
# MODEL specific
|
||||
parser = HyperOptArgumentParser(parents=[parent_parser])
|
||||
parser.add_argument("--learning_rate", default=0.002, type=float)
|
||||
parser.add_argument("--batch_size", default=16, type=int)
|
||||
parser.add_argument("--attention_dropout", default=0.5, type=float)
|
||||
parser.add_argument("--hidden_size", default=16, type=int)
|
||||
parser.add_argument("--hidden_out_size", default=16, type=int)
|
||||
parser.add_argument("--input_size", default=8, type=int)
|
||||
parser.add_argument("--nhead", default=8, type=int)
|
||||
parser.add_argument("--input_size_decoder", default=8, type=int)
|
||||
parser.add_argument("--nlayers", default=8, type=int)
|
||||
# parser.add_argument("--bidirectional", default=False, type=bool)
|
||||
trial.suggest_loguniform("learning_rate", 1e-6, 1e-2)
|
||||
trial.suggest_uniform("attention_dropout", 0, 0.75)
|
||||
trial.suggest_categorical("hidden_size", [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048])
|
||||
trial.suggest_categorical("hidden_out_size", [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048])
|
||||
trial.suggest_categorical("nlayers", [1, 2, 4, 8])
|
||||
trial.suggest_categorical("nhead", [1, 2, 8, 16])
|
||||
|
||||
# training specific (for this model)
|
||||
parser.add_argument("--num_context", type=int, default=12)
|
||||
parser.add_argument("--num_extra_target", type=int, default=2)
|
||||
parser.add_argument("--max_nb_epochs", default=10, type=int)
|
||||
parser.add_argument("--num_workers", default=4, type=int)
|
||||
|
||||
return parser
|
||||
trial._user_attrs = {
|
||||
'batch_size': 16,
|
||||
'grad_clip': 40,
|
||||
'max_nb_epochs': 200,
|
||||
'num_workers': 4,
|
||||
'num_extra_target': 24*4,
|
||||
'vis_i': '670',
|
||||
'num_context': 24*4,
|
||||
'input_size': 18,
|
||||
'input_size_decoder': 17,
|
||||
'context_in_target': True,
|
||||
'output_size': 1,
|
||||
'patience': 3,
|
||||
}
|
||||
return trial
|
||||
|
||||
+114
@@ -0,0 +1,114 @@
|
||||
from pytorch_lightning.callbacks import EarlyStopping
|
||||
from optuna.integration.pytorch_lightning import _check_pytorch_lightning_availability
|
||||
from pathlib import Path
|
||||
import optuna
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from .dict_logger import DictLogger
|
||||
from .utils import PyTorchLightningPruningCallback
|
||||
from .plot import plot_from_loader
|
||||
|
||||
|
||||
def main(
|
||||
trial: optuna.Trial,
|
||||
PL_MODEL_CLS: pl.LightningModule,
|
||||
name: str,
|
||||
MODEL_DIR: Path = Path("./lightning_logs"),
|
||||
train=True,
|
||||
prune=True,
|
||||
PERCENT_TEST_EXAMPLES=0.5,
|
||||
):
|
||||
# PyTorch Lightning will try to restore model parameters from previous trials if checkpoint
|
||||
# filenames match. Therefore, the filenames for each trial must be made unique.
|
||||
|
||||
checkpoint_callback = pl.callbacks.ModelCheckpoint(
|
||||
MODEL_DIR / name / "version_{}".format(trial.number) / "chk",
|
||||
monitor="val_loss",
|
||||
mode="min",
|
||||
)
|
||||
|
||||
# The default logger in PyTorch Lightning writes to event files to be consumed by
|
||||
# TensorBoard. We create a simple logger instead that holds the log in memory so that the
|
||||
# final accuracy can be obtained after optimization. When using the default logger, the
|
||||
# final accuracy could be stored in an attribute of the `Trainer` instead.
|
||||
logger = DictLogger(MODEL_DIR, name=name, version=trial.number)
|
||||
# print("log_dir", logger.experiment.log_dir)
|
||||
hparams = dict(**trial.params, **trial.user_attrs)
|
||||
|
||||
trainer = pl.Trainer(
|
||||
logger=logger,
|
||||
val_percent_check=PERCENT_TEST_EXAMPLES,
|
||||
checkpoint_callback=checkpoint_callback,
|
||||
max_epochs=hparams["max_nb_epochs"],
|
||||
gpus=-1 if torch.cuda.is_available() else None,
|
||||
early_stop_callback=PyTorchLightningPruningCallback(trial, monitor="val_loss")
|
||||
if prune
|
||||
else EarlyStopping(
|
||||
patience=hparams["patience"] * 2, monitor="val_loss", verbose=True
|
||||
),
|
||||
)
|
||||
|
||||
model = PL_MODEL_CLS(hparams)
|
||||
if train:
|
||||
trainer.fit(model)
|
||||
return model, trainer
|
||||
|
||||
|
||||
def objective(trial, PL_MODEL_CLS):
|
||||
# see https://github.com/optuna/optuna/blob/cf6f02d/examples/pytorch_lightning_simple.py
|
||||
trial = PL_MODEL_CLS.add_suggest(trial)
|
||||
|
||||
print("trial", trial.number, "params", trial.params)
|
||||
|
||||
model, trainer = main(trial)
|
||||
|
||||
# also report to tensorboard & print
|
||||
print("logger.metrics", model.logger.metrics[-1:])
|
||||
model.logger.experiment.add_hparams(trial.params, logger.metrics[-1])
|
||||
model.logger.save()
|
||||
|
||||
return model.logger.metrics[-1]["val_loss"]
|
||||
|
||||
|
||||
def add_number(trial: optuna.Trial, model_dir: Path):
|
||||
# For manual experiment we will start at -1 and deincr by 1
|
||||
versions = [int(s.stem.split("_")[-1]) for s in model_dir.glob("version_*")] + [-1]
|
||||
trial.number = min(versions) - 1
|
||||
print("trial.number", trial.number)
|
||||
return trial
|
||||
|
||||
|
||||
def run_trial(
|
||||
name: str,
|
||||
PL_MODEL_CLS: pl.LightningModule,
|
||||
params: dict = {},
|
||||
user_attrs: dict = {},
|
||||
MODEL_DIR: Path = Path("./lightning_logs"),
|
||||
):
|
||||
print(f"now run `tensorboard --logdir {MODEL_DIR}`")
|
||||
(MODEL_DIR / name).mkdir(parents=True, exist_ok=True)
|
||||
trial = optuna.trial.FixedTrial(params=params)
|
||||
trial = PL_MODEL_CLS.add_suggest(trial)
|
||||
trial = add_number(trial, MODEL_DIR / name)
|
||||
trial._user_attrs.update(user_attrs)
|
||||
model, trainer = main(
|
||||
trial, PL_MODEL_CLS, name=name, MODEL_DIR=MODEL_DIR, train=False, prune=False
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
# Load checkpoint
|
||||
checkpoint = sorted(Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt"))[-1]
|
||||
device = next(model.parameters()).device
|
||||
print(f"Loading checkpoint {checkpoint}")
|
||||
model = model.load_from_checkpoint(checkpoint).to(device)
|
||||
|
||||
trainer.test(model)
|
||||
|
||||
# Plot
|
||||
loader = model.val_dataloader()
|
||||
dset_test = loader.dataset
|
||||
label_names = dset_test.label_names
|
||||
plot_from_loader(model.val_dataloader(), model, i=670, title='val 670')
|
||||
plot_from_loader(model.train_dataloader(), model, i=670, title='train 670')
|
||||
plot_from_loader(model.test_dataloader(), model, i=670, title='test 670')
|
||||
return trial, trainer, model
|
||||
+21
-7
@@ -1,5 +1,18 @@
|
||||
from pytorch_lightning.callbacks import EarlyStopping
|
||||
from optuna.integration.pytorch_lightning import _check_pytorch_lightning_availability
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
import optuna
|
||||
|
||||
|
||||
def init_random_seed(seed):
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html
|
||||
np.random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
|
||||
class PyTorchLightningPruningCallback(EarlyStopping):
|
||||
"""Optuna PyTorch Lightning callback to prune unpromising trials.
|
||||
@@ -20,10 +33,10 @@ class PyTorchLightningPruningCallback(EarlyStopping):
|
||||
how this dictionary is formatted.
|
||||
"""
|
||||
|
||||
def __init__(self, trial, monitor):
|
||||
def __init__(self, trial, monitor, **kwargs):
|
||||
# type: (optuna.trial.Trial, str) -> None
|
||||
|
||||
super(PyTorchLightningPruningCallback, self).__init__(monitor)
|
||||
super().__init__(monitor, **kwargs)
|
||||
|
||||
_check_pytorch_lightning_availability()
|
||||
|
||||
@@ -41,25 +54,26 @@ class PyTorchLightningPruningCallback(EarlyStopping):
|
||||
message = "Trial was pruned at epoch {}.".format(epoch)
|
||||
raise optuna.exceptions.TrialPruned(message)
|
||||
|
||||
|
||||
class ObjectDict(dict):
|
||||
"""
|
||||
Interface similar to an argparser
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
def __setattr__(self, attr, value):
|
||||
self[attr] = value
|
||||
return self[attr]
|
||||
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if attr.startswith('_'):
|
||||
if attr.startswith("_"):
|
||||
# https://stackoverflow.com/questions/10364332/how-to-pickle-python-object-derived-from-dict
|
||||
raise AttributeError
|
||||
return dict(self)[attr]
|
||||
|
||||
|
||||
@property
|
||||
def __dict__(self):
|
||||
return dict(self)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user