diff --git a/seq2seq_time/models/inceptiontime.py b/seq2seq_time/models/inceptiontime.py index 0d33f9c..9510700 100644 --- a/seq2seq_time/models/inceptiontime.py +++ b/seq2seq_time/models/inceptiontime.py @@ -8,27 +8,35 @@ import torch import torch.nn as nn +from torch.nn import functional as F + def noop(x): return x + def shortcut(c_in, c_out): - return nn.Sequential(*[nn.Conv1d(c_in, c_out, kernel_size=1), - nn.BatchNorm1d(c_out)]) - -class Inception(nn.Module): - def __init__(self, c_in, bottleneck=32, ks=40, nb_filters=32): + return nn.Sequential( + *[nn.Conv1d(c_in, c_out, kernel_size=1), nn.BatchNorm1d(c_out)] + ) + + +class InceptionLayer(nn.Module): + def __init__(self, c_in, bottleneck=32, kernel_size=40, nb_filters=32): super().__init__() - self.bottleneck = nn.Conv1d(c_in, bottleneck, 1) if bottleneck and c_in > 1 else noop + self.bottleneck = ( + nn.Conv1d(c_in, bottleneck, 1) if bottleneck and c_in > 1 else noop + ) mts_feat = bottleneck or c_in conv_layers = [] - kss = [ks // (2**i) for i in range(3)] + kss = [kernel_size // (2 ** i) for i in range(3)] # ensure odd kss until nn.Conv1d with padding='same' is available in pytorch 1.3 - kss = [ksi if ksi % 2 != 0 else ksi - 1 for ksi in kss] + kss = [ksi if ksi % 2 != 0 else ksi - 1 for ksi in kss] for i in range(len(kss)): conv_layers.append( - nn.Conv1d(mts_feat, nb_filters, kernel_size=kss[i], padding=kss[i] // 2)) + nn.Conv1d(mts_feat, nb_filters, kernel_size=kss[i], padding=kss[i] // 2) + ) self.conv_layers = nn.ModuleList(conv_layers) self.maxpool = nn.MaxPool1d(3, stride=1, padding=1) self.conv = nn.Conv1d(c_in, nb_filters, kernel_size=1) @@ -40,40 +48,52 @@ class Inception(nn.Module): x = self.bottleneck(input_tensor) for i in range(3): out_ = self.conv_layers[i](x) - if i == 0: out = out_ - else: out = torch.cat((out, out_), 1) + if i == 0: + out = out_ + else: + out = torch.cat((out, out_), 1) mp = self.conv(self.maxpool(input_tensor)) inc_out = torch.cat((out, mp), 1) return self.act(self.bn(inc_out)) class InceptionBlock(nn.Module): - def __init__(self,c_in,bottleneck=32,ks=40,nb_filters=32,residual=True,depth=6): + def __init__( + self, c_in, bottleneck=32, kernel_size=40, nb_filters=32, residual=True, num_layers=6 + ): super().__init__() self.residual = residual - self.depth = depth + self.num_layers = num_layers - #inception & residual layers + # inception & residual layers inc_mods = [] res_layers = [] res = 0 - for d in range(depth): + for d in range(num_layers): inc_mods.append( - Inception(c_in if d == 0 else nb_filters * 4, bottleneck=bottleneck if d > 0 else 0,ks=ks, - nb_filters=nb_filters)) + InceptionLayer( + c_in if d == 0 else nb_filters * 4, + bottleneck=bottleneck if d > 0 else 0, + kernel_size=kernel_size, + nb_filters=nb_filters, + ) + ) if self.residual and d % 3 == 2: - res_layers.append(shortcut(c_in if res == 0 else nb_filters * 4, nb_filters * 4)) + res_layers.append( + shortcut(c_in if res == 0 else nb_filters * 4, nb_filters * 4) + ) res += 1 - else: res_layer = res_layers.append(None) + else: + res_layer = res_layers.append(None) self.inc_mods = nn.ModuleList(inc_mods) self.res_layers = nn.ModuleList(res_layers) self.act = nn.ReLU() - + def forward(self, x): res = x - for d, l in enumerate(range(self.depth)): + for d, l in enumerate(range(self.num_layers)): x = self.inc_mods[d](x) if self.residual and d % 3 == 2: res = self.res_layers[d](res) @@ -81,18 +101,47 @@ class InceptionBlock(nn.Module): res = x x = self.act(x) return x - -class InceptionTime(nn.Module): - def __init__(self,c_in,c_out,bottleneck=32,ks=40,nb_filters=32,residual=True,depth=6): + + + +class InceptionTimeSeq(nn.Module): + def __init__( + self, + x_dim, + y_dim, + hidden_size=32, + layers=6, + kernel_size=40, + bottleneck=16, + residual=True + ): super().__init__() - self.block = InceptionBlock(c_in,bottleneck=bottleneck,ks=ks,nb_filters=nb_filters, - residual=residual,depth=depth) - self.gap = nn.AdaptiveAvgPool1d(1) - self.fc = nn.Linear(nb_filters * 4, c_out) + self.inc_block = InceptionBlock( + x_dim + y_dim, + bottleneck=bottleneck, + kernel_size=kernel_size, + nb_filters=hidden_size, + residual=residual, + num_layers=layers, + ) + self._min_std = 0.01 + self.mean = nn.Linear(hidden_size*4, y_dim) + self.std = nn.Linear(hidden_size*4, y_dim) - def forward(self, x): - x = self.block(x) - x = self.gap(x).squeeze(-1) - x = self.fc(x) - return x + def forward(self, past_x, past_y, future_x, future_y=None): + device = next(self.parameters()).device + B, S, _ = future_x.shape + future_y_fake = past_y[:, -1:, :].repeat(1, S, 1).to(device) + context = torch.cat([past_x, past_y], -1) + target = torch.cat([future_x, future_y_fake], -1) + x = torch.cat([context, target * 1], 1).detach() + out = self.inc_block(x.permute(0, 2, 1)).permute(0, 2, 1) + + # Seems to help a little, especially with extrapolating out of bounds + steps = past_y.shape[1] + mean = self.mean(out)[:, steps:, :] + log_sigma = self.std(out)[:, steps:, :] + + sigma = self._min_std + (1 - self._min_std) * F.softplus(log_sigma) + return torch.distributions.Normal(mean, sigma), {} diff --git a/seq2seq_time/models/lstm_seq.py b/seq2seq_time/models/lstm_seq.py deleted file mode 100644 index 85c1d72..0000000 --- a/seq2seq_time/models/lstm_seq.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torch import nn -from torch.nn import functional as F - -class LSTMSeq(nn.Module): - def __init__(self, input_size, output_size, hidden_size=32, lstm_layers=2, lstm_dropout=0, _min_std = 0.05, nan_value=0): - super().__init__() - self._min_std = _min_std - self.nan_value = nan_value - - self.lstm = nn.LSTM( - input_size=input_size + output_size, - hidden_size=hidden_size, - batch_first=True, - num_layers=lstm_layers, - dropout=lstm_dropout, - ) - self.mean = nn.Linear(hidden_size, output_size) - self.std = nn.Linear(hidden_size, output_size) - - def forward(self, past_x, past_y, future_x, future_y=None): - device = next(self.parameters()).device - x = torch.cat([past_x, past_y], -1).detach() - - steps = future_x.shape[1] - outputs, _ = self.lstm(x) - outputs = outputs[:, -steps:, :] - - # outputs: [B, T, num_direction * H] - mean = self.mean(outputs) - log_sigma = self.std(outputs) - sigma = self._min_std + (1 - self._min_std) * F.softplus(log_sigma) - y_dist = torch.distributions.Normal(mean, sigma) - return y_dist, {} diff --git a/seq2seq_time/models/tcn.py b/seq2seq_time/models/tcn.py index 4ab5a45..99b1e33 100644 --- a/seq2seq_time/models/tcn.py +++ b/seq2seq_time/models/tcn.py @@ -145,7 +145,7 @@ class TemporalConvNet(nn.Module): return out -class TCNSeq2Seq(nn.Module): +class TCNSeq(nn.Module): """ See: - https://arxiv.org/pdf/1803.01271.pdf diff --git a/seq2seq_time/models/transformer.py b/seq2seq_time/models/transformer.py index b9f4474..ebc568a 100644 --- a/seq2seq_time/models/transformer.py +++ b/seq2seq_time/models/transformer.py @@ -40,12 +40,6 @@ class Transformer(nn.Module): target = torch.cat([future_x, future_y_fake], -1).detach() x = torch.cat([context, target * 1], 1).detach() - # Masks - x_mask = torch.isfinite(x) & (x != self.nan_value) - x[~x_mask] = 0 - x = x.detach() - x_key_padding_mask = ~x_mask.any(-1) - x = self.enc_emb(x).permute(1, 0, 2) S, B, _ = x.shape diff --git a/seq2seq_time/models/transformer_autor.py b/seq2seq_time/models/transformer_autor.py deleted file mode 100644 index 6ef9305..0000000 --- a/seq2seq_time/models/transformer_autor.py +++ /dev/null @@ -1,73 +0,0 @@ -from tqdm.auto import tqdm -from torch import nn -import torch -from torch.nn import functional as F - - -import fast_transformers -from fast_transformers.builders import TransformerEncoderBuilder - -class TransformerAutoR(nn.Module): - def __init__(self, x_dim, y_dim, hidden_out_size=256, nlayers=8, n_heads=8, use_lstm=False, attention_dropout=0, dropout=0, min_std=0.01): - super().__init__() - self._min_std = min_std - self.use_lstm = use_lstm - hidden_out_size = hidden_out_size//n_heads - - x_size = x_dim + y_dim - - # TODO embedd both X's the same - if use_lstm: - self.x_emb = LSTMBlock(x_size, x_size) - - self.enc_emb = nn.Linear(x_size, hidden_out_size*n_heads) - self.encoder = fast_transformers.builders.TransformerEncoderBuilder.from_kwargs( - attention_type="causal-linear", - n_layers=nlayers, - n_heads=n_heads, - feed_forward_dimensions=hidden_out_size*8*n_heads, - query_dimensions=hidden_out_size, - value_dimensions=hidden_out_size, - attention_dropout=attention_dropout, - dropout=dropout, - ).get() - self.mean = nn.Linear(hidden_out_size*n_heads, y_dim) - self.std = nn.Linear(hidden_out_size*n_heads, y_dim) - - def forward(self, past_x, past_y, future_x, future_y=None, mask_context=True, mask_target=True): - device = next(self.parameters()).device - B, S, _ = future_x.shape - future_y_fake = past_y[:, -1:, :].repeat(1, S, 1).to(device) - # future_y_fake = ( - # torch.ones(past_y.shape[0], future_x.shape[1], past_y.shape[2]).float().to(device) * 0 - # ) - context = torch.cat([past_x, past_y], -1) - target = torch.cat([future_x, future_y_fake], -1) - x = torch.cat([context, target * 1], 1).detach() - - # LSTM - if self.use_lstm: - x = self.x_emb(x) - # Size([B, T, Y]) -> Size([B, T, Y]) - - # Embed - x = self.enc_emb(x) - - # requires (B, C, hidden_dim) - steps = past_y.shape[1] - N = x.shape[1] - mask = fast_transformers.masking.TriangularCausalMask(N, device=device) - outputs = self.encoder(x, attn_mask=mask)[:, steps:, :] - - # Size([B, T, emb_dim]) - mean = self.mean(outputs) - log_sigma = self.std(outputs) - sigma = self._min_std + (1 - self._min_std) * F.softplus(log_sigma) - y_dist = torch.distributions.Normal(mean, sigma) - - return ( - y_dist, - {} - ) - - diff --git a/seq2seq_time/models/transformer_seq.py b/seq2seq_time/models/transformer_seq.py deleted file mode 100644 index cfb4962..0000000 --- a/seq2seq_time/models/transformer_seq.py +++ /dev/null @@ -1,55 +0,0 @@ -import torch -from torch import nn -from torch.nn import functional as F - -from ..util import mask_upper_triangular - -class TransformerSeq(nn.Module): - """ - A single transformer, masking nan or 0 - """ - def __init__(self, x_dim, y_dim, attention_dropout=0, nhead=8, nlayers=2, hidden_size=16, nan_value=0, min_std=0.01): - super().__init__() - self._min_std = min_std - self.nan_value = nan_value - enc_x_dim = x_dim + y_dim - - self.enc_emb = nn.Linear(enc_x_dim, hidden_size) - encoder_norm = nn.LayerNorm(hidden_size) - layer_enc = nn.TransformerEncoderLayer( - d_model=hidden_size, - dim_feedforward=hidden_size*4, - dropout=attention_dropout, - nhead=nhead, - # activation - ) - self.encoder = nn.TransformerEncoder( - layer_enc, num_layers=nlayers, norm=encoder_norm - ) - self.mean = nn.Linear(hidden_size, y_dim) - self.std = nn.Linear(hidden_size, y_dim) - - def forward(self, past_x, past_y, future_x, future_y=None): - device = next(self.parameters()).device - x = torch.cat([past_x, past_y], -1).detach() - - # Masks - x_mask = torch.isfinite(x) & (x != self.nan_value) - x[~x_mask] = 0 - x = x.detach() - x_key_padding_mask = ~x_mask.any(-1) - - x = self.enc_emb(x).permute(1, 0, 2) - - outputs = self.encoder(x, src_key_padding_mask=x_key_padding_mask).permute( - 1, 0, 2 - ) - - # Seems to help a little, especially with extrapolating out of bounds - steps = future_x.shape[1] - mean = self.mean(outputs)[:, -steps:, :] - log_sigma = self.std(outputs)[:, -steps:, :] - - sigma = self._min_std + (1 - self._min_std) * F.softplus(log_sigma) - return torch.distributions.Normal(mean, sigma), {} - diff --git a/seq2seq_time/models/xattention.py b/seq2seq_time/models/xattention.py index b6f302c..01fbee0 100644 --- a/seq2seq_time/models/xattention.py +++ b/seq2seq_time/models/xattention.py @@ -48,7 +48,7 @@ class CrossAttention(nn.Module): x = self.enc_emb(x).permute(1, 0, 2) - B, S, _ = x.shape + S, B, _ = x.shape mask = mask_upper_triangular(S, device) outputs = self.encoder(x, mask=mask#, src_key_padding_mask=x_key_padding_mask