misc

2026-06-27 17:31:58 +08:00 · 2020-10-27 06:43:50 +08:00
parent 6eda47b76f
commit 052fd6596c
9 changed files with 2920 additions and 21 deletions
@@ -1,6 +1,7 @@
 lightning_logs/
 dataset_folder/
 logs/
+outputs/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -0,0 +1,409 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:light
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.6.0
+#   kernelspec:
+#     display_name: seq2seq-time
+#     language: python
+#     name: seq2seq-time
+# ---
+
+# # Sequence to Sequence Models for Timeseries Regression
+#
+#
+# In this notebook we are going to tackle a harder problem: 
+# - predicting the future on a timeseries
+# - using an LSTM
+# - with rough uncertainty (uncalibrated)
+# - outputing sequence of predictions
+#
+# <img src="../reports/figures/Seq2Seq for regression.png" />
+#
+#
+# https://medium.com/@boitemailjeanmid/smart-meters-in-london-part1-description-and-first-insights-jean-michel-d-db97af2de71b
+#
+
+# OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
+# %load_ext autoreload
+# %autoreload 2
+# %aimport -pandas
+# %aimport -torch
+# %aimport -numpy
+# %aimport -matplotlib
+# %aimport -dask
+# %aimport -tqdm
+# %matplotlib inline
+
+# +
+# Imports
+import torch
+from torch import nn, optim
+from torch.nn import functional as F
+from torch.autograd import Variable
+import torch
+import torch.utils.data
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+plt.rcParams['figure.figsize'] = (12.0, 3.0)
+plt.style.use('ggplot')
+
+from pathlib import Path
+from tqdm.auto import tqdm
+
+import pytorch_lightning as pl
+# -
+
+import warnings
+warnings.simplefilter('once')
+
+from seq2seq_time.data.dataset import Seq2SeqDataSet, Seq2SeqDataSets
+from seq2seq_time.predict import predict, predict_multi
+from seq2seq_time.util import dset_to_nc
+
+import logging, sys
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+import holoviews as hv
+from holoviews import opts
+from holoviews.operation.datashader import datashade, dynspread
+hv.extension('bokeh')
+
+
+# ## Parameters
+
+# +
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f'using {device}')
+
+columns_target=['energy(kWh/hh)']
+window_past = 48*2
+window_future = 48*2
+batch_size = 128
+num_workers = 5
+freq = '30T'
+max_rows = 5e5
+datasets_root = Path('../data/processed/')
+
+
+# -
+
+# ## Plot helpers
+
+# +
+def plot_prediction(ds_preds, i):
+    """Plot a prediction into the future, at a single point in time."""
+    d = ds_preds.isel(t_source=i)
+
+    # Get arrays
+    xf = d.t_target
+    yp = d.y_pred
+    s = d.y_pred_std
+    yt = d.y_true
+    now = d.t_source.squeeze()
+    
+    
+    plt.figure(figsize=(12, 4))
+    
+    plt.scatter(xf, yt, label='true', c='k', s=6)
+    ylim = plt.ylim()
+
+    # plot prediction
+    plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
+            facecolor="b",
+            interpolate=True,
+            label="2 std",)
+    plt.plot(xf, yp, label='pred', c='b')
+
+    # plot true
+    plt.scatter(
+        d.t_past,
+        d.y_past,
+        c='k',
+        s=6
+    )
+    
+    # plot a red line for now
+    plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
+    plt.ylim(*ylim)
+
+    now=pd.Timestamp(now.values)
+    plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
+    plt.xlabel(f'{now.date()}')
+    plt.ylabel('energy(kWh/hh)')
+    plt.legend()
+    plt.xticks(rotation=45)
+    plt.show()
+    
+def plot_performance(ds_preds, full=False):
+    """Multiple plots using xr_preds"""
+    plot_prediction(ds_preds, 24)
+
+    ds_preds.mean('t_source').plot.scatter('t_ahead_hours', 'nll') # Mean over all predictions
+    n = len(ds_preds.t_source)
+    plt.ylabel('Negative Log Likelihood (lower is better)')
+    plt.xlabel('Hours ahead')
+    plt.title(f'NLL vs time ahead (no. samples={n})')
+    plt.show()
+
+    # Make a plot of the NLL over time. Does this solution get worse with time?
+    if full:
+        d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
+        plt.xticks(rotation=45)
+        plt.title('NLL over source time (lower is better)')
+        plt.show()
+
+    # A scatter plot is easy with xarray
+    if full:
+        plt.figure(figsize=(5, 5))
+        ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
+        plt.show()
+
+
+# -
+def plot_hist(trainer):
+    try:
+        df_hist = pd.read_csv(trainer.logger.experiment.metrics_file_path)
+        df_hist['epoch'] = df_hist['epoch'].ffill()
+        df_histe = df_hist.set_index('epoch').groupby('epoch').mean()
+        if len(df_histe)>1:
+            df_histe[['loss/train', 'loss/val']].plot(title='history')
+        return df_histe
+    except Exception:
+        pass
+
+
+# ## Lightning
+
+# +
+import pytorch_lightning as pl
+
+class PL_MODEL(pl.LightningModule):
+    def __init__(self, model, lr=3e-4, patience=None, weight_decay=0):
+        super().__init__()
+        self._model = model
+        self.lr = lr
+        self.patience = patience
+        self.weight_decay = weight_decay
+
+    def forward(self, x_past, y_past, x_future, y_future=None):
+        """Eval/Predict"""
+        y_dist, extra = self._model(x_past, y_past, x_future, y_future)
+        assert torch.isfinite(y_dist.loc).all(), 'output should be finite'
+        return y_dist, extra
+
+    def training_step(self, batch, batch_idx, phase='train'):
+        x_past, y_past, x_future, y_future = batch
+        y_dist, extra = self.forward(*batch)
+        loss = -y_dist.log_prob(y_future).mean()
+        assert torch.isfinite(loss).all(), 'loss should be finite'
+        self.log_dict({f'loss/{phase}':loss})
+        if ('loss' in extra) and (phase=='train'):
+            # some models have a special loss
+            loss = extra['loss']
+            self.log_dict({f'model_loss/{phase}':loss})
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        return self.training_step(batch, batch_idx, phase='val')
+    
+    def configure_optimizers(self):
+        optim = torch.optim.AdamW(self.parameters(), lr=self.lr,  weight_decay=self.weight_decay)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optim,
+            patience=self.patience,
+            verbose=True,
+            min_lr=1e-7,
+        ) if self.patience else None
+        return {'optimizer': optim, 'lr_scheduler': scheduler, 'monitor': 'loss/val'}
+
+
+# -
+
+# # Run
+from torch.utils.data import DataLoader
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+
+# Models
+from seq2seq_time.models.lstm_seq2seq import LSTMSeq2Seq
+from seq2seq_time.models.lstm_seq import LSTMSeq
+from seq2seq_time.models.lstm import LSTM
+from seq2seq_time.models.baseline import BaselineLast
+from seq2seq_time.models.transformer import Transformer
+from seq2seq_time.models.transformer_autor import TransformerAutoR
+from seq2seq_time.models.transformer_seq2seq import TransformerSeq2Seq
+from seq2seq_time.models.transformer_seq import TransformerSeq
+from seq2seq_time.models.neural_process import RANP
+from seq2seq_time.models.transformer_process import TransformerProcess
+from seq2seq_time.models.tcn import TemporalConvNet
+# ## Plots
+# +
+import gc
+
+def free_mem():
+    gc.collect()
+    torch.cuda.empty_cache()
+    gc.collect()
+
+
+# -
+
+models = [
+    lambda: BaselineLast(),
+#     lambda: TransformerAutoR(input_size,
+#         output_size, hidden_out_size=32),
+    lambda: RANP(input_size,
+        output_size, hidden_dim=32, 
+         latent_dim=64, n_decoder_layers=4),
+    lambda: LSTM(input_size,
+         output_size,
+         hidden_size=80,
+         lstm_layers=3,
+         lstm_dropout=0.3),
+    lambda: LSTMSeq2Seq(input_size,
+                output_size,
+                hidden_size=64,
+                lstm_layers=2,
+                lstm_dropout=0.25),
+    lambda: TransformerSeq2Seq(input_size,
+                       output_size,
+                       hidden_size=128,
+                       nhead=8,
+                       nlayers=4,
+                       attention_dropout=0.2),
+    lambda: Transformer(input_size,
+                output_size,
+                attention_dropout=0.2,
+                nhead=8,
+                nlayers=8,
+                hidden_size=128),
+    lambda :TransformerProcess(input_size,
+                output_size, hidden_size=16,
+        latent_dim=8, dropout=0.5,
+        nlayers=4,)
+#     lambda :TemporalConvNet()
+]
+# models
+
+# +
+from seq2seq_time.data.data import IMOSCurrentsVel, AppliancesEnergyPrediction, BejingPM25, GasSensor, MetroInterstateTraffic
+
+datasets = [IMOSCurrentsVel, BejingPM25, GasSensor, AppliancesEnergyPrediction, MetroInterstateTraffic]
+datasets
+
+# +
+# GasSensor(datasets_root)
+# -
+
+# ## Train
+
+from collections import defaultdict
+results = defaultdict(dict)
+
+from seq2seq_time.metrics import rmse, smape
+
+# +
+for Dataset in datasets:
+    dataset_name = Dataset.__name__
+    dataset = Dataset(datasets_root)
+    ds_train, ds_test = dataset.to_datasets(window_past=window_past,
+                                            window_future=window_future)
+
+    # Init data
+    x_past, y_past, x_future, y_future = ds_train.get_rows(10)
+    input_size = x_past.shape[-1]
+    output_size = y_future.shape[-1]
+
+    # Loaders
+    dl_train = DataLoader(ds_train,
+                          batch_size=batch_size,
+                          shuffle=True,
+                          pin_memory=num_workers == 0,
+                          num_workers=num_workers)
+    dl_test = DataLoader(ds_test,
+                         batch_size=batch_size,
+                         num_workers=num_workers)
+
+    for m_fn in models:
+        try:
+            free_mem()
+            pt_model = m_fn()
+            model_name = type(pt_model).__name__
+            print(dataset_name, model_name)
+
+            # Wrap in lightning
+            patience = 2
+            model = PL_MODEL(pt_model,
+                             lr=3e-4, patience=patience,
+                             weight_decay=1e-5).to(device)
+
+            # Trainer            
+            trainer = pl.Trainer(
+                gpus=1,
+                min_epochs=2,
+                max_epochs=20,
+                amp_level='O1',
+                precision=16,
+                
+                limit_train_batches=1000,
+                limit_val_batches=100,
+                logger=CSVLogger("../outputs", name=f'{dataset_name}_{model_name}'),
+                callbacks=[
+                    EarlyStopping(monitor='loss/val', patience=patience * 2, verbose=True),
+                ],
+            )
+
+            # Train
+            trainer.fit(model, dl_train, dl_test)
+
+            ds_preds = predict(model.to(device),
+                               ds_test,
+                               batch_size * 2,
+                               device=device,
+                               scaler=dataset.output_scaler)
+
+            print(dataset_name, model_name)
+            print(f'mean_NLL {ds_preds.nll.mean().item():2.2f}')
+            loss = ds_preds.nll.mean().item()
+
+            # Performance
+#             print(plot_hist(trainer))
+#             plot_performance(ds_preds)
+
+            metrics = dict(
+                rmse=rmse(ds_preds.y_true, ds_preds.y_pred).item(), 
+                smape=smape(ds_preds.y_true, ds_preds.y_pred).item(), 
+                nll=ds_preds.nll.mean().item()
+                )
+            results[dataset_name][model_name] = metrics
+            df_results = pd.concat({k:pd.DataFrame(v) for k,v in results.items()})
+            display(df_results)
+            
+            dset_to_nc(ds_preds, Path(trainer.logger.experiment.log_dir)/'ds_preds.nc')
+            model.cpu()
+        except Exception as e:
+            logging.exception('failed to run model')
+            
+df_results = pd.concat({k:pd.DataFrame(v) for k,v in results.items()})
+display(df_results)
+
+# +
+# EarlyStopping?
+
+# +
+# ds_preds.to_netcdf(trainer.logger.experiment.log_dir+'/ds_preds2.nc')
+# -
+
+
+
+# # Plots
+
@@ -7,9 +7,11 @@ from sklearn_pandas import DataFrameMapper
 import xarray as xr
 import pandas as pd
 import numpy as np
+import zipfile

 from .dataset import Seq2SeqDataSet
 from .util import normalize_encode_dataframe, timeseries_split
+from ..util import dset_to_nc
 from .tidal import generate_tidal_periods


@@ -77,20 +79,18 @@ class GasSensor(RegressionForecastData):
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00487/gas-sensor-array-temperature-modulation.zip'
        
        # download if needed
-        extract_path = self.datasets_root/'GasSensor'
-        files = sorted(extract_path.glob('*.csv'))
-        if len(files)<13:
-            print('download_and_extract_archive')
-            download_and_extract_archive(url, self.datasets_root, extract_path)
+        # extract_path = self.datasets_root/'gas-sensor-array-temperature-modulation.zip'
+        download_url(url, self.datasets_root)
        
-        # Load csv's
-        files = sorted(extract_path.glob('*.csv'))
-        dfs = []
-        for f in files:
-            now = pd.to_datetime(f.stem, format='%Y%m%d_%H%M%S')
-            df = pd.read_csv(f)
-            df.index = pd.to_timedelta(df['Time (s)'], unit='s') + now
-            dfs.append(df)
+        # Load csv's from inside zip
+        zf = zipfile.ZipFile(self.datasets_root / 'gas-sensor-array-temperature-modulation.zip')
+        dfs=[]
+        for f in zf.namelist():
+            if f.endswith('.csv'):
+                now = pd.to_datetime(Pdset_to_ncath(f).stem, format='%Y%m%d_%H%M%S')
+                df = pd.read_csv(zf.open(f))
+                df.index = pd.to_timedelta(df['Time (s)'], unit='s') + now
+                dfs.append(df)
        self.df = pd.concat(dfs).dropna(subset=self.columns_target)

        df = df[[ 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)',
@@ -272,11 +272,7 @@ def get_current_timeseries(
        # Add tidal freqs
        xd = xd.merge(df_eta)

-        # Cache to nc
-        xd.to_netcdf(outfile)
-        print(
-            f'wrote "{outfile}" with size {outfile.stat().st_size*1e-6:2.2f} MB'
-        )
+        dset_to_nc(xd, outfile)
    return outfile


@@ -31,7 +31,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        assert df.index.freq is not None, 'should have freq'
        assert_no_objects(df)

-        self.df = df.dropna(subset=columns_target)
+        self.df = df.dropna(subset=columns_target).ffill()

        self.window_past = window_past
        self.window_future = window_future
@@ -100,7 +100,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
    
    def __repr__(self):
        t = self.df.index
-        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
+        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]})>'


 class Seq2SeqDataSets(torch.utils.data.Dataset):
@@ -16,4 +16,4 @@ def normalize_encode_dataframe(df, encoder=OrdinalEncoder):
 def timeseries_split(df, test_fraction=0.2):
    """Split timeseries data with test in the future"""
    i = int(len(df)*test_fraction)
-    return df.iloc[:i], df.iloc[i:]
+    return df.iloc[:-i], df.iloc[-i:]
@@ -0,0 +1,23 @@
+import numpy as np
+
+EPSILON = 1e-10
+
+def _error(actual: np.ndarray, predicted: np.ndarray):
+    """ Simple error """
+    return actual - predicted
+
+def mse(actual: np.ndarray, predicted: np.ndarray):
+    """ Mean Squared Error """
+    return np.mean(np.square(_error(actual, predicted)))
+
+def rmse(actual: np.ndarray, predicted: np.ndarray):
+    """ Root Mean Squared Error """
+    return np.sqrt(mse(actual, predicted))
+
+
+def smape(actual: np.ndarray, predicted: np.ndarray):
+    """
+    Symmetric Mean Absolute Percentage Error
+    Note: result is NOT multiplied by 100
+    """
+    return np.mean(2.0 * np.abs(actual - predicted) / ((np.abs(actual) + np.abs(predicted)) + EPSILON))
@@ -0,0 +1,146 @@
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size].contiguous()
+
+
+class Conv(nn.Module):
+    """Causal convolution layer."""
+
+    def __init__(
+        self,
+        n_inputs,
+        n_outputs,
+        kernel_size,
+        stride,
+        dilation,
+        padding,
+        causal=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.chomp = Chomp1d(padding)
+        self.causal = causal
+
+    def forward(self, x):
+        out = self.conv(x)
+        if self.causal:
+            out = self.chomp(out)
+        return out
+
+
+class TemporalBlock(nn.Module):
+    def __init__(
+        self,
+        n_inputs,
+        n_outputs,
+        kernel_size,
+        stride,
+        dilation,
+        padding,
+        dropout=0.2,
+    ):
+        super(TemporalBlock, self).__init__()
+        self.conv1 = Conv(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = Conv(
+            n_outputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(
+            self.conv1, self.relu1, self.dropout1, self.conv2, self.relu2, self.dropout2
+        )
+        self.downsample = (
+            Conv(
+                n_inputs,
+                n_outputs,
+                1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                causal=False,
+            )
+            if n_inputs != n_outputs
+            else None
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = x
+        for i, l in enumerate(self.net):
+            out = l(out)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TemporalConvNet(nn.Module):
+    """
+    See:
+    - https://arxiv.org/pdf/1803.01271.pdf
+    - https://github.com/locuslab/TCN
+    """
+    def __init__(
+        self,
+        num_inputs,
+        num_channels,
+        num_embeddings=0,
+        kernel_size=2,
+        dropout=0.2,
+        embedding_dim=2,
+    ):
+        super(TemporalConvNet, self).__init__()
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            layers += [
+                TemporalBlock(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation_size,
+                    padding=(kernel_size - 1) * dilation_size,
+                    dropout=dropout,
+                )
+            ]
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = x
+        for l in self.network:
+            out = l(out)
+        return out
@@ -1,6 +1,9 @@
 from pathlib import Path
 import torch
+import xarray as xr
+import logging

+logger = logging.getLogger(__file__)
 project_dir = Path(__file__).parent.parent

 def to_numpy(x):
@@ -12,3 +15,11 @@ def to_numpy(x):
 def mask_upper_triangular(N, device):
    """Causal attention."""
    return torch.triu(torch.ones(N, N), diagonal=1).to(device).bool()
+
+def dset_to_nc(dset, f, engine="netcdf4", compression={"zlib": True}):
+    if isinstance(dset, xr.DataArray):
+        dset = dset.to_dataset(name="data")
+    encoding = {k: {"zlib": True} for k in dset.data_vars}
+    logger.info(f"saving to {f}")
+    dset.to_netcdf(f, engine=engine, encoding=encoding)
+    logger.info(f"Wrote {f.stem}.nc size={f.stat().st_size/1e6} M")