misc

2026-06-27 18:06:49 +08:00 · 2020-10-26 15:33:32 +08:00
parent b40d311e0b
commit 6eda47b76f
7 changed files with 8 additions and 15866 deletions
@@ -1,725 +0,0 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:light
 #     text_representation:
 #       extension: .py
 #       format_name: light
 #       format_version: '1.5'
 #       jupytext_version: 1.6.0
 #   kernelspec:
 #     display_name: seq2seq-time
 #     language: python
 #     name: seq2seq-time
 # ---
 # # Sequence to Sequence Models for Timeseries Regression
 #
 #
 # In this notebook we are going to tackle a harder problem: 
 # - predicting the future on a timeseries
 # - using an LSTM
 # - with rough uncertainty (uncalibrated)
 # - outputing sequence of predictions
 #
 # <img src="../reports/figures/Seq2Seq for regression.png" />
 #
 #
 # https://medium.com/@boitemailjeanmid/smart-meters-in-london-part1-description-and-first-insights-jean-michel-d-db97af2de71b
 #
 # OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
 # %load_ext autoreload
 # %autoreload 2
 # %aimport -pandas
 # %aimport -torch
 # %aimport -numpy
 # %aimport -matplotlib
 # %aimport -dask
 # %aimport -tqdm
 # %matplotlib inline
 # +
 # Imports
 import torch
 from torch import nn, optim
 from torch.nn import functional as F
 from torch.autograd import Variable
 import torch
 import torch.utils.data
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 plt.rcParams['figure.figsize'] = (12.0, 3.0)
 plt.style.use('ggplot')
 from pathlib import Path
 from tqdm.auto import tqdm
 import pytorch_lightning as pl
 # -
 import warnings
 warnings.simplefilter('once')
 from seq2seq_time.data.dataset import Seq2SeqDataSet, Seq2SeqDataSets
 from seq2seq_time.predict import predict, predict_multi
 import logging, sys
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 # ## Parameters
 # +
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f'using {device}')
 columns_target=['energy(kWh/hh)']
 window_past = 48*2
 window_future = 48*2
 batch_size = 256
 num_workers = 5
 freq = '30T'
 max_rows = 5e5
 # -
 # ## Load data
 # +
 def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london'), max_files=8):
    """
    Data loading and cleanding is always messy, so understand this code is optional.
    """
    # Load csv files
    csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:max_files]
    dfs = []
    for f in csv_files:
        df = (pd.read_csv(f, parse_dates=[1], na_values=['Null'])
              .groupby('tstp')
              .sum()
              .sort_index()
             )
        df['block'] = f.stem
        # Drop nan and 0's
        df = df[df['energy(kWh/hh)']!=0]
        df = df.dropna()
        # Add time features 
        time = df.index.to_series()
        df["month"] = time.dt.month
        df['day'] = time.dt.day
        df['week'] = time.dt.week
        df['hour'] = time.dt.hour
        df['minute'] = time.dt.minute
        df['dayofweek'] = time.dt.dayofweek
        # Load weather data
        df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
        use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
               'pressure', 'apparentTemperature', 'windSpeed', 
               'humidity']
        df_weather = df_weather[use_cols].set_index('time')
        # Resample to match energy data   
        # Use first, since we have bearing, and you can't take mean
        df_weather = df_weather.resample(freq).first().ffill()  
        # Join weather and energy data
        df = pd.merge(df, df_weather, how='inner', left_index=True, right_index=True, sort=True)
        # Holidays
        df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
        holidays = set(df_hols['Bank holidays'].dt.round('D'))  
        def is_holiday(dt):
            return dt in holidays
        days = df.index.floor('D')
        holiday_mapping = days.unique().to_series().apply(is_holiday).astype(int).to_dict()
        df['holiday'] = days.to_series().map(holiday_mapping).values
        # sort
        df.index.name = 'Date'
        df = df.loc['2012-09':] # Weird value before this
        dfs.append(df)
    return pd.concat(dfs)
 # -
 # Our dataset is the london smartmeter data. But at half hour intervals
 # +
 df = get_smartmeter_df(max_files=12)
 # # Just get the first one for now
 # dfs = list(dfs)
 # # df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
 df = df.tail(int(max_rows)).copy() # Just use last X rows
 # df = pd.concat(dfs[:6], 0)
 # # df = dfs[0]
 print(df.block.value_counts())
 df
 # -
 # ### Plot/explore
 # +
 import holoviews as hv
 from holoviews import opts
 from holoviews.plotting.links import RangeToolLink
 import datashader as ds
 from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
 from holoviews.operation import decimate
 hv.extension('bokeh')
 # def house_curve(Name=None):
 #     if isinstance(Name, int):
 #         name = df.block.unique()[Name]
 #     d = df[df.block == Name]
 #     d_curve = hv.Curve(d, 'Date', 'energy(kWh/hh)', label=Name).opts(framewise=True)
 #     return d_curve
 # dmap = hv.DynamicMap(house_curve, kdims=['Name'])
 # dmap = dmap.redim.values(Name=list(df.block.unique()))
 # dynspread(datashade(dmap).opts(width=800,
 #                      height=300,
 #                      tools=['xwheel_zoom', 'pan'],
 #                      active_tools=['xwheel_zoom', 'pan'],
 #                      default_tools=['reset', 'save', 'hover']
 #                     ))
 # -
 # ### Profiling
 # +
 # from pandas_profiling import ProfileReport
 # profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True)
 # profile
 # -
 # ### Norm
 df.describe()
 # +
 import sklearn
 from sklearn.preprocessing import StandardScaler, OrdinalEncoder
 from sklearn_pandas import DataFrameMapper
 columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
 columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
 output_scalers = [([n], StandardScaler()) for n in columns_target]
 transformers=output_scalers + \
 [([n], StandardScaler()) for n in columns_input_numeric] + \
 [([n], OrdinalEncoder()) for n in columns_categorical]
 scaler = DataFrameMapper(transformers, df_out=True)
 df_norm = scaler.fit_transform(df)
 df_norm
 # -
 output_scaler = next(filter(lambda r:r[0][0] in columns_target, scaler.features))[-1]
 output_scaler
 # ### Split
 # +
 # split data, with the test in the future
 d0 =df_norm.index.min()
 d1 = df_norm.index.max()
 split_time = d0+(d1-d0)*0.8
 split_time = split_time.round('1D')
 print(split_time)
 df_train = df_norm.groupby('block').apply(lambda d:d.loc[:split_time]).reset_index(level=0, drop=True)
 df_test = df_norm.groupby('block').apply(lambda d:d.loc[split_time:]).reset_index(level=0, drop=True)
 # df_test
 # +
 # # Show split
 # df_train['energy(kWh/hh)'].plot(label='train')
 # df_test['energy(kWh/hh)'].plot(label='test')
 # plt.ylabel('energy(kWh/hh)')
 # plt.legend()
 # -
 # # Show split
 scatter = dynspread(datashade(hv.Curve(df_train, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='blue'))
 scatter *= dynspread(datashade(hv.Curve(df_test, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='red'))
 scatter = scatter.opts(plot=dict(width=800))
 scatter
 # ### Dataset
 # +
 # ### Dataset
 # These are the columns that we wont know in the future
 # We need to blank them out in x_future
 columns_blank=['visibility',
       'windBearing', 'temperature', 'dewPoint', 'pressure',
       'apparentTemperature', 'windSpeed', 'humidity']
 df_trains = [d.resample(freq).first().ffill().dropna() for _,d in df_train.groupby('block')]
 df_tests = [d.resample(freq).first().ffill().dropna() for _,d in df_test.groupby('block')]
 ds_train = Seq2SeqDataSets(df_trains,
                          window_past=window_past,
                          window_future=window_future,
                          columns_blank=columns_blank)
 ds_test = Seq2SeqDataSets(df_tests,
                         window_past=window_past,
                         window_future=window_future,
                         columns_blank=columns_blank)
 print(ds_train)
 print(ds_test)
 # -
 # we can treat it like an array
 ds_train[0]
 len(ds_train)
 ds_train[-1]
 # +
 # We can get rows
 x_past, y_past, x_future, y_future = ds_train.get_rows(10)
 # Plot one instance, this is what the model sees
 y_past['energy(kWh/hh)'].plot(label='past')
 y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
 plt.legend()
 plt.ylabel('energy(kWh/hh)')
 # Notice we've added on two new columns tsp (time since present) and is_past
 x_past.tail()
 # -
 # Notice we've hidden some future columns to prevent cheating
 x_future.tail()
 # ## Plot helpers
 # +
 def plot_prediction(ds_preds, i):
    """Plot a prediction into the future, at a single point in time."""
    d = ds_preds.isel(t_source=i)
    # Get arrays
    xf = d.t_target
    yp = d.y_pred
    s = d.y_pred_std
    yt = d.y_true
    now = d.t_source.squeeze()
    plt.figure(figsize=(12, 4))
    plt.scatter(xf, yt, label='true', c='k', s=6)
    ylim = plt.ylim()
    # plot prediction
    plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
            facecolor="b",
            interpolate=True,
            label="2 std",)
    plt.plot(xf, yp, label='pred', c='b')
    # plot true
    plt.scatter(
        d.t_past,
        d.y_past,
        c='k',
        s=6
    )
    # plot a red line for now
    plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
    plt.ylim(*ylim)
    now=pd.Timestamp(now.values)
    plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
    plt.xlabel(f'{now.date()}')
    plt.ylabel('energy(kWh/hh)')
    plt.legend()
    plt.xticks(rotation=45)
    plt.show()
 def plot_performance(ds_preds, full=False):
    """Multiple plots using xr_preds"""
    plot_prediction(ds_preds, 24)
    ds_preds.mean('t_source').plot.scatter('t_ahead_hours', 'nll') # Mean over all predictions
    n = len(ds_preds.t_source)
    plt.ylabel('Negative Log Likelihood (lower is better)')
    plt.xlabel('Hours ahead')
    plt.title(f'NLL vs time ahead (no. samples={n})')
    plt.show()
    # Make a plot of the NLL over time. Does this solution get worse with time?
    if full:
        d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
        plt.xticks(rotation=45)
        plt.title('NLL over source time (lower is better)')
        plt.show()
    # A scatter plot is easy with xarray
    if full:
        plt.figure(figsize=(5, 5))
        ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
        plt.show()
 # -
 def plot_hist(trainer):
    try:
        df_hist = pd.read_csv(trainer.logger.experiment.metrics_file_path)
        df_hist['epoch'] = df_hist['epoch'].ffill()
        df_histe = df_hist.set_index('epoch').groupby('epoch').mean()
        if len(df_histe)>1:
            df_histe[['loss/train', 'loss/val']].plot(title='history')
        return df_histe
    except Exception:
        pass
 # ## Lightning
 # +
 import pytorch_lightning as pl
 class PL_MODEL(pl.LightningModule):
    def __init__(self, model, lr=3e-4, patience=2):
        super().__init__()
        self._model = model
        self.lr = lr
        self.patience = patience
    def forward(self, x_past, y_past, x_future, y_future=None):
        """Eval/Predict"""
        y_dist, extra = self._model(x_past, y_past, x_future, y_future)
        return y_dist, extra
    def training_step(self, batch, batch_idx, phase='train'):
        x_past, y_past, x_future, y_future = batch
        y_dist, extra = self.forward(*batch)
        loss = -y_dist.log_prob(y_future).mean()
        self.log_dict({f'loss/{phase}':loss})
        if ('loss' in extra) and (phase=='train'):
            # some models have a special loss
            loss = extra['loss']
            self.log_dict({f'model_loss/{phase}':loss})
        return loss
    def validation_step(self, batch, batch_idx):
        return self.training_step(batch, batch_idx, phase='val')
    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optim,
            patience=self.patience,
            verbose=True,
            min_lr=1e-7,
        )
        return {'optimizer': optim, 'lr_scheduler': scheduler, 'monitor': 'loss/val'}
 # -
 # # Run
 from torch.utils.data import DataLoader
 from pytorch_lightning.loggers import CSVLogger
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 # +
 # Init data
 x_past, y_past, x_future, y_future = ds_train.get_rows(10)
 input_size = x_past.shape[-1]
 output_size = y_future.shape[-1]
 dl_train = DataLoader(ds_train,
                      batch_size=batch_size,
                      shuffle=True,
                      pin_memory=num_workers==0,
                      num_workers=num_workers)
 dl_test = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers)
 # -
 from seq2seq_time.models.lstm_seq2seq import LSTMSeq2Seq
 from seq2seq_time.models.lstm_seq import LSTMSeq
 from seq2seq_time.models.lstm import LSTM
 from seq2seq_time.models.baseline import BaselineLast
 from seq2seq_time.models.transformer import Transformer
 from seq2seq_time.models.transformer_seq2seq import TransformerSeq2Seq
 from seq2seq_time.models.transformer_seq import TransformerSeq
 from seq2seq_time.models.neural_process import RANP
 # ## Plots
 # +
 models = [
    RANP(input_size,
                output_size),
    LSTM(input_size,
         output_size,
         hidden_size=80,
         lstm_layers=3,
         lstm_dropout=0.3),
    LSTMSeq2Seq(input_size,
                output_size,
                hidden_size=64,
                lstm_layers=2,
                lstm_dropout=0.25),
    TransformerSeq2Seq(input_size,
                       output_size,
                       hidden_size=64,
                       nhead=8,
                       nlayers=4,
                       attention_dropout=0.3),
    Transformer(input_size,
                output_size,
                attention_dropout=0.3,
                nhead=8,
                nlayers=6,
                hidden_size=64),
    TransformerSeq(input_size,
                output_size),
    LSTMSeq(input_size,
                output_size),
 ]
 # -
 # Baseline model
 pt_model = BaselineLast()
 model = PL_MODEL(pt_model).to(device)
 trainer = pl.Trainer(gpus=1,
                     max_epochs=1, 
                     limit_train_batches=0.01,
                     logger=CSVLogger("logs",
                                      name=type(pt_model).__name__),
                    )
 trainer.fit(model, dl_train, dl_test)
 print(plot_hist(trainer))
 ds_predss = predict_multi(model.to(device),
                   ds_test.datasets,
                   batch_size*8,
                   device=device,
                   scaler=output_scaler)
 print(f'baseline nll: {ds_preds.nll.mean().item():2.2g}')
 for pt_model in models:
    name = type(pt_model).__name__
    print(name)
    # Wrap in lightning
    patience = 2
    model = PL_MODEL(pt_model, patience=patience, lr=3e-4).to(device)
    # Trainer    
    trainer = pl.Trainer(gpus=1,
                         min_epochs=2,
                         max_epochs=10,
                         amp_level='O1',
                         precision=16,
                         gradient_clip_val=1,
                         logger=CSVLogger("logs",
                                          name=type(pt_model).__name__),
                         callbacks=[
                             EarlyStopping(monitor='loss/val', patience=patience*2),
 #                              PrintTableMetricsCallback2()
                         ],
    )
    # Train
    trainer.fit(model, dl_train, dl_test)
    ds_predss = predict_multi(model.to(device),
                       ds_test.datasets,
                       batch_size*8,
                       device=device,
                       scaler=output_scaler)
    print(name)
    print(f'mean_NLL {ds_predss.nll.mean().item():2.2f}')
    # Performance
    ds_preds = ds_predss.isel(block=0)
    print(plot_hist(trainer))
    plot_performance(ds_preds)
 # +
 # ds_preds = predict(model.to(device),
 #                    ds_test.datasets[0],
 #                    batch_size*8,
 #                    device=device,
 #                    scaler=output_scaler)
 # -
 ds_predss = predict_multi(model.to(device),
                   ds_test.datasets,
                   batch_size*8,
                   device=device,
                   scaler=output_scaler)
 ds_pred_block = ds_predss.isel(block=1)
 # # holoviews pred
 import holoviews as hv
 from holoviews import opts
 # +
 def plot_prediction_now(t_source):
    """Plot predictions with holoviews"""
    # Let us pass in an int
    if isinstance(t_source, int):
        t_source = ds_pred_block.t_source[t_source].to_pandas()
    d = ds_pred_block.sel(t_source=t_source)
    # Sometimes there are duplicate times, take the first
    if len(d.t_source.shape) and d.t_source.shape[0] > 0:
        d = d.isel(t_source=0)
    if len(d.t_source.shape) and d.t_source.shape[0] == 0:
        return None
    now = d.t_source.to_pandas()
    # Plot true
    x = np.concatenate([d.t_past, d.t_target])
    yt = np.concatenate([d.y_past, d.y_true])
    p = hv.Scatter({
        'x': x,
        'y': yt
    }, label='true').opts(color='black')
    # Get arrays
    xf = d.t_target.values
    yp = d.y_pred
    s = d.y_pred_std
    p *= hv.Curve({
        'x': xf,
        'y': yp
    }, label='pred').opts(color='blue')
    p *= hv.Area((xf, yp - 2 * s, yp + 2 * s),
                 vdims=['y', 'y2'],
                 label='2*std').opts(alpha=0.5, line_width=0)
    # plot now line
    p *= hv.VLine(now, label='now').opts(color='red', framewise=True)
    return p.opts(title=f'Prediction at {now}. NLL={d.nll.mean().item():2.2f}')
 dmap_pred = (hv.DynamicMap(plot_prediction_now, kdims=['t_source'])
        .redim.values(t_source=ds_pred_block.t_source.to_pandas())
        .opts(width=800,
                     height=300, 
                    ))
 dmap_pred
 # -
 d = ds_preds.mean(['t_source', 'block'])['nll'].groupby('t_ahead_hours').mean()
 nll_vs_tahead = hv.Curve((d.t_ahead_hours, d)).redim(x='hours ahead', y='nll').opts(width=800)
 nll_vs_tahead
 # +
 # def plot_predictions_vs_time(it_ahead):
 #     """Plot predictions vs time with holoviews"""
 #     d = ds_pred_block.isel(t_ahead=it_ahead).groupby('t_source').first()
 # #     print(d)
 #     p = hv.Scatter({
 #         'x': d.t_source,
 #         'y': d.y_true
 #     }, label='true').opts(color='black')
 #     # Get arrays
 #     xf = d.t_source.values
 #     yp = d.y_pred
 #     s = d.y_pred_std
 #     p *= hv.Curve({
 #         'x': xf,
 #         'y': yp
 #     }, label='pred').opts(color='blue')
 #     p *= hv.Area((xf, yp - 2 * s, yp + 2 * s),
 #                  vdims=['y', 'y2'],
 #                  label='2*std').opts(alpha=0.5, line_width=0)
 #     return p.opts(title=f'Prediction at {it_ahead * pd.Timedelta(freq)} ahead. NLL={d.nll.mean().item():2.2f}')
 # dmap_preds = (hv.DynamicMap(plot_predictions_vs_time, kdims=['it_ahead'])
 #         .redim.values(it_ahead=range(ds_pred_block.t_ahead.shape[0]))
 #         .opts(width=800,
 #                      height=300, 
 #                     ))
 # dmap_preds
 # # TODO fixme
 # -
 # +
 # d = ds_preds.mean(['t_ahead', 'block'])['nll'].groupby('t_source').mean()
 # nll_vs_time = hv.Curve(d).opts(width=800)
 # nll_vs_time
 # +
 # true_vs_pred = hv.Scatter((ds_preds.y_true, ds_preds.y_pred))
 # dynspread(datashade(true_vs_pred))
 # -
 # # Summarize experiments
 # # LR finder
 # +
 # # Run learning rate finder
 # lr_finder = trainer.tuner.lr_find(model)
 # # Results can be found in
 # lr_finder.results
 # # Plot with
 # fig = lr_finder.plot(suggest=True)
 # fig.show()
 # # Pick point based on plot, or get suggestion
 # new_lr = lr_finder.suggestion()
 # -
@@ -1,805 +0,0 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:light
 #     text_representation:
 #       extension: .py
 #       format_name: light
 #       format_version: '1.5'
 #       jupytext_version: 1.6.0
 #   kernelspec:
 #     display_name: seq2seq-time
 #     language: python
 #     name: seq2seq-time
 # ---
 # # Sequence to Sequence Models for Timeseries Regression
 #
 #
 # In this notebook we are going to tackle a harder problem: 
 # - predicting the future on a timeseries
 # - using an LSTM
 # - with rough uncertainty (uncalibrated)
 # - outputing sequence of predictions
 #
 # <img src="../reports/figures/Seq2Seq for regression.png" />
 #
 #
 # https://medium.com/@boitemailjeanmid/smart-meters-in-london-part1-description-and-first-insights-jean-michel-d-db97af2de71b
 #
 # OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
 # %load_ext autoreload
 # %autoreload 2
 # %aimport -pandas
 # %aimport -torch
 # %aimport -numpy
 # %aimport -matplotlib
 # %aimport -dask
 # %aimport -tqdm
 # %matplotlib inline
 # +
 # Imports
 import torch
 from torch import nn, optim
 from torch.nn import functional as F
 from torch.autograd import Variable
 import torch
 import torch.utils.data
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 plt.rcParams['figure.figsize'] = (12.0, 3.0)
 plt.style.use('ggplot')
 from pathlib import Path
 from tqdm.auto import tqdm
 import pytorch_lightning as pl
 # -
 import warnings
 warnings.simplefilter('once')
 from seq2seq_time.data.dataset import Seq2SeqDataSet, Seq2SeqDataSets
 from seq2seq_time.predict import predict, predict_multi
 import logging, sys
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 # ## Parameters
 # +
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f'using {device}')
 columns_target=['energy(kWh/hh)']
 window_past = 48*2
 window_future = 48*2
 batch_size = 128
 num_workers = 5
 freq = '30T'
 max_rows = 5e5
 # -
 # ## Load data
 # +
 def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london'), max_files=8):
    """
    Data loading and cleanding is always messy, so understand this code is optional.
    """
    # Load csv files
    csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:max_files]
    dfs = []
    for f in csv_files:
        df = (pd.read_csv(f, parse_dates=[1], na_values=['Null'])
              .groupby('tstp')
              .sum()
              .sort_index()
             )
        df['block'] = f.stem
        # Drop nan and 0's
        df = df[df['energy(kWh/hh)']!=0]
        df = df.dropna()
        # Add time features 
        time = df.index.to_series()
        df["month"] = time.dt.month
        df['day'] = time.dt.day
        df['week'] = time.dt.week
        df['hour'] = time.dt.hour
        df['minute'] = time.dt.minute
        df['dayofweek'] = time.dt.dayofweek
        # Load weather data
        df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
        use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
               'pressure', 'apparentTemperature', 'windSpeed', 
               'humidity']
        df_weather = df_weather[use_cols].set_index('time')
        # Resample to match energy data   
        # Use first, since we have bearing, and you can't take mean
        df_weather = df_weather.resample(freq).first().ffill()  
        # Join weather and energy data
        df = pd.merge(df, df_weather, how='inner', left_index=True, right_index=True, sort=True)
        # Holidays
        df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
        holidays = set(df_hols['Bank holidays'].dt.round('D'))  
        def is_holiday(dt):
            return dt in holidays
        days = df.index.floor('D')
        holiday_mapping = days.unique().to_series().apply(is_holiday).astype(int).to_dict()
        df['holiday'] = days.to_series().map(holiday_mapping).values
        # sort
        df.index.name = 'Date'
        df = df.loc['2012-09':] # Weird value before this
        dfs.append(df)
    return pd.concat(dfs)
 # -
 # Our dataset is the london smartmeter data. But at half hour intervals
 # +
 df = get_smartmeter_df(max_files=12)
 # # Just get the first one for now
 # dfs = list(dfs)
 # # df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
 df = df.tail(int(max_rows)).copy() # Just use last X rows
 # df = pd.concat(dfs[:6], 0)
 # # df = dfs[0]
 print(df.block.value_counts())
 df
 # -
 # ### Plot/explore
 # +
 import holoviews as hv
 from holoviews import opts
 from holoviews.plotting.links import RangeToolLink
 import datashader as ds
 from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
 from holoviews.operation import decimate
 hv.extension('bokeh')
 # def house_curve(Name=None):
 #     if isinstance(Name, int):
 #         name = df.block.unique()[Name]
 #     d = df[df.block == Name]
 #     d_curve = hv.Curve(d, 'Date', 'energy(kWh/hh)', label=Name).opts(framewise=True)
 #     return d_curve
 # dmap = hv.DynamicMap(house_curve, kdims=['Name'])
 # dmap = dmap.redim.values(Name=list(df.block.unique()))
 # dynspread(datashade(dmap).opts(width=800,
 #                      height=300,
 #                      tools=['xwheel_zoom', 'pan'],
 #                      active_tools=['xwheel_zoom', 'pan'],
 #                      default_tools=['reset', 'save', 'hover']
 #                     ))
 # -
 # ### Profiling
 # +
 # from pandas_profiling import ProfileReport
 # profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True)
 # profile
 # -
 # ### Norm
 df.describe()
 # +
 import sklearn
 from sklearn.preprocessing import StandardScaler, OrdinalEncoder
 from sklearn_pandas import DataFrameMapper
 columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
 columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
 output_scalers = [([n], StandardScaler()) for n in columns_target]
 transformers=output_scalers + \
 [([n], StandardScaler()) for n in columns_input_numeric] + \
 [([n], OrdinalEncoder()) for n in columns_categorical]
 scaler = DataFrameMapper(transformers, df_out=True)
 df_norm = scaler.fit_transform(df)
 df_norm
 # -
 output_scaler = next(filter(lambda r:r[0][0] in columns_target, scaler.features))[-1]
 output_scaler
 # ### Split
 # +
 # split data, with the test in the future
 d0 =df_norm.index.min()
 d1 = df_norm.index.max()
 split_time = d0+(d1-d0)*0.8
 split_time = split_time.round('1D')
 print(split_time)
 df_train = df_norm.groupby('block').apply(lambda d:d.loc[:split_time]).reset_index(level=0, drop=True)
 df_test = df_norm.groupby('block').apply(lambda d:d.loc[split_time:]).reset_index(level=0, drop=True)
 # df_test
 # +
 # # Show split
 # df_train['energy(kWh/hh)'].plot(label='train')
 # df_test['energy(kWh/hh)'].plot(label='test')
 # plt.ylabel('energy(kWh/hh)')
 # plt.legend()
 # -
 # # Show split
 scatter = dynspread(datashade(hv.Curve(df_train, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='blue'))
 scatter *= dynspread(datashade(hv.Curve(df_test, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='red'))
 scatter = scatter.opts(plot=dict(width=800))
 scatter
 # ### Dataset
 # +
 # ### Dataset
 # These are the columns that we wont know in the future
 # We need to blank them out in x_future
 columns_blank=['visibility',
       'windBearing', 'temperature', 'dewPoint', 'pressure',
       'apparentTemperature', 'windSpeed', 'humidity']
 df_trains = [d.resample(freq).first().ffill().dropna() for _,d in df_train.groupby('block')]
 df_tests = [d.resample(freq).first().ffill().dropna() for _,d in df_test.groupby('block')]
 ds_train = Seq2SeqDataSets(df_trains,
                          window_past=window_past,
                          window_future=window_future,
                          columns_blank=columns_blank)
 ds_test = Seq2SeqDataSets(df_tests,
                         window_past=window_past,
                         window_future=window_future,
                         columns_blank=columns_blank)
 print(ds_train)
 print(ds_test)
 # -
 # we can treat it like an array
 ds_train[0]
 len(ds_train)
 ds_train[-1]
 # +
 # We can get rows
 x_past, y_past, x_future, y_future = ds_train.get_rows(10)
 # Plot one instance, this is what the model sees
 y_past['energy(kWh/hh)'].plot(label='past')
 y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
 plt.legend()
 plt.ylabel('energy(kWh/hh)')
 # Notice we've added on two new columns tsp (time since present) and is_past
 x_past.tail()
 # -
 # Notice we've hidden some future columns to prevent cheating
 x_future.tail()
 # ## Plot helpers
 # +
 def plot_prediction(ds_preds, i):
    """Plot a prediction into the future, at a single point in time."""
    d = ds_preds.isel(t_source=i)
    # Get arrays
    xf = d.t_target
    yp = d.y_pred
    s = d.y_pred_std
    yt = d.y_true
    now = d.t_source.squeeze()
    plt.figure(figsize=(12, 4))
    plt.scatter(xf, yt, label='true', c='k', s=6)
    ylim = plt.ylim()
    # plot prediction
    plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
            facecolor="b",
            interpolate=True,
            label="2 std",)
    plt.plot(xf, yp, label='pred', c='b')
    # plot true
    plt.scatter(
        d.t_past,
        d.y_past,
        c='k',
        s=6
    )
    # plot a red line for now
    plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
    plt.ylim(*ylim)
    now=pd.Timestamp(now.values)
    plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
    plt.xlabel(f'{now.date()}')
    plt.ylabel('energy(kWh/hh)')
    plt.legend()
    plt.xticks(rotation=45)
    plt.show()
 def plot_performance(ds_preds, full=False):
    """Multiple plots using xr_preds"""
    plot_prediction(ds_preds, 24)
    ds_preds.mean('t_source').plot.scatter('t_ahead_hours', 'nll') # Mean over all predictions
    n = len(ds_preds.t_source)
    plt.ylabel('Negative Log Likelihood (lower is better)')
    plt.xlabel('Hours ahead')
    plt.title(f'NLL vs time ahead (no. samples={n})')
    plt.show()
    # Make a plot of the NLL over time. Does this solution get worse with time?
    if full:
        d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
        plt.xticks(rotation=45)
        plt.title('NLL over source time (lower is better)')
        plt.show()
    # A scatter plot is easy with xarray
    if full:
        plt.figure(figsize=(5, 5))
        ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
        plt.show()
 # -
 def plot_hist(trainer):
    try:
        df_hist = pd.read_csv(trainer.logger.experiment.metrics_file_path)
        df_hist['epoch'] = df_hist['epoch'].ffill()
        df_histe = df_hist.set_index('epoch').groupby('epoch').mean()
        if len(df_histe)>1:
            df_histe[['loss/train', 'loss/val']].plot(title='history')
        return df_histe
    except Exception:
        pass
 # ## Lightning
 # +
 import pytorch_lightning as pl
 class PL_MODEL(pl.LightningModule):
    def __init__(self, model, lr=3e-4, patience=2, weight_decay=0):
        super().__init__()
        self._model = model
        self.lr = lr
        self.patience = patience
        self.weight_decay = weight_decay
    def forward(self, x_past, y_past, x_future, y_future=None):
        """Eval/Predict"""
        y_dist, extra = self._model(x_past, y_past, x_future, y_future)
        return y_dist, extra
    def training_step(self, batch, batch_idx, phase='train'):
        x_past, y_past, x_future, y_future = batch
        y_dist, extra = self.forward(*batch)
        loss = -y_dist.log_prob(y_future).mean()
        self.log_dict({f'loss/{phase}':loss})
        if ('loss' in extra) and (phase=='train'):
            # some models have a special loss
            loss = extra['loss']
            self.log_dict({f'model_loss/{phase}':loss})
        return loss
    def validation_step(self, batch, batch_idx):
        return self.training_step(batch, batch_idx, phase='val')
    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.parameters(), lr=self.lr,  weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optim,
            patience=self.patience,
            verbose=True,
            min_lr=1e-7,
        )
        return {'optimizer': optim, 'lr_scheduler': scheduler, 'monitor': 'loss/val'}
 # -
 # # Run
 from torch.utils.data import DataLoader
 from pytorch_lightning.loggers import CSVLogger
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 # +
 # Init data
 x_past, y_past, x_future, y_future = ds_train.get_rows(10)
 input_size = x_past.shape[-1]
 output_size = y_future.shape[-1]
 dl_train = DataLoader(ds_train,
                      batch_size=batch_size,
                      shuffle=True,
                      pin_memory=num_workers==0,
                      num_workers=num_workers)
 dl_test = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers)
 # +
 import gc
 def free_mem():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
 # -
 from seq2seq_time.models.lstm_seq2seq import LSTMSeq2Seq
 from seq2seq_time.models.lstm_seq import LSTMSeq
 from seq2seq_time.models.lstm import LSTM
 from seq2seq_time.models.baseline import BaselineLast
 from seq2seq_time.models.transformer import Transformer
 from seq2seq_time.models.transformer_autor import TransformerAutoR
 from seq2seq_time.models.transformer_seq2seq import TransformerSeq2Seq
 from seq2seq_time.models.transformer_seq import TransformerSeq
 from seq2seq_time.models.neural_process import RANP
 from seq2seq_time.models.transformer_process import TransformerProcess
 # ## Plots
 # +
 # PL_MODEL(TransformerAutoR(input_size, output_size, hidden_out_size=32),
 #          patience=patience,
 #          lr=2e-5,
 #          weight_decay=1e-3)
 # -
 models = [
 #     TransformerAutoR2(input_size,
 #         output_size),
    lambda: TransformerAutoR(input_size,
        output_size, hidden_out_size=32),
    lambda: RANP(input_size,
        output_size, hidden_dim=32, 
         latent_dim=64, n_decoder_layers=4),
    lambda: LSTM(input_size,
         output_size,
         hidden_size=80,
         lstm_layers=3,
         lstm_dropout=0.3),
    lambda: LSTMSeq2Seq(input_size,
                output_size,
                hidden_size=64,
                lstm_layers=2,
                lstm_dropout=0.25),
    lambda: TransformerSeq2Seq(input_size,
                       output_size,
                       hidden_size=128,
                       nhead=8,
                       nlayers=4,
                       attention_dropout=0.2),
    lambda: Transformer(input_size,
                output_size,
                attention_dropout=0.2,
                nhead=8,
                nlayers=8,
                hidden_size=128),
 #     lambda: TransformerSeq(input_size,
 #                 output_size),
 #     lambda: LSTMSeq(input_size,
 #                 output_size),
    lambda :TransformerProcess(input_size,
                output_size, hidden_size=16,
        latent_dim=8, dropout=0.5,
        nlayers=4,)
 ]
 models
 # Baseline model
 pt_model = BaselineLast()
 model = PL_MODEL(pt_model).to(device)
 trainer = pl.Trainer(gpus=1,
                     max_epochs=1, 
                     limit_train_batches=0.01,
                     logger=CSVLogger("logs",
                                      name=type(pt_model).__name__),
                    )
 trainer.fit(model, dl_train, dl_test)
 print(plot_hist(trainer))
 ds_predss = predict_multi(model.to(device),
                   ds_test.datasets,
                   batch_size*8,
                   device=device,
                   scaler=output_scaler)
 print(f'baseline nll: {ds_predss.nll.mean().item():2.2g}')
 # ## Train
 for m_fn in models:
    pt_model = m_fn()
    name = type(pt_model).__name__
    print(name)
    # Wrap in lightning
    patience = 2
    model = PL_MODEL(pt_model, patience=patience, lr=2e-5, weight_decay=1e-3).to(device)
    # Trainer    
    trainer = pl.Trainer(gpus=1,
                         min_epochs=2,
                         max_epochs=30,
                         amp_level='O1',
                         precision=16,
                         gradient_clip_val=1,
                         logger=CSVLogger("logs",
                                          name=type(pt_model).__name__),
                         callbacks=[
                             EarlyStopping(monitor='loss/val', patience=patience*2),
 #                              PrintTableMetricsCallback2()
                         ],
    )
    # Train
    trainer.fit(model, dl_train, dl_test)
    ds_predss = predict_multi(model.to(device),
                       ds_test.datasets,
                       batch_size*2,
                       device=device,
                       scaler=output_scaler)
    print(name)
    print(f'mean_NLL {ds_predss.nll.mean().item():2.2f}')
    # Performance
    ds_preds = ds_predss.isel(block=0)
    print(plot_hist(trainer))
    plot_performance(ds_preds)
    model.cpu()
    free_mem()
 # # Plots
 # +
 # Get latest checkpoint for a model type...
 pt_model = models[1]()
 name = type(pt_model).__name__
 checkpoints = (Path('logs')/name).glob('version_*')
 sort_checkpoints = lambda f:int(f.stem.split('_')[-1])
 checkpoints = sorted(checkpoints, key=sort_checkpoints)
 latest_checkpoint = checkpoints[-1]
 checkpoint_f = sorted(latest_checkpoint.glob('checkpoints/*.ckpt'))[-1]
 print('pt model name', name)
 print('latest_checkpoint', checkpoint_f)
 # -
 # Load
 model = PL_MODEL(pt_model).to(device)
 model.load_from_checkpoint(str(checkpoint_f), model=pt_model)
 ds_predss = predict_multi(model.to(device),
                   ds_test.datasets,
                   batch_size*4,
                   device=device,
                   scaler=output_scaler)
 ds_predss.nll.mean().item()
 ds_pred_block = ds_predss.isel(block=1)
 # # holoviews pred
 # +
 import holoviews as hv
 from holoviews import opts
 import holoviews as hv
 from holoviews import opts
 import datashader as ds
 from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
 from holoviews.operation import decimate
 hv.extension('bokeh')
 # +
 # A few diagnostic plots
 d_source = ds_predss.mean(['t_ahead',
                           'block'])['nll'].groupby('t_source').mean()
 nll_vs_time = (hv.Curve(d_source).opts(width=600,
                                       height=200,
                                       title='Error vs time of prediction'))
 d_ahead = ds_predss.mean(['t_source',
                          'block'])['nll'].groupby('t_ahead_hours').mean()
 nll_vs_tahead = (hv.Curve(
    (d_ahead.t_ahead_hours,
     d_ahead)).redim(x='hours ahead',
                     y='nll').opts(width=600,
                                   height=200,
                                   title='Error vs time ahead'))
 true_vs_pred = datashade(hv.Scatter(
    (ds_predss.y_true,
     ds_predss.y_pred))).redim(x='true', y='pred').opts(title='Scatter plot')
 true_vs_pred = dynspread(true_vs_pred)
 l = nll_vs_time + nll_vs_tahead + true_vs_pred
 l.cols(1).opts(
    framewise=True,
    shared_axes=False,
 )
 # +
 def hv_predict_from_time(t_source):
    """Plot predictions with holoviews"""
    # Let us pass in an int
    if isinstance(t_source, int):
        t_source = ds_pred_block.t_source[t_source].to_pandas()
    d = ds_pred_block.sel(t_source=t_source)
    # Sometimes there are duplicate times, take the first
    if len(d.t_source.shape) and d.t_source.shape[0] > 0:
        d = d.isel(t_source=0)
    if len(d.t_source.shape) and d.t_source.shape[0] == 0:
        return None
    now = d.t_source.to_pandas()
    # Plot true
    x = np.concatenate([d.t_past, d.t_target])
    yt = np.concatenate([d.y_past, d.y_true])
    p = hv.Scatter({
        'x': x,
        'y': yt
    }, label='true').opts(color='black')
    # Get arrays
    xf = d.t_target.values
    yp = d.y_pred
    s = d.y_pred_std
    p *= hv.Curve({
        'x': xf,
        'y': yp
    }, label='pred').opts(color='blue')
    p *= hv.Area((xf, yp - 2 * s, yp + 2 * s),
                 vdims=['y', 'y2'],
                 label='2*std').opts(alpha=0.5, line_width=0)
    # plot now line
    p *= hv.VLine(now, label='now').opts(color='red', framewise=True)
    return p.opts(title=f'Prediction at {now}. NLL={d.nll.mean().item():2.2f}')
 dmap_hv_predict_from_time = (hv.DynamicMap(hv_predict_from_time, kdims=['t_source'])
        .redim.values(t_source=ds_pred_block.t_source.to_pandas())
        .opts(width=800,
                     height=300, 
                    ))
 dmap_hv_predict_from_time
 # +
 def hv_plot_predictions_vs_time(it_ahead=6,
                                std=False,
                                ds_pred_block=ds_pred_block):
    """Plot predictions vs time with holoviews"""
    d = ds_pred_block.isel(t_ahead=it_ahead).groupby('t_source').first()
    p = hv.Scatter({
        'x': d.t_source,
        'y': d.y_true
    }, label='true').opts(color='black', size=2)
    # Get arrays
    xf = d.t_source.values
    yp = d.y_pred
    s = d.y_pred_std
    # Mean    
    p *= hv.Curve({'x': xf, 'y': yp}, label='pred').opts(color='blue')
    if std:
        p *= hv.Spread((xf, yp, s*2),
                     label='2*std').opts(alpha=0.5, line_width=0)
    else:
        p = datashade(p)
    title = f'Prediction at {it_ahead * pd.Timedelta(freq)} ahead. NLL={d.nll.mean().item():2.2f}'
    return p.opts(
                  title=title,
                  width=800,
                  height=300,
                  tools=['xwheel_zoom'],
                  active_tools=['xwheel_zoom', 'pan'],
                 )
 p = hv_plot_predictions_vs_time(
    6, std=True, ds_pred_block=ds_pred_block.isel(t_source=slice(100, 4000)))
 p
 # -
 # # Summarize experiments
 # # LR finder
 # +
 # # Run learning rate finder
 # lr_finder = trainer.tuner.lr_find(model)
 # # Results can be found in
 # lr_finder.results
 # # Plot with
 # fig = lr_finder.plot(suggest=True)
 # fig.show()
 # # Pick point based on plot, or get suggestion
 # new_lr = lr_finder.suggestion()
 # -
@@ -28,6 +28,10 @@ class RegressionForecastData:
        # Check processing
        self.check()
    @property
    def columns_past(self):
        return set(self.df.columns)-set(self.columns_forecast)-set(self.columns_target)
    def download(self) -> pd.DataFrame:
        """Implement this method to download data and return raw df"""
@@ -54,8 +58,8 @@ class RegressionForecastData:
    def to_datasets(self, window_past: int, window_future: int, valid:bool=False) -> Tuple[Seq2SeqDataSet, Seq2SeqDataSet]:
        """Convert to torch datasets"""
-        ds_train = Seq2SeqDataSet(df_train, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
+        ds_train = Seq2SeqDataSet(self.df_train, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
-        ds_test = Seq2SeqDataSet(df_test, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
+        ds_test = Seq2SeqDataSet(self.df_test, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
        return ds_train, ds_test
    def __repr__(self):
@@ -31,7 +31,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        assert df.index.freq is not None, 'should have freq'
        assert_no_objects(df)
-        self.df = df
+        self.df = df.dropna(subset=columns_target)
        self.window_past = window_past
        self.window_future = window_future
@@ -1,5 +1,6 @@
 import uptide
 import pandas as pd
 import numpy as np
 # https://en.wikipedia.org/wiki/Theory_of_tides#Harmonic_analysis
 default_tidal_constituents = [