Files
seq2seq-time/notebooks/03.0-mike-RNN_Timeseries_Seq2Seq.py
T
2020-10-24 20:21:07 +08:00

806 lines
22 KiB
Python

# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.6.0
# kernelspec:
# display_name: seq2seq-time
# language: python
# name: seq2seq-time
# ---
# # Sequence to Sequence Models for Timeseries Regression
#
#
# In this notebook we are going to tackle a harder problem:
# - predicting the future on a timeseries
# - using an LSTM
# - with rough uncertainty (uncalibrated)
# - outputing sequence of predictions
#
# <img src="../reports/figures/Seq2Seq for regression.png" />
#
#
# https://medium.com/@boitemailjeanmid/smart-meters-in-london-part1-description-and-first-insights-jean-michel-d-db97af2de71b
#
# OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
# %load_ext autoreload
# %autoreload 2
# %aimport -pandas
# %aimport -torch
# %aimport -numpy
# %aimport -matplotlib
# %aimport -dask
# %aimport -tqdm
# %matplotlib inline
# +
# Imports
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
import torch
import torch.utils.data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0, 3.0)
plt.style.use('ggplot')
from pathlib import Path
from tqdm.auto import tqdm
import pytorch_lightning as pl
# -
import warnings
warnings.simplefilter('once')
from seq2seq_time.data.dataset import Seq2SeqDataSet, Seq2SeqDataSets
from seq2seq_time.predict import predict, predict_multi
import logging, sys
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# ## Parameters
# +
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'using {device}')
columns_target=['energy(kWh/hh)']
window_past = 48*2
window_future = 48*2
batch_size = 128
num_workers = 5
freq = '30T'
max_rows = 5e5
# -
# ## Load data
# +
def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london'), max_files=8):
"""
Data loading and cleanding is always messy, so understand this code is optional.
"""
# Load csv files
csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:max_files]
dfs = []
for f in csv_files:
df = (pd.read_csv(f, parse_dates=[1], na_values=['Null'])
.groupby('tstp')
.sum()
.sort_index()
)
df['block'] = f.stem
# Drop nan and 0's
df = df[df['energy(kWh/hh)']!=0]
df = df.dropna()
# Add time features
time = df.index.to_series()
df["month"] = time.dt.month
df['day'] = time.dt.day
df['week'] = time.dt.week
df['hour'] = time.dt.hour
df['minute'] = time.dt.minute
df['dayofweek'] = time.dt.dayofweek
# Load weather data
df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
'pressure', 'apparentTemperature', 'windSpeed',
'humidity']
df_weather = df_weather[use_cols].set_index('time')
# Resample to match energy data
# Use first, since we have bearing, and you can't take mean
df_weather = df_weather.resample(freq).first().ffill()
# Join weather and energy data
df = pd.merge(df, df_weather, how='inner', left_index=True, right_index=True, sort=True)
# Holidays
df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
holidays = set(df_hols['Bank holidays'].dt.round('D'))
def is_holiday(dt):
return dt in holidays
days = df.index.floor('D')
holiday_mapping = days.unique().to_series().apply(is_holiday).astype(int).to_dict()
df['holiday'] = days.to_series().map(holiday_mapping).values
# sort
df.index.name = 'Date'
df = df.loc['2012-09':] # Weird value before this
dfs.append(df)
return pd.concat(dfs)
# -
# Our dataset is the london smartmeter data. But at half hour intervals
# +
df = get_smartmeter_df(max_files=12)
# # Just get the first one for now
# dfs = list(dfs)
# # df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
df = df.tail(int(max_rows)).copy() # Just use last X rows
# df = pd.concat(dfs[:6], 0)
# # df = dfs[0]
print(df.block.value_counts())
df
# -
# ### Plot/explore
# +
import holoviews as hv
from holoviews import opts
from holoviews.plotting.links import RangeToolLink
import datashader as ds
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
from holoviews.operation import decimate
hv.extension('bokeh')
# def house_curve(Name=None):
# if isinstance(Name, int):
# name = df.block.unique()[Name]
# d = df[df.block == Name]
# d_curve = hv.Curve(d, 'Date', 'energy(kWh/hh)', label=Name).opts(framewise=True)
# return d_curve
# dmap = hv.DynamicMap(house_curve, kdims=['Name'])
# dmap = dmap.redim.values(Name=list(df.block.unique()))
# dynspread(datashade(dmap).opts(width=800,
# height=300,
# tools=['xwheel_zoom', 'pan'],
# active_tools=['xwheel_zoom', 'pan'],
# default_tools=['reset', 'save', 'hover']
# ))
# -
# ### Profiling
# +
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True)
# profile
# -
# ### Norm
df.describe()
# +
import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
output_scalers = [([n], StandardScaler()) for n in columns_target]
transformers=output_scalers + \
[([n], StandardScaler()) for n in columns_input_numeric] + \
[([n], OrdinalEncoder()) for n in columns_categorical]
scaler = DataFrameMapper(transformers, df_out=True)
df_norm = scaler.fit_transform(df)
df_norm
# -
output_scaler = next(filter(lambda r:r[0][0] in columns_target, scaler.features))[-1]
output_scaler
# ### Split
# +
# split data, with the test in the future
d0 =df_norm.index.min()
d1 = df_norm.index.max()
split_time = d0+(d1-d0)*0.8
split_time = split_time.round('1D')
print(split_time)
df_train = df_norm.groupby('block').apply(lambda d:d.loc[:split_time]).reset_index(level=0, drop=True)
df_test = df_norm.groupby('block').apply(lambda d:d.loc[split_time:]).reset_index(level=0, drop=True)
# df_test
# +
# # Show split
# df_train['energy(kWh/hh)'].plot(label='train')
# df_test['energy(kWh/hh)'].plot(label='test')
# plt.ylabel('energy(kWh/hh)')
# plt.legend()
# -
# # Show split
scatter = dynspread(datashade(hv.Curve(df_train, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='blue'))
scatter *= dynspread(datashade(hv.Curve(df_test, kdims=['Date'], vdims=['energy(kWh/hh)', 'block']).groupby('block'), cmap='red'))
scatter = scatter.opts(plot=dict(width=800))
scatter
# ### Dataset
# +
# ### Dataset
# These are the columns that we wont know in the future
# We need to blank them out in x_future
columns_blank=['visibility',
'windBearing', 'temperature', 'dewPoint', 'pressure',
'apparentTemperature', 'windSpeed', 'humidity']
df_trains = [d.resample(freq).first().ffill().dropna() for _,d in df_train.groupby('block')]
df_tests = [d.resample(freq).first().ffill().dropna() for _,d in df_test.groupby('block')]
ds_train = Seq2SeqDataSets(df_trains,
window_past=window_past,
window_future=window_future,
columns_blank=columns_blank)
ds_test = Seq2SeqDataSets(df_tests,
window_past=window_past,
window_future=window_future,
columns_blank=columns_blank)
print(ds_train)
print(ds_test)
# -
# we can treat it like an array
ds_train[0]
len(ds_train)
ds_train[-1]
# +
# We can get rows
x_past, y_past, x_future, y_future = ds_train.get_rows(10)
# Plot one instance, this is what the model sees
y_past['energy(kWh/hh)'].plot(label='past')
y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
plt.legend()
plt.ylabel('energy(kWh/hh)')
# Notice we've added on two new columns tsp (time since present) and is_past
x_past.tail()
# -
# Notice we've hidden some future columns to prevent cheating
x_future.tail()
# ## Plot helpers
# +
def plot_prediction(ds_preds, i):
"""Plot a prediction into the future, at a single point in time."""
d = ds_preds.isel(t_source=i)
# Get arrays
xf = d.t_target
yp = d.y_pred
s = d.y_pred_std
yt = d.y_true
now = d.t_source.squeeze()
plt.figure(figsize=(12, 4))
plt.scatter(xf, yt, label='true', c='k', s=6)
ylim = plt.ylim()
# plot prediction
plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
facecolor="b",
interpolate=True,
label="2 std",)
plt.plot(xf, yp, label='pred', c='b')
# plot true
plt.scatter(
d.t_past,
d.y_past,
c='k',
s=6
)
# plot a red line for now
plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
plt.ylim(*ylim)
now=pd.Timestamp(now.values)
plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
plt.xlabel(f'{now.date()}')
plt.ylabel('energy(kWh/hh)')
plt.legend()
plt.xticks(rotation=45)
plt.show()
def plot_performance(ds_preds, full=False):
"""Multiple plots using xr_preds"""
plot_prediction(ds_preds, 24)
ds_preds.mean('t_source').plot.scatter('t_ahead_hours', 'nll') # Mean over all predictions
n = len(ds_preds.t_source)
plt.ylabel('Negative Log Likelihood (lower is better)')
plt.xlabel('Hours ahead')
plt.title(f'NLL vs time ahead (no. samples={n})')
plt.show()
# Make a plot of the NLL over time. Does this solution get worse with time?
if full:
d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
plt.xticks(rotation=45)
plt.title('NLL over source time (lower is better)')
plt.show()
# A scatter plot is easy with xarray
if full:
plt.figure(figsize=(5, 5))
ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
plt.show()
# -
def plot_hist(trainer):
try:
df_hist = pd.read_csv(trainer.logger.experiment.metrics_file_path)
df_hist['epoch'] = df_hist['epoch'].ffill()
df_histe = df_hist.set_index('epoch').groupby('epoch').mean()
if len(df_histe)>1:
df_histe[['loss/train', 'loss/val']].plot(title='history')
return df_histe
except Exception:
pass
# ## Lightning
# +
import pytorch_lightning as pl
class PL_MODEL(pl.LightningModule):
def __init__(self, model, lr=3e-4, patience=2, weight_decay=0):
super().__init__()
self._model = model
self.lr = lr
self.patience = patience
self.weight_decay = weight_decay
def forward(self, x_past, y_past, x_future, y_future=None):
"""Eval/Predict"""
y_dist, extra = self._model(x_past, y_past, x_future, y_future)
return y_dist, extra
def training_step(self, batch, batch_idx, phase='train'):
x_past, y_past, x_future, y_future = batch
y_dist, extra = self.forward(*batch)
loss = -y_dist.log_prob(y_future).mean()
self.log_dict({f'loss/{phase}':loss})
if ('loss' in extra) and (phase=='train'):
# some models have a special loss
loss = extra['loss']
self.log_dict({f'model_loss/{phase}':loss})
return loss
def validation_step(self, batch, batch_idx):
return self.training_step(batch, batch_idx, phase='val')
def configure_optimizers(self):
optim = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optim,
patience=self.patience,
verbose=True,
min_lr=1e-7,
)
return {'optimizer': optim, 'lr_scheduler': scheduler, 'monitor': 'loss/val'}
# -
# # Run
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
# +
# Init data
x_past, y_past, x_future, y_future = ds_train.get_rows(10)
input_size = x_past.shape[-1]
output_size = y_future.shape[-1]
dl_train = DataLoader(ds_train,
batch_size=batch_size,
shuffle=True,
pin_memory=num_workers==0,
num_workers=num_workers)
dl_test = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers)
# +
import gc
def free_mem():
gc.collect()
torch.cuda.empty_cache()
gc.collect()
# -
from seq2seq_time.models.lstm_seq2seq import LSTMSeq2Seq
from seq2seq_time.models.lstm_seq import LSTMSeq
from seq2seq_time.models.lstm import LSTM
from seq2seq_time.models.baseline import BaselineLast
from seq2seq_time.models.transformer import Transformer
from seq2seq_time.models.transformer_autor import TransformerAutoR
from seq2seq_time.models.transformer_seq2seq import TransformerSeq2Seq
from seq2seq_time.models.transformer_seq import TransformerSeq
from seq2seq_time.models.neural_process import RANP
from seq2seq_time.models.transformer_process import TransformerProcess
# ## Plots
# +
# PL_MODEL(TransformerAutoR(input_size, output_size, hidden_out_size=32),
# patience=patience,
# lr=2e-5,
# weight_decay=1e-3)
# -
models = [
# TransformerAutoR2(input_size,
# output_size),
lambda: TransformerAutoR(input_size,
output_size, hidden_out_size=32),
lambda: RANP(input_size,
output_size, hidden_dim=32,
latent_dim=64, n_decoder_layers=4),
lambda: LSTM(input_size,
output_size,
hidden_size=80,
lstm_layers=3,
lstm_dropout=0.3),
lambda: LSTMSeq2Seq(input_size,
output_size,
hidden_size=64,
lstm_layers=2,
lstm_dropout=0.25),
lambda: TransformerSeq2Seq(input_size,
output_size,
hidden_size=128,
nhead=8,
nlayers=4,
attention_dropout=0.2),
lambda: Transformer(input_size,
output_size,
attention_dropout=0.2,
nhead=8,
nlayers=8,
hidden_size=128),
# lambda: TransformerSeq(input_size,
# output_size),
# lambda: LSTMSeq(input_size,
# output_size),
lambda :TransformerProcess(input_size,
output_size, hidden_size=16,
latent_dim=8, dropout=0.5,
nlayers=4,)
]
models
# Baseline model
pt_model = BaselineLast()
model = PL_MODEL(pt_model).to(device)
trainer = pl.Trainer(gpus=1,
max_epochs=1,
limit_train_batches=0.01,
logger=CSVLogger("logs",
name=type(pt_model).__name__),
)
trainer.fit(model, dl_train, dl_test)
print(plot_hist(trainer))
ds_predss = predict_multi(model.to(device),
ds_test.datasets,
batch_size*8,
device=device,
scaler=output_scaler)
print(f'baseline nll: {ds_predss.nll.mean().item():2.2g}')
# ## Train
for m_fn in models:
pt_model = m_fn()
name = type(pt_model).__name__
print(name)
# Wrap in lightning
patience = 2
model = PL_MODEL(pt_model, patience=patience, lr=2e-5, weight_decay=1e-3).to(device)
# Trainer
trainer = pl.Trainer(gpus=1,
min_epochs=2,
max_epochs=30,
amp_level='O1',
precision=16,
gradient_clip_val=1,
logger=CSVLogger("logs",
name=type(pt_model).__name__),
callbacks=[
EarlyStopping(monitor='loss/val', patience=patience*2),
# PrintTableMetricsCallback2()
],
)
# Train
trainer.fit(model, dl_train, dl_test)
ds_predss = predict_multi(model.to(device),
ds_test.datasets,
batch_size*2,
device=device,
scaler=output_scaler)
print(name)
print(f'mean_NLL {ds_predss.nll.mean().item():2.2f}')
# Performance
ds_preds = ds_predss.isel(block=0)
print(plot_hist(trainer))
plot_performance(ds_preds)
model.cpu()
free_mem()
# # Plots
# +
# Get latest checkpoint for a model type...
pt_model = models[1]()
name = type(pt_model).__name__
checkpoints = (Path('logs')/name).glob('version_*')
sort_checkpoints = lambda f:int(f.stem.split('_')[-1])
checkpoints = sorted(checkpoints, key=sort_checkpoints)
latest_checkpoint = checkpoints[-1]
checkpoint_f = sorted(latest_checkpoint.glob('checkpoints/*.ckpt'))[-1]
print('pt model name', name)
print('latest_checkpoint', checkpoint_f)
# -
# Load
model = PL_MODEL(pt_model).to(device)
model.load_from_checkpoint(str(checkpoint_f), model=pt_model)
ds_predss = predict_multi(model.to(device),
ds_test.datasets,
batch_size*4,
device=device,
scaler=output_scaler)
ds_predss.nll.mean().item()
ds_pred_block = ds_predss.isel(block=1)
# # holoviews pred
# +
import holoviews as hv
from holoviews import opts
import holoviews as hv
from holoviews import opts
import datashader as ds
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
from holoviews.operation import decimate
hv.extension('bokeh')
# +
# A few diagnostic plots
d_source = ds_predss.mean(['t_ahead',
'block'])['nll'].groupby('t_source').mean()
nll_vs_time = (hv.Curve(d_source).opts(width=600,
height=200,
title='Error vs time of prediction'))
d_ahead = ds_predss.mean(['t_source',
'block'])['nll'].groupby('t_ahead_hours').mean()
nll_vs_tahead = (hv.Curve(
(d_ahead.t_ahead_hours,
d_ahead)).redim(x='hours ahead',
y='nll').opts(width=600,
height=200,
title='Error vs time ahead'))
true_vs_pred = datashade(hv.Scatter(
(ds_predss.y_true,
ds_predss.y_pred))).redim(x='true', y='pred').opts(title='Scatter plot')
true_vs_pred = dynspread(true_vs_pred)
l = nll_vs_time + nll_vs_tahead + true_vs_pred
l.cols(1).opts(
framewise=True,
shared_axes=False,
)
# +
def hv_predict_from_time(t_source):
"""Plot predictions with holoviews"""
# Let us pass in an int
if isinstance(t_source, int):
t_source = ds_pred_block.t_source[t_source].to_pandas()
d = ds_pred_block.sel(t_source=t_source)
# Sometimes there are duplicate times, take the first
if len(d.t_source.shape) and d.t_source.shape[0] > 0:
d = d.isel(t_source=0)
if len(d.t_source.shape) and d.t_source.shape[0] == 0:
return None
now = d.t_source.to_pandas()
# Plot true
x = np.concatenate([d.t_past, d.t_target])
yt = np.concatenate([d.y_past, d.y_true])
p = hv.Scatter({
'x': x,
'y': yt
}, label='true').opts(color='black')
# Get arrays
xf = d.t_target.values
yp = d.y_pred
s = d.y_pred_std
p *= hv.Curve({
'x': xf,
'y': yp
}, label='pred').opts(color='blue')
p *= hv.Area((xf, yp - 2 * s, yp + 2 * s),
vdims=['y', 'y2'],
label='2*std').opts(alpha=0.5, line_width=0)
# plot now line
p *= hv.VLine(now, label='now').opts(color='red', framewise=True)
return p.opts(title=f'Prediction at {now}. NLL={d.nll.mean().item():2.2f}')
dmap_hv_predict_from_time = (hv.DynamicMap(hv_predict_from_time, kdims=['t_source'])
.redim.values(t_source=ds_pred_block.t_source.to_pandas())
.opts(width=800,
height=300,
))
dmap_hv_predict_from_time
# +
def hv_plot_predictions_vs_time(it_ahead=6,
std=False,
ds_pred_block=ds_pred_block):
"""Plot predictions vs time with holoviews"""
d = ds_pred_block.isel(t_ahead=it_ahead).groupby('t_source').first()
p = hv.Scatter({
'x': d.t_source,
'y': d.y_true
}, label='true').opts(color='black', size=2)
# Get arrays
xf = d.t_source.values
yp = d.y_pred
s = d.y_pred_std
# Mean
p *= hv.Curve({'x': xf, 'y': yp}, label='pred').opts(color='blue')
if std:
p *= hv.Spread((xf, yp, s*2),
label='2*std').opts(alpha=0.5, line_width=0)
else:
p = datashade(p)
title = f'Prediction at {it_ahead * pd.Timedelta(freq)} ahead. NLL={d.nll.mean().item():2.2f}'
return p.opts(
title=title,
width=800,
height=300,
tools=['xwheel_zoom'],
active_tools=['xwheel_zoom', 'pan'],
)
p = hv_plot_predictions_vs_time(
6, std=True, ds_pred_block=ds_pred_block.isel(t_source=slice(100, 4000)))
p
# -
# # Summarize experiments
# # LR finder
# +
# # Run learning rate finder
# lr_finder = trainer.tuner.lr_find(model)
# # Results can be found in
# lr_finder.results
# # Plot with
# fig = lr_finder.plot(suggest=True)
# fig.show()
# # Pick point based on plot, or get suggestion
# new_lr = lr_finder.suggestion()
# -