mirror of
https://github.com/wassname/seq2seq-time.git
synced 2026-06-27 19:16:40 +08:00
dataloading
This commit is contained in:
@@ -51,7 +51,7 @@ lint:
|
||||
## Set up python interpreter environment
|
||||
create_environment:
|
||||
@echo ">>> Detected conda, creating conda environment."
|
||||
conda create --name $(PROJECT_NAME) python=3
|
||||
conda create --name $(PROJECT_NAME) python=3.7
|
||||
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
|
||||
|
||||
## Test python environment is setup correctly
|
||||
|
||||
+1537
-2479
File diff suppressed because one or more lines are too long
@@ -0,0 +1,519 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:light
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: light
|
||||
# format_version: '1.5'
|
||||
# jupytext_version: 1.6.0
|
||||
# kernelspec:
|
||||
# display_name: seq2seq-time
|
||||
# language: python
|
||||
# name: seq2seq-time
|
||||
# ---
|
||||
|
||||
# # Sequence to Sequence Models for Timeseries Regression
|
||||
#
|
||||
#
|
||||
# In this notebook we are going to tackle a harder problem:
|
||||
# - predicting the future on a timeseries
|
||||
# - using an LSTM
|
||||
# - with rough uncertainty (uncalibrated)
|
||||
# - outputing sequence of predictions
|
||||
#
|
||||
# <img src="../reports/figures/Seq2Seq for regression.png" />
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# - [ ] TODO mike autocorrelation baseline
|
||||
# - [ ] TODO mike acorn data
|
||||
|
||||
# OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
|
||||
# %load_ext autoreload
|
||||
# %autoreload 2
|
||||
# %aimport -pandas
|
||||
# %aimport -torch
|
||||
# %aimport -numpy
|
||||
# %aimport -matplotlib
|
||||
# %aimport -dask
|
||||
# %aimport -tqdm
|
||||
# %matplotlib inline
|
||||
|
||||
# +
|
||||
# Imports
|
||||
import torch
|
||||
from torch import nn, optim
|
||||
from torch.nn import functional as F
|
||||
from torch.autograd import Variable
|
||||
import torch
|
||||
import torch.utils.data
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from pathlib import Path
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
import pytorch_lightning as pl
|
||||
# -
|
||||
|
||||
from seq2seq_time.data.dataset import Seq2SeqDataSet
|
||||
from seq2seq_time.predict import predict
|
||||
|
||||
import logging, sys
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
# ## Parameters
|
||||
|
||||
# +
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print(f'using {device}')
|
||||
|
||||
columns_target=['energy(kWh/hh)']
|
||||
window_past = 48*4
|
||||
window_future = 48*4
|
||||
batch_size = 64
|
||||
num_workers = 0
|
||||
freq = '30T'
|
||||
max_rows = 1e5
|
||||
|
||||
|
||||
# -
|
||||
|
||||
# ## Load data
|
||||
|
||||
# +
|
||||
|
||||
def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london')):
|
||||
"""
|
||||
Data loading and cleanding is always messy, so understand this code is optional.
|
||||
"""
|
||||
|
||||
# Load csv files
|
||||
csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:1]
|
||||
|
||||
# import pdb; pdb.set_trace() # you can use debugging in jupyter to interact with variables inside a function
|
||||
|
||||
# concatendate them
|
||||
df = pd.concat([pd.read_csv(f, parse_dates=[1], na_values=['Null']) for f in csv_files])
|
||||
|
||||
# Add ACORN categories
|
||||
df_households = pd.read_csv(indir/'informations_households.csv')
|
||||
df_households = df_households[['LCLid', 'stdorToU', 'Acorn_grouped']]
|
||||
df = pd.merge(df, df_households, on='LCLid')
|
||||
|
||||
# Take the mean over all houses
|
||||
name, df = next(iter(df.groupby('LCLid')))
|
||||
df = df.set_index('tstp')
|
||||
print(df)
|
||||
|
||||
# Load weather data
|
||||
df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
|
||||
use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
|
||||
'pressure', 'apparentTemperature', 'windSpeed',
|
||||
'humidity']
|
||||
df_weather = df_weather[use_cols].set_index('time')
|
||||
df_weather = df_weather.resample(freq).first().ffill() # Resample to match energy data
|
||||
|
||||
# Join weather and energy data
|
||||
df = pd.concat([df, df_weather], 1).dropna()
|
||||
|
||||
# Also find bank holidays
|
||||
df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
|
||||
holidays = set(df_hols['Bank holidays'].dt.round('D'))
|
||||
|
||||
time = df.index.to_series()
|
||||
def is_holiday(dt):
|
||||
return dt.floor('D') in holidays
|
||||
df['holiday'] = time.apply(is_holiday).astype(int)
|
||||
|
||||
# TODO pd.read_csv('../data/raw/smart-meters-in-london/acorn_details.csv', engine='python')
|
||||
|
||||
|
||||
# Add time features
|
||||
df["month"] = time.dt.month
|
||||
df['day'] = time.dt.day
|
||||
df['week'] = time.dt.week
|
||||
df['hour'] = time.dt.hour
|
||||
df['minute'] = time.dt.minute
|
||||
df['dayofweek'] = time.dt.dayofweek
|
||||
|
||||
# Drop nan and 0's
|
||||
df = df[df['energy(kWh/hh)']!=0]
|
||||
df = df.dropna()
|
||||
|
||||
# sort by time
|
||||
df = df.sort_index()
|
||||
|
||||
return df
|
||||
# -
|
||||
# Our dataset is the london smartmeter data. But at half hour intervals
|
||||
|
||||
# +
|
||||
df = get_smartmeter_df()
|
||||
|
||||
# df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
|
||||
|
||||
df = df.tail(int(max_rows)).copy() # Just use last X rows
|
||||
df
|
||||
# -
|
||||
|
||||
df.describe()
|
||||
|
||||
# +
|
||||
import sklearn
|
||||
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
|
||||
from sklearn_pandas import DataFrameMapper
|
||||
|
||||
columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
|
||||
columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
|
||||
|
||||
output_scalers = [([n], StandardScaler()) for n in columns_target]
|
||||
transformers=output_scalers + \
|
||||
[([n], StandardScaler()) for n in columns_input_numeric] + \
|
||||
[([n], OrdinalEncoder()) for n in columns_categorical]
|
||||
scaler = DataFrameMapper(transformers, df_out=True)
|
||||
df_norm = scaler.fit_transform(df)
|
||||
df_norm
|
||||
# -
|
||||
|
||||
output_scaler = next(filter(lambda r:r[0][0] in columns_target, mapper4.features))[-1]
|
||||
output_scaler
|
||||
|
||||
# # Resample
|
||||
df_norm = df_norm.resample(freq).first().fillna(0)
|
||||
|
||||
# +
|
||||
# split data, with the test in the future
|
||||
n_split = -int(len(df)*0.2)
|
||||
df_train = df_norm[:n_split]
|
||||
df_test = df_norm[n_split:]
|
||||
|
||||
# Show split
|
||||
df_train['energy(kWh/hh)'].plot(label='train')
|
||||
df_test['energy(kWh/hh)'].plot(label='test')
|
||||
plt.ylabel('energy(kWh/hh)')
|
||||
plt.legend()
|
||||
# -
|
||||
df_norm
|
||||
|
||||
|
||||
columns_blank=['visibility',
|
||||
'windBearing', 'temperature', 'dewPoint', 'pressure',
|
||||
'apparentTemperature', 'windSpeed', 'humidity']
|
||||
|
||||
ds_train = Seq2SeqDataSet(df_train,
|
||||
window_past=window_past,
|
||||
window_future=window_future,
|
||||
columns_blank=columns_blank)
|
||||
ds_test = Seq2SeqDataSet(df_test,
|
||||
window_past=window_past,
|
||||
window_future=window_future,
|
||||
columns_blank=columns_blank)
|
||||
print(ds_train)
|
||||
print(ds_test)
|
||||
|
||||
# %%timeit
|
||||
for i in range(100):
|
||||
ds_train[i]
|
||||
|
||||
# we can treat it like an array
|
||||
ds_train[0]
|
||||
len(ds_train)
|
||||
ds_train[0][2][-2]
|
||||
|
||||
# +
|
||||
# We can get rows
|
||||
x_past, y_past, x_future, y_future = ds_train.get_rows(10)
|
||||
|
||||
# Plot one instance, this is what the model sees
|
||||
y_past['energy(kWh/hh)'].plot(label='past')
|
||||
y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
|
||||
plt.legend()
|
||||
plt.ylabel('energy(kWh/hh)')
|
||||
|
||||
# Notice we've added on two new columns tsp (time since present) and is_past
|
||||
x_past.tail()
|
||||
# -
|
||||
|
||||
# Notice we've hidden some future columns to prevent cheating
|
||||
x_future.tail()
|
||||
|
||||
|
||||
# ## Model
|
||||
|
||||
# +
|
||||
|
||||
class Seq2SeqNet(nn.Module):
|
||||
def __init__(self, input_size, input_size_decoder, output_size, hidden_size=32, lstm_layers=2, lstm_dropout=0, _min_std = 0.05):
|
||||
super().__init__()
|
||||
self._min_std = _min_std
|
||||
|
||||
self.encoder = nn.LSTM(
|
||||
input_size=input_size + output_size,
|
||||
hidden_size=hidden_size,
|
||||
batch_first=True,
|
||||
num_layers=lstm_layers,
|
||||
dropout=lstm_dropout,
|
||||
)
|
||||
self.decoder = nn.LSTM(
|
||||
input_size=input_size_decoder,
|
||||
hidden_size=hidden_size,
|
||||
batch_first=True,
|
||||
num_layers=lstm_layers,
|
||||
dropout=lstm_dropout,
|
||||
)
|
||||
self.mean = nn.Linear(hidden_size, output_size)
|
||||
self.std = nn.Linear(hidden_size, output_size)
|
||||
|
||||
def forward(self, context_x, context_y, target_x, target_y=None):
|
||||
x = torch.cat([context_x, context_y], -1)
|
||||
_, (h_out, cell) = self.encoder(x)
|
||||
|
||||
## Shape
|
||||
# hidden = [batch size, n layers * n directions, hid dim]
|
||||
# cell = [batch size, n layers * n directions, hid dim]
|
||||
# output = [batch size, seq len, hid dim * n directions]
|
||||
outputs, (_, _) = self.decoder(target_x, (h_out, cell))
|
||||
|
||||
|
||||
# outputs: [B, T, num_direction * H]
|
||||
mean = self.mean(outputs)
|
||||
log_sigma = self.std(outputs)
|
||||
log_sigma = torch.clamp(log_sigma, np.log(self._min_std), -np.log(self._min_std))
|
||||
|
||||
sigma = torch.exp(log_sigma)
|
||||
y_dist = torch.distributions.Normal(mean, sigma)
|
||||
return y_dist
|
||||
|
||||
|
||||
# -
|
||||
|
||||
|
||||
|
||||
# +
|
||||
input_size = x_past.shape[-1]
|
||||
output_size = y_future.shape[-1]
|
||||
|
||||
model = Seq2SeqNet(input_size, input_size, output_size,
|
||||
hidden_size=32,
|
||||
lstm_layers=2,
|
||||
lstm_dropout=0).to(device)
|
||||
model
|
||||
# -
|
||||
# Init the optimiser
|
||||
optimizer = optim.Adam(model.parameters(), lr=1e-3)
|
||||
|
||||
# +
|
||||
|
||||
past_x = torch.rand((batch_size, window_past, input_size)).to(device)
|
||||
future_x = torch.rand((batch_size, window_future, input_size)).to(device)
|
||||
past_y = torch.rand((batch_size, window_past, output_size)).to(device)
|
||||
future_y = torch.rand((batch_size, window_future, output_size)).to(device)
|
||||
output = model(past_x, past_y, future_x, future_y)
|
||||
print(output)
|
||||
|
||||
from torchsummaryX import summary
|
||||
summary(model, past_x, past_y, future_x, future_y )
|
||||
1
|
||||
# -
|
||||
|
||||
# ## Training
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# +
|
||||
def train_epoch(ds, model, bs=128):
|
||||
model.train()
|
||||
|
||||
training_loss = []
|
||||
|
||||
# Put data into a torch loader
|
||||
load_train = torch.utils.data.dataloader.DataLoader(
|
||||
ds,
|
||||
batch_size=bs,
|
||||
pin_memory=False,
|
||||
num_workers=num_workers,
|
||||
shuffle=True,
|
||||
)
|
||||
|
||||
for batch in tqdm(load_train, leave=False, desc='train'):
|
||||
# Send data to gpu
|
||||
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
|
||||
|
||||
# Discard previous gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Run model
|
||||
y_dist = model(x_past, y_past, x_future, y_future)
|
||||
|
||||
# Get loss, it's Negative Log Likelihood
|
||||
loss = -y_dist.log_prob(y_future).mean()
|
||||
|
||||
# Backprop
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Record stats
|
||||
training_loss.append(loss.item())
|
||||
|
||||
return np.mean(training_loss)
|
||||
|
||||
|
||||
def test_epoch(ds, model, bs=512):
|
||||
model.eval()
|
||||
|
||||
test_loss = []
|
||||
load_test = torch.utils.data.dataloader.DataLoader(ds,
|
||||
batch_size=bs,
|
||||
pin_memory=False,
|
||||
num_workers=num_workers)
|
||||
for batch in tqdm(load_test, leave=False, desc='test'):
|
||||
# Send data to gpu
|
||||
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
|
||||
with torch.no_grad():
|
||||
# Run model
|
||||
y_dist = model(x_past, y_past, x_future, y_future)
|
||||
# Get loss, it's Negative Log Likelihood
|
||||
loss = -y_dist.log_prob(y_future).mean()
|
||||
|
||||
test_loss.append(loss.item())
|
||||
|
||||
return np.mean(test_loss)
|
||||
|
||||
|
||||
def training_loop(ds_train, ds_test, model, epochs=1, bs=128):
|
||||
all_losses = []
|
||||
try:
|
||||
test_loss = test_epoch(ds_test, model)
|
||||
print(f"Start: Test Loss = {test_loss:.2f}")
|
||||
for epoch in tqdm(range(epochs), desc='epochs'):
|
||||
loss = train_epoch(ds_train, model, bs=bs)
|
||||
print(f"Epoch {epoch+1}/{epochs}: Training Loss = {loss:.2f}")
|
||||
|
||||
test_loss = test_epoch(ds_test, model)
|
||||
print(f"Epoch {epoch+1}/{epochs}: Test Loss = {test_loss:.2f}")
|
||||
print("-" * 50)
|
||||
|
||||
all_losses.append([loss, test_loss])
|
||||
|
||||
except KeyboardInterrupt:
|
||||
# This lets you stop manually. and still get the results
|
||||
pass
|
||||
|
||||
# Visualising the results
|
||||
all_losses = np.array(all_losses)
|
||||
plt.plot(all_losses[:, 0], label="Training")
|
||||
plt.plot(all_losses[:, 1], label="Test")
|
||||
plt.title("Loss")
|
||||
plt.legend()
|
||||
|
||||
return all_losses
|
||||
|
||||
|
||||
# -
|
||||
|
||||
# this might take 1 minute per epoch on a gpu
|
||||
training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
|
||||
1
|
||||
|
||||
# ## Predict
|
||||
#
|
||||
|
||||
# TODO get working
|
||||
output_scaler = scaler.transformers[-4][1]
|
||||
ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
|
||||
|
||||
|
||||
|
||||
# +
|
||||
# TODO Metrics... smape etc
|
||||
|
||||
# +
|
||||
def plot_prediction(ds_preds, i):
|
||||
"""Plot a prediction into the future, at a single point in time."""
|
||||
d = ds_preds.isel(t_source=i)
|
||||
|
||||
# Get arrays
|
||||
xf = d.t_target
|
||||
yp = d.y_pred
|
||||
s = d.y_pred_std
|
||||
yt = d.y_true
|
||||
now = d.t_source.squeeze()
|
||||
|
||||
# plot prediction
|
||||
plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
|
||||
facecolor="b",
|
||||
interpolate=True,
|
||||
label="2 std",)
|
||||
plt.plot(xf, yp, label='pred', c='b')
|
||||
|
||||
# plot true
|
||||
plt.scatter(
|
||||
d.t_past,
|
||||
d.y_past,
|
||||
c='k',
|
||||
s=6
|
||||
)
|
||||
plt.scatter(xf, yt, label='true', c='k', s=6)
|
||||
|
||||
# plot a red line for now
|
||||
plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
|
||||
|
||||
now=pd.Timestamp(now.values)
|
||||
plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
|
||||
plt.xlabel(f'{now.date()}')
|
||||
plt.ylabel('energy(kWh/hh)')
|
||||
plt.legend()
|
||||
plt.xticks(rotation=45)
|
||||
plt.show()
|
||||
|
||||
# plot_prediction(ds_preds, 0)
|
||||
# plot_prediction(ds_preds, 12) # 6 hours later
|
||||
plot_prediction(ds_preds, 24) # 12 hours later
|
||||
plot_prediction(ds_preds, 48) # 12 hours later
|
||||
# -
|
||||
|
||||
# ## Error vs time ahead
|
||||
|
||||
|
||||
|
||||
# +
|
||||
d = ds_preds.mean('t_source') # Mean over all predictions
|
||||
|
||||
# Plot with xarray, it has a pandas like interface
|
||||
d.plot.scatter('t_ahead_hours', 'nll')
|
||||
|
||||
# Tidy the graph
|
||||
n = len(ds_preds.t_source)
|
||||
plt.ylabel('Negative Log Likelihood (lower is better)')
|
||||
plt.xlabel('Hours ahead')
|
||||
plt.title(f'NLL vs time (no. samples={n})')
|
||||
# -
|
||||
|
||||
d = ds_preds.mean('t_source') # Mean over all predictions
|
||||
d['likelihood'] = np.exp(-d.nll) # get likelihood, after taking mean in log domain
|
||||
d.plot.scatter('t_ahead_hours', 'likelihood')
|
||||
|
||||
|
||||
|
||||
# Make a plot of the NLL over time. Does this solution get worse with time?
|
||||
# this is hard because we need to take the mean over t_ahead
|
||||
# then group by t_source
|
||||
d = ds_preds.mean('t_ahead').groupby('t_source').mean()
|
||||
# And even then it's clearer with smoothing
|
||||
d.plot.scatter('t_source', 'nll')
|
||||
plt.xticks(rotation=45)
|
||||
plt.title('NLL over time (lower is better)')
|
||||
1
|
||||
|
||||
# A scatter plot is easy with xarray
|
||||
ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
|
||||
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
name: seq2seq-time
|
||||
channels:
|
||||
- pytorch
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1=conda_forge
|
||||
- _openmp_mutex=4.5=1_gnu
|
||||
- absl-py=0.10.0=py37hc8dfbb8_1
|
||||
- aiohttp=3.6.3=py37h7b6447c_0
|
||||
- appdirs=1.4.4=py_0
|
||||
- argon2-cffi=20.1.0=py37h8f50634_2
|
||||
- async-timeout=3.0.1=py_1000
|
||||
- async_generator=1.10=py_0
|
||||
- attrs=20.2.0=pyh9f0ad1d_0
|
||||
- awscli=1.18.159=py37hc8dfbb8_0
|
||||
- backcall=0.2.0=pyh9f0ad1d_0
|
||||
- backports=1.0=py_2
|
||||
- backports.functools_lru_cache=1.6.1=py_0
|
||||
- black=20.8b1=py_1
|
||||
- blas=1.0=mkl
|
||||
- bleach=3.2.1=pyh9f0ad1d_0
|
||||
- blinker=1.4=py_1
|
||||
- botocore=1.18.18=pyh9f0ad1d_0
|
||||
- brotlipy=0.7.0=py37hb5d75c8_1001
|
||||
- c-ares=1.16.1=h516909a_3
|
||||
- ca-certificates=2020.10.14=0
|
||||
- cachetools=4.1.1=py_0
|
||||
- certifi=2020.6.20=py37he5f6b98_2
|
||||
- cffi=1.14.3=py37h00ebd2e_1
|
||||
- chardet=3.0.4=py37he5f6b98_1008
|
||||
- click=7.1.2=pyh9f0ad1d_0
|
||||
- colorama=0.4.3=py_0
|
||||
- cryptography=3.1.1=py37hff6837a_1
|
||||
- cudatoolkit=10.2.89=hfd86e86_1
|
||||
- cycler=0.10.0=py_2
|
||||
- dataclasses=0.7=py37_0
|
||||
- dbus=1.13.18=hb2f20db_0
|
||||
- decorator=4.4.2=py_0
|
||||
- defusedxml=0.6.0=py_0
|
||||
- docutils=0.15.2=py37_0
|
||||
- entrypoints=0.3=py37hc8dfbb8_1002
|
||||
- expat=2.2.10=he6710b0_2
|
||||
- fontconfig=2.13.1=h1056068_1002
|
||||
- freetype=2.10.3=he06d7ca_0
|
||||
- fsspec=0.8.4=py_0
|
||||
- future=0.18.2=py37hc8dfbb8_2
|
||||
- gettext=0.19.8.1=hf34092f_1003
|
||||
- glib=2.66.1=he1b5a44_1
|
||||
- google-auth=1.22.1=py_0
|
||||
- google-auth-oauthlib=0.4.1=py_2
|
||||
- grpcio=1.31.0=py37hb0870dc_0
|
||||
- gst-plugins-base=1.14.5=h0935bb2_2
|
||||
- gstreamer=1.14.5=h36ae1b5_2
|
||||
- icu=67.1=he1b5a44_0
|
||||
- idna=2.10=pyh9f0ad1d_0
|
||||
- importlib-metadata=2.0.0=py37hc8dfbb8_0
|
||||
- importlib_metadata=2.0.0=1
|
||||
- iniconfig=1.1.1=py_0
|
||||
- intel-openmp=2020.2=254
|
||||
- ipykernel=5.3.4=py37hc6149b9_1
|
||||
- ipython=7.18.1=py37hc6149b9_1
|
||||
- ipython_genutils=0.2.0=py_1
|
||||
- ipywidgets=7.5.1=pyh9f0ad1d_1
|
||||
- jedi=0.17.2=py37hc8dfbb8_1
|
||||
- jinja2=2.11.2=pyh9f0ad1d_0
|
||||
- jmespath=0.10.0=pyh9f0ad1d_0
|
||||
- joblib=0.17.0=py_0
|
||||
- jpeg=9d=h516909a_0
|
||||
- jsonschema=3.2.0=py37hc8dfbb8_1
|
||||
- jupyter_client=6.1.7=py_0
|
||||
- jupyter_core=4.6.3=py37hc8dfbb8_2
|
||||
- jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
|
||||
- kiwisolver=1.2.0=py37h99015e2_1
|
||||
- krb5=1.17.1=hfafb76e_3
|
||||
- lcms2=2.11=hbd6801e_0
|
||||
- ld_impl_linux-64=2.35=h769bd43_9
|
||||
- libblas=3.8.0=17_openblas
|
||||
- libcblas=3.8.0=17_openblas
|
||||
- libclang=10.0.1=default_hde54327_1
|
||||
- libedit=3.1.20191231=he28a2e2_2
|
||||
- libevent=2.1.10=hcdb4288_3
|
||||
- libffi=3.2.1=he1b5a44_1007
|
||||
- libgcc-ng=9.3.0=h5dbcf3e_17
|
||||
- libgfortran-ng=7.5.0=hae1eefd_17
|
||||
- libgfortran4=7.5.0=hae1eefd_17
|
||||
- libglib=2.66.1=h0dae87d_1
|
||||
- libgomp=9.3.0=h5dbcf3e_17
|
||||
- libiconv=1.16=h516909a_0
|
||||
- liblapack=3.8.0=17_openblas
|
||||
- libllvm10=10.0.1=he513fc3_3
|
||||
- libopenblas=0.3.10=pthreads_hb3c22a3_5
|
||||
- libpng=1.6.37=hed695b0_2
|
||||
- libpq=12.3=h1281834_2
|
||||
- libprotobuf=3.13.0.1=h8b12597_0
|
||||
- libsodium=1.0.18=h516909a_1
|
||||
- libstdcxx-ng=9.3.0=h2ae2ef3_17
|
||||
- libtiff=4.1.0=hc7e4089_6
|
||||
- libuuid=2.32.1=h14c3975_1000
|
||||
- libwebp-base=1.1.0=h516909a_3
|
||||
- libxcb=1.14=h7b6447c_0
|
||||
- libxkbcommon=0.10.0=he1b5a44_0
|
||||
- libxml2=2.9.10=h68273f3_2
|
||||
- lz4-c=1.9.2=he1b5a44_3
|
||||
- markdown=3.3.1=pyh9f0ad1d_0
|
||||
- markupsafe=1.1.1=py37hb5d75c8_2
|
||||
- matplotlib=3.3.2=py37hc8dfbb8_1
|
||||
- matplotlib-base=3.3.2=py37hc9afd2a_1
|
||||
- mccabe=0.6.1=py_1
|
||||
- mistune=0.8.4=py37h8f50634_1002
|
||||
- mkl=2020.2=256
|
||||
- more-itertools=8.5.0=py_0
|
||||
- multidict=4.7.6=py37h7b6447c_1
|
||||
- mypy=0.790=py_0
|
||||
- mypy_extensions=0.4.3=py37hc8dfbb8_1
|
||||
- mysql-common=8.0.21=2
|
||||
- mysql-libs=8.0.21=hf3661c5_2
|
||||
- nbclient=0.5.1=py_0
|
||||
- nbconvert=6.0.7=py37hc8dfbb8_1
|
||||
- nbformat=5.0.8=py_0
|
||||
- ncurses=6.2=he1b5a44_2
|
||||
- nest-asyncio=1.4.1=py_0
|
||||
- ninja=1.10.1=hfc4b9b4_2
|
||||
- notebook=6.1.4=py37hc8dfbb8_1
|
||||
- nspr=4.29=he1b5a44_1
|
||||
- nss=3.58=h27285de_1
|
||||
- numpy=1.19.2=py37h7ea13bd_1
|
||||
- oauthlib=3.1.0=py_0
|
||||
- olefile=0.46=pyh9f0ad1d_1
|
||||
- openssl=1.1.1h=h516909a_0
|
||||
- packaging=20.4=pyh9f0ad1d_0
|
||||
- pandas=1.1.3=py37h9fdb41a_2
|
||||
- pandoc=2.11.0.2=hd18ef5c_0
|
||||
- pandocfilters=1.4.2=py_1
|
||||
- parso=0.7.1=pyh9f0ad1d_0
|
||||
- pathspec=0.8.0=pyh9f0ad1d_0
|
||||
- pcre=8.44=he1b5a44_0
|
||||
- pexpect=4.8.0=py37hc8dfbb8_1
|
||||
- pickleshare=0.7.5=py37hc8dfbb8_1002
|
||||
- pillow=8.0.0=py37h718be6c_0
|
||||
- pip=20.2.4=py_0
|
||||
- pluggy=0.13.1=py37hc8dfbb8_3
|
||||
- prometheus_client=0.8.0=pyh9f0ad1d_0
|
||||
- prompt-toolkit=3.0.8=py_0
|
||||
- protobuf=3.13.0.1=py37h3340039_1
|
||||
- psutil=5.7.2=py37hb5d75c8_1
|
||||
- ptyprocess=0.6.0=py37_1000
|
||||
- py=1.9.0=pyh9f0ad1d_0
|
||||
- pyasn1=0.4.8=py_0
|
||||
- pyasn1-modules=0.2.8=py_0
|
||||
- pycodestyle=2.6.0=pyh9f0ad1d_0
|
||||
- pycparser=2.20=pyh9f0ad1d_2
|
||||
- pydocstyle=5.1.1=py_0
|
||||
- pyflakes=2.2.0=pyh9f0ad1d_0
|
||||
- pygments=2.7.1=py_0
|
||||
- pyjwt=1.7.1=py_0
|
||||
- pylama=7.7.1=py_0
|
||||
- pyopenssl=19.1.0=py37_0
|
||||
- pyparsing=2.4.7=pyh9f0ad1d_0
|
||||
- pyqt=5.12.3=py37h8685d9f_4
|
||||
- pyrsistent=0.17.3=py37h8f50634_1
|
||||
- pysocks=1.7.1=py37he5f6b98_2
|
||||
- pytest=6.1.1=py37hc8dfbb8_1
|
||||
- python=3.7.8=h425cb1d_1_cpython
|
||||
- python-dateutil=2.8.1=py_0
|
||||
- python_abi=3.7=1_cp37m
|
||||
- pytorch=1.6.0=py3.7_cuda10.2.89_cudnn7.6.5_0
|
||||
- pytorch-lightning=1.0.2=py_0
|
||||
- pytz=2020.1=pyh9f0ad1d_0
|
||||
- pyyaml=5.3.1=py37hb5d75c8_1
|
||||
- pyzmq=19.0.2=py37hac76be4_2
|
||||
- qt=5.12.9=h1f2b2cb_0
|
||||
- readline=8.0=he28a2e2_2
|
||||
- regex=2020.10.15=py37h8f50634_0
|
||||
- requests=2.24.0=pyh9f0ad1d_0
|
||||
- requests-oauthlib=1.3.0=pyh9f0ad1d_0
|
||||
- rsa=4.4.1=pyh9f0ad1d_0
|
||||
- s3transfer=0.3.3=py37hc8dfbb8_2
|
||||
- scikit-learn=0.23.2=py37h6785257_0
|
||||
- scipy=1.5.2=py37hb14ef9d_2
|
||||
- send2trash=1.5.0=py_0
|
||||
- setuptools=49.6.0=py37he5f6b98_2
|
||||
- six=1.15.0=pyh9f0ad1d_0
|
||||
- snowballstemmer=2.0.0=py_0
|
||||
- sqlite=3.33.0=h4cf870e_1
|
||||
- tensorboard=2.3.0=py_0
|
||||
- tensorboard-plugin-wit=1.6.0=pyh9f0ad1d_0
|
||||
- terminado=0.9.1=py37hc8dfbb8_1
|
||||
- testpath=0.4.4=py_0
|
||||
- threadpoolctl=2.1.0=pyh5ca1d4c_0
|
||||
- tk=8.6.10=hed695b0_1
|
||||
- toml=0.10.1=pyh9f0ad1d_0
|
||||
- torchvision=0.7.0=py37_cu102
|
||||
- tornado=6.0.4=py37h8f50634_2
|
||||
- tqdm=4.50.2=pyh9f0ad1d_0
|
||||
- traitlets=5.0.5=py_0
|
||||
- typed-ast=1.4.1=py37h516909a_0
|
||||
- typing-extensions=3.7.4.3=0
|
||||
- typing_extensions=3.7.4.3=py_0
|
||||
- urllib3=1.25.10=py_0
|
||||
- wcwidth=0.2.5=pyh9f0ad1d_2
|
||||
- webencodings=0.5.1=py_1
|
||||
- werkzeug=1.0.1=pyh9f0ad1d_0
|
||||
- wheel=0.35.1=pyh9f0ad1d_0
|
||||
- widgetsnbextension=3.5.1=py37hc8dfbb8_2
|
||||
- xarray=0.16.1=py_0
|
||||
- xz=5.2.5=h516909a_1
|
||||
- yaml=0.2.5=h516909a_0
|
||||
- yapf=0.30.0=pyh9f0ad1d_0
|
||||
- yarl=1.6.2=py37h8f50634_0
|
||||
- zeromq=4.3.3=he1b5a44_2
|
||||
- zipp=3.3.1=py_0
|
||||
- zlib=1.2.11=h516909a_1010
|
||||
- zstd=1.4.5=h6597ccf_2
|
||||
- pip:
|
||||
- pyqt5-sip==4.19.18
|
||||
- pyqtchart==5.12
|
||||
- pyqtwebengine==5.12.1
|
||||
- sklearn-pandas==2.0.2
|
||||
- torchsummaryx==1.3.0
|
||||
prefix: /home/wassname/anaconda/envs/seq2seq-time
|
||||
@@ -0,0 +1,26 @@
|
||||
name: seq2seq-time
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- python==3.7
|
||||
- pip
|
||||
- awscli
|
||||
- ipykernel
|
||||
- tqdm
|
||||
- xarray
|
||||
- pandas
|
||||
- pytorch
|
||||
- torchvision
|
||||
- cudatoolkit==10.2
|
||||
- black
|
||||
- pylama
|
||||
- mypy
|
||||
- pytest
|
||||
- numpy
|
||||
- matplotlib
|
||||
- scikit-learn
|
||||
- pytorch-lightning
|
||||
- yapf
|
||||
- ipywidgets
|
||||
prefix: /home/wassname/anaconda/envs/seq2seq-time
|
||||
@@ -11,5 +11,5 @@ dependencies:
|
||||
- awscli
|
||||
- pip:
|
||||
# local package
|
||||
- -e .
|
||||
# - -e .
|
||||
|
||||
|
||||
@@ -4,3 +4,13 @@ This project has multiple ways of documenting requirements
|
||||
- environment.min.yaml - This is the minimum requirements, use it to install a new test or dev environment
|
||||
- environment.max.yaml - This pins all conda packages, use for production or finding vunrebilities
|
||||
- requirements.txt - For people or bots not using conda
|
||||
|
||||
```
|
||||
# Install requirements
|
||||
conda create --name seq2seq-time python=3.7 -f ./requirements/environment.yaml
|
||||
conda activate seq2seq-time
|
||||
# Install this package in editable mode
|
||||
python -m pip install -e .
|
||||
# Install kernel
|
||||
python -m ipykernel install --user --name seq2seq-time --display-name seq2seq-time
|
||||
```
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
import pandas as pd
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
|
||||
def assert_normalized(df):
|
||||
stats = df.describe().T
|
||||
np.testing.assert_allclose(stats['mean'].values, 0, atol=0.1), 'means should be normalized to ~0'
|
||||
np.testing.assert_allclose(stats['std'].values, 1, atol=0.1), 'standard deviations should be normalized to ~0'
|
||||
|
||||
def assert_no_objects(df):
|
||||
for name, dtype in df.dtypes.iteritems():
|
||||
assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'
|
||||
|
||||
|
||||
class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
"""
|
||||
Takes in dataframe and returns sequences through time.
|
||||
|
||||
Returns x_past, y_past, x_future, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, df: pd.DataFrame, window_past=40, window_future=10, columns_target=['energy(kWh/hh)'], columns_blank=[],):
|
||||
"""
|
||||
Args:
|
||||
- df: DataFrame with time index, already scaled
|
||||
- columns_blank: The columns we will blank, in the future
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO auto categorical columns
|
||||
# TODO specify blank future columns
|
||||
assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
|
||||
assert df.index.freq is not None, 'should have freq'
|
||||
# assert_normalized(df)
|
||||
assert_no_objects(df)
|
||||
|
||||
# Use numpy instead of pandas, for speed
|
||||
self.x = df.drop(columns=columns_target).copy().values
|
||||
self.y = df[columns_target].copy().values
|
||||
self.t = df.index.copy()
|
||||
self.columns = list(df.columns)
|
||||
self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
|
||||
|
||||
self.window_past = window_past
|
||||
self.window_future = window_future
|
||||
self.columns_target = columns_target
|
||||
|
||||
def get_components(self, i):
|
||||
"""Get past and future rows."""
|
||||
x = self.x[i : i + (self.window_past + self.window_future)].copy()
|
||||
y = self.y[i:i + (self.window_past + self.window_future)].copy()
|
||||
t = self.t[i:i + (self.window_past + self.window_future)].copy()
|
||||
t = t.astype(int) * 1e-9 / 60 / 60 / 24 # days
|
||||
t = t.values
|
||||
now = t[self.window_past]
|
||||
|
||||
# Add a features: relative hours since present time, is future
|
||||
tstp = (t - now)[:, None]
|
||||
is_past = tstp < 0
|
||||
x = np.concatenate([x, tstp, is_past], -1)
|
||||
|
||||
# Split into future and past
|
||||
x_past = x[:self.window_past]
|
||||
y_past = y[:self.window_past]
|
||||
x_future = x[self.window_past:]
|
||||
y_future = y[self.window_past:]
|
||||
|
||||
# Stop it cheating by using future weather measurements
|
||||
x_future[:, self.icol_blank] = 0
|
||||
return x_past, y_past, x_future, y_future
|
||||
|
||||
|
||||
def __getitem__(self, i):
|
||||
"""This is how python implements square brackets"""
|
||||
if i<0:
|
||||
# Handle negative integers
|
||||
i = len(self)+i
|
||||
data = self.get_components(i)
|
||||
# From dataframe to torch
|
||||
return [d.astype(np.float32) for d in data]
|
||||
|
||||
|
||||
def get_rows(self, i):
|
||||
"""
|
||||
Output pandas dataframes for display purposes.
|
||||
"""
|
||||
x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
|
||||
x_past, y_past, x_future, y_future = self.get_components(i)
|
||||
t_past = self.t[i:i+self.window_past]
|
||||
t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
|
||||
x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
|
||||
x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
|
||||
y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
|
||||
y_future = pd.DataFrame(y_future, columns=self.columns_target, index=t_future)
|
||||
return x_past, y_past, x_future, y_future
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x) - (self.window_past + self.window_future)
|
||||
|
||||
def __repr__(self):
|
||||
return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
|
||||
@@ -0,0 +1,72 @@
|
||||
import xarray as xr
|
||||
import torch
|
||||
from tqdm.auto import tqdm
|
||||
import pandas as pd
|
||||
|
||||
from .util import to_numpy
|
||||
|
||||
def predict(model, ds_test, batch_size, device='cpu', scaler=None):
|
||||
"""
|
||||
Gather all predictions into xarray.
|
||||
|
||||
When we generate prediction in a sequence to sequence model we start at a time then predict
|
||||
N steps into the future. So we have 2 dimensions: source time, target time.
|
||||
|
||||
But we also care about how far we were predicting into the future, so we have 3 dimensions: source time, target time, time ahead.
|
||||
|
||||
It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
|
||||
"""
|
||||
load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
|
||||
freq = ds_test.t.freq
|
||||
xrs = []
|
||||
for i, batch in enumerate(tqdm(load_test, desc='predict')):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
|
||||
y_dist = model(x_past, y_past, x_future, y_future)
|
||||
nll = -y_dist.log_prob(y_future)
|
||||
|
||||
# Convert to numpy
|
||||
mean = to_numpy(y_dist.loc.squeeze(-1))
|
||||
std = to_numpy(y_dist.scale.squeeze(-1))
|
||||
nll = to_numpy(nll.squeeze(-1))
|
||||
y_future = to_numpy(y_future.squeeze(-1))
|
||||
y_past = to_numpy(y_past.squeeze(-1))
|
||||
|
||||
# Make an xarray.Dataset for the data
|
||||
bs = y_future.shape[0]
|
||||
t_source = ds_test.t[i:i+bs].values
|
||||
t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
|
||||
t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
|
||||
xr_out = xr.Dataset(
|
||||
{
|
||||
# Format> name: ([dimensions,...], array),
|
||||
"y_past": (["t_source", "t_behind",], y_past),
|
||||
"nll": (["t_source", "t_ahead",], nll),
|
||||
"y_pred": (["t_source", "t_ahead",], mean),
|
||||
"y_pred_std": (["t_source", "t_ahead",], std),
|
||||
"y_true": (["t_source", "t_ahead",], y_future),
|
||||
},
|
||||
coords={"t_source": t_source, "t_ahead": t_ahead, "t_behind": t_behind},
|
||||
)
|
||||
xrs.append(xr_out)
|
||||
|
||||
# Join all batches
|
||||
ds_preds = xr.concat(xrs, dim="t_source")
|
||||
|
||||
# undo scaling on y
|
||||
if scaler:
|
||||
ds_preds['y_pred_std'].values = ds_preds.y_pred_std * scaler.scale_
|
||||
ds_preds['y_past'].values = scaler.inverse_transform(ds_preds.y_past)
|
||||
ds_preds['y_pred'].values = scaler.inverse_transform(ds_preds.y_pred)
|
||||
ds_preds['y_true'].values = scaler.inverse_transform(ds_preds.y_true)
|
||||
|
||||
# Add some derived coordinates, they will be the ones not in bold
|
||||
# The target time, is a function of the source time, and how far we predict ahead
|
||||
ds_preds = ds_preds.assign_coords(t_target=ds_preds.t_source+ds_preds.t_ahead)
|
||||
|
||||
ds_preds = ds_preds.assign_coords(t_past=ds_preds.t_source+ds_preds.t_behind)
|
||||
|
||||
# Some plots don't like timedeltas, so lets make a coordinate for time ahead in hours
|
||||
ds_preds = ds_preds.assign_coords(t_ahead_hours=(ds_preds.t_ahead*1.0e-9/60/60).astype(float))
|
||||
return ds_preds
|
||||
@@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
import torch
|
||||
|
||||
project_dir = Path(__file__).parent.parent
|
||||
|
||||
def to_numpy(x):
|
||||
"""Helper function to avoid repeating code"""
|
||||
if isinstance(x, torch.Tensor):
|
||||
x = x.cpu().detach().numpy()
|
||||
return x
|
||||
Reference in New Issue
Block a user