dataloading

This commit is contained in:
wassname
2020-10-18 13:12:09 +08:00
parent 7ab5c56bf2
commit 279ef54d86
11 changed files with 2497 additions and 2481 deletions
+1 -1
View File
@@ -51,7 +51,7 @@ lint:
## Set up python interpreter environment
create_environment:
@echo ">>> Detected conda, creating conda environment."
conda create --name $(PROJECT_NAME) python=3
conda create --name $(PROJECT_NAME) python=3.7
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
## Test python environment is setup correctly
File diff suppressed because one or more lines are too long
+519
View File
@@ -0,0 +1,519 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.6.0
# kernelspec:
# display_name: seq2seq-time
# language: python
# name: seq2seq-time
# ---
# # Sequence to Sequence Models for Timeseries Regression
#
#
# In this notebook we are going to tackle a harder problem:
# - predicting the future on a timeseries
# - using an LSTM
# - with rough uncertainty (uncalibrated)
# - outputing sequence of predictions
#
# <img src="../reports/figures/Seq2Seq for regression.png" />
#
#
#
# - [ ] TODO mike autocorrelation baseline
# - [ ] TODO mike acorn data
# OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
# %load_ext autoreload
# %autoreload 2
# %aimport -pandas
# %aimport -torch
# %aimport -numpy
# %aimport -matplotlib
# %aimport -dask
# %aimport -tqdm
# %matplotlib inline
# +
# Imports
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
import torch
import torch.utils.data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm
import pytorch_lightning as pl
# -
from seq2seq_time.data.dataset import Seq2SeqDataSet
from seq2seq_time.predict import predict
import logging, sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# ## Parameters
# +
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'using {device}')
columns_target=['energy(kWh/hh)']
window_past = 48*4
window_future = 48*4
batch_size = 64
num_workers = 0
freq = '30T'
max_rows = 1e5
# -
# ## Load data
# +
def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london')):
"""
Data loading and cleanding is always messy, so understand this code is optional.
"""
# Load csv files
csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:1]
# import pdb; pdb.set_trace() # you can use debugging in jupyter to interact with variables inside a function
# concatendate them
df = pd.concat([pd.read_csv(f, parse_dates=[1], na_values=['Null']) for f in csv_files])
# Add ACORN categories
df_households = pd.read_csv(indir/'informations_households.csv')
df_households = df_households[['LCLid', 'stdorToU', 'Acorn_grouped']]
df = pd.merge(df, df_households, on='LCLid')
# Take the mean over all houses
name, df = next(iter(df.groupby('LCLid')))
df = df.set_index('tstp')
print(df)
# Load weather data
df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
'pressure', 'apparentTemperature', 'windSpeed',
'humidity']
df_weather = df_weather[use_cols].set_index('time')
df_weather = df_weather.resample(freq).first().ffill() # Resample to match energy data
# Join weather and energy data
df = pd.concat([df, df_weather], 1).dropna()
# Also find bank holidays
df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
holidays = set(df_hols['Bank holidays'].dt.round('D'))
time = df.index.to_series()
def is_holiday(dt):
return dt.floor('D') in holidays
df['holiday'] = time.apply(is_holiday).astype(int)
# TODO pd.read_csv('../data/raw/smart-meters-in-london/acorn_details.csv', engine='python')
# Add time features
df["month"] = time.dt.month
df['day'] = time.dt.day
df['week'] = time.dt.week
df['hour'] = time.dt.hour
df['minute'] = time.dt.minute
df['dayofweek'] = time.dt.dayofweek
# Drop nan and 0's
df = df[df['energy(kWh/hh)']!=0]
df = df.dropna()
# sort by time
df = df.sort_index()
return df
# -
# Our dataset is the london smartmeter data. But at half hour intervals
# +
df = get_smartmeter_df()
# df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
df = df.tail(int(max_rows)).copy() # Just use last X rows
df
# -
df.describe()
# +
import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
output_scalers = [([n], StandardScaler()) for n in columns_target]
transformers=output_scalers + \
[([n], StandardScaler()) for n in columns_input_numeric] + \
[([n], OrdinalEncoder()) for n in columns_categorical]
scaler = DataFrameMapper(transformers, df_out=True)
df_norm = scaler.fit_transform(df)
df_norm
# -
output_scaler = next(filter(lambda r:r[0][0] in columns_target, mapper4.features))[-1]
output_scaler
# # Resample
df_norm = df_norm.resample(freq).first().fillna(0)
# +
# split data, with the test in the future
n_split = -int(len(df)*0.2)
df_train = df_norm[:n_split]
df_test = df_norm[n_split:]
# Show split
df_train['energy(kWh/hh)'].plot(label='train')
df_test['energy(kWh/hh)'].plot(label='test')
plt.ylabel('energy(kWh/hh)')
plt.legend()
# -
df_norm
columns_blank=['visibility',
'windBearing', 'temperature', 'dewPoint', 'pressure',
'apparentTemperature', 'windSpeed', 'humidity']
ds_train = Seq2SeqDataSet(df_train,
window_past=window_past,
window_future=window_future,
columns_blank=columns_blank)
ds_test = Seq2SeqDataSet(df_test,
window_past=window_past,
window_future=window_future,
columns_blank=columns_blank)
print(ds_train)
print(ds_test)
# %%timeit
for i in range(100):
ds_train[i]
# we can treat it like an array
ds_train[0]
len(ds_train)
ds_train[0][2][-2]
# +
# We can get rows
x_past, y_past, x_future, y_future = ds_train.get_rows(10)
# Plot one instance, this is what the model sees
y_past['energy(kWh/hh)'].plot(label='past')
y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
plt.legend()
plt.ylabel('energy(kWh/hh)')
# Notice we've added on two new columns tsp (time since present) and is_past
x_past.tail()
# -
# Notice we've hidden some future columns to prevent cheating
x_future.tail()
# ## Model
# +
class Seq2SeqNet(nn.Module):
def __init__(self, input_size, input_size_decoder, output_size, hidden_size=32, lstm_layers=2, lstm_dropout=0, _min_std = 0.05):
super().__init__()
self._min_std = _min_std
self.encoder = nn.LSTM(
input_size=input_size + output_size,
hidden_size=hidden_size,
batch_first=True,
num_layers=lstm_layers,
dropout=lstm_dropout,
)
self.decoder = nn.LSTM(
input_size=input_size_decoder,
hidden_size=hidden_size,
batch_first=True,
num_layers=lstm_layers,
dropout=lstm_dropout,
)
self.mean = nn.Linear(hidden_size, output_size)
self.std = nn.Linear(hidden_size, output_size)
def forward(self, context_x, context_y, target_x, target_y=None):
x = torch.cat([context_x, context_y], -1)
_, (h_out, cell) = self.encoder(x)
## Shape
# hidden = [batch size, n layers * n directions, hid dim]
# cell = [batch size, n layers * n directions, hid dim]
# output = [batch size, seq len, hid dim * n directions]
outputs, (_, _) = self.decoder(target_x, (h_out, cell))
# outputs: [B, T, num_direction * H]
mean = self.mean(outputs)
log_sigma = self.std(outputs)
log_sigma = torch.clamp(log_sigma, np.log(self._min_std), -np.log(self._min_std))
sigma = torch.exp(log_sigma)
y_dist = torch.distributions.Normal(mean, sigma)
return y_dist
# -
# +
input_size = x_past.shape[-1]
output_size = y_future.shape[-1]
model = Seq2SeqNet(input_size, input_size, output_size,
hidden_size=32,
lstm_layers=2,
lstm_dropout=0).to(device)
model
# -
# Init the optimiser
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# +
past_x = torch.rand((batch_size, window_past, input_size)).to(device)
future_x = torch.rand((batch_size, window_future, input_size)).to(device)
past_y = torch.rand((batch_size, window_past, output_size)).to(device)
future_y = torch.rand((batch_size, window_future, output_size)).to(device)
output = model(past_x, past_y, future_x, future_y)
print(output)
from torchsummaryX import summary
summary(model, past_x, past_y, future_x, future_y )
1
# -
# ## Training
# +
def train_epoch(ds, model, bs=128):
model.train()
training_loss = []
# Put data into a torch loader
load_train = torch.utils.data.dataloader.DataLoader(
ds,
batch_size=bs,
pin_memory=False,
num_workers=num_workers,
shuffle=True,
)
for batch in tqdm(load_train, leave=False, desc='train'):
# Send data to gpu
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
# Discard previous gradients
optimizer.zero_grad()
# Run model
y_dist = model(x_past, y_past, x_future, y_future)
# Get loss, it's Negative Log Likelihood
loss = -y_dist.log_prob(y_future).mean()
# Backprop
loss.backward()
optimizer.step()
# Record stats
training_loss.append(loss.item())
return np.mean(training_loss)
def test_epoch(ds, model, bs=512):
model.eval()
test_loss = []
load_test = torch.utils.data.dataloader.DataLoader(ds,
batch_size=bs,
pin_memory=False,
num_workers=num_workers)
for batch in tqdm(load_test, leave=False, desc='test'):
# Send data to gpu
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
with torch.no_grad():
# Run model
y_dist = model(x_past, y_past, x_future, y_future)
# Get loss, it's Negative Log Likelihood
loss = -y_dist.log_prob(y_future).mean()
test_loss.append(loss.item())
return np.mean(test_loss)
def training_loop(ds_train, ds_test, model, epochs=1, bs=128):
all_losses = []
try:
test_loss = test_epoch(ds_test, model)
print(f"Start: Test Loss = {test_loss:.2f}")
for epoch in tqdm(range(epochs), desc='epochs'):
loss = train_epoch(ds_train, model, bs=bs)
print(f"Epoch {epoch+1}/{epochs}: Training Loss = {loss:.2f}")
test_loss = test_epoch(ds_test, model)
print(f"Epoch {epoch+1}/{epochs}: Test Loss = {test_loss:.2f}")
print("-" * 50)
all_losses.append([loss, test_loss])
except KeyboardInterrupt:
# This lets you stop manually. and still get the results
pass
# Visualising the results
all_losses = np.array(all_losses)
plt.plot(all_losses[:, 0], label="Training")
plt.plot(all_losses[:, 1], label="Test")
plt.title("Loss")
plt.legend()
return all_losses
# -
# this might take 1 minute per epoch on a gpu
training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
1
# ## Predict
#
# TODO get working
output_scaler = scaler.transformers[-4][1]
ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
# +
# TODO Metrics... smape etc
# +
def plot_prediction(ds_preds, i):
"""Plot a prediction into the future, at a single point in time."""
d = ds_preds.isel(t_source=i)
# Get arrays
xf = d.t_target
yp = d.y_pred
s = d.y_pred_std
yt = d.y_true
now = d.t_source.squeeze()
# plot prediction
plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
facecolor="b",
interpolate=True,
label="2 std",)
plt.plot(xf, yp, label='pred', c='b')
# plot true
plt.scatter(
d.t_past,
d.y_past,
c='k',
s=6
)
plt.scatter(xf, yt, label='true', c='k', s=6)
# plot a red line for now
plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
now=pd.Timestamp(now.values)
plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
plt.xlabel(f'{now.date()}')
plt.ylabel('energy(kWh/hh)')
plt.legend()
plt.xticks(rotation=45)
plt.show()
# plot_prediction(ds_preds, 0)
# plot_prediction(ds_preds, 12) # 6 hours later
plot_prediction(ds_preds, 24) # 12 hours later
plot_prediction(ds_preds, 48) # 12 hours later
# -
# ## Error vs time ahead
# +
d = ds_preds.mean('t_source') # Mean over all predictions
# Plot with xarray, it has a pandas like interface
d.plot.scatter('t_ahead_hours', 'nll')
# Tidy the graph
n = len(ds_preds.t_source)
plt.ylabel('Negative Log Likelihood (lower is better)')
plt.xlabel('Hours ahead')
plt.title(f'NLL vs time (no. samples={n})')
# -
d = ds_preds.mean('t_source') # Mean over all predictions
d['likelihood'] = np.exp(-d.nll) # get likelihood, after taking mean in log domain
d.plot.scatter('t_ahead_hours', 'likelihood')
# Make a plot of the NLL over time. Does this solution get worse with time?
# this is hard because we need to take the mean over t_ahead
# then group by t_source
d = ds_preds.mean('t_ahead').groupby('t_source').mean()
# And even then it's clearer with smoothing
d.plot.scatter('t_source', 'nll')
plt.xticks(rotation=45)
plt.title('NLL over time (lower is better)')
1
# A scatter plot is easy with xarray
ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
+221
View File
@@ -0,0 +1,221 @@
name: seq2seq-time
channels:
- pytorch
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_gnu
- absl-py=0.10.0=py37hc8dfbb8_1
- aiohttp=3.6.3=py37h7b6447c_0
- appdirs=1.4.4=py_0
- argon2-cffi=20.1.0=py37h8f50634_2
- async-timeout=3.0.1=py_1000
- async_generator=1.10=py_0
- attrs=20.2.0=pyh9f0ad1d_0
- awscli=1.18.159=py37hc8dfbb8_0
- backcall=0.2.0=pyh9f0ad1d_0
- backports=1.0=py_2
- backports.functools_lru_cache=1.6.1=py_0
- black=20.8b1=py_1
- blas=1.0=mkl
- bleach=3.2.1=pyh9f0ad1d_0
- blinker=1.4=py_1
- botocore=1.18.18=pyh9f0ad1d_0
- brotlipy=0.7.0=py37hb5d75c8_1001
- c-ares=1.16.1=h516909a_3
- ca-certificates=2020.10.14=0
- cachetools=4.1.1=py_0
- certifi=2020.6.20=py37he5f6b98_2
- cffi=1.14.3=py37h00ebd2e_1
- chardet=3.0.4=py37he5f6b98_1008
- click=7.1.2=pyh9f0ad1d_0
- colorama=0.4.3=py_0
- cryptography=3.1.1=py37hff6837a_1
- cudatoolkit=10.2.89=hfd86e86_1
- cycler=0.10.0=py_2
- dataclasses=0.7=py37_0
- dbus=1.13.18=hb2f20db_0
- decorator=4.4.2=py_0
- defusedxml=0.6.0=py_0
- docutils=0.15.2=py37_0
- entrypoints=0.3=py37hc8dfbb8_1002
- expat=2.2.10=he6710b0_2
- fontconfig=2.13.1=h1056068_1002
- freetype=2.10.3=he06d7ca_0
- fsspec=0.8.4=py_0
- future=0.18.2=py37hc8dfbb8_2
- gettext=0.19.8.1=hf34092f_1003
- glib=2.66.1=he1b5a44_1
- google-auth=1.22.1=py_0
- google-auth-oauthlib=0.4.1=py_2
- grpcio=1.31.0=py37hb0870dc_0
- gst-plugins-base=1.14.5=h0935bb2_2
- gstreamer=1.14.5=h36ae1b5_2
- icu=67.1=he1b5a44_0
- idna=2.10=pyh9f0ad1d_0
- importlib-metadata=2.0.0=py37hc8dfbb8_0
- importlib_metadata=2.0.0=1
- iniconfig=1.1.1=py_0
- intel-openmp=2020.2=254
- ipykernel=5.3.4=py37hc6149b9_1
- ipython=7.18.1=py37hc6149b9_1
- ipython_genutils=0.2.0=py_1
- ipywidgets=7.5.1=pyh9f0ad1d_1
- jedi=0.17.2=py37hc8dfbb8_1
- jinja2=2.11.2=pyh9f0ad1d_0
- jmespath=0.10.0=pyh9f0ad1d_0
- joblib=0.17.0=py_0
- jpeg=9d=h516909a_0
- jsonschema=3.2.0=py37hc8dfbb8_1
- jupyter_client=6.1.7=py_0
- jupyter_core=4.6.3=py37hc8dfbb8_2
- jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
- kiwisolver=1.2.0=py37h99015e2_1
- krb5=1.17.1=hfafb76e_3
- lcms2=2.11=hbd6801e_0
- ld_impl_linux-64=2.35=h769bd43_9
- libblas=3.8.0=17_openblas
- libcblas=3.8.0=17_openblas
- libclang=10.0.1=default_hde54327_1
- libedit=3.1.20191231=he28a2e2_2
- libevent=2.1.10=hcdb4288_3
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.3.0=h5dbcf3e_17
- libgfortran-ng=7.5.0=hae1eefd_17
- libgfortran4=7.5.0=hae1eefd_17
- libglib=2.66.1=h0dae87d_1
- libgomp=9.3.0=h5dbcf3e_17
- libiconv=1.16=h516909a_0
- liblapack=3.8.0=17_openblas
- libllvm10=10.0.1=he513fc3_3
- libopenblas=0.3.10=pthreads_hb3c22a3_5
- libpng=1.6.37=hed695b0_2
- libpq=12.3=h1281834_2
- libprotobuf=3.13.0.1=h8b12597_0
- libsodium=1.0.18=h516909a_1
- libstdcxx-ng=9.3.0=h2ae2ef3_17
- libtiff=4.1.0=hc7e4089_6
- libuuid=2.32.1=h14c3975_1000
- libwebp-base=1.1.0=h516909a_3
- libxcb=1.14=h7b6447c_0
- libxkbcommon=0.10.0=he1b5a44_0
- libxml2=2.9.10=h68273f3_2
- lz4-c=1.9.2=he1b5a44_3
- markdown=3.3.1=pyh9f0ad1d_0
- markupsafe=1.1.1=py37hb5d75c8_2
- matplotlib=3.3.2=py37hc8dfbb8_1
- matplotlib-base=3.3.2=py37hc9afd2a_1
- mccabe=0.6.1=py_1
- mistune=0.8.4=py37h8f50634_1002
- mkl=2020.2=256
- more-itertools=8.5.0=py_0
- multidict=4.7.6=py37h7b6447c_1
- mypy=0.790=py_0
- mypy_extensions=0.4.3=py37hc8dfbb8_1
- mysql-common=8.0.21=2
- mysql-libs=8.0.21=hf3661c5_2
- nbclient=0.5.1=py_0
- nbconvert=6.0.7=py37hc8dfbb8_1
- nbformat=5.0.8=py_0
- ncurses=6.2=he1b5a44_2
- nest-asyncio=1.4.1=py_0
- ninja=1.10.1=hfc4b9b4_2
- notebook=6.1.4=py37hc8dfbb8_1
- nspr=4.29=he1b5a44_1
- nss=3.58=h27285de_1
- numpy=1.19.2=py37h7ea13bd_1
- oauthlib=3.1.0=py_0
- olefile=0.46=pyh9f0ad1d_1
- openssl=1.1.1h=h516909a_0
- packaging=20.4=pyh9f0ad1d_0
- pandas=1.1.3=py37h9fdb41a_2
- pandoc=2.11.0.2=hd18ef5c_0
- pandocfilters=1.4.2=py_1
- parso=0.7.1=pyh9f0ad1d_0
- pathspec=0.8.0=pyh9f0ad1d_0
- pcre=8.44=he1b5a44_0
- pexpect=4.8.0=py37hc8dfbb8_1
- pickleshare=0.7.5=py37hc8dfbb8_1002
- pillow=8.0.0=py37h718be6c_0
- pip=20.2.4=py_0
- pluggy=0.13.1=py37hc8dfbb8_3
- prometheus_client=0.8.0=pyh9f0ad1d_0
- prompt-toolkit=3.0.8=py_0
- protobuf=3.13.0.1=py37h3340039_1
- psutil=5.7.2=py37hb5d75c8_1
- ptyprocess=0.6.0=py37_1000
- py=1.9.0=pyh9f0ad1d_0
- pyasn1=0.4.8=py_0
- pyasn1-modules=0.2.8=py_0
- pycodestyle=2.6.0=pyh9f0ad1d_0
- pycparser=2.20=pyh9f0ad1d_2
- pydocstyle=5.1.1=py_0
- pyflakes=2.2.0=pyh9f0ad1d_0
- pygments=2.7.1=py_0
- pyjwt=1.7.1=py_0
- pylama=7.7.1=py_0
- pyopenssl=19.1.0=py37_0
- pyparsing=2.4.7=pyh9f0ad1d_0
- pyqt=5.12.3=py37h8685d9f_4
- pyrsistent=0.17.3=py37h8f50634_1
- pysocks=1.7.1=py37he5f6b98_2
- pytest=6.1.1=py37hc8dfbb8_1
- python=3.7.8=h425cb1d_1_cpython
- python-dateutil=2.8.1=py_0
- python_abi=3.7=1_cp37m
- pytorch=1.6.0=py3.7_cuda10.2.89_cudnn7.6.5_0
- pytorch-lightning=1.0.2=py_0
- pytz=2020.1=pyh9f0ad1d_0
- pyyaml=5.3.1=py37hb5d75c8_1
- pyzmq=19.0.2=py37hac76be4_2
- qt=5.12.9=h1f2b2cb_0
- readline=8.0=he28a2e2_2
- regex=2020.10.15=py37h8f50634_0
- requests=2.24.0=pyh9f0ad1d_0
- requests-oauthlib=1.3.0=pyh9f0ad1d_0
- rsa=4.4.1=pyh9f0ad1d_0
- s3transfer=0.3.3=py37hc8dfbb8_2
- scikit-learn=0.23.2=py37h6785257_0
- scipy=1.5.2=py37hb14ef9d_2
- send2trash=1.5.0=py_0
- setuptools=49.6.0=py37he5f6b98_2
- six=1.15.0=pyh9f0ad1d_0
- snowballstemmer=2.0.0=py_0
- sqlite=3.33.0=h4cf870e_1
- tensorboard=2.3.0=py_0
- tensorboard-plugin-wit=1.6.0=pyh9f0ad1d_0
- terminado=0.9.1=py37hc8dfbb8_1
- testpath=0.4.4=py_0
- threadpoolctl=2.1.0=pyh5ca1d4c_0
- tk=8.6.10=hed695b0_1
- toml=0.10.1=pyh9f0ad1d_0
- torchvision=0.7.0=py37_cu102
- tornado=6.0.4=py37h8f50634_2
- tqdm=4.50.2=pyh9f0ad1d_0
- traitlets=5.0.5=py_0
- typed-ast=1.4.1=py37h516909a_0
- typing-extensions=3.7.4.3=0
- typing_extensions=3.7.4.3=py_0
- urllib3=1.25.10=py_0
- wcwidth=0.2.5=pyh9f0ad1d_2
- webencodings=0.5.1=py_1
- werkzeug=1.0.1=pyh9f0ad1d_0
- wheel=0.35.1=pyh9f0ad1d_0
- widgetsnbextension=3.5.1=py37hc8dfbb8_2
- xarray=0.16.1=py_0
- xz=5.2.5=h516909a_1
- yaml=0.2.5=h516909a_0
- yapf=0.30.0=pyh9f0ad1d_0
- yarl=1.6.2=py37h8f50634_0
- zeromq=4.3.3=he1b5a44_2
- zipp=3.3.1=py_0
- zlib=1.2.11=h516909a_1010
- zstd=1.4.5=h6597ccf_2
- pip:
- pyqt5-sip==4.19.18
- pyqtchart==5.12
- pyqtwebengine==5.12.1
- sklearn-pandas==2.0.2
- torchsummaryx==1.3.0
prefix: /home/wassname/anaconda/envs/seq2seq-time
+26
View File
@@ -0,0 +1,26 @@
name: seq2seq-time
channels:
- conda-forge
- defaults
dependencies:
- python==3.7
- pip
- awscli
- ipykernel
- tqdm
- xarray
- pandas
- pytorch
- torchvision
- cudatoolkit==10.2
- black
- pylama
- mypy
- pytest
- numpy
- matplotlib
- scikit-learn
- pytorch-lightning
- yapf
- ipywidgets
prefix: /home/wassname/anaconda/envs/seq2seq-time
+1 -1
View File
@@ -11,5 +11,5 @@ dependencies:
- awscli
- pip:
# local package
- -e .
# - -e .
+10
View File
@@ -4,3 +4,13 @@ This project has multiple ways of documenting requirements
- environment.min.yaml - This is the minimum requirements, use it to install a new test or dev environment
- environment.max.yaml - This pins all conda packages, use for production or finding vunrebilities
- requirements.txt - For people or bots not using conda
```
# Install requirements
conda create --name seq2seq-time python=3.7 -f ./requirements/environment.yaml
conda activate seq2seq-time
# Install this package in editable mode
python -m pip install -e .
# Install kernel
python -m ipykernel install --user --name seq2seq-time --display-name seq2seq-time
```
View File
+100
View File
@@ -0,0 +1,100 @@
import pandas as pd
import torch.utils.data
import numpy as np
def assert_normalized(df):
stats = df.describe().T
np.testing.assert_allclose(stats['mean'].values, 0, atol=0.1), 'means should be normalized to ~0'
np.testing.assert_allclose(stats['std'].values, 1, atol=0.1), 'standard deviations should be normalized to ~0'
def assert_no_objects(df):
for name, dtype in df.dtypes.iteritems():
assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'
class Seq2SeqDataSet(torch.utils.data.Dataset):
"""
Takes in dataframe and returns sequences through time.
Returns x_past, y_past, x_future, etc.
"""
def __init__(self, df: pd.DataFrame, window_past=40, window_future=10, columns_target=['energy(kWh/hh)'], columns_blank=[],):
"""
Args:
- df: DataFrame with time index, already scaled
- columns_blank: The columns we will blank, in the future
"""
super().__init__()
# TODO auto categorical columns
# TODO specify blank future columns
assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
assert df.index.freq is not None, 'should have freq'
# assert_normalized(df)
assert_no_objects(df)
# Use numpy instead of pandas, for speed
self.x = df.drop(columns=columns_target).copy().values
self.y = df[columns_target].copy().values
self.t = df.index.copy()
self.columns = list(df.columns)
self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
self.window_past = window_past
self.window_future = window_future
self.columns_target = columns_target
def get_components(self, i):
"""Get past and future rows."""
x = self.x[i : i + (self.window_past + self.window_future)].copy()
y = self.y[i:i + (self.window_past + self.window_future)].copy()
t = self.t[i:i + (self.window_past + self.window_future)].copy()
t = t.astype(int) * 1e-9 / 60 / 60 / 24 # days
t = t.values
now = t[self.window_past]
# Add a features: relative hours since present time, is future
tstp = (t - now)[:, None]
is_past = tstp < 0
x = np.concatenate([x, tstp, is_past], -1)
# Split into future and past
x_past = x[:self.window_past]
y_past = y[:self.window_past]
x_future = x[self.window_past:]
y_future = y[self.window_past:]
# Stop it cheating by using future weather measurements
x_future[:, self.icol_blank] = 0
return x_past, y_past, x_future, y_future
def __getitem__(self, i):
"""This is how python implements square brackets"""
if i<0:
# Handle negative integers
i = len(self)+i
data = self.get_components(i)
# From dataframe to torch
return [d.astype(np.float32) for d in data]
def get_rows(self, i):
"""
Output pandas dataframes for display purposes.
"""
x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
x_past, y_past, x_future, y_future = self.get_components(i)
t_past = self.t[i:i+self.window_past]
t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
y_future = pd.DataFrame(y_future, columns=self.columns_target, index=t_future)
return x_past, y_past, x_future, y_future
def __len__(self):
return len(self.x) - (self.window_past + self.window_future)
def __repr__(self):
return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
+72
View File
@@ -0,0 +1,72 @@
import xarray as xr
import torch
from tqdm.auto import tqdm
import pandas as pd
from .util import to_numpy
def predict(model, ds_test, batch_size, device='cpu', scaler=None):
"""
Gather all predictions into xarray.
When we generate prediction in a sequence to sequence model we start at a time then predict
N steps into the future. So we have 2 dimensions: source time, target time.
But we also care about how far we were predicting into the future, so we have 3 dimensions: source time, target time, time ahead.
It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
"""
load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
freq = ds_test.t.freq
xrs = []
for i, batch in enumerate(tqdm(load_test, desc='predict')):
model.eval()
with torch.no_grad():
x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
y_dist = model(x_past, y_past, x_future, y_future)
nll = -y_dist.log_prob(y_future)
# Convert to numpy
mean = to_numpy(y_dist.loc.squeeze(-1))
std = to_numpy(y_dist.scale.squeeze(-1))
nll = to_numpy(nll.squeeze(-1))
y_future = to_numpy(y_future.squeeze(-1))
y_past = to_numpy(y_past.squeeze(-1))
# Make an xarray.Dataset for the data
bs = y_future.shape[0]
t_source = ds_test.t[i:i+bs].values
t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
xr_out = xr.Dataset(
{
# Format> name: ([dimensions,...], array),
"y_past": (["t_source", "t_behind",], y_past),
"nll": (["t_source", "t_ahead",], nll),
"y_pred": (["t_source", "t_ahead",], mean),
"y_pred_std": (["t_source", "t_ahead",], std),
"y_true": (["t_source", "t_ahead",], y_future),
},
coords={"t_source": t_source, "t_ahead": t_ahead, "t_behind": t_behind},
)
xrs.append(xr_out)
# Join all batches
ds_preds = xr.concat(xrs, dim="t_source")
# undo scaling on y
if scaler:
ds_preds['y_pred_std'].values = ds_preds.y_pred_std * scaler.scale_
ds_preds['y_past'].values = scaler.inverse_transform(ds_preds.y_past)
ds_preds['y_pred'].values = scaler.inverse_transform(ds_preds.y_pred)
ds_preds['y_true'].values = scaler.inverse_transform(ds_preds.y_true)
# Add some derived coordinates, they will be the ones not in bold
# The target time, is a function of the source time, and how far we predict ahead
ds_preds = ds_preds.assign_coords(t_target=ds_preds.t_source+ds_preds.t_ahead)
ds_preds = ds_preds.assign_coords(t_past=ds_preds.t_source+ds_preds.t_behind)
# Some plots don't like timedeltas, so lets make a coordinate for time ahead in hours
ds_preds = ds_preds.assign_coords(t_ahead_hours=(ds_preds.t_ahead*1.0e-9/60/60).astype(float))
return ds_preds
+10
View File
@@ -0,0 +1,10 @@
from pathlib import Path
import torch
project_dir = Path(__file__).parent.parent
def to_numpy(x):
"""Helper function to avoid repeating code"""
if isinstance(x, torch.Tensor):
x = x.cpu().detach().numpy()
return x