dataloading

2026-06-27 19:16:40 +08:00 · 2020-10-18 13:12:09 +08:00
parent 7ab5c56bf2
commit 279ef54d86
11 changed files with 2497 additions and 2481 deletions
@@ -51,7 +51,7 @@ lint:
 ## Set up python interpreter environment
 create_environment:
 	@echo ">>> Detected conda, creating conda environment."
-	conda create --name $(PROJECT_NAME) python=3
+	conda create --name $(PROJECT_NAME) python=3.7
 	@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"

 ## Test python environment is setup correctly
@@ -0,0 +1,519 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:light
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.6.0
+#   kernelspec:
+#     display_name: seq2seq-time
+#     language: python
+#     name: seq2seq-time
+# ---
+
+# # Sequence to Sequence Models for Timeseries Regression
+#
+#
+# In this notebook we are going to tackle a harder problem: 
+# - predicting the future on a timeseries
+# - using an LSTM
+# - with rough uncertainty (uncalibrated)
+# - outputing sequence of predictions
+#
+# <img src="../reports/figures/Seq2Seq for regression.png" />
+#
+#
+
+#
+# - [ ] TODO mike autocorrelation baseline
+# - [ ] TODO mike acorn data
+
+# OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
+# %load_ext autoreload
+# %autoreload 2
+# %aimport -pandas
+# %aimport -torch
+# %aimport -numpy
+# %aimport -matplotlib
+# %aimport -dask
+# %aimport -tqdm
+# %matplotlib inline
+
+# +
+# Imports
+import torch
+from torch import nn, optim
+from torch.nn import functional as F
+from torch.autograd import Variable
+import torch
+import torch.utils.data
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pathlib import Path
+from tqdm.auto import tqdm
+
+import pytorch_lightning as pl
+# -
+
+from seq2seq_time.data.dataset import Seq2SeqDataSet
+from seq2seq_time.predict import predict
+
+import logging, sys
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# ## Parameters
+
+# +
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f'using {device}')
+
+columns_target=['energy(kWh/hh)']
+window_past = 48*4
+window_future = 48*4
+batch_size = 64
+num_workers = 0
+freq = '30T'
+max_rows = 1e5
+
+
+# -
+
+# ## Load data
+
+# +
+
+def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london')):
+    """
+    Data loading and cleanding is always messy, so understand this code is optional.
+    """
+    
+    # Load csv files
+    csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:1]
+    
+#     import pdb; pdb.set_trace() # you can use debugging in jupyter to interact with variables inside a function
+    
+    # concatendate them
+    df = pd.concat([pd.read_csv(f, parse_dates=[1], na_values=['Null']) for f in csv_files])
+    
+    # Add ACORN categories
+    df_households = pd.read_csv(indir/'informations_households.csv')
+    df_households = df_households[['LCLid', 'stdorToU', 'Acorn_grouped']]
+    df = pd.merge(df, df_households, on='LCLid')
+
+    # Take the mean over all houses
+    name, df = next(iter(df.groupby('LCLid')))
+    df = df.set_index('tstp')
+    print(df)
+
+    # Load weather data
+    df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
+    use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
+           'pressure', 'apparentTemperature', 'windSpeed', 
+           'humidity']
+    df_weather = df_weather[use_cols].set_index('time')
+    df_weather = df_weather.resample(freq).first().ffill()  # Resample to match energy data    
+
+    # Join weather and energy data
+    df = pd.concat([df, df_weather], 1).dropna()    
+    
+    # Also find bank holidays
+    df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
+    holidays = set(df_hols['Bank holidays'].dt.round('D'))  
+
+    time = df.index.to_series()
+    def is_holiday(dt):
+        return dt.floor('D') in holidays
+    df['holiday'] = time.apply(is_holiday).astype(int)
+    
+    # TODO pd.read_csv('../data/raw/smart-meters-in-london/acorn_details.csv', engine='python')
+
+
+    # Add time features    
+    df["month"] = time.dt.month
+    df['day'] = time.dt.day
+    df['week'] = time.dt.week
+    df['hour'] = time.dt.hour
+    df['minute'] = time.dt.minute
+    df['dayofweek'] = time.dt.dayofweek
+
+    # Drop nan and 0's
+    df = df[df['energy(kWh/hh)']!=0]
+    df = df.dropna()
+
+    # sort by time
+    df = df.sort_index()
+    
+    return df
+# -
+# Our dataset is the london smartmeter data. But at half hour intervals
+
+# +
+df = get_smartmeter_df()
+
+# df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean
+
+df = df.tail(int(max_rows)).copy() # Just use last X rows
+df
+# -
+
+df.describe()
+
+# +
+import sklearn
+from sklearn.preprocessing import StandardScaler, OrdinalEncoder
+from sklearn_pandas import DataFrameMapper
+
+columns_input_numeric = list(df.drop(columns=columns_target)._get_numeric_data().columns)
+columns_categorical = list(set(df.columns)-set(columns_input_numeric)-set(columns_target))
+
+output_scalers = [([n], StandardScaler()) for n in columns_target]
+transformers=output_scalers + \
+[([n], StandardScaler()) for n in columns_input_numeric] + \
+[([n], OrdinalEncoder()) for n in columns_categorical]
+scaler = DataFrameMapper(transformers, df_out=True)
+df_norm = scaler.fit_transform(df)
+df_norm
+# -
+
+output_scaler = next(filter(lambda r:r[0][0] in columns_target, mapper4.features))[-1]
+output_scaler
+
+# # Resample
+df_norm = df_norm.resample(freq).first().fillna(0)
+
+# +
+# split data, with the test in the future
+n_split = -int(len(df)*0.2)
+df_train = df_norm[:n_split]
+df_test = df_norm[n_split:]
+
+# Show split
+df_train['energy(kWh/hh)'].plot(label='train')
+df_test['energy(kWh/hh)'].plot(label='test')
+plt.ylabel('energy(kWh/hh)')
+plt.legend()
+# -
+df_norm
+
+
+columns_blank=['visibility',
+       'windBearing', 'temperature', 'dewPoint', 'pressure',
+       'apparentTemperature', 'windSpeed', 'humidity']
+
+ds_train = Seq2SeqDataSet(df_train,
+                          window_past=window_past,
+                          window_future=window_future,
+                          columns_blank=columns_blank)
+ds_test = Seq2SeqDataSet(df_test,
+                         window_past=window_past,
+                         window_future=window_future,
+                         columns_blank=columns_blank)
+print(ds_train)
+print(ds_test)
+
+# %%timeit
+for i in range(100):
+    ds_train[i]
+
+# we can treat it like an array
+ds_train[0]
+len(ds_train)
+ds_train[0][2][-2]
+
+# +
+# We can get rows
+x_past, y_past, x_future, y_future = ds_train.get_rows(10)
+
+# Plot one instance, this is what the model sees
+y_past['energy(kWh/hh)'].plot(label='past')
+y_future['energy(kWh/hh)'].plot(ax=plt.gca(), label='future')
+plt.legend()
+plt.ylabel('energy(kWh/hh)')
+
+# Notice we've added on two new columns tsp (time since present) and is_past
+x_past.tail()
+# -
+
+# Notice we've hidden some future columns to prevent cheating
+x_future.tail()
+
+
+# ## Model
+
+# +
+
+class Seq2SeqNet(nn.Module):
+    def __init__(self, input_size, input_size_decoder, output_size, hidden_size=32, lstm_layers=2, lstm_dropout=0, _min_std = 0.05):
+        super().__init__()
+        self._min_std = _min_std
+
+        self.encoder = nn.LSTM(
+            input_size=input_size + output_size,
+            hidden_size=hidden_size,
+            batch_first=True,
+            num_layers=lstm_layers,
+            dropout=lstm_dropout,
+        )
+        self.decoder = nn.LSTM(
+            input_size=input_size_decoder,
+            hidden_size=hidden_size,
+            batch_first=True,
+            num_layers=lstm_layers,
+            dropout=lstm_dropout,
+        )
+        self.mean = nn.Linear(hidden_size, output_size)
+        self.std = nn.Linear(hidden_size, output_size)
+
+    def forward(self, context_x, context_y, target_x, target_y=None):
+        x = torch.cat([context_x, context_y], -1)
+        _, (h_out, cell) = self.encoder(x)
+        
+        ## Shape
+        # hidden = [batch size, n layers * n directions, hid dim]
+        # cell = [batch size, n layers * n directions, hid dim]
+        # output = [batch size, seq len, hid dim * n directions]
+        outputs, (_, _) = self.decoder(target_x, (h_out, cell))
+        
+        
+        # outputs: [B, T, num_direction * H]
+        mean = self.mean(outputs)
+        log_sigma = self.std(outputs)
+        log_sigma = torch.clamp(log_sigma, np.log(self._min_std), -np.log(self._min_std))
+
+        sigma = torch.exp(log_sigma)
+        y_dist = torch.distributions.Normal(mean, sigma)
+        return y_dist
+
+
+# -
+
+
+
+# +
+input_size = x_past.shape[-1]
+output_size = y_future.shape[-1]
+
+model = Seq2SeqNet(input_size, input_size, output_size,
+                   hidden_size=32, 
+                   lstm_layers=2, 
+                   lstm_dropout=0).to(device)
+model
+# -
+# Init the optimiser
+optimizer = optim.Adam(model.parameters(), lr=1e-3)
+
+# +
+
+past_x = torch.rand((batch_size, window_past, input_size)).to(device)
+future_x = torch.rand((batch_size, window_future, input_size)).to(device)
+past_y = torch.rand((batch_size, window_past, output_size)).to(device)
+future_y = torch.rand((batch_size, window_future, output_size)).to(device)
+output = model(past_x, past_y, future_x, future_y)  
+print(output)
+
+from torchsummaryX import summary
+summary(model, past_x, past_y, future_x, future_y )
+1
+# -
+
+# ## Training
+
+
+
+
+
+# +
+def train_epoch(ds, model, bs=128):
+    model.train()
+
+    training_loss = []
+
+    # Put data into a torch loader
+    load_train = torch.utils.data.dataloader.DataLoader(
+        ds,
+        batch_size=bs,
+        pin_memory=False,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+
+    for batch in tqdm(load_train, leave=False, desc='train'):
+        # Send data to gpu
+        x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
+
+        # Discard previous gradients
+        optimizer.zero_grad()
+        
+        # Run model
+        y_dist = model(x_past, y_past, x_future, y_future)
+        
+        # Get loss, it's Negative Log Likelihood
+        loss = -y_dist.log_prob(y_future).mean()
+
+        # Backprop
+        loss.backward()
+        optimizer.step()
+
+        # Record stats
+        training_loss.append(loss.item())
+
+    return np.mean(training_loss)
+
+
+def test_epoch(ds, model, bs=512):
+    model.eval()
+
+    test_loss = []
+    load_test = torch.utils.data.dataloader.DataLoader(ds,
+                                                       batch_size=bs,
+                                                       pin_memory=False,
+                                                       num_workers=num_workers)
+    for batch in tqdm(load_test, leave=False, desc='test'):
+        # Send data to gpu
+        x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
+        with torch.no_grad():
+            # Run model
+            y_dist = model(x_past, y_past, x_future, y_future)
+            # Get loss, it's Negative Log Likelihood
+            loss = -y_dist.log_prob(y_future).mean()
+
+        test_loss.append(loss.item())
+
+    return np.mean(test_loss)
+
+
+def training_loop(ds_train, ds_test, model, epochs=1, bs=128):
+    all_losses = []
+    try:
+        test_loss = test_epoch(ds_test, model)
+        print(f"Start: Test Loss = {test_loss:.2f}")
+        for epoch in tqdm(range(epochs), desc='epochs'):
+            loss = train_epoch(ds_train, model, bs=bs)
+            print(f"Epoch {epoch+1}/{epochs}: Training Loss = {loss:.2f}")
+
+            test_loss = test_epoch(ds_test, model)
+            print(f"Epoch {epoch+1}/{epochs}: Test Loss = {test_loss:.2f}")
+            print("-" * 50)
+
+            all_losses.append([loss, test_loss])
+
+    except KeyboardInterrupt:
+        # This lets you stop manually. and still get the results
+        pass
+
+    # Visualising the results
+    all_losses = np.array(all_losses)
+    plt.plot(all_losses[:, 0], label="Training")
+    plt.plot(all_losses[:, 1], label="Test")
+    plt.title("Loss")
+    plt.legend()
+
+    return all_losses
+
+
+# -
+
+# this might take 1 minute per epoch on a gpu
+training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
+1
+
+# ## Predict
+#
+
+# TODO get working
+output_scaler = scaler.transformers[-4][1]
+ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
+
+
+
+# +
+# TODO Metrics... smape etc
+
+# +
+def plot_prediction(ds_preds, i):
+    """Plot a prediction into the future, at a single point in time."""
+    d = ds_preds.isel(t_source=i)
+
+    # Get arrays
+    xf = d.t_target
+    yp = d.y_pred
+    s = d.y_pred_std
+    yt = d.y_true
+    now = d.t_source.squeeze()
+
+    # plot prediction
+    plt.fill_between(xf, yp-2*s, yp+2*s, alpha=0.25,
+            facecolor="b",
+            interpolate=True,
+            label="2 std",)
+    plt.plot(xf, yp, label='pred', c='b')
+
+    # plot true
+    plt.scatter(
+        d.t_past,
+        d.y_past,
+        c='k',
+        s=6
+    )
+    plt.scatter(xf, yt, label='true', c='k', s=6)
+    
+    # plot a red line for now
+    plt.vlines(x=now, ymin=0, ymax=1, label='now', color='r')
+
+    now=pd.Timestamp(now.values)
+    plt.title(f'Prediction NLL={d.nll.mean().item():2.2g}')
+    plt.xlabel(f'{now.date()}')
+    plt.ylabel('energy(kWh/hh)')
+    plt.legend()
+    plt.xticks(rotation=45)
+    plt.show()
+    
+# plot_prediction(ds_preds, 0) 
+# plot_prediction(ds_preds, 12) # 6 hours later
+plot_prediction(ds_preds, 24) # 12 hours later
+plot_prediction(ds_preds, 48) # 12 hours later
+# -
+
+# ## Error vs time ahead
+
+
+
+# +
+d = ds_preds.mean('t_source') # Mean over all predictions
+
+# Plot with xarray, it has a pandas like interface
+d.plot.scatter('t_ahead_hours', 'nll')
+
+# Tidy the graph
+n = len(ds_preds.t_source)
+plt.ylabel('Negative Log Likelihood (lower is better)')
+plt.xlabel('Hours ahead')
+plt.title(f'NLL vs time (no. samples={n})')
+# -
+
+d = ds_preds.mean('t_source') # Mean over all predictions
+d['likelihood'] = np.exp(-d.nll) # get likelihood, after taking mean in log domain
+d.plot.scatter('t_ahead_hours', 'likelihood')
+
+
+
+# Make a plot of the NLL over time. Does this solution get worse with time?
+# this is hard because we need to take the mean over t_ahead
+# then group by t_source
+d = ds_preds.mean('t_ahead').groupby('t_source').mean()
+# And even then it's clearer with smoothing
+d.plot.scatter('t_source', 'nll')
+plt.xticks(rotation=45)
+plt.title('NLL over time (lower is better)')
+1
+
+# A scatter plot is easy with xarray
+ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
+
+
@@ -0,0 +1,221 @@
+name: seq2seq-time
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=1_gnu
+  - absl-py=0.10.0=py37hc8dfbb8_1
+  - aiohttp=3.6.3=py37h7b6447c_0
+  - appdirs=1.4.4=py_0
+  - argon2-cffi=20.1.0=py37h8f50634_2
+  - async-timeout=3.0.1=py_1000
+  - async_generator=1.10=py_0
+  - attrs=20.2.0=pyh9f0ad1d_0
+  - awscli=1.18.159=py37hc8dfbb8_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.1=py_0
+  - black=20.8b1=py_1
+  - blas=1.0=mkl
+  - bleach=3.2.1=pyh9f0ad1d_0
+  - blinker=1.4=py_1
+  - botocore=1.18.18=pyh9f0ad1d_0
+  - brotlipy=0.7.0=py37hb5d75c8_1001
+  - c-ares=1.16.1=h516909a_3
+  - ca-certificates=2020.10.14=0
+  - cachetools=4.1.1=py_0
+  - certifi=2020.6.20=py37he5f6b98_2
+  - cffi=1.14.3=py37h00ebd2e_1
+  - chardet=3.0.4=py37he5f6b98_1008
+  - click=7.1.2=pyh9f0ad1d_0
+  - colorama=0.4.3=py_0
+  - cryptography=3.1.1=py37hff6837a_1
+  - cudatoolkit=10.2.89=hfd86e86_1
+  - cycler=0.10.0=py_2
+  - dataclasses=0.7=py37_0
+  - dbus=1.13.18=hb2f20db_0
+  - decorator=4.4.2=py_0
+  - defusedxml=0.6.0=py_0
+  - docutils=0.15.2=py37_0
+  - entrypoints=0.3=py37hc8dfbb8_1002
+  - expat=2.2.10=he6710b0_2
+  - fontconfig=2.13.1=h1056068_1002
+  - freetype=2.10.3=he06d7ca_0
+  - fsspec=0.8.4=py_0
+  - future=0.18.2=py37hc8dfbb8_2
+  - gettext=0.19.8.1=hf34092f_1003
+  - glib=2.66.1=he1b5a44_1
+  - google-auth=1.22.1=py_0
+  - google-auth-oauthlib=0.4.1=py_2
+  - grpcio=1.31.0=py37hb0870dc_0
+  - gst-plugins-base=1.14.5=h0935bb2_2
+  - gstreamer=1.14.5=h36ae1b5_2
+  - icu=67.1=he1b5a44_0
+  - idna=2.10=pyh9f0ad1d_0
+  - importlib-metadata=2.0.0=py37hc8dfbb8_0
+  - importlib_metadata=2.0.0=1
+  - iniconfig=1.1.1=py_0
+  - intel-openmp=2020.2=254
+  - ipykernel=5.3.4=py37hc6149b9_1
+  - ipython=7.18.1=py37hc6149b9_1
+  - ipython_genutils=0.2.0=py_1
+  - ipywidgets=7.5.1=pyh9f0ad1d_1
+  - jedi=0.17.2=py37hc8dfbb8_1
+  - jinja2=2.11.2=pyh9f0ad1d_0
+  - jmespath=0.10.0=pyh9f0ad1d_0
+  - joblib=0.17.0=py_0
+  - jpeg=9d=h516909a_0
+  - jsonschema=3.2.0=py37hc8dfbb8_1
+  - jupyter_client=6.1.7=py_0
+  - jupyter_core=4.6.3=py37hc8dfbb8_2
+  - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
+  - kiwisolver=1.2.0=py37h99015e2_1
+  - krb5=1.17.1=hfafb76e_3
+  - lcms2=2.11=hbd6801e_0
+  - ld_impl_linux-64=2.35=h769bd43_9
+  - libblas=3.8.0=17_openblas
+  - libcblas=3.8.0=17_openblas
+  - libclang=10.0.1=default_hde54327_1
+  - libedit=3.1.20191231=he28a2e2_2
+  - libevent=2.1.10=hcdb4288_3
+  - libffi=3.2.1=he1b5a44_1007
+  - libgcc-ng=9.3.0=h5dbcf3e_17
+  - libgfortran-ng=7.5.0=hae1eefd_17
+  - libgfortran4=7.5.0=hae1eefd_17
+  - libglib=2.66.1=h0dae87d_1
+  - libgomp=9.3.0=h5dbcf3e_17
+  - libiconv=1.16=h516909a_0
+  - liblapack=3.8.0=17_openblas
+  - libllvm10=10.0.1=he513fc3_3
+  - libopenblas=0.3.10=pthreads_hb3c22a3_5
+  - libpng=1.6.37=hed695b0_2
+  - libpq=12.3=h1281834_2
+  - libprotobuf=3.13.0.1=h8b12597_0
+  - libsodium=1.0.18=h516909a_1
+  - libstdcxx-ng=9.3.0=h2ae2ef3_17
+  - libtiff=4.1.0=hc7e4089_6
+  - libuuid=2.32.1=h14c3975_1000
+  - libwebp-base=1.1.0=h516909a_3
+  - libxcb=1.14=h7b6447c_0
+  - libxkbcommon=0.10.0=he1b5a44_0
+  - libxml2=2.9.10=h68273f3_2
+  - lz4-c=1.9.2=he1b5a44_3
+  - markdown=3.3.1=pyh9f0ad1d_0
+  - markupsafe=1.1.1=py37hb5d75c8_2
+  - matplotlib=3.3.2=py37hc8dfbb8_1
+  - matplotlib-base=3.3.2=py37hc9afd2a_1
+  - mccabe=0.6.1=py_1
+  - mistune=0.8.4=py37h8f50634_1002
+  - mkl=2020.2=256
+  - more-itertools=8.5.0=py_0
+  - multidict=4.7.6=py37h7b6447c_1
+  - mypy=0.790=py_0
+  - mypy_extensions=0.4.3=py37hc8dfbb8_1
+  - mysql-common=8.0.21=2
+  - mysql-libs=8.0.21=hf3661c5_2
+  - nbclient=0.5.1=py_0
+  - nbconvert=6.0.7=py37hc8dfbb8_1
+  - nbformat=5.0.8=py_0
+  - ncurses=6.2=he1b5a44_2
+  - nest-asyncio=1.4.1=py_0
+  - ninja=1.10.1=hfc4b9b4_2
+  - notebook=6.1.4=py37hc8dfbb8_1
+  - nspr=4.29=he1b5a44_1
+  - nss=3.58=h27285de_1
+  - numpy=1.19.2=py37h7ea13bd_1
+  - oauthlib=3.1.0=py_0
+  - olefile=0.46=pyh9f0ad1d_1
+  - openssl=1.1.1h=h516909a_0
+  - packaging=20.4=pyh9f0ad1d_0
+  - pandas=1.1.3=py37h9fdb41a_2
+  - pandoc=2.11.0.2=hd18ef5c_0
+  - pandocfilters=1.4.2=py_1
+  - parso=0.7.1=pyh9f0ad1d_0
+  - pathspec=0.8.0=pyh9f0ad1d_0
+  - pcre=8.44=he1b5a44_0
+  - pexpect=4.8.0=py37hc8dfbb8_1
+  - pickleshare=0.7.5=py37hc8dfbb8_1002
+  - pillow=8.0.0=py37h718be6c_0
+  - pip=20.2.4=py_0
+  - pluggy=0.13.1=py37hc8dfbb8_3
+  - prometheus_client=0.8.0=pyh9f0ad1d_0
+  - prompt-toolkit=3.0.8=py_0
+  - protobuf=3.13.0.1=py37h3340039_1
+  - psutil=5.7.2=py37hb5d75c8_1
+  - ptyprocess=0.6.0=py37_1000
+  - py=1.9.0=pyh9f0ad1d_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.8=py_0
+  - pycodestyle=2.6.0=pyh9f0ad1d_0
+  - pycparser=2.20=pyh9f0ad1d_2
+  - pydocstyle=5.1.1=py_0
+  - pyflakes=2.2.0=pyh9f0ad1d_0
+  - pygments=2.7.1=py_0
+  - pyjwt=1.7.1=py_0
+  - pylama=7.7.1=py_0
+  - pyopenssl=19.1.0=py37_0
+  - pyparsing=2.4.7=pyh9f0ad1d_0
+  - pyqt=5.12.3=py37h8685d9f_4
+  - pyrsistent=0.17.3=py37h8f50634_1
+  - pysocks=1.7.1=py37he5f6b98_2
+  - pytest=6.1.1=py37hc8dfbb8_1
+  - python=3.7.8=h425cb1d_1_cpython
+  - python-dateutil=2.8.1=py_0
+  - python_abi=3.7=1_cp37m
+  - pytorch=1.6.0=py3.7_cuda10.2.89_cudnn7.6.5_0
+  - pytorch-lightning=1.0.2=py_0
+  - pytz=2020.1=pyh9f0ad1d_0
+  - pyyaml=5.3.1=py37hb5d75c8_1
+  - pyzmq=19.0.2=py37hac76be4_2
+  - qt=5.12.9=h1f2b2cb_0
+  - readline=8.0=he28a2e2_2
+  - regex=2020.10.15=py37h8f50634_0
+  - requests=2.24.0=pyh9f0ad1d_0
+  - requests-oauthlib=1.3.0=pyh9f0ad1d_0
+  - rsa=4.4.1=pyh9f0ad1d_0
+  - s3transfer=0.3.3=py37hc8dfbb8_2
+  - scikit-learn=0.23.2=py37h6785257_0
+  - scipy=1.5.2=py37hb14ef9d_2
+  - send2trash=1.5.0=py_0
+  - setuptools=49.6.0=py37he5f6b98_2
+  - six=1.15.0=pyh9f0ad1d_0
+  - snowballstemmer=2.0.0=py_0
+  - sqlite=3.33.0=h4cf870e_1
+  - tensorboard=2.3.0=py_0
+  - tensorboard-plugin-wit=1.6.0=pyh9f0ad1d_0
+  - terminado=0.9.1=py37hc8dfbb8_1
+  - testpath=0.4.4=py_0
+  - threadpoolctl=2.1.0=pyh5ca1d4c_0
+  - tk=8.6.10=hed695b0_1
+  - toml=0.10.1=pyh9f0ad1d_0
+  - torchvision=0.7.0=py37_cu102
+  - tornado=6.0.4=py37h8f50634_2
+  - tqdm=4.50.2=pyh9f0ad1d_0
+  - traitlets=5.0.5=py_0
+  - typed-ast=1.4.1=py37h516909a_0
+  - typing-extensions=3.7.4.3=0
+  - typing_extensions=3.7.4.3=py_0
+  - urllib3=1.25.10=py_0
+  - wcwidth=0.2.5=pyh9f0ad1d_2
+  - webencodings=0.5.1=py_1
+  - werkzeug=1.0.1=pyh9f0ad1d_0
+  - wheel=0.35.1=pyh9f0ad1d_0
+  - widgetsnbextension=3.5.1=py37hc8dfbb8_2
+  - xarray=0.16.1=py_0
+  - xz=5.2.5=h516909a_1
+  - yaml=0.2.5=h516909a_0
+  - yapf=0.30.0=pyh9f0ad1d_0
+  - yarl=1.6.2=py37h8f50634_0
+  - zeromq=4.3.3=he1b5a44_2
+  - zipp=3.3.1=py_0
+  - zlib=1.2.11=h516909a_1010
+  - zstd=1.4.5=h6597ccf_2
+  - pip:
+    - pyqt5-sip==4.19.18
+    - pyqtchart==5.12
+    - pyqtwebengine==5.12.1
+    - sklearn-pandas==2.0.2
+    - torchsummaryx==1.3.0
+prefix: /home/wassname/anaconda/envs/seq2seq-time
@@ -0,0 +1,26 @@
+name: seq2seq-time
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python==3.7
+  - pip
+  - awscli
+  - ipykernel
+  - tqdm
+  - xarray
+  - pandas
+  - pytorch
+  - torchvision
+  - cudatoolkit==10.2
+  - black
+  - pylama
+  - mypy
+  - pytest
+  - numpy
+  - matplotlib
+  - scikit-learn
+  - pytorch-lightning
+  - yapf
+  - ipywidgets
+prefix: /home/wassname/anaconda/envs/seq2seq-time
@@ -11,5 +11,5 @@ dependencies:
  - awscli
  - pip:
    # local package
-    - -e .
+    # - -e .

@@ -4,3 +4,13 @@ This project has multiple ways of documenting requirements
 - environment.min.yaml - This is the minimum requirements, use it to install a new test or dev environment
 - environment.max.yaml - This pins all conda packages, use for production or finding vunrebilities
 - requirements.txt - For people or bots not using conda
+
+```
+# Install requirements
+conda create --name seq2seq-time python=3.7 -f ./requirements/environment.yaml
+conda activate seq2seq-time 
+# Install this package in editable mode
+python -m pip install -e .
+# Install kernel
+python -m ipykernel install --user --name seq2seq-time --display-name seq2seq-time 
+```
@@ -0,0 +1,100 @@
+import pandas as pd
+import torch.utils.data
+import numpy as np
+
+def assert_normalized(df):
+    stats = df.describe().T
+    np.testing.assert_allclose(stats['mean'].values, 0, atol=0.1), 'means should be normalized to ~0'
+    np.testing.assert_allclose(stats['std'].values, 1, atol=0.1), 'standard deviations should be normalized to ~0'
+
+def assert_no_objects(df):
+    for name, dtype in df.dtypes.iteritems():
+        assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'
+
+
+class Seq2SeqDataSet(torch.utils.data.Dataset):
+    """
+    Takes in dataframe and returns sequences through time.
+    
+    Returns x_past, y_past, x_future, etc.
+    """
+    
+    def __init__(self, df: pd.DataFrame, window_past=40, window_future=10, columns_target=['energy(kWh/hh)'], columns_blank=[],):
+        """
+        Args:
+        - df: DataFrame with time index, already scaled
+        - columns_blank: The columns we will blank, in the future
+        """
+        super().__init__()
+        # TODO auto categorical columns
+        # TODO specify blank future columns
+        assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
+        assert df.index.freq is not None, 'should have freq'
+        # assert_normalized(df)
+        assert_no_objects(df)
+
+        # Use numpy instead of pandas, for speed
+        self.x = df.drop(columns=columns_target).copy().values
+        self.y = df[columns_target].copy().values
+        self.t = df.index.copy()
+        self.columns = list(df.columns)
+        self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
+
+        self.window_past = window_past
+        self.window_future = window_future
+        self.columns_target = columns_target
+
+    def get_components(self, i):
+        """Get past and future rows."""
+        x = self.x[i : i + (self.window_past + self.window_future)].copy()
+        y = self.y[i:i + (self.window_past + self.window_future)].copy()
+        t = self.t[i:i + (self.window_past + self.window_future)].copy()
+        t = t.astype(int) * 1e-9 / 60 / 60 / 24  # days
+        t = t.values
+        now = t[self.window_past]
+        
+        # Add a features: relative hours since present time, is future
+        tstp = (t - now)[:, None]
+        is_past = tstp < 0
+        x = np.concatenate([x, tstp, is_past], -1)
+        
+        # Split into future and past
+        x_past = x[:self.window_past]
+        y_past = y[:self.window_past]
+        x_future = x[self.window_past:]
+        y_future = y[self.window_past:]
+
+        # Stop it cheating by using future weather measurements
+        x_future[:, self.icol_blank] = 0
+        return x_past, y_past, x_future, y_future
+
+
+    def __getitem__(self, i):
+        """This is how python implements square brackets"""
+        if i<0:
+            # Handle negative integers
+            i = len(self)+i
+        data = self.get_components(i)
+        # From dataframe to torch
+        return [d.astype(np.float32) for d in data]
+    
+    
+    def get_rows(self, i):
+        """
+        Output pandas dataframes for display purposes.
+        """
+        x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
+        x_past, y_past, x_future, y_future = self.get_components(i)
+        t_past = self.t[i:i+self.window_past]
+        t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
+        x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
+        x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
+        y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
+        y_future = pd.DataFrame(y_future, columns=self.columns_target, index=t_future)
+        return x_past, y_past, x_future, y_future
+        
+    def __len__(self):
+        return len(self.x) - (self.window_past + self.window_future)
+    
+    def __repr__(self):
+        return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
@@ -0,0 +1,72 @@
+import xarray as xr 
+import torch
+from tqdm.auto import tqdm
+import pandas as pd
+
+from .util import to_numpy
+
+def predict(model, ds_test, batch_size, device='cpu', scaler=None):
+    """
+    Gather all predictions into xarray.
+    
+    When we generate prediction in a sequence to sequence model we start at a time then predict
+    N steps into the future. So we have 2 dimensions: source time, target time.
+
+    But we also care about how far we were predicting into the future, so we have 3 dimensions: source time, target time, time ahead.
+
+    It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
+    """
+    load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
+    freq = ds_test.t.freq
+    xrs = []
+    for i, batch in enumerate(tqdm(load_test, desc='predict')):
+        model.eval()
+        with torch.no_grad():
+            x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
+            y_dist = model(x_past, y_past, x_future, y_future)
+            nll = -y_dist.log_prob(y_future)
+
+            # Convert to numpy
+            mean = to_numpy(y_dist.loc.squeeze(-1))
+            std = to_numpy(y_dist.scale.squeeze(-1))
+            nll = to_numpy(nll.squeeze(-1))
+            y_future = to_numpy(y_future.squeeze(-1))
+            y_past = to_numpy(y_past.squeeze(-1))    
+
+        # Make an xarray.Dataset for the data
+        bs = y_future.shape[0]
+        t_source = ds_test.t[i:i+bs].values
+        t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
+        t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
+        xr_out = xr.Dataset(
+            {
+                # Format> name: ([dimensions,...], array),
+                "y_past": (["t_source", "t_behind",], y_past),
+                "nll": (["t_source", "t_ahead",], nll),
+                "y_pred": (["t_source", "t_ahead",], mean),
+                "y_pred_std": (["t_source", "t_ahead",], std),
+                "y_true": (["t_source", "t_ahead",], y_future),
+            },
+            coords={"t_source": t_source, "t_ahead": t_ahead, "t_behind": t_behind},
+        )
+        xrs.append(xr_out)
+
+    # Join all batches
+    ds_preds = xr.concat(xrs, dim="t_source")
+    
+    # undo scaling on y
+    if scaler:
+        ds_preds['y_pred_std'].values = ds_preds.y_pred_std * scaler.scale_
+        ds_preds['y_past'].values =  scaler.inverse_transform(ds_preds.y_past)
+        ds_preds['y_pred'].values =  scaler.inverse_transform(ds_preds.y_pred)
+        ds_preds['y_true'].values =  scaler.inverse_transform(ds_preds.y_true)
+
+    # Add some derived coordinates, they will be the ones not in bold
+    # The target time, is a function of the source time, and how far we predict ahead
+    ds_preds = ds_preds.assign_coords(t_target=ds_preds.t_source+ds_preds.t_ahead)
+
+    ds_preds = ds_preds.assign_coords(t_past=ds_preds.t_source+ds_preds.t_behind)
+
+    # Some plots don't like timedeltas, so lets make a coordinate for time ahead in hours
+    ds_preds = ds_preds.assign_coords(t_ahead_hours=(ds_preds.t_ahead*1.0e-9/60/60).astype(float))
+    return ds_preds
@@ -0,0 +1,10 @@
+from pathlib import Path
+import torch
+
+project_dir = Path(__file__).parent.parent
+
+def to_numpy(x):
+    """Helper function to avoid repeating code"""
+    if isinstance(x, torch.Tensor):
+        x = x.cpu().detach().numpy()
+    return x