Files
attentive-neural-processes/neural_processes/data/smart_meter.py
T
2020-04-13 12:30:52 +08:00

205 lines
7.0 KiB
Python

from pathlib import Path
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
from diskcache import Cache
cache = Cache(".cache")
def npsample_batch(x, y, size=None, sort=True):
"""Sample from numpy arrays along 2nd dim."""
inds = np.random.choice(range(x.shape[1]), size=size, replace=False)
if sort:
inds.sort()
return x[:, inds], y[:, inds]
def collate_fns(max_num_context, max_num_extra_target, sample, sort=True, context_in_target=True):
def collate_fn(batch, sample=sample):
# Collate
x = np.stack([x for x, y in batch], 0)
y = np.stack([y for x, y in batch], 0)
# Sample a subset of random size
num_context = np.random.randint(4, max_num_context)
num_extra_target = np.random.randint(4, max_num_extra_target)
x = torch.from_numpy(x).float()
y = torch.from_numpy(y).float()
# Last feature will show how far in time a point is from our last context
assert (np.diff(x[:, :, 0], 1)>=0).all(), 'first features should be ordered e.g. seconds'
assert (x[:, max_num_context, -1]==0.).all(), 'last features should be empty'
time = x[:, :, 0]
t0 = x[:, max_num_context, 0][:, None]
x[:, :, -1] = time - t0 # Feature to let the model know this is past data
x_context = x[:, :max_num_context]
y_context = y[:, :max_num_context]
x_target_extra = x[:, max_num_context:]
y_target_extra = y[:, max_num_context:]
if sample:
# This is slightly differen't than normal, we are ensuring that our target point are in the future, to mimic deployment
x_context, y_context = npsample_batch(
x_context, y_context, size=num_context, sort=sort
)
x_target_extra, y_target_extra = npsample_batch(
x_target_extra, y_target_extra, size=num_extra_target, sort=sort
)
# do we want to compute loss over context+target_extra, or focus in on only target_extra?
if context_in_target is True:
x_target = torch.cat([x_context, x_target_extra], 1)
y_target = torch.cat([y_context, y_target_extra], 1)
else:
x_target = x_target_extra
y_target = y_target_extra
assert (x[:, -1, -1] > 0).all()
assert (x[:, 0, -1] < 0).all()
return x_context, y_context, x_target, y_target
return collate_fn
class SmartMeterDataSet(torch.utils.data.Dataset):
def __init__(self, df, num_context=40, num_extra_target=10, label_names=['energy(kWh/hh)']):
self.df = df
self.num_context = num_context
self.num_extra_target = num_extra_target
self.label_names = label_names
def get_rows(self, i):
rows = self.df.iloc[i : i + (self.num_context + self.num_extra_target)].copy()
rows['tstp'] = (rows['tstp'] - rows['tstp'].iloc[0]).dt.total_seconds() / 86400.0
rows = rows.sort_values('tstp')
# make sure tstp, which is our x axis, is the first value
columns = ['tstp'] + list(set(rows.columns) - set(['tstp', 'block'])) + ['future']
rows['future'] = 0.
rows = rows[columns]
# This will be the last row, and will change it upon sample to let the model know some points are in the future
x = rows.drop(columns=self.label_names).copy()
y = rows[self.label_names].copy()
return x, y
def __getitem__(self, i):
x, y = self.get_rows(i)
return x.values, y.values
def __len__(self):
return len(self.df) - (self.num_context + self.num_extra_target)
def load_weather_csv(infile):
# Load weather data
df_weather = pd.read_csv(infile, parse_dates=[3])
use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
'pressure', 'apparentTemperature', 'windSpeed',
'humidity']
df_weather = df_weather[use_cols].set_index('time')
# Resample to match energy data
df_weather = df_weather.resample('30T').ffill()
# Normalise
weather_norms=dict(mean={'visibility': 11.2,
'windBearing': 195.7,
'temperature': 10.5,
'dewPoint': 6.5,
'pressure': 1014.1,
'apparentTemperature': 9.2,
'windSpeed': 3.9,
'humidity': 0.8},
std={'visibility': 3.1,
'windBearing': 90.6,
'temperature': 5.8,
'dewPoint': 5.0,
'pressure': 11.4,
'apparentTemperature': 6.9,
'windSpeed': 2.0,
'humidity': 0.1})
for col in df_weather.columns:
df_weather[col] -= weather_norms['mean'][col]
df_weather[col] /= weather_norms['std'][col]
return df_weather
def f2i(f: Path) -> int:
"""block_2.csv->2"""
return int(f.stem.split('_')[-1])
def is_test(f):
return f2i(f) % 8 == 1
def is_val(f):
return f2i(f) % 7==1
@cache.memoize()
def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), max_files=10, use_logy=False):
df_weather = load_weather_csv(indir/'weather_hourly_darksky.csv')
# Also find bank holidays
df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
holidays = set(df_hols['Bank holidays'].dt.round('D'))
def load_csv(f):
df = pd.read_csv(f, parse_dates=[1], na_values=['Null'])
# Do a whole block as one series
df = df.groupby('tstp').mean()
df = df.sort_values('tstp')
df['block'] = f2i(f)
# Drop nan and 0's
df = df[df['energy(kWh/hh)'] != 0]
df = df.dropna()
# df.index.name = 'tstp'
df['tstp'] = df.index
# join weather and holidays
df = pd.concat([df, df_weather], 1).dropna()
df['holiday'] = df.tstp.apply(lambda dt: dt.floor('D') in holidays).astype(int)
# Add time features
time = df.tstp
df["month"] = time.dt.month / 12.0
df['day'] = time.dt.day / 310.0
df['week'] = time.dt.week / 52.0
df['hour'] = time.dt.hour / 24.0
df['minute'] = time.dt.minute / 24.0
df['dayofweek'] = time.dt.dayofweek / 7.0
if use_logy:
df['energy(kWh/hh)'] = np.log(df['energy(kWh/hh)']+1e-4)
return df
csv_files = list((indir / 'halfhourly_dataset').glob('*.csv'))
csv_files.sort(key=f2i)
csv_files = csv_files[:max_files]
test_files = [f for f in csv_files if is_test(f)]
val_files = [f for f in csv_files if is_val(f) and (not is_test(f))]
train_files = [f for f in csv_files if (not is_val(f)) and (not is_test(f))]
print(len(train_files), len(val_files), len(test_files))
print(train_files, val_files, test_files)
assert not set(train_files).intersection(set(test_files), set(val_files))
assert not set(test_files).intersection(set(val_files))
df_test = pd.concat([load_csv(f) for f in tqdm(test_files, desc='test csv')], 0)
df_val = pd.concat([load_csv(f) for f in tqdm(val_files, desc='val csv')], 0)
df_train = pd.concat([load_csv(f) for f in tqdm(train_files, desc='train csv')], 0)
return df_train, df_val, df_test