multiple blocks

This commit is contained in:
wassname
2020-04-13 12:30:52 +08:00
parent 411efbc8d4
commit b2e42d0552
8 changed files with 608 additions and 1248136 deletions
+2
View File
@@ -0,0 +1,2 @@
*.csv filter=lfs diff=lfs merge=lfs -text
*.tsv filter=lfs diff=lfs merge=lfs -text
+1
View File
@@ -5,6 +5,7 @@ events.out.*
/optuna_result/
/runs/
/logs/
.cache/
# Created by https://www.gitignore.io/api/code,linux,macos,python,windows,jupyternotebook,jupyternotebooks
# Edit at https://www.gitignore.io/?templates=code,linux,macos,python,windows,jupyternotebook,jupyternotebooks
File diff suppressed because it is too large Load Diff
Binary file not shown.
1 Bank holidays version https://git-lfs.github.com/spec/v1 Type
3 size 786
File diff suppressed because it is too large Load Diff
+83 -50
View File
@@ -2,6 +2,11 @@ from pathlib import Path
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
from diskcache import Cache
cache = Cache(".cache")
def npsample_batch(x, y, size=None, sort=True):
"""Sample from numpy arrays along 2nd dim."""
@@ -76,7 +81,7 @@ class SmartMeterDataSet(torch.utils.data.Dataset):
rows = rows.sort_values('tstp')
# make sure tstp, which is our x axis, is the first value
columns = ['tstp'] + list(set(rows.columns) - set(['tstp'])) + ['future']
columns = ['tstp'] + list(set(rows.columns) - set(['tstp', 'block'])) + ['future']
rows['future'] = 0.
rows = rows[columns]
@@ -94,21 +99,14 @@ class SmartMeterDataSet(torch.utils.data.Dataset):
def __len__(self):
return len(self.df) - (self.num_context + self.num_extra_target)
def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), use_logy=False):
csv_files = sorted(Path('data/smart-meters-in-london/halfhourly_dataset').glob('*.csv'))[:1]
df = pd.concat([pd.read_csv(f, parse_dates=[1], na_values=['Null']) for f in csv_files])
# print(df.info())
df = df.groupby('tstp').mean()
df['tstp'] = df.index
df.index.name = ''
def load_weather_csv(infile):
# Load weather data
df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
df_weather = pd.read_csv(infile, parse_dates=[3])
use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
'pressure', 'apparentTemperature', 'windSpeed',
'humidity']
'pressure', 'apparentTemperature', 'windSpeed',
'humidity']
df_weather = df_weather[use_cols].set_index('time')
# Resample to match energy data
@@ -116,56 +114,91 @@ def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), use_logy=Fals
# Normalise
weather_norms=dict(mean={'visibility': 11.2,
'windBearing': 195.7,
'temperature': 10.5,
'dewPoint': 6.5,
'pressure': 1014.1,
'apparentTemperature': 9.2,
'windSpeed': 3.9,
'humidity': 0.8},
'windBearing': 195.7,
'temperature': 10.5,
'dewPoint': 6.5,
'pressure': 1014.1,
'apparentTemperature': 9.2,
'windSpeed': 3.9,
'humidity': 0.8},
std={'visibility': 3.1,
'windBearing': 90.6,
'temperature': 5.8,
'dewPoint': 5.0,
'pressure': 11.4,
'apparentTemperature': 6.9,
'windSpeed': 2.0,
'humidity': 0.1})
'windBearing': 90.6,
'temperature': 5.8,
'dewPoint': 5.0,
'pressure': 11.4,
'apparentTemperature': 6.9,
'windSpeed': 2.0,
'humidity': 0.1})
for col in df_weather.columns:
df_weather[col] -= weather_norms['mean'][col]
df_weather[col] /= weather_norms['std'][col]
return df_weather
df = pd.concat([df, df_weather], 1).dropna()
def f2i(f: Path) -> int:
"""block_2.csv->2"""
return int(f.stem.split('_')[-1])
def is_test(f):
return f2i(f) % 8 == 1
def is_val(f):
return f2i(f) % 7==1
@cache.memoize()
def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), max_files=10, use_logy=False):
df_weather = load_weather_csv(indir/'weather_hourly_darksky.csv')
# Also find bank holidays
df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
holidays = set(df_hols['Bank holidays'].dt.round('D'))
df['holiday'] = df.tstp.apply(lambda dt:dt.floor('D') in holidays).astype(int)
def load_csv(f):
df = pd.read_csv(f, parse_dates=[1], na_values=['Null'])
# Add time features
time = df.tstp
df["month"] = time.dt.month / 12.0
df['day'] = time.dt.day / 310.0
df['week'] = time.dt.week / 52.0
df['hour'] = time.dt.hour / 24.0
df['minute'] = time.dt.minute / 24.0
df['dayofweek'] = time.dt.dayofweek / 7.0
# Do a whole block as one series
df = df.groupby('tstp').mean()
df = df.sort_values('tstp')
# Drop nan and 0's
df = df[df['energy(kWh/hh)'] != 0]
df = df.dropna()
df['block'] = f2i(f)
if use_logy:
df['energy(kWh/hh)'] = np.log(df['energy(kWh/hh)']+1e-4)
df = df.sort_values('tstp')
# Drop nan and 0's
df = df[df['energy(kWh/hh)'] != 0]
df = df.dropna()
# df.index.name = 'tstp'
df['tstp'] = df.index
# join weather and holidays
df = pd.concat([df, df_weather], 1).dropna()
df['holiday'] = df.tstp.apply(lambda dt: dt.floor('D') in holidays).astype(int)
# Add time features
time = df.tstp
df["month"] = time.dt.month / 12.0
df['day'] = time.dt.day / 310.0
df['week'] = time.dt.week / 52.0
df['hour'] = time.dt.hour / 24.0
df['minute'] = time.dt.minute / 24.0
df['dayofweek'] = time.dt.dayofweek / 7.0
if use_logy:
df['energy(kWh/hh)'] = np.log(df['energy(kWh/hh)']+1e-4)
return df
# split data
test_split= -int(len(df) * 0.1)
val_split= int(len(df) * 0.15)
df_test = df[:val_split]
df_train = df[val_split:test_split]
df_val = df[test_split:]
csv_files = list((indir / 'halfhourly_dataset').glob('*.csv'))
csv_files.sort(key=f2i)
csv_files = csv_files[:max_files]
test_files = [f for f in csv_files if is_test(f)]
val_files = [f for f in csv_files if is_val(f) and (not is_test(f))]
train_files = [f for f in csv_files if (not is_val(f)) and (not is_test(f))]
print(len(train_files), len(val_files), len(test_files))
print(train_files, val_files, test_files)
assert not set(train_files).intersection(set(test_files), set(val_files))
assert not set(test_files).intersection(set(val_files))
df_test = pd.concat([load_csv(f) for f in tqdm(test_files, desc='test csv')], 0)
df_val = pd.concat([load_csv(f) for f in tqdm(val_files, desc='val csv')], 0)
df_train = pd.concat([load_csv(f) for f in tqdm(train_files, desc='train csv')], 0)
return df_train, df_val, df_test
+4 -1
View File
@@ -114,7 +114,8 @@ def run_trial(
model, trainer = main(
trial, PL_MODEL_CLS, name=name, MODEL_DIR=MODEL_DIR, train=False, prune=False
)
if number is None:
checkpoints = sorted(Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt"))
if len(checkpoints)==0 or number is None:
try:
trainer.fit(model)
except KeyboardInterrupt:
@@ -147,6 +148,8 @@ def run_trial(
plt.show()
plot_from_loader(model.test_dataloader(), model, i=670, title='test 670')
plt.show()
else:
print('no checkpoints')
try:
trainer.test(model)
+509 -4222
View File
File diff suppressed because one or more lines are too long