This commit is contained in:
wassname
2020-10-18 13:33:24 +08:00
parent 279ef54d86
commit 975c27d5c3
4 changed files with 898 additions and 183 deletions
File diff suppressed because one or more lines are too long
+9 -13
View File
@@ -202,6 +202,8 @@ plt.legend()
df_norm
# These are the columns that we wont know in the future
# We need to blank them out in x_future
columns_blank=['visibility',
'windBearing', 'temperature', 'dewPoint', 'pressure',
'apparentTemperature', 'windSpeed', 'humidity']
@@ -217,10 +219,6 @@ ds_test = Seq2SeqDataSet(df_test,
print(ds_train)
print(ds_test)
# %%timeit
for i in range(100):
ds_train[i]
# we can treat it like an array
ds_train[0]
len(ds_train)
@@ -426,10 +424,8 @@ training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
# ## Predict
#
# TODO get working
output_scaler = scaler.transformers[-4][1]
ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
ds_preds = predict(model, ds_test, batch_size, device=device, scaler=output_scaler)
ds_preds
# +
@@ -504,11 +500,7 @@ d.plot.scatter('t_ahead_hours', 'likelihood')
# Make a plot of the NLL over time. Does this solution get worse with time?
# this is hard because we need to take the mean over t_ahead
# then group by t_source
d = ds_preds.mean('t_ahead').groupby('t_source').mean()
# And even then it's clearer with smoothing
d.plot.scatter('t_source', 'nll')
d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
plt.xticks(rotation=45)
plt.title('NLL over time (lower is better)')
1
@@ -517,3 +509,7 @@ plt.title('NLL over time (lower is better)')
ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
+23 -24
View File
@@ -12,6 +12,7 @@ def assert_no_objects(df):
assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'
class Seq2SeqDataSet(torch.utils.data.Dataset):
"""
Takes in dataframe and returns sequences through time.
@@ -26,37 +27,34 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
- columns_blank: The columns we will blank, in the future
"""
super().__init__()
# TODO auto categorical columns
# TODO specify blank future columns
assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
assert df.index.freq is not None, 'should have freq'
# assert_normalized(df)
assert_no_objects(df)
# Use numpy instead of pandas, for speed
self.x = df.drop(columns=columns_target).copy().values
self.y = df[columns_target].copy().values
self.t = df.index.copy()
self.columns = list(df.columns)
self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
self.df = df
self.window_past = window_past
self.window_future = window_future
self.columns_target = columns_target
# For speed
self._icol_blank = [df.drop(columns = columns_target).columns.tolist().index(n) for n in columns_blank]
self._x = self.df.drop(columns = self.columns_target).values
self._y = self.df[columns_target].values
def get_components(self, i):
"""Get past and future rows."""
x = self.x[i : i + (self.window_past + self.window_future)].copy()
y = self.y[i:i + (self.window_past + self.window_future)].copy()
t = self.t[i:i + (self.window_past + self.window_future)].copy()
t = t.astype(int) * 1e-9 / 60 / 60 / 24 # days
t = t.values
now = t[self.window_past]
x = self._x[i : i + (self.window_past + self.window_future)].copy()
y = self._y[i:i + (self.window_past + self.window_future)].copy()
time = self.df.index.values[i:i + (self.window_past + self.window_future)].copy()
days = time.astype(int) * 1e-9 / 60 / 60 / 24 # days
now = days[self.window_past]
# Add a features: relative hours since present time, is future
tstp = (t - now)[:, None]
is_past = tstp < 0
x = np.concatenate([x, tstp, is_past], -1)
days_since_present = (days - now)[:, None]
is_past = days_since_present < 0
x = np.concatenate([x, days_since_present, is_past], -1)
# Split into future and past
x_past = x[:self.window_past]
@@ -65,7 +63,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
y_future = y[self.window_past:]
# Stop it cheating by using future weather measurements
x_future[:, self.icol_blank] = 0
x_future[:, self._icol_blank] = 0
return x_past, y_past, x_future, y_future
@@ -83,10 +81,10 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
"""
Output pandas dataframes for display purposes.
"""
x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
x_cols = list(self.df.drop(columns=self.columns_target).columns) + ['tsp_days', 'is_past']
x_past, y_past, x_future, y_future = self.get_components(i)
t_past = self.t[i:i+self.window_past]
t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
t_past = self.df.index[i:i+self.window_past]
t_future = self.df.index[i+self.window_past:i+self.window_past + self.window_future]
x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
@@ -94,7 +92,8 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
return x_past, y_past, x_future, y_future
def __len__(self):
return len(self.x) - (self.window_past + self.window_future)
return len(self._x) - (self.window_past + self.window_future)
def __repr__(self):
return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
t = self.df.index
return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
+2 -2
View File
@@ -17,7 +17,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):
It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
"""
load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
freq = ds_test.t.freq
freq = ds_test.df.index.freq
xrs = []
for i, batch in enumerate(tqdm(load_test, desc='predict')):
model.eval()
@@ -35,7 +35,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):
# Make an xarray.Dataset for the data
bs = y_future.shape[0]
t_source = ds_test.t[i:i+bs].values
t_source = ds_test.df.index[i:i+bs].values
t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
xr_out = xr.Dataset(