mirror of
https://github.com/wassname/seq2seq-time.git
synced 2026-06-27 17:50:09 +08:00
working
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -202,6 +202,8 @@ plt.legend()
|
||||
df_norm
|
||||
|
||||
|
||||
# These are the columns that we wont know in the future
|
||||
# We need to blank them out in x_future
|
||||
columns_blank=['visibility',
|
||||
'windBearing', 'temperature', 'dewPoint', 'pressure',
|
||||
'apparentTemperature', 'windSpeed', 'humidity']
|
||||
@@ -217,10 +219,6 @@ ds_test = Seq2SeqDataSet(df_test,
|
||||
print(ds_train)
|
||||
print(ds_test)
|
||||
|
||||
# %%timeit
|
||||
for i in range(100):
|
||||
ds_train[i]
|
||||
|
||||
# we can treat it like an array
|
||||
ds_train[0]
|
||||
len(ds_train)
|
||||
@@ -426,10 +424,8 @@ training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
|
||||
# ## Predict
|
||||
#
|
||||
|
||||
# TODO get working
|
||||
output_scaler = scaler.transformers[-4][1]
|
||||
ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
|
||||
|
||||
ds_preds = predict(model, ds_test, batch_size, device=device, scaler=output_scaler)
|
||||
ds_preds
|
||||
|
||||
|
||||
# +
|
||||
@@ -504,11 +500,7 @@ d.plot.scatter('t_ahead_hours', 'likelihood')
|
||||
|
||||
|
||||
# Make a plot of the NLL over time. Does this solution get worse with time?
|
||||
# this is hard because we need to take the mean over t_ahead
|
||||
# then group by t_source
|
||||
d = ds_preds.mean('t_ahead').groupby('t_source').mean()
|
||||
# And even then it's clearer with smoothing
|
||||
d.plot.scatter('t_source', 'nll')
|
||||
d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
|
||||
plt.xticks(rotation=45)
|
||||
plt.title('NLL over time (lower is better)')
|
||||
1
|
||||
@@ -517,3 +509,7 @@ plt.title('NLL over time (lower is better)')
|
||||
ds_preds.plot.scatter('y_true', 'y_pred', s=.01)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ def assert_no_objects(df):
|
||||
assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'
|
||||
|
||||
|
||||
|
||||
class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
"""
|
||||
Takes in dataframe and returns sequences through time.
|
||||
@@ -26,37 +27,34 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
- columns_blank: The columns we will blank, in the future
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO auto categorical columns
|
||||
# TODO specify blank future columns
|
||||
assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
|
||||
assert df.index.freq is not None, 'should have freq'
|
||||
# assert_normalized(df)
|
||||
assert_no_objects(df)
|
||||
|
||||
# Use numpy instead of pandas, for speed
|
||||
self.x = df.drop(columns=columns_target).copy().values
|
||||
self.y = df[columns_target].copy().values
|
||||
self.t = df.index.copy()
|
||||
self.columns = list(df.columns)
|
||||
self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
|
||||
self.df = df
|
||||
|
||||
self.window_past = window_past
|
||||
self.window_future = window_future
|
||||
self.columns_target = columns_target
|
||||
|
||||
# For speed
|
||||
self._icol_blank = [df.drop(columns = columns_target).columns.tolist().index(n) for n in columns_blank]
|
||||
self._x = self.df.drop(columns = self.columns_target).values
|
||||
self._y = self.df[columns_target].values
|
||||
|
||||
def get_components(self, i):
|
||||
"""Get past and future rows."""
|
||||
x = self.x[i : i + (self.window_past + self.window_future)].copy()
|
||||
y = self.y[i:i + (self.window_past + self.window_future)].copy()
|
||||
t = self.t[i:i + (self.window_past + self.window_future)].copy()
|
||||
t = t.astype(int) * 1e-9 / 60 / 60 / 24 # days
|
||||
t = t.values
|
||||
now = t[self.window_past]
|
||||
x = self._x[i : i + (self.window_past + self.window_future)].copy()
|
||||
y = self._y[i:i + (self.window_past + self.window_future)].copy()
|
||||
time = self.df.index.values[i:i + (self.window_past + self.window_future)].copy()
|
||||
|
||||
days = time.astype(int) * 1e-9 / 60 / 60 / 24 # days
|
||||
now = days[self.window_past]
|
||||
|
||||
# Add a features: relative hours since present time, is future
|
||||
tstp = (t - now)[:, None]
|
||||
is_past = tstp < 0
|
||||
x = np.concatenate([x, tstp, is_past], -1)
|
||||
days_since_present = (days - now)[:, None]
|
||||
is_past = days_since_present < 0
|
||||
x = np.concatenate([x, days_since_present, is_past], -1)
|
||||
|
||||
# Split into future and past
|
||||
x_past = x[:self.window_past]
|
||||
@@ -65,7 +63,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
y_future = y[self.window_past:]
|
||||
|
||||
# Stop it cheating by using future weather measurements
|
||||
x_future[:, self.icol_blank] = 0
|
||||
x_future[:, self._icol_blank] = 0
|
||||
return x_past, y_past, x_future, y_future
|
||||
|
||||
|
||||
@@ -83,10 +81,10 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
"""
|
||||
Output pandas dataframes for display purposes.
|
||||
"""
|
||||
x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
|
||||
x_cols = list(self.df.drop(columns=self.columns_target).columns) + ['tsp_days', 'is_past']
|
||||
x_past, y_past, x_future, y_future = self.get_components(i)
|
||||
t_past = self.t[i:i+self.window_past]
|
||||
t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
|
||||
t_past = self.df.index[i:i+self.window_past]
|
||||
t_future = self.df.index[i+self.window_past:i+self.window_past + self.window_future]
|
||||
x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
|
||||
x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
|
||||
y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
|
||||
@@ -94,7 +92,8 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
|
||||
return x_past, y_past, x_future, y_future
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x) - (self.window_past + self.window_future)
|
||||
return len(self._x) - (self.window_past + self.window_future)
|
||||
|
||||
def __repr__(self):
|
||||
return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
|
||||
t = self.df.index
|
||||
return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
|
||||
|
||||
@@ -17,7 +17,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):
|
||||
It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
|
||||
"""
|
||||
load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
|
||||
freq = ds_test.t.freq
|
||||
freq = ds_test.df.index.freq
|
||||
xrs = []
|
||||
for i, batch in enumerate(tqdm(load_test, desc='predict')):
|
||||
model.eval()
|
||||
@@ -35,7 +35,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):
|
||||
|
||||
# Make an xarray.Dataset for the data
|
||||
bs = y_future.shape[0]
|
||||
t_source = ds_test.t[i:i+bs].values
|
||||
t_source = ds_test.df.index[i:i+bs].values
|
||||
t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
|
||||
t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
|
||||
xr_out = xr.Dataset(
|
||||
|
||||
Reference in New Issue
Block a user