working

2026-06-27 17:50:09 +08:00 · 2020-10-18 13:33:24 +08:00
parent 279ef54d86
commit 975c27d5c3
4 changed files with 898 additions and 183 deletions
@@ -202,6 +202,8 @@ plt.legend()
 df_norm


+# These are the columns that we wont know in the future
+# We need to blank them out in x_future
 columns_blank=['visibility',
       'windBearing', 'temperature', 'dewPoint', 'pressure',
       'apparentTemperature', 'windSpeed', 'humidity']
@@ -217,10 +219,6 @@ ds_test = Seq2SeqDataSet(df_test,
 print(ds_train)
 print(ds_test)

-# %%timeit
-for i in range(100):
-    ds_train[i]
-
 # we can treat it like an array
 ds_train[0]
 len(ds_train)
@@ -426,10 +424,8 @@ training_loop(ds_train, ds_test, model, epochs=8, bs=batch_size)
 # ## Predict
 #

-# TODO get working
-output_scaler = scaler.transformers[-4][1]
-ds_preds = predict(model, ds_test, batch_size*6, device=device, scaler=output_scaler)
-
+ds_preds = predict(model, ds_test, batch_size, device=device, scaler=output_scaler)
+ds_preds


 # +
@@ -504,11 +500,7 @@ d.plot.scatter('t_ahead_hours', 'likelihood')


 # Make a plot of the NLL over time. Does this solution get worse with time?
-# this is hard because we need to take the mean over t_ahead
-# then group by t_source
-d = ds_preds.mean('t_ahead').groupby('t_source').mean()
-# And even then it's clearer with smoothing
-d.plot.scatter('t_source', 'nll')
+d = ds_preds.mean('t_ahead').groupby('t_source').mean().plot.scatter('t_source', 'nll')
 plt.xticks(rotation=45)
 plt.title('NLL over time (lower is better)')
 1
@@ -517,3 +509,7 @@ plt.title('NLL over time (lower is better)')
 ds_preds.plot.scatter('y_true', 'y_pred', s=.01)


+
+
+
+
@@ -12,6 +12,7 @@ def assert_no_objects(df):
        assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'


+
 class Seq2SeqDataSet(torch.utils.data.Dataset):
    """
    Takes in dataframe and returns sequences through time.
@@ -26,37 +27,34 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        - columns_blank: The columns we will blank, in the future
        """
        super().__init__()
-        # TODO auto categorical columns
-        # TODO specify blank future columns
        assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
        assert df.index.freq is not None, 'should have freq'
-        # assert_normalized(df)
        assert_no_objects(df)

-        # Use numpy instead of pandas, for speed
-        self.x = df.drop(columns=columns_target).copy().values
-        self.y = df[columns_target].copy().values
-        self.t = df.index.copy()
-        self.columns = list(df.columns)
-        self.icol_blank = [df.drop(columns=columns_target).columns.tolist().index(n) for n in columns_blank]
+        self.df = df

        self.window_past = window_past
        self.window_future = window_future
        self.columns_target = columns_target

+        # For speed
+        self._icol_blank = [df.drop(columns = columns_target).columns.tolist().index(n) for n in columns_blank]
+        self._x = self.df.drop(columns = self.columns_target).values
+        self._y = self.df[columns_target].values
+
    def get_components(self, i):
        """Get past and future rows."""
-        x = self.x[i : i + (self.window_past + self.window_future)].copy()
-        y = self.y[i:i + (self.window_past + self.window_future)].copy()
-        t = self.t[i:i + (self.window_past + self.window_future)].copy()
-        t = t.astype(int) * 1e-9 / 60 / 60 / 24  # days
-        t = t.values
-        now = t[self.window_past]
+        x = self._x[i : i + (self.window_past + self.window_future)].copy()
+        y = self._y[i:i + (self.window_past + self.window_future)].copy()        
+        time = self.df.index.values[i:i + (self.window_past + self.window_future)].copy()
+
+        days = time.astype(int) * 1e-9 / 60 / 60 / 24  # days
+        now = days[self.window_past]
        
        # Add a features: relative hours since present time, is future
-        tstp = (t - now)[:, None]
-        is_past = tstp < 0
-        x = np.concatenate([x, tstp, is_past], -1)
+        days_since_present = (days - now)[:, None]
+        is_past = days_since_present < 0
+        x = np.concatenate([x, days_since_present, is_past], -1)
        
        # Split into future and past
        x_past = x[:self.window_past]
@@ -65,7 +63,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        y_future = y[self.window_past:]

        # Stop it cheating by using future weather measurements
-        x_future[:, self.icol_blank] = 0
+        x_future[:, self._icol_blank] = 0
        return x_past, y_past, x_future, y_future


@@ -83,10 +81,10 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        """
        Output pandas dataframes for display purposes.
        """
-        x_cols = list(self.columns)[1:] + ['tsp_days', 'is_past']
+        x_cols = list(self.df.drop(columns=self.columns_target).columns) + ['tsp_days', 'is_past']
        x_past, y_past, x_future, y_future = self.get_components(i)
-        t_past = self.t[i:i+self.window_past]
-        t_future = self.t[i+self.window_past:i+self.window_past + self.window_future]
+        t_past = self.df.index[i:i+self.window_past]
+        t_future = self.df.index[i+self.window_past:i+self.window_past + self.window_future]
        x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
        x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
        y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
@@ -94,7 +92,8 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        return x_past, y_past, x_future, y_future
        
    def __len__(self):
-        return len(self.x) - (self.window_past + self.window_future)
+        return len(self._x) - (self.window_past + self.window_future)
    
    def __repr__(self):
-        return f'<{type(self).__name__}(shape={self.x.shape}, times={self.t[0]} to {self.t[1]} at {self.t.freq.freqstr})>'
+        t = self.df.index
+        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
@@ -17,7 +17,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):
    It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
    """
    load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size)
-    freq = ds_test.t.freq
+    freq = ds_test.df.index.freq
    xrs = []
    for i, batch in enumerate(tqdm(load_test, desc='predict')):
        model.eval()
@@ -35,7 +35,7 @@ def predict(model, ds_test, batch_size, device='cpu', scaler=None):

        # Make an xarray.Dataset for the data
        bs = y_future.shape[0]
-        t_source = ds_test.t[i:i+bs].values
+        t_source = ds_test.df.index[i:i+bs].values
        t_ahead = pd.timedelta_range(0, periods=ds_test.window_future, freq=freq).values
        t_behind = pd.timedelta_range(end=-pd.Timedelta(freq), periods=ds_test.window_past, freq=freq)
        xr_out = xr.Dataset(