multiple datasets

2026-06-27 16:31:46 +08:00 · 2020-10-18 14:12:53 +08:00
parent 975c27d5c3
commit 17fb62e766
3 changed files with 861 additions and 1621 deletions
@@ -29,7 +29,8 @@

 #
 # - [ ] TODO mike autocorrelation baseline
-# - [ ] TODO mike acorn data
+# - [x] TODO mike acorn data
+# - [ ] TODO mike handle multiple houses. Multiindex

 # OPTIONAL: Load the "autoreload" extension so that code can change. But blacklist large modules
 # %load_ext autoreload
@@ -88,15 +89,13 @@ max_rows = 1e5

 # +

-def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london')):
+def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london'), max_files=1):
    """
    Data loading and cleanding is always messy, so understand this code is optional.
    """
    
    # Load csv files
-    csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:1]
-    
-#     import pdb; pdb.set_trace() # you can use debugging in jupyter to interact with variables inside a function
+    csv_files = sorted((indir/'halfhourly_dataset').glob('*.csv'))[:max_files]
    
    # concatendate them
    df = pd.concat([pd.read_csv(f, parse_dates=[1], na_values=['Null']) for f in csv_files])
@@ -105,56 +104,56 @@ def get_smartmeter_df(indir=Path('../data/raw/smart-meters-in-london')):
    df_households = pd.read_csv(indir/'informations_households.csv')
    df_households = df_households[['LCLid', 'stdorToU', 'Acorn_grouped']]
    df = pd.merge(df, df_households, on='LCLid')
-
-    # Take the mean over all houses
-    name, df = next(iter(df.groupby('LCLid')))
+    
    df = df.set_index('tstp')
-    print(df)
-
-    # Load weather data
-    df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
-    use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
-           'pressure', 'apparentTemperature', 'windSpeed', 
-           'humidity']
-    df_weather = df_weather[use_cols].set_index('time')
-    df_weather = df_weather.resample(freq).first().ffill()  # Resample to match energy data    
-
-    # Join weather and energy data
-    df = pd.concat([df, df_weather], 1).dropna()    
    
-    # Also find bank holidays
-    df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
-    holidays = set(df_hols['Bank holidays'].dt.round('D'))  
-
+    # Drop nan and 0's
+    df = df[df['energy(kWh/hh)']!=0]
+    df = df.dropna()
+    
+    # Add time features 
    time = df.index.to_series()
-    def is_holiday(dt):
-        return dt.floor('D') in holidays
-    df['holiday'] = time.apply(is_holiday).astype(int)
-    
-    # TODO pd.read_csv('../data/raw/smart-meters-in-london/acorn_details.csv', engine='python')
-
-
-    # Add time features    
    df["month"] = time.dt.month
    df['day'] = time.dt.day
    df['week'] = time.dt.week
    df['hour'] = time.dt.hour
    df['minute'] = time.dt.minute
    df['dayofweek'] = time.dt.dayofweek
-
-    # Drop nan and 0's
-    df = df[df['energy(kWh/hh)']!=0]
-    df = df.dropna()
-
-    # sort by time
-    df = df.sort_index()
    
-    return df
+    # Load weather data
+    df_weather = pd.read_csv(indir/'weather_hourly_darksky.csv', parse_dates=[3])
+    use_cols = ['visibility', 'windBearing', 'temperature', 'time', 'dewPoint',
+           'pressure', 'apparentTemperature', 'windSpeed', 
+           'humidity']
+    df_weather = df_weather[use_cols].set_index('time')
+    df_weather = df_weather.resample(freq).first().ffill()  # Resample to match energy data   
+    
+    # Join weather and energy data
+    df = pd.merge(df, df_weather, how='inner', left_index=True, right_index=True, sort=True)
+    
+    # Holidays
+    df_hols = pd.read_csv(indir/'uk_bank_holidays.csv', parse_dates=[0])
+    holidays = set(df_hols['Bank holidays'].dt.round('D'))  
+    def is_holiday(dt):
+        return dt in holidays
+    days = df.index.floor('D')
+    holiday_mapping = days.unique().to_series().apply(is_holiday).astype(int).to_dict()
+    df['holiday'] = days.to_series().map(holiday_mapping).values
+
+    # Loop over houses
+    for name, df_h in df.groupby('LCLid'):
+
+        yield df_h
+
+
 # -
 # Our dataset is the london smartmeter data. But at half hour intervals

 # +
-df = get_smartmeter_df()
+dfs = get_smartmeter_df()
+
+# Just get the first one for now
+df = next(iter(dfs))

 # df = df.resample(freq).first().dropna() # Where empty we will backfill, this will respect causality, and mostly maintain the mean

@@ -181,7 +180,7 @@ df_norm = scaler.fit_transform(df)
 df_norm
 # -

-output_scaler = next(filter(lambda r:r[0][0] in columns_target, mapper4.features))[-1]
+output_scaler = next(filter(lambda r:r[0][0] in columns_target, scaler.features))[-1]
 output_scaler

 # # Resample
@@ -202,6 +201,8 @@ plt.legend()
 df_norm


+# ### Dataset
+
 # These are the columns that we wont know in the future
 # We need to blank them out in x_future
 columns_blank=['visibility',
@@ -318,14 +319,12 @@ print(output)
 from torchsummaryX import summary
 summary(model, past_x, past_y, future_x, future_y )
 1
+
+
 # -

 # ## Training

-
-
-
-
 # +
 def train_epoch(ds, model, bs=128):
    model.train()
@@ -481,10 +480,7 @@ plot_prediction(ds_preds, 48) # 12 hours later


 # +
-d = ds_preds.mean('t_source') # Mean over all predictions
-
-# Plot with xarray, it has a pandas like interface
-d.plot.scatter('t_ahead_hours', 'nll')
+ds_preds.mean('t_source').plot.scatter('t_ahead_hours', 'nll') # Mean over all predictions

 # Tidy the graph
 n = len(ds_preds.t_source)
@@ -493,10 +489,6 @@ plt.xlabel('Hours ahead')
 plt.title(f'NLL vs time (no. samples={n})')
 # -

-d = ds_preds.mean('t_source') # Mean over all predictions
-d['likelihood'] = np.exp(-d.nll) # get likelihood, after taking mean in log domain
-d.plot.scatter('t_ahead_hours', 'likelihood')
-


 # Make a plot of the NLL over time. Does this solution get worse with time?
@@ -513,3 +505,5 @@ ds_preds.plot.scatter('y_true', 'y_pred', s=.01)



+
+
@@ -1,6 +1,7 @@
 import pandas as pd
 import torch.utils.data
 import numpy as np
+import typing

 def assert_normalized(df):
    stats = df.describe().T
@@ -12,7 +13,6 @@ def assert_no_objects(df):
        assert dtype.name!='object', f'all objects should be pd.categories. {name} is not'


-
 class Seq2SeqDataSet(torch.utils.data.Dataset):
    """
    Takes in dataframe and returns sequences through time.
@@ -90,6 +90,8 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
        y_future = pd.DataFrame(y_future, columns=self.columns_target, index=t_future)
        return x_past, y_past, x_future, y_future
+
+
        
    def __len__(self):
        return len(self._x) - (self.window_past + self.window_future)
@@ -97,3 +99,44 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
    def __repr__(self):
        t = self.df.index
        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
+
+
+class Seq2SeqDataSets(torch.utils.data.Dataset):
+    """
+    Multiple datasets. 
+    
+    See Seq2SeqDataSets
+    """
+    def __init__(self, dfs: typing.List[pd.DataFrame], **kwargs):
+        self.datasets = [Seq2SeqDataSet(df, **kwargs) for df in dfs]
+
+    def __getitem__(self, i):
+        l = 0
+        for d in self.datasets:
+            l += len(d)
+            if i < l:
+                return d[i]
+        raise IndexError
+
+    def get_rows(self, i):
+        """
+        Output pandas dataframes for display purposes.
+        """
+        x_cols = list(self.df.drop(columns=self.columns_target).columns) + ['tsp_days', 'is_past']
+        x_past, y_past, x_future, y_future = self.get_components(i)
+        t_past = self.df.index[i:i+self.window_past]
+        t_future = self.df.index[i+self.window_past:i+self.window_past + self.window_future]
+        x_past = pd.DataFrame(x_past, columns=x_cols, index=t_past)
+        x_future = pd.DataFrame(x_future, columns=x_cols, index=t_future)
+        y_past = pd.DataFrame(y_past, columns=self.columns_target, index=t_past)
+        y_future = pd.DataFrame(y_future, columns=self.columns_target, index=t_future)
+        return x_past, y_past, x_future, y_future
+
+    def __len__(self):
+        l = 0
+        for d in self.datasets:
+            l += len(d)
+        return l
+    
+    def __repr__(self):
+        return f'<{type(self).__name__}({self.datasets})>'