misc

2026-06-27 18:44:28 +08:00 · 2020-10-27 06:43:50 +08:00
parent 6eda47b76f
commit 052fd6596c
9 changed files with 2920 additions and 21 deletions
@@ -7,9 +7,11 @@ from sklearn_pandas import DataFrameMapper
 import xarray as xr
 import pandas as pd
 import numpy as np
+import zipfile

 from .dataset import Seq2SeqDataSet
 from .util import normalize_encode_dataframe, timeseries_split
+from ..util import dset_to_nc
 from .tidal import generate_tidal_periods


@@ -77,20 +79,18 @@ class GasSensor(RegressionForecastData):
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00487/gas-sensor-array-temperature-modulation.zip'
        
        # download if needed
-        extract_path = self.datasets_root/'GasSensor'
-        files = sorted(extract_path.glob('*.csv'))
-        if len(files)<13:
-            print('download_and_extract_archive')
-            download_and_extract_archive(url, self.datasets_root, extract_path)
+        # extract_path = self.datasets_root/'gas-sensor-array-temperature-modulation.zip'
+        download_url(url, self.datasets_root)
        
-        # Load csv's
-        files = sorted(extract_path.glob('*.csv'))
-        dfs = []
-        for f in files:
-            now = pd.to_datetime(f.stem, format='%Y%m%d_%H%M%S')
-            df = pd.read_csv(f)
-            df.index = pd.to_timedelta(df['Time (s)'], unit='s') + now
-            dfs.append(df)
+        # Load csv's from inside zip
+        zf = zipfile.ZipFile(self.datasets_root / 'gas-sensor-array-temperature-modulation.zip')
+        dfs=[]
+        for f in zf.namelist():
+            if f.endswith('.csv'):
+                now = pd.to_datetime(Pdset_to_ncath(f).stem, format='%Y%m%d_%H%M%S')
+                df = pd.read_csv(zf.open(f))
+                df.index = pd.to_timedelta(df['Time (s)'], unit='s') + now
+                dfs.append(df)
        self.df = pd.concat(dfs).dropna(subset=self.columns_target)

        df = df[[ 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)',
@@ -272,11 +272,7 @@ def get_current_timeseries(
        # Add tidal freqs
        xd = xd.merge(df_eta)

-        # Cache to nc
-        xd.to_netcdf(outfile)
-        print(
-            f'wrote "{outfile}" with size {outfile.stat().st_size*1e-6:2.2f} MB'
-        )
+        dset_to_nc(xd, outfile)
    return outfile


@@ -31,7 +31,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
        assert df.index.freq is not None, 'should have freq'
        assert_no_objects(df)

-        self.df = df.dropna(subset=columns_target)
+        self.df = df.dropna(subset=columns_target).ffill()

        self.window_past = window_past
        self.window_future = window_future
@@ -100,7 +100,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
    
    def __repr__(self):
        t = self.df.index
-        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]} at {t.freq.freqstr})>'
+        return f'<{type(self).__name__}(shape={self.df.shape}, times={t[0]} to {t[1]})>'


 class Seq2SeqDataSets(torch.utils.data.Dataset):
@@ -16,4 +16,4 @@ def normalize_encode_dataframe(df, encoder=OrdinalEncoder):
 def timeseries_split(df, test_fraction=0.2):
    """Split timeseries data with test in the future"""
    i = int(len(df)*test_fraction)
-    return df.iloc[:i], df.iloc[i:]
+    return df.iloc[:-i], df.iloc[-i:]
@@ -0,0 +1,23 @@
+import numpy as np
+
+EPSILON = 1e-10
+
+def _error(actual: np.ndarray, predicted: np.ndarray):
+    """ Simple error """
+    return actual - predicted
+
+def mse(actual: np.ndarray, predicted: np.ndarray):
+    """ Mean Squared Error """
+    return np.mean(np.square(_error(actual, predicted)))
+
+def rmse(actual: np.ndarray, predicted: np.ndarray):
+    """ Root Mean Squared Error """
+    return np.sqrt(mse(actual, predicted))
+
+
+def smape(actual: np.ndarray, predicted: np.ndarray):
+    """
+    Symmetric Mean Absolute Percentage Error
+    Note: result is NOT multiplied by 100
+    """
+    return np.mean(2.0 * np.abs(actual - predicted) / ((np.abs(actual) + np.abs(predicted)) + EPSILON))
@@ -0,0 +1,146 @@
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size].contiguous()
+
+
+class Conv(nn.Module):
+    """Causal convolution layer."""
+
+    def __init__(
+        self,
+        n_inputs,
+        n_outputs,
+        kernel_size,
+        stride,
+        dilation,
+        padding,
+        causal=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.chomp = Chomp1d(padding)
+        self.causal = causal
+
+    def forward(self, x):
+        out = self.conv(x)
+        if self.causal:
+            out = self.chomp(out)
+        return out
+
+
+class TemporalBlock(nn.Module):
+    def __init__(
+        self,
+        n_inputs,
+        n_outputs,
+        kernel_size,
+        stride,
+        dilation,
+        padding,
+        dropout=0.2,
+    ):
+        super(TemporalBlock, self).__init__()
+        self.conv1 = Conv(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = Conv(
+            n_outputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(
+            self.conv1, self.relu1, self.dropout1, self.conv2, self.relu2, self.dropout2
+        )
+        self.downsample = (
+            Conv(
+                n_inputs,
+                n_outputs,
+                1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                causal=False,
+            )
+            if n_inputs != n_outputs
+            else None
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = x
+        for i, l in enumerate(self.net):
+            out = l(out)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TemporalConvNet(nn.Module):
+    """
+    See:
+    - https://arxiv.org/pdf/1803.01271.pdf
+    - https://github.com/locuslab/TCN
+    """
+    def __init__(
+        self,
+        num_inputs,
+        num_channels,
+        num_embeddings=0,
+        kernel_size=2,
+        dropout=0.2,
+        embedding_dim=2,
+    ):
+        super(TemporalConvNet, self).__init__()
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            layers += [
+                TemporalBlock(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation_size,
+                    padding=(kernel_size - 1) * dilation_size,
+                    dropout=dropout,
+                )
+            ]
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = x
+        for l in self.network:
+            out = l(out)
+        return out
@@ -1,6 +1,9 @@
 from pathlib import Path
 import torch
+import xarray as xr
+import logging

+logger = logging.getLogger(__file__)
 project_dir = Path(__file__).parent.parent

 def to_numpy(x):
@@ -12,3 +15,11 @@ def to_numpy(x):
 def mask_upper_triangular(N, device):
    """Causal attention."""
    return torch.triu(torch.ones(N, N), diagonal=1).to(device).bool()
+
+def dset_to_nc(dset, f, engine="netcdf4", compression={"zlib": True}):
+    if isinstance(dset, xr.DataArray):
+        dset = dset.to_dataset(name="data")
+    encoding = {k: {"zlib": True} for k in dset.data_vars}
+    logger.info(f"saving to {f}")
+    dset.to_netcdf(f, engine=engine, encoding=encoding)
+    logger.info(f"Wrote {f.stem}.nc size={f.stat().st_size/1e6} M")