tidy data

This commit is contained in:
wassname
2020-10-26 14:41:07 +08:00
parent 49fdfdc3ab
commit 3ee1c5bbb8
11 changed files with 2405 additions and 4369 deletions
+2 -2
View File
@@ -8,7 +8,7 @@ PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
PROFILE = default
PROJECT_NAME = seq2seq-time
PYTHON_INTERPRETER = seq2seq-time
PYTHON_INTERPRETER = python
#################################################################################
# COMMANDS #
@@ -70,7 +70,7 @@ test:
doc_reqs:
conda env export --no-builds --from-history --name $(PROJECT_NAME) > requirements/environment.min.yaml
conda env export --name $(PROJECT_NAME) > requirements/environment.max.yaml
$(PYTHON_INTERPRETER) -m pip freeze > requirements/requirements.txt --name
$(PYTHON_INTERPRETER) -m pip freeze > requirements/requirements.txt
#################################################################################
# Self Documenting Commands #
File diff suppressed because one or more lines are too long
-725
View File
@@ -1,725 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here I download and preprocess current data\n",
"\n",
"\n",
"see\n",
"- from https://catalogue-imos.aodn.org.au/geonetwork/srv/api/records/ae86e2f5-eaaf-459e-a405-e654d85adb9c\n",
"- http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/WA/WATR20/Velocity/catalog.html"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:39:41.682896Z",
"start_time": "2020-10-26T03:39:40.104951Z"
}
},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import xarray as xr\n",
"import pandas as pd\n",
"import numpy as np\n",
"from urllib import request\n",
"import os, shutil\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-25T07:01:29.004548Z",
"start_time": "2020-10-25T07:01:29.001734Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:39:42.553735Z",
"start_time": "2020-10-26T03:39:41.685439Z"
}
},
"outputs": [],
"source": [
"from torchvision.datasets.utils import download_url, extract_archive, download_and_extract_archive"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:39:42.560919Z",
"start_time": "2020-10-26T03:39:42.556898Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import uptide\n",
"\n",
"# https://en.wikipedia.org/wiki/Theory_of_tides#Harmonic_analysis\n",
"default_tidal_constituents = [\n",
" 'M2', 'S2', 'N2', 'K2', # Semi-diurnal\n",
" 'K1', 'O1', 'P1', 'Q1', # Diurnal\n",
" 'M4', 'M6', 'S4', 'MK3', # Short period\n",
" 'MM', 'SSA', 'SA' # Long period\n",
" ]\n",
"\n",
"def generate_tidal_periods(t:pd.Series, constituents:list=default_tidal_constituents):\n",
" tide = uptide.Tides(constituents)\n",
" t0 = t[0]\n",
" td = t-t0\n",
" td = td.dt.total_seconds().to_numpy().astype(int)\n",
" tide.set_initial_time(t0)\n",
"\n",
" # calc tides\n",
" amplitudes=np.ones_like(td)\n",
" phases=np.zeros_like(td)\n",
" eta = {}\n",
" for name, f, amplitude, omega, phase, phi, u in zip(tide.constituents, tide.f, amplitudes, tide.omega,\n",
" phases, tide.phi, tide.u):\n",
" eta[name] = f*amplitude*np.cos(omega*td-phase+phi+u)\n",
" df_eta = pd.DataFrame(eta, index=t)\n",
" return df_eta"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:59:16.671092Z",
"start_time": "2020-10-26T03:59:16.655680Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"# 'ANMN Two Rocks, WA, 204m mooring, Jul2009 - Dec2009. Preprocessed with DepthPP.'\n",
"\n",
"def get_current_timeseries(\n",
" cache_folder=Path(\"../data/raw/IMOS_ANMN/\"), \n",
" outfile=Path('../data/processed/currents/MOS_ANMN-WA_AETVZ_WATR20_FV01_WATR20-1909-Continental-194_currents.nc')\n",
" ):\n",
" if not outfile.exists():\n",
"\n",
" files = [\n",
" \"IMOS_ANMN-WA_AETVZ_20090715T080000Z_WATR20_FV01_WATR20-0907-Continental-194_END-20090716T181317Z_C-20191122T052830Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20100409T080000Z_WATR20_FV01_WATR20-1004-Continental-194_END-20100430T084500Z_C-20191122T053845Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20101222T080000Z_WATR20_FV01_WATR20-1012-Continental-194_END-20110518T051500Z_C-20200916T020035Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20110608T080000Z_WATR20_FV01_WATR20-1106-Continental-194_END-20111122T035000Z_C-20200916T025619Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20111221T060300Z_WATR20_FV01_WATR20-1112-Continental-194_END-20120704T050500Z_C-20200916T043212Z.nc\", \n",
" \"IMOS_ANMN-WA_AETVZ_20120726T044000Z_WATR20_FV01_WATR20-1207-Continental-194_END-20130204T044000Z_C-20200916T032027Z.nc\",\n",
"\n",
" \"IMOS_ANMN-WA_AETVZ_20130221T080000Z_WATR20_FV01_WATR20-1302-Continental-194_END-20131003T035000Z_C-20180529T020609Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20131111T080000Z_WATR20_FV01_WATR20-1311-Continental-194_END-20140519T035000Z_C-20200114T033335Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20140710T080000Z_WATR20_FV01_WATR20-1407-Continental-194_END-20150121T021500Z_C-20180529T055902Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20150213T080000Z_WATR20_FV01_WATR20-1502-Continental-194_END-20150424T134002Z_C-20200114T035347Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20150914T080000Z_WATR20_FV01_WATR20-1509-Continental-194_END-20160331T043000Z_C-20180601T013623Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20160427T080000Z_WATR20_FV01_WATR20-1604-Continental-194_END-20160531T021800Z_C-20180531T071709Z.nc\",\n",
" # \"IMOS_ANMN-WA_AETVZ_20170512T080000Z_WATR20_FV01_WATR20-1705-Continental-194_END-20170717T014558Z_C-20190805T004647Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20171204T080000Z_WATR20_FV01_WATR20-1712-Continental-194_END-20180618T030000Z_C-20180620T233149Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20180802T080000Z_WATR20_FV01_WATR20-1807-Continental-194_END-20190225T054500Z_C-20190227T001343Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20190307T080000Z_WATR20_FV01_WATR20-1903-Continental-194_END-20190911T003144Z_C-20200114T045053Z.nc\",\n",
" \"IMOS_ANMN-WA_AETVZ_20190926T080000Z_WATR20_FV01_WATR20-1909-Continental-194_END-20200326T030000Z_C-20200420T064334Z.nc\",\n",
" ]\n",
" base=\"http://thredds.aodn.org.au/thredds/fileServer/IMOS/ANMN/WA/WATR20/Velocity/\"\n",
"\n",
" # Download files\n",
" [download_url(base+f, cache_folder) for f in files]\n",
"\n",
" # load and merge\n",
" xds=[xr.open_dataset(cache_folder/f) for f in files]\n",
" vars=['VCUR', 'UCUR', 'WCUR', 'TEMP', 'PRES_REL', 'DEPTH', 'ROLL', 'PITCH']\n",
" xds2= [x[vars].isel(HEIGHT_ABOVE_SENSOR=18) for x in xds]\n",
" xd = xr.concat(xds2, dim='TIME')\n",
" xd = xd.where(xd.DEPTH>150) # remove outliers\n",
"\n",
"\n",
" xd['TIME'] = xd['TIME'].dt.round('10T')\n",
" xd = xd.dropna(dim='TIME', subset=['VCUR', 'UCUR', 'WCUR'])\n",
" # xd = xd.resample(TIME='30T').first()\n",
" # Add tides, these are features that can be forecast\n",
"\n",
" # Generate tidal freqs\n",
" t = xd.TIME.to_series()\n",
" df_eta = generate_tidal_periods(t)\n",
"\n",
" # Add tidal freqs\n",
" xd = xd.merge(df_eta)\n",
"\n",
" # Cache to nc\n",
" xd.to_netcdf(outfile)\n",
" print(f'wrote \"{outfile}\" with size {outfile.stat().st_size*1e-6:2.2f} MB')\n",
" return outfile"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T04:04:08.230047Z",
"start_time": "2020-10-26T04:04:08.099310Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>VCUR</th>\n",
" <th>UCUR</th>\n",
" <th>WCUR</th>\n",
" <th>TEMP</th>\n",
" <th>PRES_REL</th>\n",
" <th>DEPTH</th>\n",
" <th>ROLL</th>\n",
" <th>PITCH</th>\n",
" <th>LATITUDE</th>\n",
" <th>LONGITUDE</th>\n",
" <th>...</th>\n",
" <th>O1</th>\n",
" <th>P1</th>\n",
" <th>Q1</th>\n",
" <th>M4</th>\n",
" <th>M6</th>\n",
" <th>S4</th>\n",
" <th>MK3</th>\n",
" <th>MM</th>\n",
" <th>SSA</th>\n",
" <th>SA</th>\n",
" </tr>\n",
" <tr>\n",
" <th>TIME</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2009-07-15 08:00:00</th>\n",
" <td>-0.396391</td>\n",
" <td>0.089687</td>\n",
" <td>-0.009671</td>\n",
" <td>18.549999</td>\n",
" <td>205.076004</td>\n",
" <td>203.550812</td>\n",
" <td>4.6</td>\n",
" <td>-3.4</td>\n",
" <td>-31.728650</td>\n",
" <td>115.037217</td>\n",
" <td>...</td>\n",
" <td>0.286288</td>\n",
" <td>0.116457</td>\n",
" <td>-1.014973</td>\n",
" <td>-0.146817</td>\n",
" <td>-0.801534</td>\n",
" <td>-0.500000</td>\n",
" <td>0.370082</td>\n",
" <td>0.132683</td>\n",
" <td>-0.686775</td>\n",
" <td>-0.395743</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-07-15 08:10:00</th>\n",
" <td>-0.407620</td>\n",
" <td>0.085398</td>\n",
" <td>-0.019875</td>\n",
" <td>18.650000</td>\n",
" <td>205.078003</td>\n",
" <td>203.552795</td>\n",
" <td>4.6</td>\n",
" <td>-2.4</td>\n",
" <td>-31.728650</td>\n",
" <td>115.037217</td>\n",
" <td>...</td>\n",
" <td>0.242810</td>\n",
" <td>0.159551</td>\n",
" <td>-1.031149</td>\n",
" <td>-0.304345</td>\n",
" <td>-0.900573</td>\n",
" <td>-0.642788</td>\n",
" <td>0.494417</td>\n",
" <td>0.134147</td>\n",
" <td>-0.686601</td>\n",
" <td>-0.395853</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-07-15 08:20:00</th>\n",
" <td>-0.365314</td>\n",
" <td>0.104038</td>\n",
" <td>0.000991</td>\n",
" <td>18.730000</td>\n",
" <td>205.076996</td>\n",
" <td>203.551788</td>\n",
" <td>4.8</td>\n",
" <td>-2.7</td>\n",
" <td>-31.728650</td>\n",
" <td>115.037217</td>\n",
" <td>...</td>\n",
" <td>0.198932</td>\n",
" <td>0.202343</td>\n",
" <td>-1.045759</td>\n",
" <td>-0.453239</td>\n",
" <td>-0.942304</td>\n",
" <td>-0.766044</td>\n",
" <td>0.610654</td>\n",
" <td>0.135610</td>\n",
" <td>-0.686427</td>\n",
" <td>-0.395963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-07-15 08:30:00</th>\n",
" <td>-0.406632</td>\n",
" <td>0.119376</td>\n",
" <td>-0.003729</td>\n",
" <td>18.799999</td>\n",
" <td>205.067001</td>\n",
" <td>203.541901</td>\n",
" <td>4.7</td>\n",
" <td>-2.4</td>\n",
" <td>-31.728650</td>\n",
" <td>115.037217</td>\n",
" <td>...</td>\n",
" <td>0.154727</td>\n",
" <td>0.244751</td>\n",
" <td>-1.058780</td>\n",
" <td>-0.589276</td>\n",
" <td>-0.924071</td>\n",
" <td>-0.866025</td>\n",
" <td>0.716890</td>\n",
" <td>0.137073</td>\n",
" <td>-0.686253</td>\n",
" <td>-0.396072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-07-15 08:40:00</th>\n",
" <td>-0.383744</td>\n",
" <td>0.090066</td>\n",
" <td>-0.008921</td>\n",
" <td>18.860001</td>\n",
" <td>205.065994</td>\n",
" <td>203.540894</td>\n",
" <td>4.9</td>\n",
" <td>-2.9</td>\n",
" <td>-31.728650</td>\n",
" <td>115.037217</td>\n",
" <td>...</td>\n",
" <td>0.110268</td>\n",
" <td>0.286697</td>\n",
" <td>-1.070194</td>\n",
" <td>-0.708598</td>\n",
" <td>-0.847034</td>\n",
" <td>-0.939693</td>\n",
" <td>0.811384</td>\n",
" <td>0.138535</td>\n",
" <td>-0.686080</td>\n",
" <td>-0.396182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26 01:00:00</th>\n",
" <td>-0.436635</td>\n",
" <td>-0.784922</td>\n",
" <td>-0.012147</td>\n",
" <td>16.610001</td>\n",
" <td>197.384003</td>\n",
" <td>195.919662</td>\n",
" <td>-2.9</td>\n",
" <td>3.0</td>\n",
" <td>-31.728717</td>\n",
" <td>115.042133</td>\n",
" <td>...</td>\n",
" <td>-0.734741</td>\n",
" <td>0.190139</td>\n",
" <td>0.964792</td>\n",
" <td>0.882484</td>\n",
" <td>0.770444</td>\n",
" <td>0.505439</td>\n",
" <td>1.028587</td>\n",
" <td>-0.881951</td>\n",
" <td>0.990514</td>\n",
" <td>0.997626</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26 01:30:00</th>\n",
" <td>-0.355067</td>\n",
" <td>-0.845100</td>\n",
" <td>-0.005201</td>\n",
" <td>16.629999</td>\n",
" <td>197.408005</td>\n",
" <td>195.943497</td>\n",
" <td>-2.7</td>\n",
" <td>3.0</td>\n",
" <td>-31.728717</td>\n",
" <td>115.042133</td>\n",
" <td>...</td>\n",
" <td>-0.629257</td>\n",
" <td>0.316317</td>\n",
" <td>0.895545</td>\n",
" <td>0.957914</td>\n",
" <td>0.933774</td>\n",
" <td>0.006292</td>\n",
" <td>0.851981</td>\n",
" <td>-0.880483</td>\n",
" <td>0.990416</td>\n",
" <td>0.997601</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26 02:00:00</th>\n",
" <td>-0.568277</td>\n",
" <td>-0.816935</td>\n",
" <td>-0.024944</td>\n",
" <td>16.660000</td>\n",
" <td>197.412994</td>\n",
" <td>195.948425</td>\n",
" <td>-2.6</td>\n",
" <td>2.9</td>\n",
" <td>-31.728717</td>\n",
" <td>115.042133</td>\n",
" <td>...</td>\n",
" <td>-0.514470</td>\n",
" <td>0.437113</td>\n",
" <td>0.814067</td>\n",
" <td>0.793395</td>\n",
" <td>0.584762</td>\n",
" <td>-0.494541</td>\n",
" <td>0.551159</td>\n",
" <td>-0.878996</td>\n",
" <td>0.990316</td>\n",
" <td>0.997576</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26 02:30:00</th>\n",
" <td>-0.306141</td>\n",
" <td>-0.773147</td>\n",
" <td>-0.028096</td>\n",
" <td>16.719999</td>\n",
" <td>197.419006</td>\n",
" <td>195.954407</td>\n",
" <td>-2.6</td>\n",
" <td>2.7</td>\n",
" <td>-31.728717</td>\n",
" <td>115.042133</td>\n",
" <td>...</td>\n",
" <td>-0.392074</td>\n",
" <td>0.550470</td>\n",
" <td>0.721473</td>\n",
" <td>0.430136</td>\n",
" <td>-0.085096</td>\n",
" <td>-0.862862</td>\n",
" <td>0.169980</td>\n",
" <td>-0.877489</td>\n",
" <td>0.990217</td>\n",
" <td>0.997551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26 03:00:00</th>\n",
" <td>-0.218563</td>\n",
" <td>-0.757217</td>\n",
" <td>0.013233</td>\n",
" <td>16.790001</td>\n",
" <td>197.429001</td>\n",
" <td>195.964340</td>\n",
" <td>-2.8</td>\n",
" <td>2.8</td>\n",
" <td>-31.728717</td>\n",
" <td>115.042133</td>\n",
" <td>...</td>\n",
" <td>-0.263881</td>\n",
" <td>0.654460</td>\n",
" <td>0.619026</td>\n",
" <td>-0.040868</td>\n",
" <td>-0.708264</td>\n",
" <td>-0.999980</td>\n",
" <td>-0.235982</td>\n",
" <td>-0.875962</td>\n",
" <td>0.990116</td>\n",
" <td>0.997526</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>239075 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" VCUR UCUR WCUR TEMP PRES_REL \\\n",
"TIME \n",
"2009-07-15 08:00:00 -0.396391 0.089687 -0.009671 18.549999 205.076004 \n",
"2009-07-15 08:10:00 -0.407620 0.085398 -0.019875 18.650000 205.078003 \n",
"2009-07-15 08:20:00 -0.365314 0.104038 0.000991 18.730000 205.076996 \n",
"2009-07-15 08:30:00 -0.406632 0.119376 -0.003729 18.799999 205.067001 \n",
"2009-07-15 08:40:00 -0.383744 0.090066 -0.008921 18.860001 205.065994 \n",
"... ... ... ... ... ... \n",
"2020-03-26 01:00:00 -0.436635 -0.784922 -0.012147 16.610001 197.384003 \n",
"2020-03-26 01:30:00 -0.355067 -0.845100 -0.005201 16.629999 197.408005 \n",
"2020-03-26 02:00:00 -0.568277 -0.816935 -0.024944 16.660000 197.412994 \n",
"2020-03-26 02:30:00 -0.306141 -0.773147 -0.028096 16.719999 197.419006 \n",
"2020-03-26 03:00:00 -0.218563 -0.757217 0.013233 16.790001 197.429001 \n",
"\n",
" DEPTH ROLL PITCH LATITUDE LONGITUDE ... \\\n",
"TIME ... \n",
"2009-07-15 08:00:00 203.550812 4.6 -3.4 -31.728650 115.037217 ... \n",
"2009-07-15 08:10:00 203.552795 4.6 -2.4 -31.728650 115.037217 ... \n",
"2009-07-15 08:20:00 203.551788 4.8 -2.7 -31.728650 115.037217 ... \n",
"2009-07-15 08:30:00 203.541901 4.7 -2.4 -31.728650 115.037217 ... \n",
"2009-07-15 08:40:00 203.540894 4.9 -2.9 -31.728650 115.037217 ... \n",
"... ... ... ... ... ... ... \n",
"2020-03-26 01:00:00 195.919662 -2.9 3.0 -31.728717 115.042133 ... \n",
"2020-03-26 01:30:00 195.943497 -2.7 3.0 -31.728717 115.042133 ... \n",
"2020-03-26 02:00:00 195.948425 -2.6 2.9 -31.728717 115.042133 ... \n",
"2020-03-26 02:30:00 195.954407 -2.6 2.7 -31.728717 115.042133 ... \n",
"2020-03-26 03:00:00 195.964340 -2.8 2.8 -31.728717 115.042133 ... \n",
"\n",
" O1 P1 Q1 M4 M6 \\\n",
"TIME \n",
"2009-07-15 08:00:00 0.286288 0.116457 -1.014973 -0.146817 -0.801534 \n",
"2009-07-15 08:10:00 0.242810 0.159551 -1.031149 -0.304345 -0.900573 \n",
"2009-07-15 08:20:00 0.198932 0.202343 -1.045759 -0.453239 -0.942304 \n",
"2009-07-15 08:30:00 0.154727 0.244751 -1.058780 -0.589276 -0.924071 \n",
"2009-07-15 08:40:00 0.110268 0.286697 -1.070194 -0.708598 -0.847034 \n",
"... ... ... ... ... ... \n",
"2020-03-26 01:00:00 -0.734741 0.190139 0.964792 0.882484 0.770444 \n",
"2020-03-26 01:30:00 -0.629257 0.316317 0.895545 0.957914 0.933774 \n",
"2020-03-26 02:00:00 -0.514470 0.437113 0.814067 0.793395 0.584762 \n",
"2020-03-26 02:30:00 -0.392074 0.550470 0.721473 0.430136 -0.085096 \n",
"2020-03-26 03:00:00 -0.263881 0.654460 0.619026 -0.040868 -0.708264 \n",
"\n",
" S4 MK3 MM SSA SA \n",
"TIME \n",
"2009-07-15 08:00:00 -0.500000 0.370082 0.132683 -0.686775 -0.395743 \n",
"2009-07-15 08:10:00 -0.642788 0.494417 0.134147 -0.686601 -0.395853 \n",
"2009-07-15 08:20:00 -0.766044 0.610654 0.135610 -0.686427 -0.395963 \n",
"2009-07-15 08:30:00 -0.866025 0.716890 0.137073 -0.686253 -0.396072 \n",
"2009-07-15 08:40:00 -0.939693 0.811384 0.138535 -0.686080 -0.396182 \n",
"... ... ... ... ... ... \n",
"2020-03-26 01:00:00 0.505439 1.028587 -0.881951 0.990514 0.997626 \n",
"2020-03-26 01:30:00 0.006292 0.851981 -0.880483 0.990416 0.997601 \n",
"2020-03-26 02:00:00 -0.494541 0.551159 -0.878996 0.990316 0.997576 \n",
"2020-03-26 02:30:00 -0.862862 0.169980 -0.877489 0.990217 0.997551 \n",
"2020-03-26 03:00:00 -0.999980 -0.235982 -0.875962 0.990116 0.997526 \n",
"\n",
"[239075 rows x 25 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xd.to_dataframe().drop(columns=['HEIGHT_ABOVE_SENSOR', 'NOMINAL_DEPTH'])#.columns#[['VCUR', 'UCUR', 'WCUR', 'TEMP', 'PRES_REL', 'DEPTH', 'ROLL', 'PITCH']]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:44:41.020269Z",
"start_time": "2020-10-26T03:44:41.017322Z"
}
},
"outputs": [],
"source": [
"# for x in xds:\n",
"# x.DEPTH.plot()\n",
"# plt.ylim(190, 210)\n",
"\n",
"# plt.show()\n",
"# for x in xds:\n",
"# x.plot.scatter('LONGITUDE', 'LONGITUDE')\n",
"# plt.show()\n",
"\n",
"# xd['VCUR'].plot(alpha=0.5)\n",
"# xd['UCUR'].plot(alpha=0.5)\n",
"# xd['WCUR'].plot(alpha=0.5)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:51:16.821117Z",
"start_time": "2020-10-26T03:51:16.606212Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:51:17.614829Z",
"start_time": "2020-10-26T03:51:17.204376Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('../data/processed/currents/MOS_ANMN-WA_AETVZ_WATR20_FV01_WATR20-1909-Continental-194_currents.nc')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2020-10-26T03:51:18.335001Z",
"start_time": "2020-10-26T03:51:18.328504Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"43.107293"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "seq2seq-time",
"language": "python",
"name": "seq2seq-time"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "219.011px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 2
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+190
View File
@@ -0,0 +1,190 @@
absl-py @ file:///home/conda/feedstock_root/build_artifacts/absl-py_1602289403781/work
aiohttp @ file:///tmp/build/80754af9/aiohttp_1602530305083/work
appdirs==1.4.4
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1602546578258/work
async-generator==1.10
async-timeout==3.0.1
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1599308529326/work
awscli @ file:///home/conda/feedstock_root/build_artifacts/awscli_1602890549104/work
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
backports.functools-lru-cache==1.6.1
black @ file:///home/conda/feedstock_root/build_artifacts/black-recipe_1599478779128/work
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1600454382015/work
blinker==1.4
bokeh @ file:///home/conda/feedstock_root/build_artifacts/bokeh_1602690186583/work
botocore @ file:///home/conda/feedstock_root/build_artifacts/botocore_1602884371056/work
bpe==1.0
brotlipy==0.7.0
cachetools @ file:///home/conda/feedstock_root/build_artifacts/cachetools_1593420445823/work
certifi==2020.6.20
cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1602537219008/work
cftime @ file:///home/conda/feedstock_root/build_artifacts/cftime_1602504440833/work
chardet @ file:///home/conda/feedstock_root/build_artifacts/chardet_1602255309768/work
click==7.1.2
cloudpickle @ file:///home/conda/feedstock_root/build_artifacts/cloudpickle_1598400192773/work
colorama==0.4.3
colorcet==2.0.2
confuse @ file:///home/conda/feedstock_root/build_artifacts/confuse_1593279073800/work
cryptography @ file:///home/conda/feedstock_root/build_artifacts/cryptography_1602614063317/work
cycler==0.10.0
cytoolz==0.11.0
dask @ file:///home/conda/feedstock_root/build_artifacts/dask-core_1602029610262/work
datashader @ file:///home/conda/feedstock_root/build_artifacts/datashader_1597664023361/work
datashape==0.5.4
decorator==4.4.2
defusedxml==0.6.0
distributed @ file:///home/conda/feedstock_root/build_artifacts/distributed_1602493186453/work
docutils==0.15.2
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1602701733603/work/dist/entrypoints-0.3-py2.py3-none-any.whl
fastparquet @ file:///home/conda/feedstock_root/build_artifacts/fastparquet_1594909864671/work
fsspec @ file:///home/conda/feedstock_root/build_artifacts/fsspec_1602700749102/work
future @ file:///home/conda/feedstock_root/build_artifacts/future_1602538316704/work
google-auth @ file:///tmp/build/80754af9/google-auth_1601995530934/work
google-auth-oauthlib==0.4.1
grpcio @ file:///home/conda/feedstock_root/build_artifacts/grpcio_1596715635580/work
HeapDict==1.0.1
holoviews @ file:///home/conda/feedstock_root/build_artifacts/holoviews_1600439907620/work
htmlmin==0.1.12
hypothesis==4.32.3
idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1593328102638/work
ImageHash @ file:///home/conda/feedstock_root/build_artifacts/imagehash_1588182723834/work
importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1600910428305/work
iniconfig @ file:///tmp/build/80754af9/iniconfig_1602780191262/work
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1602682802500/work/dist/ipykernel-5.3.4-py3-none-any.whl
ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1602640393953/work
ipython-genutils==0.2.0
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1599554010055/work
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1602395235501/work
Jinja2==2.11.2
jmespath @ file:///home/conda/feedstock_root/build_artifacts/jmespath_1589369830981/work
joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1601671685479/work
jsonschema==3.2.0
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1598486169312/work
jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1602537277085/work
jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/kiwisolver_1602517221725/work
llvmlite==0.34.0
locket==0.2.0
Markdown @ file:///home/conda/feedstock_root/build_artifacts/markdown_1602544730470/work
MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1602267316845/work
matplotlib @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-suite_1602600750896/work
mccabe==0.6.1
missingno==0.4.2
mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1602381812692/work
more-itertools @ file:///home/conda/feedstock_root/build_artifacts/more-itertools_1598643641143/work
msgpack @ file:///home/conda/feedstock_root/build_artifacts/msgpack-python_1602380760823/work
multidict @ file:///tmp/build/80754af9/multidict_1600456400975/work
multipledispatch==0.6.0
mypy @ file:///home/conda/feedstock_root/build_artifacts/mypy_1602270162469/work
mypy-extensions==0.4.3
nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1602859080374/work
nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert_1602715396354/work
nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1602732862338/work
nc-time-axis==1.2.0
nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1601342677072/work
netCDF4 @ file:///home/conda/feedstock_root/build_artifacts/netcdf4_1602508544050/work
networkx @ file:///home/conda/feedstock_root/build_artifacts/networkx_1598210780226/work
notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1602720128568/work
numba @ file:///home/conda/feedstock_root/build_artifacts/numba_1599084798687/work
numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1602429044575/work
oauthlib==3.1.0
olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1589925210001/work
pandas @ file:///home/conda/feedstock_root/build_artifacts/pandas_1602502751364/work
pandas-profiling @ file:///home/conda/feedstock_root/build_artifacts/pandas-profiling_1599137999474/work
pandocfilters==1.4.2
panel @ file:///home/conda/feedstock_root/build_artifacts/panel_1592920888719/work
param==1.9.3
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1595548966091/work
partd==1.1.0
pathspec==0.8.0
patsy==0.5.1
pexpect==4.8.0
phik @ file:///home/conda/feedstock_root/build_artifacts/phik_1590331950347/work
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602535628301/work
Pillow @ file:///home/conda/feedstock_root/build_artifacts/pillow_1602708615436/work
pluggy @ file:///home/conda/feedstock_root/build_artifacts/pluggy_1602337415071/work
prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1590412252446/work
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1602524994744/work
protobuf==3.13.0
psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1602264040045/work
ptyprocess==0.6.0
py @ file:///home/conda/feedstock_root/build_artifacts/py_1593088446458/work
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycodestyle @ file:///home/conda/feedstock_root/build_artifacts/pycodestyle_1589305246696/work
pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work
pyct @ file:///tmp/build/80754af9/pyct_1600458283986/work
pydocstyle @ file:///home/conda/feedstock_root/build_artifacts/pydocstyle_1598747747227/work
pyflakes==2.2.0
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1600347314331/work
PyJWT==1.7.1
pylama==7.7.1
pyOpenSSL==19.1.0
pyparsing==2.4.7
PyQt5==5.12.3
PyQt5-sip==4.19.18
PyQtChart==5.12
PyQtWebEngine==5.12.1
pyrsistent @ file:///home/conda/feedstock_root/build_artifacts/pyrsistent_1602259985647/work
PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1602326924965/work
pytest==6.1.1
python-dateutil==2.8.1
pytorch-fast-transformers==0.3.0
pytorch-lightning @ file:///home/conda/feedstock_root/build_artifacts/pytorch-lightning_1602786328955/work
pytorch-lightning-bolts==0.2.5
pytz==2020.1
pyviz-comms @ file:///home/conda/feedstock_root/build_artifacts/pyviz_comms_1594121601757/work
PyWavelets @ file:///home/conda/feedstock_root/build_artifacts/pywavelets_1602504439149/work
PyYAML==5.3.1
pyzmq==19.0.2
regex @ file:///home/conda/feedstock_root/build_artifacts/regex_1602771401882/work
requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1592425495151/work
requests-oauthlib @ file:///home/conda/feedstock_root/build_artifacts/requests-oauthlib_1595492159598/work
rsa @ file:///home/conda/feedstock_root/build_artifacts/rsa_1591990902901/work
s3transfer @ file:///home/conda/feedstock_root/build_artifacts/s3transfer_1602631002642/work
scikit-learn @ file:///home/conda/feedstock_root/build_artifacts/scikit-learn_1596546074663/work
scipy @ file:///home/conda/feedstock_root/build_artifacts/scipy_1602862657152/work
seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-base_1599592695803/work
Send2Trash==1.5.0
# Editable Git install with no remote (seq2seq-time==0.1.0)
-e /media/wassname/Storage5/projects2/3ST/seq2seq-time
six @ file:///home/conda/feedstock_root/build_artifacts/six_1590081179328/work
sklearn==0.0
sklearn-pandas==2.0.2
snowballstemmer==2.0.0
sortedcontainers @ file:///home/conda/feedstock_root/build_artifacts/sortedcontainers_1591999956871/work
statsmodels @ file:///home/conda/feedstock_root/build_artifacts/statsmodels_1602599914091/work
tangled-up-in-unicode @ file:///home/conda/feedstock_root/build_artifacts/tangled-up-in-unicode_1589363771888/work
tblib @ file:///tmp/build/80754af9/tblib_1597928476713/work
tensorboard @ file:///home/conda/feedstock_root/build_artifacts/tensorboard_1595378845776/work/tensorboard-2.3.0-py3-none-any.whl
tensorboard-plugin-wit @ file:///home/conda/feedstock_root/build_artifacts/tensorboard-plugin-wit_1592816951245/work/tensorboard_plugin_wit-1.6.0.post3-py3-none-any.whl
terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1602679584439/work
testpath==0.4.4
threadpoolctl @ file:///tmp/tmp79xdzxkt/threadpoolctl-2.1.0-py3-none-any.whl
thrift==0.11.0
toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1589469402899/work
toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1600973991856/work
torch==1.6.0
torchsummaryX==1.3.0
torchvision==0.7.0
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1602488893411/work
tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1602171507552/work
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1602771532708/work
typed-ast==1.4.1
typing-extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1602702424206/work
ucimlr @ git+https://github.com/isacarnekvist/ucimlr@329ed0586effeb2d57f179f3abb0da9862feed01
unlzw==0.1.1
uptide==1.0
urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1595434816409/work
visions @ file:///home/conda/feedstock_root/build_artifacts/visions_1597645571032/work
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
webencodings==0.5.1
Werkzeug==1.0.1
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1602769155190/work
xarray @ file:///home/conda/feedstock_root/build_artifacts/xarray_1600638299066/work
xlrd==1.2.0
yapf @ file:///home/conda/feedstock_root/build_artifacts/yapf_1595950469082/work
yarl @ file:///home/conda/feedstock_root/build_artifacts/yarl_1602671471836/work
zict==2.0.0
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1602852756910/work
+309
View File
@@ -0,0 +1,309 @@
from typing import List, Tuple
from torchvision.datasets.utils import download_url, extract_archive, download_and_extract_archive
import os
from tqdm.auto import tqdm
from pathlib import Path
from sklearn_pandas import DataFrameMapper
import xarray as xr
import pandas as pd
import numpy as np
from .dataset import Seq2SeqDataSet
from .util import normalize_encode_dataframe, timeseries_split
from .tidal import generate_tidal_periods
class RegressionForecastData:
columns_forecast = None # The input colums which can be included in future (e.g. week or weather forecast)
columns_target = None # Target columns
def __init__(self, datasets_root):
self.datasets_root = datasets_root
# Process data
self.df = self.download()
self.df_norm, self.scaler = self.normalize(self.df)
self.output_scaler = next(filter(lambda r:r[0][0] in self.columns_target, self.scaler.features))[-1]
self.df_train, self.df_test = self.split(self.df_norm)
# Check processing
self.check()
def download(self) -> pd.DataFrame:
"""Implement this method to download data and return raw df"""
raise NotImplementedError()
return df
def normalize(self, df) -> Tuple[pd.DataFrame, DataFrameMapper]:
df_norm, scaler = normalize_encode_dataframe(df)
return df_norm, scaler
def split(self, df_norm: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
df_train, df_test = timeseries_split(df_norm)
return df_train, df_test
def check(self) -> None:
"""Check the resulting dataframe"""
assert isinstance(self.df.index, pd.DatetimeIndex), 'index must be datetime'
assert self.df.index.freq is not None, 'df must have freq'
assert self.columns_forecast is not None
assert self.columns_target is not None
assert ~set(self.columns_target).issubset(set(self.columns_forecast)), 'target columns should not be in forecast'
assert set(self.columns_forecast).issubset(set(self.df.columns)), 'columns_forecast must be in df'
assert set(self.columns_target).issubset(set(self.df.columns)), 'columns_target must be in df'
def to_datasets(self, window_past: int, window_future: int, valid:bool=False) -> Tuple[Seq2SeqDataSet, Seq2SeqDataSet]:
"""Convert to torch datasets"""
ds_train = Seq2SeqDataSet(df_train, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
ds_test = Seq2SeqDataSet(df_test, window_past=window_past, window_future=window_future, columns_target=self.columns_target, columns_past=self.columns_past)
return ds_train, ds_test
def __repr__(self):
return f'<{type(self).__name__} {self.df.shape if (self.df is not None) else None}>'
class GasSensor(RegressionForecastData):
"""
See: http://archive.ics.uci.edu/ml/datasets/Gas+sensor+array+temperature+modulation
"""
columns_target = ['R1 (MOhm)']
columns_forecast = ['Flow rate (mL/min)', 'Heater voltage (V)']
def download(self):
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00487/gas-sensor-array-temperature-modulation.zip'
# download if needed
extract_path = self.datasets_root/'GasSensor'
files = sorted(extract_path.glob('*.csv'))
if len(files)<13:
print('download_and_extract_archive')
download_and_extract_archive(url, self.datasets_root, extract_path)
# Load csv's
files = sorted(extract_path.glob('*.csv'))
dfs = []
for f in files:
now = pd.to_datetime(f.stem, format='%Y%m%d_%H%M%S')
df = pd.read_csv(f)
df.index = pd.to_timedelta(df['Time (s)'], unit='s') + now
dfs.append(df)
self.df = pd.concat(dfs).dropna(subset=self.columns_target)
df = df[[ 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)',
'Flow rate (mL/min)', 'Heater voltage (V)', 'R1 (MOhm)']]
df = df.resample('0.3S').first()
return df
class MetroInterstateTraffic(RegressionForecastData):
"""
See: https://archive.ics.uci.edu/ml/datasets/Metro+Interstate+Traffic+Volume
"""
columns_target = ['traffic_volume']
columns_forecast = ['holiday', 'month', 'day', 'week', 'hour',
'minute', 'dayofweek']
def download(self):
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz'
# download if needed
filename = '00492_Metro_Interstate_Traffic_Volume.csv.gz'
local_path = self.datasets_root/filename
if not local_path.exists():
download_url(url, self.datasets_root, filename)
df = (pd.read_csv(local_path, index_col='date_time', parse_dates=['date_time'])
.dropna(subset=self.columns_target)
.resample('1H').first()
)
# Make holiday a bool
df['holiday'] = ~df['holiday'].isna()
df['weather_main'] = df['weather_main'].fillna('none')
df['weather_description'] = df['weather_description'].fillna('none')
# Add time features
time = df.index.to_series()
df["month"] = time.dt.month
df['day'] = time.dt.day
df['week'] = time.dt.isocalendar().week
df['hour'] = time.dt.hour
df['minute'] = time.dt.minute
df['dayofweek'] = time.dt.dayofweek
return df
class AppliancesEnergyPrediction(RegressionForecastData):
"""
See: https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction
"""
columns_target = ['log_Appliances']
columns_forecast = ['month', 'day', 'week', 'hour',
'minute', 'dayofweek']
def download(self):
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'
# download if needed
filename = '00374_AppliancesEnergyPrediction.csv'
local_path = self.datasets_root/filename
if not local_path.exists():
download_url(url, self.datasets_root, filename)
df = pd.read_csv(local_path, index_col='date', parse_dates=['date'])
# log target
df['log_Appliances'] = np.log(df['Appliances'] + 1e-5)
df = df.drop(columns=['Appliances'])
df = df.dropna(subset=self.columns_target).resample('10T').first()
# Add time features
time = df.index.to_series()
df["month"] = time.dt.month
df['day'] = time.dt.day
df['week'] = time.dt.isocalendar().week
df['hour'] = time.dt.hour
df['minute'] = time.dt.minute
df['dayofweek'] = time.dt.dayofweek
return df
class BejingPM25(RegressionForecastData):
"""
See: http://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data
"""
columns_target = ['log_pm2.5']
columns_forecast = ['month', 'day', 'week', 'hour',
'minute', 'dayofweek']
def download(self):
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv'
# download if needed
filename = '00381_BejingPM25.csv'
local_path = self.datasets_root/filename
if not local_path.exists():
download_url(url, self.datasets_root, filename)
df = pd.read_csv(local_path)
df.index = pd.to_datetime(df[['year', 'month', 'day', 'hour']]).dt.tz_localize('Asia/Shanghai')
df = df.drop(columns=['year', 'month', 'day', 'hour', 'No'])
# log target
df['log_pm2.5'] = np.log(df['pm2.5'] + 1e-5)
df = df.drop(columns=['pm2.5'])
df.dropna(subset=self.columns_target, inplace=True)
df = df.resample('1H').first()
df['cbwd'] = df['cbwd'].fillna('none')
# Add time features
time = df.index.to_series()
df["month"] = time.dt.month
df['day'] = time.dt.day
df['week'] = time.dt.isocalendar().week
df['hour'] = time.dt.hour
df['minute'] = time.dt.minute
df['dayofweek'] = time.dt.dayofweek
# df['log_pm2.5'] = np.log(df['pm2.5']+1e-5)
return df
def get_current_timeseries(
cache_folder=Path("../data/raw/IMOS_ANMN/"),
outfile=Path(
'../data/processed/currents/MOS_ANMN-WA_AETVZ_WATR20_FV01_WATR20-1909-Continental-194_currents.nc'
)):
"""
Download Current data from the IMOS and pre-process.
"""
if not outfile.exists():
files = [
"IMOS_ANMN-WA_AETVZ_20090715T080000Z_WATR20_FV01_WATR20-0907-Continental-194_END-20090716T181317Z_C-20191122T052830Z.nc",
"IMOS_ANMN-WA_AETVZ_20100409T080000Z_WATR20_FV01_WATR20-1004-Continental-194_END-20100430T084500Z_C-20191122T053845Z.nc",
"IMOS_ANMN-WA_AETVZ_20101222T080000Z_WATR20_FV01_WATR20-1012-Continental-194_END-20110518T051500Z_C-20200916T020035Z.nc",
"IMOS_ANMN-WA_AETVZ_20110608T080000Z_WATR20_FV01_WATR20-1106-Continental-194_END-20111122T035000Z_C-20200916T025619Z.nc",
"IMOS_ANMN-WA_AETVZ_20111221T060300Z_WATR20_FV01_WATR20-1112-Continental-194_END-20120704T050500Z_C-20200916T043212Z.nc",
"IMOS_ANMN-WA_AETVZ_20120726T044000Z_WATR20_FV01_WATR20-1207-Continental-194_END-20130204T044000Z_C-20200916T032027Z.nc",
"IMOS_ANMN-WA_AETVZ_20130221T080000Z_WATR20_FV01_WATR20-1302-Continental-194_END-20131003T035000Z_C-20180529T020609Z.nc",
"IMOS_ANMN-WA_AETVZ_20131111T080000Z_WATR20_FV01_WATR20-1311-Continental-194_END-20140519T035000Z_C-20200114T033335Z.nc",
"IMOS_ANMN-WA_AETVZ_20140710T080000Z_WATR20_FV01_WATR20-1407-Continental-194_END-20150121T021500Z_C-20180529T055902Z.nc",
"IMOS_ANMN-WA_AETVZ_20150213T080000Z_WATR20_FV01_WATR20-1502-Continental-194_END-20150424T134002Z_C-20200114T035347Z.nc",
"IMOS_ANMN-WA_AETVZ_20150914T080000Z_WATR20_FV01_WATR20-1509-Continental-194_END-20160331T043000Z_C-20180601T013623Z.nc",
"IMOS_ANMN-WA_AETVZ_20160427T080000Z_WATR20_FV01_WATR20-1604-Continental-194_END-20160531T021800Z_C-20180531T071709Z.nc",
# "IMOS_ANMN-WA_AETVZ_20170512T080000Z_WATR20_FV01_WATR20-1705-Continental-194_END-20170717T014558Z_C-20190805T004647Z.nc",
"IMOS_ANMN-WA_AETVZ_20171204T080000Z_WATR20_FV01_WATR20-1712-Continental-194_END-20180618T030000Z_C-20180620T233149Z.nc",
"IMOS_ANMN-WA_AETVZ_20180802T080000Z_WATR20_FV01_WATR20-1807-Continental-194_END-20190225T054500Z_C-20190227T001343Z.nc",
"IMOS_ANMN-WA_AETVZ_20190307T080000Z_WATR20_FV01_WATR20-1903-Continental-194_END-20190911T003144Z_C-20200114T045053Z.nc",
"IMOS_ANMN-WA_AETVZ_20190926T080000Z_WATR20_FV01_WATR20-1909-Continental-194_END-20200326T030000Z_C-20200420T064334Z.nc",
]
base = "http://thredds.aodn.org.au/thredds/fileServer/IMOS/ANMN/WA/WATR20/Velocity/"
# Download files
[download_url(base + f, cache_folder) for f in files]
# load and merge
xds = [xr.open_dataset(cache_folder / f) for f in files]
vars = [
'VCUR', 'UCUR', 'WCUR', 'TEMP', 'PRES_REL', 'DEPTH', 'ROLL',
'PITCH'
]
xds2 = [x[vars].isel(HEIGHT_ABOVE_SENSOR=18) for x in xds]
xd = xr.concat(xds2, dim='TIME')
xd = xd.where(xd.DEPTH > 150) # remove outliers
xd['TIME'] = xd['TIME'].dt.round('10T')
xd = xd.dropna(dim='TIME', subset=['VCUR', 'UCUR', 'WCUR'])
# Generate tidal freqs
t = xd.TIME.to_series()
df_eta = generate_tidal_periods(t)
# Add tidal freqs
xd = xd.merge(df_eta)
# Cache to nc
xd.to_netcdf(outfile)
print(
f'wrote "{outfile}" with size {outfile.stat().st_size*1e-6:2.2f} MB'
)
return outfile
class IMOSCurrentsVel(RegressionForecastData):
"""
Current Speed at ANMN Two Rocks, WA, 204m mooring
see:
- http://thredds.aodn.org.au/thredds/fileServer/IMOS/ANMN/WA/WATR20/Velocity/
from https://catalogue-imos.aodn.org.au/geonetwork/srv/api/records/ae86e2f5-eaaf-459e-a405-e654d85adb9c
and http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/WA/WATR20/Velocity/catalog.html
And https://en.wikipedia.org/wiki/Theory_of_tides
"""
columns_target = ['SPD']
columns_forecast = [
'M2', 'S2', 'N2', 'K2', 'K1', 'O1', 'P1', 'Q1', 'M4', 'M6', 'S4',
'MK3', 'MM', 'SSA', 'SA'
]
def download(self):
outfile = self.datasets_root / 'MOS_ANMN-WA_AETVZ_WATR20_FV01_WATR20-1909-Continental-194_currents.nc'
get_current_timeseries(outfile=outfile)
# made in previous notebook
xd = xr.load_dataset(outfile)
df = xd.to_dataframe().drop(
columns=['HEIGHT_ABOVE_SENSOR', 'NOMINAL_DEPTH'])
df['SPD'] = np.sqrt(df.VCUR**2 + df.UCUR**2)
df.dropna(subset=self.columns_target, inplace=True)
df = df.resample('30T').first()
return df
+5 -3
View File
@@ -20,11 +20,11 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
Returns x_past, y_past, x_future, etc.
"""
def __init__(self, df: pd.DataFrame, window_past=40, window_future=10, columns_target=['energy(kWh/hh)'], columns_blank=[],):
def __init__(self, df: pd.DataFrame, window_past=40, window_future=10, columns_target=['energy(kWh/hh)'], columns_past=[],):
"""
Args:
- df: DataFrame with time index, already scaled
- columns_blank: The columns we will blank, in the future
- columns_past: The columns we will blank, in the future
"""
super().__init__()
assert isinstance(df.index, pd.DatetimeIndex), 'should have a datetime index'
@@ -38,7 +38,7 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
self.columns_target = columns_target
# For speed
self._icol_blank = [df.drop(columns = columns_target).columns.tolist().index(n) for n in columns_blank]
self._icol_blank = [df.drop(columns = columns_target).columns.tolist().index(n) for n in columns_past]
self._x = self.df.drop(columns = self.columns_target).values
self._y = self.df[columns_target].values
@@ -64,6 +64,8 @@ class Seq2SeqDataSet(torch.utils.data.Dataset):
# Stop it cheating by using future weather measurements. Fill in with last value
x_future[:, self._icol_blank] = x_past[0, self._icol_blank]
# x_future[:, self._icol_blank] = 0
return x_past, y_past, x_future, y_future
-30
View File
@@ -1,30 +0,0 @@
# -*- coding: utf-8 -*-
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
""" Runs data processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]
# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
main()
+43
View File
@@ -0,0 +1,43 @@
import uptide
import pandas as pd
# https://en.wikipedia.org/wiki/Theory_of_tides#Harmonic_analysis
default_tidal_constituents = [
'M2',
'S2',
'N2',
'K2', # Semi-diurnal
'K1',
'O1',
'P1',
'Q1', # Diurnal
'M4',
'M6',
'S4',
'MK3', # Short period
'MM',
'SSA',
'SA' # Long period
]
def generate_tidal_periods(t: pd.Series,
constituents: list = default_tidal_constituents):
tide = uptide.Tides(constituents)
t0 = t[0]
td = t - t0
td = td.dt.total_seconds().to_numpy().astype(int)
tide.set_initial_time(t0)
# calc tides
amplitudes = np.ones_like(td)
phases = np.zeros_like(td)
eta = {}
for name, f, amplitude, omega, phase, phi, u in zip(
tide.constituents, tide.f, amplitudes, tide.omega, phases,
tide.phi, tide.u):
eta[name] = f * amplitude * np.cos(omega * td - phase + phi + u)
df_eta = pd.DataFrame(eta, index=t)
return df_eta
+19
View File
@@ -0,0 +1,19 @@
import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
def normalize_encode_dataframe(df, encoder=OrdinalEncoder):
"""Normalise numeric data, encode categorical data."""
columns_input_numeric = list(df._get_numeric_data().columns)
columns_categorical = list(set(df.columns)-set(columns_input_numeric))
transformers= [([n], StandardScaler()) for n in columns_input_numeric] + \
[([n], encoder()) for n in columns_categorical]
scaler = DataFrameMapper(transformers, df_out=True)
df_norm = scaler.fit_transform(df)
return df_norm, scaler
def timeseries_split(df, test_fraction=0.2):
"""Split timeseries data with test in the future"""
i = int(len(df)*test_fraction)
return df.iloc[:i], df.iloc[i:]