Files
2020-11-01 08:22:16 +08:00

30 lines
1.1 KiB
Python

import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
def normalize_encode_dataframe(df, encoder=OrdinalEncoder):
"""Normalise numeric data, encode categorical data."""
columns_input_numeric = list(df._get_numeric_data().columns)
columns_categorical = list(set(df.columns)-set(columns_input_numeric))
transformers= [([n], StandardScaler()) for n in columns_input_numeric] + \
[([n], encoder()) for n in columns_categorical]
scaler = DataFrameMapper(transformers, df_out=True)
df_norm = scaler.fit_transform(df)
return df_norm, scaler
def timeseries_split(df, test_fraction=0.2, dropna=None):
"""Split timeseries data with test in the future"""
# If there are lots of nan's we can ignore them when splitting into portions
if isinstance(dropna, list):
index = df.dropna(subset=dropna).index
elif dropna is True:
index = df.dropna().index
else:
index = df.index
i = int(len(index)*(1- test_fraction))
dt = index.values[i]
return df.loc[:dt], df.loc[dt:]