initial gluonts dependency

This commit is contained in:
Dr. Kashif Rasul
2020-12-17 17:04:56 +01:00
parent ecc31f6082
commit b072ab227b
88 changed files with 498 additions and 11571 deletions
+46 -152
View File
File diff suppressed because one or more lines are too long
+1 -2
View File
@@ -2,7 +2,6 @@ from pkgutil import extend_path
from pkg_resources import get_distribution, DistributionNotFound
from .exception import assert_pts
from .trainer import Trainer
__path__ = extend_path(__path__, __name__) # type: ignore
@@ -10,4 +9,4 @@ __path__ = extend_path(__path__, __name__) # type: ignore
try:
__version__ = get_distribution(__name__).version
except DistributionNotFound:
__version__ = "0.0.0-unknown"
__version__ = "0.0.0-unknown"
-9
View File
@@ -1,9 +0,0 @@
# Relative imports
from ._base import fqname_for
__all__ = ["fqname_for"]
# fix Sphinx issues, see https://bit.ly/2K2eptM
for item in __all__:
if hasattr(item, "__module__"):
setattr(item, "__module__", __name__)
-29
View File
@@ -1,29 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
def fqname_for(cls: type) -> str:
"""
Returns the fully qualified name of ``cls``.
Parameters
----------
cls
The class we are interested in.
Returns
-------
str
The fully qualified name of ``cls``.
"""
return f"{cls.__module__}.{cls.__qualname__}"
-171
View File
@@ -1,171 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import functools
import inspect
from collections import OrderedDict
from typing import Any
import torch
from pydantic import BaseConfig, BaseModel, create_model
from pts.core.serde import dump_code
class BaseValidatedInitializerModel(BaseModel):
"""
Base Pydantic model for components with :func:`validated` initializers.
See Also
--------
validated
Decorates an initializer methods with argument validation logic.
"""
class Config(BaseConfig):
"""
`Config <https://pydantic-docs.helpmanual.io/#model-config>`_ for the
Pydantic model inherited by all :func:`validated` initializers.
Allows the use of arbitrary type annotations in initializer parameters.
"""
arbitrary_types_allowed = True
def validated(base_model=None):
"""
Decorates an ``__init__`` method with typed parameters with validation
and auto-conversion logic.
>>> class ComplexNumber:
... @validated()
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
... self.x = x
... self.y = y
Classes with decorated initializers can be instantiated using arguments of
another type (e.g. an ``y`` argument of type ``str`` ). The decorator
handles the type conversion logic.
>>> c = ComplexNumber(y='42')
>>> (c.x, c.y)
(0.0, 42.0)
If the bound argument cannot be converted, the decorator throws an error.
>>> c = ComplexNumber(y=None)
Traceback (most recent call last):
...
pydantic.error_wrappers.ValidationError: 1 validation error for ComplexNumberModel
y
none is not an allowed value (type=type_error.none.not_allowed)
Internally, the decorator delegates all validation and conversion logic to
`a Pydantic model <https://pydantic-docs.helpmanual.io/>`_, which can be
accessed through the ``Model`` attribute of the decorated initiazlier.
>>> ComplexNumber.__init__.Model
<class 'ComplexNumberModel'>
The Pydantic model is synthesized automatically from on the parameter
names and types of the decorated initializer. In the ``ComplexNumber``
example, the synthesized Pydantic model corresponds to the following
definition.
>>> class ComplexNumberModel(BaseValidatedInitializerModel):
... x: float = 0.0
... y: float = 0.0
Clients can optionally customize the base class of the synthesized
Pydantic model using the ``base_model`` decorator parameter. The default
behavior uses :class:`BaseValidatedInitializerModel` and its
`model config <https://pydantic-docs.helpmanual.io/#config>`_.
See Also
--------
BaseValidatedInitializerModel
Default base class for all synthesized Pydantic models.
"""
def validator(init):
init_qualname = dict(inspect.getmembers(init))["__qualname__"]
init_clsnme = init_qualname.split(".")[0]
init_params = inspect.signature(init).parameters
init_fields = {
param.name: (
param.annotation
if param.annotation != inspect.Parameter.empty
else Any,
param.default if param.default != inspect.Parameter.empty else ...,
)
for param in init_params.values()
if param.name != "self"
and param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
}
if base_model is None:
PydanticModel = create_model(
f"{init_clsnme}Model",
__config__=BaseValidatedInitializerModel.Config,
**init_fields,
)
else:
PydanticModel = create_model(
f"{init_clsnme}Model", __base__=base_model, **init_fields,
)
def validated_repr(self) -> str:
return dump_code(self)
def validated_getnewargs_ex(self):
return (), self.__init_args__
@functools.wraps(init)
def init_wrapper(*args, **kwargs):
self, *args = args
nmargs = {
name: arg
for (name, param), arg in zip(list(init_params.items()), [self] + args)
if name != "self"
}
model = PydanticModel(**{**nmargs, **kwargs})
# merge nmargs, kwargs, and the model fields into a single dict
all_args = {**nmargs, **kwargs, **model.__dict__}
# save the merged dictionary for Representable use, but only of the
# __init_args__ is not already set in order to avoid overriding a
# value set by a subclass initializer in super().__init__ calls
if not getattr(self, "__init_args__", {}):
self.__init_args__ = OrderedDict(
{
name: arg
for name, arg in sorted(all_args.items())
if type(arg) != torch.nn.ParameterDict
}
)
self.__class__.__getnewargs_ex__ = validated_getnewargs_ex
self.__class__.__repr__ = validated_repr
return init(self, **all_args)
# attach the Pydantic model as the attribute of the initializer wrapper
setattr(init_wrapper, "Model", PydanticModel)
return init_wrapper
return validator
-374
View File
@@ -1,374 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import itertools
import json
import math
import textwrap
from functools import singledispatch
from pydoc import locate
from typing import Any, Optional, cast, NamedTuple
import numpy as np
from pts.core import fqname_for
bad_type_msg = textwrap.dedent(
"""
Cannot serialize type {}. See the documentation of the `encode` and
`validate` functions at
http://gluon-ts.mxnet.io/api/gluonts/gluonts.html
and the Python documentation of the `__getnewargs_ex__` magic method at
https://docs.python.org/3/library/pickle.html#object.__getnewargs_ex__
for more information how to make this type serializable.
"""
).lstrip()
def dump_code(o: Any) -> str:
"""
Serializes an object to a Python code string.
Parameters
----------
o
The object to serialize.
Returns
-------
str
A string representing the object as Python code.
See Also
--------
load_code
Inverse function.
"""
def _dump_code(x: Any) -> str:
# r = { 'class': ..., 'args': ... }
# r = { 'class': ..., 'kwargs': ... }
if type(x) == dict and x.get("__kind__") == kind_inst:
args = x.get("args", [])
kwargs = x.get("kwargs", {})
fqname = x["class"]
bindings = ", ".join(
itertools.chain(
map(_dump_code, args),
[f"{k}={_dump_code(v)}" for k, v in kwargs.items()],
)
)
return f"{fqname}({bindings})"
if type(x) == dict and x.get("__kind__") == kind_type:
return x["class"]
if isinstance(x, dict):
inner = ", ".join(
f"{_dump_code(k)}: {_dump_code(v)}" for k, v in x.items()
)
return f"{{{inner}}}"
if isinstance(x, list):
inner = ", ".join(list(map(dump_code, x)))
return f"[{inner}]"
if isinstance(x, tuple):
inner = ", ".join(list(map(dump_code, x)))
# account for the extra `,` in `(x,)`
if len(x) == 1:
inner += ","
return f"({inner})"
if isinstance(x, str):
# json.dumps escapes the string
return json.dumps(x)
if isinstance(x, float) or np.issubdtype(type(x), np.inexact):
if math.isfinite(x):
return str(x)
else:
# e.g. `nan` needs to be encoded as `float("nan")`
return 'float("{x}")'
if isinstance(x, int) or np.issubdtype(type(x), np.integer):
return str(x)
if x is None:
return str(x)
raise RuntimeError(
f"Unexpected element type {fqname_for(x.__class__)}"
)
return _dump_code(encode(o))
# JSON Serialization/Deserialization
# ----------------------------------
# The canonical way to do this is to define and `default` and `object_hook`
# parameters to the json.dumps and json.loads methods. Unfortunately, due
# to https://bugs.python.org/issue12657 this is not possible at the moment,
# as support for custom NamedTuple serialization is broken.
#
# To circumvent the issue, we pass the input value through custom encode
# and decode functions that map nested object terms to JSON-serializable
# data structures with explicit recursion.
def dump_json(o: Any, indent: Optional[int] = None) -> str:
"""
Serializes an object to a JSON string.
Parameters
----------
o
The object to serialize.
indent
An optional number of spaced to use as an indent.
Returns
-------
str
A string representing the object in JSON format.
See Also
--------
load_json
Inverse function.
"""
return json.dumps(encode(o), indent=indent, sort_keys=True)
def load_json(s: str) -> Any:
"""
Deserializes an object from a JSON string.
Parameters
----------
s
A string representing the object in JSON format.
Returns
-------
Any
The deserialized object.
See Also
--------
dump_json
Inverse function.
"""
return decode(json.loads(s))
# Structural encoding/decoding
# ----------------------------
kind_type = "type"
kind_inst = "instance"
@singledispatch
def encode(v: Any) -> Any:
"""
Transforms a value `v` as a serializable intermediate representation (for
example, named tuples are encoded as dictionaries). The intermediate
representation is then recursively traversed and serialized either as
Python code or as JSON string.
This function is decorated with :func:`~functools.singledispatch` and can
be specialized by clients for families of types that are not supported by
the basic implementation (explained below).
Examples
--------
The conversion logic implemented by the basic implementation is used
as a fallback and is best explained by a series of examples.
Lists (as lists).
>>> encode([1, 2.0, '3'])
[1, 2.0, '3']
Tuples (as lists).
>>> encode((1, 2.0, '3'))
[1, 2.0, '3']
Dictionaries (as dictionaries).
>>> encode({'a': 1, 'b': 2.0, 'c': '3'})
{'a': 1, 'b': 2.0, 'c': '3'}
Named tuples (as dictionaries with a ``'__kind__': 'instance'`` member).
>>> from pprint import pprint
>>> from typing import NamedTuple
>>> class ComplexNumber(NamedTuple):
... x: float = 0.0
... y: float = 0.0
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
{'__kind__': 'instance',
'class': 'gluonts.core.serde.ComplexNumber',
'kwargs': {'x': 4.0, 'y': 2.0}}
Classes with a :func:`~gluonts.core.component.validated` initializer (as
dictionaries with a ``'__kind__': 'instance'`` member).
>>> from gluonts.core.component import validated
>>> class ComplexNumber:
... @validated()
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
... self.x = x
... self.y = y
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
{'__kind__': 'instance',
'args': [],
'class': 'gluonts.core.serde.ComplexNumber',
'kwargs': {'x': 4.0, 'y': 2.0}}
Classes with a ``__getnewargs_ex__`` magic method (as dictionaries with a
``'__kind__': 'instance'`` member).
>>> from gluonts.core.component import validated
>>> class ComplexNumber:
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
... self.x = x
... self.y = y
... def __getnewargs_ex__(self):
... return [], {'x': self.x, 'y': self.y}
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
{'__kind__': 'instance',
'args': [],
'class': 'gluonts.core.serde.ComplexNumber',
'kwargs': {'x': 4.0, 'y': 2.0}}
Types (as dictionaries with a ``'__kind__': 'type' member``).
>>> encode(ComplexNumber)
{'__kind__': 'type', 'class': 'gluonts.core.serde.ComplexNumber'}
Parameters
----------
v
The value to be encoded.
Returns
-------
Any
An encoding of ``v`` that can be serialized to Python code or
JSON string.
See Also
--------
decode
Inverse function.
dump_json
Serializes an object to a JSON string.
dump_code
Serializes an object to a Python code string.
"""
if isinstance(v, type(None)):
return None
if isinstance(v, (float, int, str)):
return v
if np.issubdtype(type(v), np.inexact):
return float(v)
if np.issubdtype(type(v), np.integer):
return int(v)
# we have to check for namedtuples first, to encode them not as plain
# tuples (which would become lists)
if isinstance(v, tuple) and hasattr(v, "_asdict"):
v = cast(NamedTuple, v)
return {
"__kind__": kind_inst,
"class": fqname_for(v.__class__),
"kwargs": encode(v._asdict()),
}
if isinstance(v, (list, set, tuple)):
return list(map(encode, v))
if isinstance(v, dict):
return {k: encode(v) for k, v in v.items()}
if isinstance(v, type):
return {"__kind__": kind_type, "class": fqname_for(v)}
if hasattr(v, "__getnewargs_ex__"):
args, kwargs = v.__getnewargs_ex__() # mypy: ignore
return {
"__kind__": kind_inst,
"class": fqname_for(v.__class__),
"args": encode(args),
"kwargs": encode(kwargs),
}
raise RuntimeError(bad_type_msg.format(fqname_for(v.__class__)))
def decode(r: Any) -> Any:
"""
Decodes a value from an intermediate representation `r`.
Parameters
----------
r
An intermediate representation to be decoded.
Returns
-------
Any
A Python data structure corresponding to the decoded version of ``r``.
See Also
--------
encode
Inverse function.
"""
# structural recursion over the possible shapes of r
# r = { 'class': ..., 'args': ... }
# r = { 'class': ..., 'kwargs': ... }
if type(r) == dict and r.get("__kind__") == kind_inst:
cls = cast(Any, locate(r["class"]))
args = decode(r["args"]) if "args" in r else []
kwargs = decode(r["kwargs"]) if "kwargs" in r else {}
return cls(*args, **kwargs)
# r = { 'class': ..., 'args': ... }
# r = { 'class': ..., 'kwargs': ... }
if type(r) == dict and r.get("__kind__") == kind_type:
return locate(r["class"])
# r = { k1: v1, ..., kn: vn }
elif type(r) == dict:
return {k: decode(v) for k, v in r.items()}
# r = ( y1, ..., yn )
elif type(r) == tuple:
return tuple([decode(y) for y in r])
# r = [ y1, ..., yn ]
elif type(r) == list:
return [decode(y) for y in r]
# r = { y1, ..., yn }
elif type(r) == set:
return {decode(y) for y in r}
# r = a
else:
return r
-32
View File
@@ -1,32 +0,0 @@
from .artificial import (
ArtificialDataset,
ConstantDataset,
ComplexSeasonalTimeSeries,
RecipeDataset,
constant_dataset,
default_synthetic,
generate_sf2,
)
from .common import (
DataEntry,
FieldName,
Dataset,
MetaData,
TrainDatasets,
DateConstants,
)
from .file_dataset import FileDataset
from .list_dataset import ListDataset
from .loader import TrainDataLoader, InferenceDataLoader
from .multivariate_grouper import MultivariateGrouper
from .process import ProcessStartField, ProcessDataEntry
from .stat import DatasetStatistics, ScaleHistogram, calculate_dataset_statistics
from .transformed_iterable_dataset import TransformedIterableDataset
from .utils import (
to_pandas,
load_datasets,
save_datasets,
serialize_data_entry,
frequency_add,
forecast_start,
)
-834
View File
@@ -1,834 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import math
import os
import random
from typing import Callable, List, NamedTuple, Optional, Tuple, Union
import numpy as np
import pandas as pd
import rapidjson as json
from .common import (
MetaData,
CategoricalFeatureInfo,
BasicFeatureInfo,
FieldName,
Dataset,
TrainDatasets,
DataEntry,
)
from .list_dataset import ListDataset
from .recipe import (
BinaryHolidays,
BinaryMarkovChain,
Constant,
ForEachCat,
Lag,
LinearTrend,
RandomCat,
RandomGaussian,
Stack,
generate,
take_as_list,
)
from .stat import DatasetStatistics, calculate_dataset_statistics
class DatasetInfo(NamedTuple):
"""
Information stored on a dataset. When downloading from the repository, the
dataset repository checks that the obtained version matches the one
declared in dataset_info/dataset_name.json.
"""
name: str
metadata: MetaData
prediction_length: int
train_statistics: DatasetStatistics
test_statistics: DatasetStatistics
class ArtificialDataset:
"""
Parent class of a dataset that can be generated from code.
"""
def __init__(self, freq) -> None:
self.freq = freq
@property
def metadata(self) -> MetaData:
pass
@property
def train(self) -> List[DataEntry]:
pass
@property
def test(self) -> List[DataEntry]:
pass
# todo return the same type as dataset repo for better usability
def generate(self) -> TrainDatasets:
return TrainDatasets(
metadata=self.metadata,
train=ListDataset(self.train, self.freq),
test=ListDataset(self.test, self.freq),
)
class ConstantDataset(ArtificialDataset):
def __init__(
self,
num_timeseries: int = 10,
num_steps: int = 30,
freq: str = "1H",
start: str = "2000-01-01 00:00:00",
is_nan: bool = False, # Generates constant dataset of 0s with explicit NaN missing values
is_random_constant: bool = False, # Inserts random constant value for each time series
is_different_scales: bool = False, # Generates constants on various scales
is_piecewise: bool = False, # Determines whether the time series in the test
# and train set should have different constant values
is_noise: bool = False, # Determines whether to add Gaussian noise to the constant dataset
is_long: bool = False, # Determines whether some time series will have very long lengths
is_short: bool = False, # Determines whether some time series will have very short lengths
is_trend: bool = False, # Determines whether to add linear trends
num_missing_middle: int = 0, # Number of missing values in the middle of the time series
is_promotions: bool = False, # Determines whether to add promotions to the target time series
# and to store in metadata
holidays: Optional[
List[pd.Timestamp]
] = None, # Determines whether to add holidays to the target time series
# and to store in metadata
) -> None:
super(ConstantDataset, self).__init__(freq)
self.num_timeseries = num_timeseries
self.num_steps = num_steps
self.num_training_steps = self.num_steps // 10 * 8
self.prediction_length = self.num_steps - self.num_training_steps
self.start = start
self.is_nan = is_nan
self.is_random_constant = is_random_constant
self.is_different_scales = is_different_scales
self.is_piecewise = is_piecewise
self.is_noise = is_noise
self.is_long = is_long
self.is_short = is_short
self.is_trend = is_trend
self.num_missing_middle = num_missing_middle
self.is_promotions = is_promotions
self.holidays = holidays
@property
def metadata(self) -> MetaData:
metadata = MetaData(
freq=self.freq,
feat_static_cat=[
{
"name": "feat_static_cat_000",
"cardinality": str(self.num_timeseries),
}
],
feat_static_real=[{"name": "feat_static_real_000"}],
prediction_length=self.prediction_length,
)
if self.is_promotions or self.holidays:
metadata = MetaData(
freq=self.freq,
feat_static_cat=[
{
"name": "feat_static_cat_000",
"cardinality": str(self.num_timeseries),
}
],
feat_static_real=[{"name": "feat_static_real_000"}],
feat_dynamic_real=[BasicFeatureInfo(name=FieldName.FEAT_DYNAMIC_REAL)],
prediction_length=self.prediction_length,
)
return metadata
def determine_constant(
self, index: int, constant: Optional[float] = None, seed: int = 1
) -> Optional[float]:
if self.is_random_constant:
my_random = random.Random(seed)
constant = (index + 1) * my_random.random()
elif self.is_different_scales:
if index == 0:
constant = 1e-8
elif constant is not None:
constant *= 100
else:
constant = float(index)
return constant
def compute_data_from_recipe(
self,
num_steps: int,
constant: Optional[float] = None,
one_to_zero: float = 0.1,
zero_to_one: float = 0.1,
scale_features: float = 200,
) -> TrainDatasets:
recipe = []
recipe_type = Constant(constant)
if self.is_noise:
recipe_type += RandomGaussian() # Use default stddev = 1.0
if self.is_trend:
recipe_type += LinearTrend()
if self.is_promotions:
recipe.append(
("binary_causal", BinaryMarkovChain(one_to_zero, zero_to_one))
)
recipe.append((FieldName.FEAT_DYNAMIC_REAL, Stack(["binary_causal"])))
recipe_type += scale_features * Lag("binary_causal", lag=0)
if self.holidays:
timestamp = self.init_date()
# Compute dates array
dates = []
for i in range(num_steps):
dates.append(timestamp)
timestamp += 1
recipe.append(("binary_holidays", BinaryHolidays(dates, self.holidays)))
recipe.append((FieldName.FEAT_DYNAMIC_REAL, Stack(["binary_holidays"])))
recipe_type += scale_features * Lag("binary_holidays", lag=0)
recipe.append((FieldName.TARGET, recipe_type))
max_train_length = num_steps - self.prediction_length
data = RecipeDataset(
recipe=recipe,
metadata=self.metadata,
max_train_length=max_train_length,
prediction_length=self.prediction_length,
num_timeseries=1, # Add 1 time series at a time in the loop for different constant valus per time series
)
generated = data.generate()
return generated
def piecewise_constant(self, index: int, num_steps: int) -> List:
target = []
for j in range(num_steps):
if j < self.num_training_steps:
constant = self.determine_constant(index=index)
else:
constant = self.determine_constant(index=index, seed=2)
target.append(constant)
return target
def get_num_steps(
self,
index: int,
num_steps_max: int = 10000,
long_freq: int = 4,
num_steps_min: int = 2,
short_freq: int = 4,
) -> int:
num_steps = self.num_steps
if self.is_long and index % long_freq == 0:
num_steps = num_steps_max
elif self.is_short and index % short_freq == 0:
num_steps = num_steps_min
return num_steps
def init_date(self) -> pd.Timestamp:
week_dict = {
0: "MON",
1: "TUE",
2: "WED",
3: "THU",
4: "FRI",
5: "SAT",
6: "SUN",
}
timestamp = pd.Timestamp(self.start)
freq_week_start = self.freq
if freq_week_start == "W":
freq_week_start = f"W-{week_dict[timestamp.weekday()]}"
return pd.Timestamp(self.start, freq=freq_week_start)
@staticmethod
def insert_nans_and_zeros(ts_len: int) -> List:
target = []
for j in range(ts_len):
# Place NaNs at even indices. Use convention no NaNs before start date.
if j != 0 and j % 2 == 0:
target.append(np.nan)
# Place zeros at odd indices
else:
target.append(0.0)
return target
def insert_missing_vals_middle(
self, ts_len: int, constant: Optional[float]
) -> List:
target = []
lower_bound = (self.num_training_steps - self.num_missing_middle) // 2
upper_bound = (self.num_training_steps + self.num_missing_middle) // 2
num_missing_endpts = math.floor(0.1 * self.num_missing_middle)
for j in range(ts_len):
if (
(0 < j < lower_bound and j % (2 * num_missing_endpts) == 0)
or (lower_bound <= j < upper_bound)
or (j >= upper_bound and j % (2 * num_missing_endpts) == 0)
):
val = np.nan
else:
val = constant
target.append(val)
return target
def generate_ts(self, num_ts_steps: int, is_train: bool = False) -> List[DataEntry]:
res = []
constant = None
for i in range(self.num_timeseries):
if self.is_nan:
target = self.insert_nans_and_zeros(num_ts_steps)
elif self.is_piecewise:
target = self.piecewise_constant(i, num_ts_steps)
else:
constant = self.determine_constant(i, constant)
if self.num_missing_middle > 0:
target = self.insert_missing_vals_middle(num_ts_steps, constant)
elif (
self.is_noise
or self.is_trend
or self.is_promotions
or self.holidays
):
num_steps = self.get_num_steps(i)
generated = self.compute_data_from_recipe(num_steps, constant)
if is_train:
time_series = generated.train
else:
assert generated.test is not None
time_series = generated.test
# returns np array convert to list for consistency
target = list(time_series)[0][FieldName.TARGET].tolist()
else:
target = [constant] * num_ts_steps
ts_data = dict(
start=self.start,
target=target,
item_id=str(i),
feat_static_cat=[i],
feat_static_real=[i],
)
if self.is_promotions or self.holidays:
ts_data[FieldName.FEAT_DYNAMIC_REAL] = list(time_series)[0][
FieldName.FEAT_DYNAMIC_REAL
].tolist()
res.append(ts_data)
return res
@property
def train(self) -> List[DataEntry]:
return self.generate_ts(num_ts_steps=self.num_training_steps, is_train=True)
@property
def test(self) -> List[DataEntry]:
return self.generate_ts(num_ts_steps=self.num_steps)
class ComplexSeasonalTimeSeries(ArtificialDataset):
"""
Generate sinus time series that ramp up and reach a certain amplitude, and
level and have additional spikes on each sunday.
TODO: This could be converted to a RecipeDataset to avoid code duplication.
"""
def __init__(
self,
num_series: int = 100,
prediction_length: int = 20,
freq_str: str = "D",
length_low: int = 30,
length_high: int = 200,
min_val: float = -10000,
max_val: float = 10000,
is_integer: bool = False,
proportion_missing_values: float = 0,
is_noise: bool = True,
is_scale: bool = True,
percentage_unique_timestamps: float = 0.07,
is_out_of_bounds_date: bool = False,
seasonality: Optional[int] = None,
clip_values: bool = False,
) -> None:
"""
:param num_series: number of time series generated in the train and
test set
:param prediction_length:
:param freq_str:
:param length_low: minimum length of a time-series, must be larger than
prediction_length
:param length_high: maximum length of a time-series
:param min_val: min value of a time-series
:param max_val: max value of a time-series
:param is_integer: whether the dataset has integers or not
:param proportion_missing_values:
:param is_noise: whether to add noise
:param is_scale: whether to add scale
:param percentage_unique_timestamps: percentage of random start dates bounded between 0 and 1
:param is_out_of_bounds_date: determines whether to use very old start dates and start dates far in the future
:param seasonality: Seasonality of the generated data. If not given uses default seasonality for frequency
:param clip_values: if True the values will be clipped to [min_val, max_val], otherwise linearly scales them
"""
assert length_low > prediction_length
super(ComplexSeasonalTimeSeries, self).__init__(freq_str)
self.num_series = num_series
self.prediction_length = prediction_length
self.length_low = length_low
self.length_high = length_high
self.freq_str = freq_str
self.min_val = min_val
self.max_val = max_val
self.is_integer = is_integer
self.proportion_missing_values = proportion_missing_values
self.is_noise = is_noise
self.is_scale = is_scale
self.percentage_unique_timestamps = percentage_unique_timestamps
self.is_out_of_bounds_date = is_out_of_bounds_date
self.seasonality = seasonality
self.clip_values = clip_values
@property
def metadata(self) -> MetaData:
return MetaData(freq=self.freq, prediction_length=self.prediction_length)
def _get_period(self) -> int:
if self.seasonality is not None:
return self.seasonality
if self.freq_str == "M":
return 24
elif self.freq_str == "W":
return 52
elif self.freq_str == "D":
return 14
elif self.freq_str == "H":
return 24
elif self.freq_str == "min":
return 60
else:
raise RuntimeError()
def _get_start(self, index: int, my_random: random.Random) -> str:
if (
self.is_out_of_bounds_date and index == 0
): # Add edge case of dates out of normal bounds past date
start_y, start_m, start_d = (
1690,
2,
7,
) # Pandas doesn't allot before 1650
start_h, start_min = 18, 36
elif (
self.is_out_of_bounds_date and index == self.num_series - 1
): # Add edge case of dates out of normal bounds future date
start_y, start_m, start_d = (
2030,
6,
3,
) # Pandas doesn't allot before 1650
start_h, start_min = 18, 36
# assume that only 100 * percentage_unique_timestamps of timestamps are unique
elif my_random.random() < self.percentage_unique_timestamps:
start_y = my_random.randint(2000, 2018)
start_m = my_random.randint(1, 12)
start_d = my_random.randint(1, 28)
start_h = my_random.randint(0, 23)
start_min = my_random.randint(0, 59)
else:
start_y, start_m, start_d = 2013, 11, 28
start_h, start_min = 18, 36
if self.freq_str == "M":
return "%04.d-%02.d" % (start_y, start_m)
elif self.freq_str in ["W", "D"]:
return "%04.d-%02.d-%02.d" % (start_y, start_m, start_d)
elif self.freq_str == "H":
return "%04.d-%02.d-%02.d %02.d:00:00" % (
start_y,
start_m,
start_d,
start_h,
)
else:
return "%04.d-%02.d-%02.d %02.d:%02.d:00" % (
start_y,
start_m,
start_d,
start_h,
start_min,
)
def _special_time_point_indicator(self, index) -> bool:
if self.freq_str == "M":
return index.month == 1
elif self.freq_str == "W":
return index.month % 2 == 0
elif self.freq_str == "D":
return index.dayofweek == 0
elif self.freq_str == "H":
return index.hour == 0
elif self.freq_str == "min":
return index.minute % 30 == 0
else:
raise RuntimeError(f'Bad freq_str value "{index}"')
@property
def train(self) -> List[DataEntry]:
return [
dict(
start=ts[FieldName.START],
target=ts[FieldName.TARGET][: -self.prediction_length],
item_id=ts[FieldName.ITEM_ID],
)
for ts in self.make_timeseries()
]
@property
def test(self) -> List[DataEntry]:
return self.make_timeseries()
def make_timeseries(self, seed: int = 1) -> List[DataEntry]:
res = []
# Fix seed so that the training set is the same
# as the test set from 0:self.prediction_length for the two independent calls
def sigmoid(x: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-x))
# Ensure same start dates in test and training set
my_random = random.Random(seed)
state = np.random.RandomState(seed)
for i in range(self.num_series):
val_range = self.max_val - self.min_val
length = state.randint(low=self.length_low, high=self.length_high)
start = self._get_start(i, my_random)
envelope = sigmoid((np.arange(length) - 20.0) / 10.0)
level = 0.3 * val_range * (state.random_sample() - 0.5)
phi = 2 * np.pi * state.random_sample()
period = self._get_period()
w = 2 * np.pi / period
t = np.arange(length)
idx = pd.date_range(start=start, freq=self.freq_str, periods=length)
special_tp_indicator = self._special_time_point_indicator(idx)
sunday_effect = state.random_sample() * special_tp_indicator
v = np.sin(w * t + phi) + sunday_effect
if self.is_scale:
scale = 0.1 * val_range * state.random_sample()
v *= scale
v += level
if self.is_noise:
noise_range = 0.02 * val_range * state.random_sample()
noise = noise_range * state.normal(size=length)
v += noise
v = envelope * v
if self.clip_values:
np.clip(v, a_min=self.min_val, a_max=self.max_val, out=v)
else:
"""
Rather than mapping [v_min, v_max] to [self.min_val, self.max_val] which would lead to
all the time series having the same min and max, we want to keep the same interval length
(v_max - v_min). We thus shift the interval [v_min, v_max] in [self.min_val, self.max_val]
and clip it if needed.
"""
v_min, v_max = v.min(), v.max()
p_min, p_max = (
max(self.min_val, v_min),
min(self.max_val, v_max),
)
shifted_min = np.clip(
p_min + (p_max - v_max), a_min=self.min_val, a_max=self.max_val,
)
shifted_max = np.clip(
p_max + (p_min - v_min), a_min=self.min_val, a_max=self.max_val,
)
v = shifted_min + (shifted_max - shifted_min) * (v - v_min) / (
v_max - v_min
)
if self.is_integer:
np.clip(
v, a_min=np.ceil(self.min_val), a_max=np.floor(self.max_val), out=v,
)
v = np.round(v).astype(int)
v = list(v.tolist())
if self.proportion_missing_values > 0:
assert (
self.proportion_missing_values < 1.0
), "Please chose a number 0 < x < 1.0"
idx = np.arange(len(v))
state.shuffle(idx)
num_missing_values = (
int(len(v) * self.proportion_missing_values) + 1
) # Add one in case this gets zero
missing_idx = idx[:num_missing_values]
for j in missing_idx:
# Using convention that there are no missing values before the start date.
if j != 0:
v[j] = None if state.rand() < 0.5 else "NaN"
res.append(
dict(
start=pd.Timestamp(start, freq=self.freq_str),
target=np.array(v),
item_id=i,
)
)
return res
class RecipeDataset(ArtificialDataset):
"""Synthetic data set generated by providing a recipe.
A recipe is either a (non-deterministic) function
f(length: int, global_state: dict) -> dict
or list of (field, function) tuples of the form
(field: str, f(data: dict, length: int, global_state: dict) -> dict)
which is processed sequentially, with data initially set to {},
and each entry updating data[field] to the output of the function
call.
"""
def __init__(
self,
recipe: Union[Callable, List[Tuple[str, Callable]]],
metadata: MetaData,
max_train_length: int,
prediction_length: int,
num_timeseries: int,
trim_length_fun=lambda x, **kwargs: 0,
data_start=pd.Timestamp("2014-01-01"),
) -> None:
"""
:param recipe: The recipe to generate from (see class docstring)
:param metadata: The metadata to be included in the dataset
:param max_train_length: The maximum length of a training time series.
:param prediction_length: The length of the prediction range
:param num_timeseries: Number of time series to generate
:param trim_length_fun: Callable f(x: int) -> int returning the
(shortened) training length
:param data_start: Start date for the data set
"""
super().__init__(freq=metadata.freq)
self.recipe = recipe
self._metadata = metadata
self.max_train_length = max_train_length
self.prediction_length = prediction_length
self.trim_length_fun = trim_length_fun
self.num_timeseries = num_timeseries
self.data_start = pd.Timestamp(data_start, freq=self._metadata.freq)
@property
def metadata(self) -> MetaData:
return self._metadata
def dataset_info(self, train_ds: Dataset, test_ds: Dataset) -> DatasetInfo:
return DatasetInfo(
name=f"RecipeDataset({repr(self.recipe)})",
metadata=self.metadata,
prediction_length=self.prediction_length,
train_statistics=calculate_dataset_statistics(train_ds),
test_statistics=calculate_dataset_statistics(test_ds),
)
@staticmethod
def trim_ts_item_end(x: DataEntry, length: int) -> DataEntry:
"""Trim a TimeSeriesItem into a training range, by removing
the last prediction_length time points from the target and dynamic
features."""
y = dict(
item_id=x[FieldName.ITEM_ID],
start=x[FieldName.START],
target=x[FieldName.TARGET][:-length],
)
if FieldName.FEAT_DYNAMIC_CAT in x:
y[FieldName.FEAT_DYNAMIC_CAT] = x[FieldName.FEAT_DYNAMIC_CAT][:, :-length]
if FieldName.FEAT_DYNAMIC_REAL in x:
y[FieldName.FEAT_DYNAMIC_REAL] = x[FieldName.FEAT_DYNAMIC_REAL][:, :-length]
return y
@staticmethod
def trim_ts_item_front(x: DataEntry, length: int) -> DataEntry:
"""Trim a TimeSeriesItem into a training range, by removing
the first offset_front time points from the target and dynamic
features."""
assert length <= len(x[FieldName.TARGET])
y = dict(
item_id=x[FieldName.ITEM_ID],
start=x[FieldName.START] + length * x[FieldName.START].freq,
target=x[FieldName.TARGET][length:],
)
if FieldName.FEAT_DYNAMIC_CAT in x:
y[FieldName.FEAT_DYNAMIC_CAT] = x[FieldName.FEAT_DYNAMIC_CAT][:, length:]
if FieldName.FEAT_DYNAMIC_REAL in x:
y[FieldName.FEAT_DYNAMIC_REAL] = x[FieldName.FEAT_DYNAMIC_REAL][:, length:]
return y
def generate(self) -> TrainDatasets:
metadata = self.metadata
data_it = generate(
length=self.max_train_length + self.prediction_length,
recipe=self.recipe,
start=self.data_start,
)
full_length_data = take_as_list(data_it, self.num_timeseries)
test_data = [
RecipeDataset.trim_ts_item_front(
x, self.trim_length_fun(x, train_length=self.max_train_length)
)
for x in full_length_data
]
train_data = [
RecipeDataset.trim_ts_item_end(x, self.prediction_length) for x in test_data
]
return TrainDatasets(
metadata=metadata,
train=ListDataset(train_data, metadata.freq),
test=ListDataset(test_data, metadata.freq),
)
def default_synthetic() -> Tuple[DatasetInfo, Dataset, Dataset]:
recipe = [
(FieldName.TARGET, LinearTrend() + RandomGaussian()),
(FieldName.FEAT_STATIC_CAT, RandomCat([10])),
(
FieldName.FEAT_STATIC_REAL,
ForEachCat(RandomGaussian(1, (10,)), FieldName.FEAT_STATIC_CAT)
+ RandomGaussian(0.1, (10,)),
),
]
data = RecipeDataset(
recipe=recipe,
metadata=MetaData(
freq="D",
feat_static_real=[BasicFeatureInfo(name=FieldName.FEAT_STATIC_REAL)],
feat_static_cat=[
CategoricalFeatureInfo(name=FieldName.FEAT_STATIC_CAT, cardinality=10)
],
feat_dynamic_real=[BasicFeatureInfo(name=FieldName.FEAT_DYNAMIC_REAL)],
),
max_train_length=20,
prediction_length=10,
num_timeseries=10,
trim_length_fun=lambda x, **kwargs: np.minimum(
int(np.random.geometric(1 / (kwargs["train_length"] / 2))),
kwargs["train_length"],
),
)
generated = data.generate()
assert generated.test is not None
info = data.dataset_info(generated.train, generated.test)
return info, generated.train, generated.test
def constant_dataset() -> Tuple[DatasetInfo, Dataset, Dataset]:
metadata = MetaData(
freq="1H",
feat_static_cat=[
CategoricalFeatureInfo(name="feat_static_cat_000", cardinality="10")
],
feat_static_real=[BasicFeatureInfo(name="feat_static_real_000")],
)
start_date = "2000-01-01 00:00:00"
train_ds = ListDataset(
data_iter=[
{
FieldName.ITEM_ID: str(i),
FieldName.START: start_date,
FieldName.TARGET: [float(i)] * 24,
FieldName.FEAT_STATIC_CAT: [i],
FieldName.FEAT_STATIC_REAL: [float(i)],
}
for i in range(10)
],
freq=metadata.freq,
)
test_ds = ListDataset(
data_iter=[
{
FieldName.ITEM_ID: str(i),
FieldName.START: start_date,
FieldName.TARGET: [float(i)] * 30,
FieldName.FEAT_STATIC_CAT: [i],
FieldName.FEAT_STATIC_REAL: [float(i)],
}
for i in range(10)
],
freq=metadata.freq,
)
info = DatasetInfo(
name="constant_dataset",
metadata=metadata,
prediction_length=6,
train_statistics=calculate_dataset_statistics(train_ds),
test_statistics=calculate_dataset_statistics(test_ds),
)
return info, train_ds, test_ds
def generate_sf2(
filename: str, time_series: List, is_missing: bool, num_missing: int
) -> None:
# This function generates the test and train json files which will be converted to csv format
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
with open(filename, "w") as json_file:
for ts in time_series:
if is_missing:
target = [] # type: List
# For Forecast don't output feat_static_cat and feat_static_real
for j, val in enumerate(ts[FieldName.TARGET]):
# only add ones that are not missing
if j != 0 and j % num_missing == 0:
target.append(None)
else:
target.append(val)
ts[FieldName.TARGET] = target
ts.pop(FieldName.FEAT_STATIC_CAT, None)
ts.pop(FieldName.FEAT_STATIC_REAL, None)
# Chop features in training set
if FieldName.FEAT_DYNAMIC_REAL in ts.keys() and "train" in filename:
# TODO: Fix for missing values
for i, feat_dynamic_real in enumerate(ts[FieldName.FEAT_DYNAMIC_REAL]):
ts[FieldName.FEAT_DYNAMIC_REAL][i] = feat_dynamic_real[
: len(ts[FieldName.TARGET])
]
json.dump(ts, json_file)
json_file.write("\n")
-95
View File
@@ -1,95 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import Any, Dict, Iterable, NamedTuple, List, Optional
import pandas as pd
from pydantic import BaseModel
# Dictionary used for data flowing through the transformations.
DataEntry = Dict[str, Any]
# A Dataset is an iterable of DataEntry.
Dataset = Iterable[DataEntry]
class SourceContext(NamedTuple):
source: str
row: int
class FieldName:
"""
A bundle of default field names to be used by clients when instantiating
transformer instances.
"""
ITEM_ID = "item_id"
START = "start"
TARGET = "target"
FEAT_STATIC_CAT = "feat_static_cat"
FEAT_STATIC_REAL = "feat_static_real"
FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
FEAT_DYNAMIC_REAL = "feat_dynamic_real"
FEAT_TIME = "time_feat"
FEAT_CONST = "feat_dynamic_const"
FEAT_AGE = "feat_dynamic_age"
OBSERVED_VALUES = "observed_values"
IS_PAD = "is_pad"
FORECAST_START = "forecast_start"
class CategoricalFeatureInfo(BaseModel):
name: str
cardinality: str
class BasicFeatureInfo(BaseModel):
name: str
class MetaData(BaseModel):
freq: str = None
target: Optional[BasicFeatureInfo] = None
feat_static_cat: List[CategoricalFeatureInfo] = []
feat_static_real: List[BasicFeatureInfo] = []
feat_dynamic_real: List[BasicFeatureInfo] = []
feat_dynamic_cat: List[CategoricalFeatureInfo] = []
prediction_length: Optional[int] = None
class TrainDatasets(NamedTuple):
"""
A dataset containing two subsets, one to be used for training purposes,
and the other for testing purposes, as well as metadata.
"""
metadata: MetaData
train: Dataset
test: Optional[Dataset] = None
class DateConstants:
"""
Default constants for specific dates.
"""
OLDEST_SUPPORTED_TIMESTAMP = pd.Timestamp(1800, 1, 1, 12)
LATEST_SUPPORTED_TIMESTAMP = pd.Timestamp(2200, 1, 1, 12)
-133
View File
@@ -1,133 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import functools
import glob
import random
from pathlib import Path
from typing import Iterator, List
from typing import NamedTuple
import rapidjson as json
from .common import Dataset, DataEntry, SourceContext
from .process import ProcessDataEntry
def load(file_obj):
for line in file_obj:
yield json.loads(line)
class Span(NamedTuple):
path: Path
line: int
class Line(NamedTuple):
content: object
span: Span
class JsonLinesFile:
"""
An iterable type that draws from a JSON Lines file.
Parameters
----------
path
Path of the file to load data from. This should be a valid
JSON Lines file.
"""
def __init__(self, path: Path, shuffle: bool = True) -> None:
self.path = path
self.shuffle = shuffle
def __iter__(self):
with open(self.path) as jsonl_file:
lines = jsonl_file.read().splitlines()
if self.shuffle:
random.shuffle(lines)
for line_number, raw in enumerate(lines, start=1):
span = Span(path=self.path, line=line_number)
try:
yield Line(json.loads(raw), span=span)
except ValueError:
raise Exception(f"Could not read json line {line_number}, {raw}")
def __len__(self):
# 1MB
BUF_SIZE = 1024 ** 2
with open(self.path) as file_obj:
read_chunk = functools.partial(file_obj.read, BUF_SIZE)
return sum(chunk.count("\n") for chunk in iter(read_chunk, ""))
class FileDataset(Dataset):
"""
Dataset that loads JSON Lines files contained in a path.
Parameters
----------
path
Return list of path names that match path. Each file is considered
and should be valid. A valid line in a file can be for
instance: {"start": "2014-09-07", "target": [0.1, 0.2]}.
freq
Frequency of the observation in the time series.
Must be a valid Pandas frequency.
one_dim_target
Whether to accept only univariate target time series.
shuffle
Whether to shuffle the time series when making the batches
"""
def __init__(
self, path: Path, freq: str, one_dim_target: bool = True, shuffle: bool = False
) -> None:
self.shuffle = shuffle
self.path = path
self.process = ProcessDataEntry(freq, one_dim_target=one_dim_target)
if not self.files():
raise OSError(f"no valid file found via {path}")
def __iter__(self) -> Iterator[DataEntry]:
for path in self.files():
for line in JsonLinesFile(path, self.shuffle):
data = self.process(line.content)
data["source"] = SourceContext(
source=line.span.path, row=line.span.line
)
yield data
def __len__(self):
return sum([len(JsonLinesFile(path)) for path in self.files()])
def files(self) -> List[Path]:
"""
List the files that compose the dataset.
Returns
-------
List[Path]
List of the paths of all files composing the dataset.
"""
files = glob.glob(str(self.path))
if self.shuffle:
random.shuffle(files)
return files
-42
View File
@@ -1,42 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import random
from typing import Iterable
from .common import DataEntry, Dataset, SourceContext
from .process import ProcessDataEntry
class ListDataset(Dataset):
def __init__(
self,
data_iter: Iterable[DataEntry],
freq: str,
one_dim_target: bool = True,
shuffle: bool = False,
) -> None:
process = ProcessDataEntry(freq, one_dim_target)
self.list_data = [process(data) for data in data_iter]
if shuffle:
random.shuffle(self.list_data)
def __iter__(self):
source_name = "list_data"
for row_number, data in enumerate(self.list_data, start=1):
data["source"] = SourceContext(source=source_name, row=row_number)
yield data
def __len__(self):
return len(self.list_data)
-224
View File
@@ -1,224 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import itertools
from collections import defaultdict
from typing import Any, Dict, Iterable, Iterator, List, Optional # noqa: F401
import numpy as np
# Third-party imports
import torch
from pts.transform.transform import Transformation
# First-party imports
from .common import DataEntry, Dataset
DataBatch = Dict[str, Any]
class BatchBuffer:
def __init__(
self, batch_size: int, device: torch.device, dtype: np.dtype = np.float32
) -> None:
self._buffers: Dict[Any, List[Any]] = defaultdict(list)
self.batch_size = batch_size
self._size = 0
self.device = device
self.dtype = dtype
def add(self, d: Dict[str, List[np.ndarray]]):
if self._buffers:
assert self._buffers.keys() == d.keys()
for k, v in d.items():
self._buffers[k].append(v)
self._size += 1
def __len__(self):
return self._size
def next_batch(self) -> DataBatch:
assert self._size > 0
n = min(self._size, self.batch_size)
batch = {k: self.stack(v[:n]) for k, v in self._buffers.items()}
for key in self._buffers.keys():
self._buffers[key] = self._buffers[key][n:]
self._size -= n
return batch
def stack(self, xs):
if isinstance(xs[0], np.ndarray):
data = np.asarray(xs)
if data.dtype.kind == "f":
data = data.astype(self.dtype)
return torch.from_numpy(data).to(device=self.device, non_blocking=True)
elif isinstance(xs[0], torch.Tensor):
return torch.stack(*xs)
else:
return xs # stack all other types as list
def shuffle(self):
perm = np.random.permutation(self._size)
for key in self._buffers.keys():
li = self._buffers[key]
self._buffers[key] = [li[i] for i in perm]
class DataLoader(Iterable[DataEntry]):
"""
An abstract Iterable type for iterating and transforming a dataset,
in batches of a prescribed size.
Parameters
----------
dataset
The dataset from which to load data.
transform
A transformation to apply to each entry in the dataset.
batch_size
The size of the batches to emit.
device
device to use to store data on.
dtype
Floating point type to use.
"""
def __init__(
self,
dataset: Dataset,
transform: Transformation,
batch_size: int,
device: torch.device,
dtype: np.dtype = np.float32,
) -> None:
self.dataset = dataset
self.transform = transform
self.batch_size = batch_size
self.device = device
self.dtype = dtype
class TrainDataLoader(DataLoader):
"""
An Iterable type for iterating and transforming a dataset, in batches of a
prescribed size, until a given number of batches is reached.
The transformation are applied with in training mode, i.e. with the flag
`is_train = True`.
Parameters
----------
dataset
The dataset from which to load data.
transform
A transformation to apply to each entry in the dataset.
batch_size
The size of the batches to emit.
device
device to use to store data on.
num_batches_per_epoch
Number of batches to return in one complete iteration over this object.
dtype
Floating point type to use.
"""
def __init__(
self,
dataset: Dataset,
transform: Transformation,
batch_size: int,
device: torch.device,
num_batches_per_epoch: int,
dtype: np.dtype = np.float32,
shuffle_for_training: bool = True,
num_batches_for_shuffling: int = 10,
) -> None:
super().__init__(dataset, transform, batch_size, device, dtype)
self.num_batches_per_epoch = num_batches_per_epoch
self.shuffle_for_training = shuffle_for_training
self._num_buffered_batches = (
num_batches_for_shuffling if shuffle_for_training else 1
)
self._cur_iter: Optional[Iterator] = None
self._buffer = BatchBuffer(self.batch_size, device, dtype)
def _emit_batches_while_buffer_larger_than(self, thresh) -> Iterator[DataBatch]:
if self.shuffle_for_training:
self._buffer.shuffle()
while len(self._buffer) > thresh:
yield self._buffer.next_batch()
def _iterate_forever(self, collection: Iterable[DataEntry]) -> Iterator[DataEntry]:
# iterate forever over the collection, the collection must be non empty
while True:
try:
first = next(iter(collection))
except StopIteration:
raise Exception("empty dataset")
else:
for x in itertools.chain([first], collection):
yield x
def __len__(self) -> int:
return self.num_batches_per_epoch
def __iter__(self) -> Iterator[DataBatch]:
batch_count = 0
if self._cur_iter is None:
self._cur_iter = self.transform(
self._iterate_forever(self.dataset), is_train=True
)
assert self._cur_iter is not None
while True:
data_entry = next(self._cur_iter)
self._buffer.add(data_entry)
if len(self._buffer) >= self._num_buffered_batches * self.batch_size:
for batch in self._emit_batches_while_buffer_larger_than(
self.batch_size - 1
):
yield batch
batch_count += 1
if batch_count >= self.num_batches_per_epoch:
return
class InferenceDataLoader(DataLoader):
"""
An Iterable type for iterating and transforming a dataset just once, in
batches of a prescribed size.
The transformation are applied with in inference mode, i.e. with the flag
`is_train = False`.
Parameters
----------
dataset
The dataset from which to load data.
transform
A transformation to apply to each entry in the dataset.
batch_size
The size of the batches to emit.
device
device to use to store data on.
dtype
Floating point type to use.
"""
def __iter__(self) -> Iterator[DataBatch]:
buffer = BatchBuffer(self.batch_size, self.device, self.dtype)
for data_entry in self.transform(iter(self.dataset), is_train=False):
buffer.add(data_entry)
if len(buffer) >= self.batch_size:
yield buffer.next_batch()
if len(buffer) > 0:
yield buffer.next_batch()
-211
View File
@@ -1,211 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import logging
from typing import Callable, Optional
import numpy as np
import pandas as pd
# First-party imports
from .common import DataEntry, Dataset, FieldName, DateConstants
from .list_dataset import ListDataset
class MultivariateGrouper:
"""
The MultivariateGrouper takes a univariate dataset and groups it into a
single multivariate time series. Therefore, this class allows the user
to convert a univariate dataset into a multivariate dataset without making
a separate copy of the dataset.
The Multivariate Grouper has two different modes:
Training: For training data, the univariate time series get aligned to the
earliest time stamp in the dataset. Time series will be left and right
padded to produce an array of shape (dim, num_time_steps)
Test: The test dataset might have multiple start dates (usually because
the test dataset mimics a rolling evaluation scenario). In this case,
the univariate dataset will be split into n multivariate time series,
where n is the number of evaluation dates. Again, the
time series will be grouped but only left padded. Note that the
padded value will influence the prediction if the context length is
longer than the length of the time series.
Rules for padding for training and test datasets can be specified by the
user.
Parameters
----------
max_target_dim
Set maximum dimensionality (for faster testing or when hitting
constraints of multivariate model). Takes the last max_target_dim
time series and groups them to multivariate time series.
num_test_dates
Number of test dates in the test set. This can be more than one if
the test set contains more than one forecast start date (often the
case in a rolling evaluation scenario). Must be set to convert test
data.
train_fill_rule
Implements the rule that fills missing data after alignment of the
time series for the training dataset.
test_fill_rule
Implements the rule that fills missing data after alignment of the
time series for the test dataset.
"""
def __init__(
self,
max_target_dim: Optional[int] = None,
num_test_dates: Optional[int] = None,
train_fill_rule: Callable = np.mean,
test_fill_rule: Callable = lambda x: 0.0,
) -> None:
self.num_test_dates = num_test_dates
self.max_target_dimension = max_target_dim
self.train_fill_function = train_fill_rule
self.test_fill_rule = test_fill_rule
self.first_timestamp = DateConstants.LATEST_SUPPORTED_TIMESTAMP
self.last_timestamp = DateConstants.OLDEST_SUPPORTED_TIMESTAMP
self.frequency = ""
def __call__(self, dataset: Dataset) -> Dataset:
self._preprocess(dataset)
return self._group_all(dataset)
def _preprocess(self, dataset: Dataset) -> None:
"""
The preprocess function iterates over the dataset to gather data that
is necessary for alignment.
This includes
1) Storing first/last timestamp in the dataset
2) Storing the frequency of the dataset
"""
for data in dataset:
timestamp = data[FieldName.START]
self.first_timestamp = min(self.first_timestamp, timestamp)
self.last_timestamp = max(
self.last_timestamp,
timestamp + (len(data[FieldName.TARGET]) - 1) * timestamp.freq,
)
self.frequency = timestamp.freq
logging.info(
f"first/last timestamp found: "
f"{self.first_timestamp}/{self.last_timestamp}"
)
def _group_all(self, dataset: Dataset) -> Dataset:
if self.num_test_dates is None:
grouped_dataset = self._prepare_train_data(dataset)
else:
grouped_dataset = self._prepare_test_data(dataset)
return grouped_dataset
def _prepare_train_data(self, dataset: Dataset) -> ListDataset:
logging.info("group training time-series to datasets")
grouped_data = self._transform_target(self._align_data_entry, dataset)
grouped_data = self._restrict_max_dimensionality(grouped_data)
grouped_data[FieldName.START] = self.first_timestamp
grouped_data[FieldName.FEAT_STATIC_CAT] = [0]
return ListDataset([grouped_data], freq=self.frequency, one_dim_target=False)
def _prepare_test_data(self, dataset: Dataset) -> ListDataset:
logging.info("group test time-series to datasets")
grouped_data = self._transform_target(self._left_pad_data, dataset)
# splits test dataset with rolling date into N R^d time series where
# N is the number of rolling evaluation dates
split_dataset = np.split(grouped_data[FieldName.TARGET], self.num_test_dates)
all_entries = list()
for dataset_at_test_date in split_dataset:
grouped_data = dict()
grouped_data[FieldName.TARGET] = np.array(
list(dataset_at_test_date), dtype=np.float32
)
grouped_data = self._restrict_max_dimensionality(grouped_data)
grouped_data[FieldName.START] = self.first_timestamp
grouped_data[FieldName.FEAT_STATIC_CAT] = [0]
all_entries.append(grouped_data)
return ListDataset(
all_entries, freq=self.frequency, one_dim_target=False
)
def _align_data_entry(self, data: DataEntry) -> np.array:
ts = self.to_ts(data)
return ts.reindex(
pd.date_range(
start=self.first_timestamp,
end=self.last_timestamp,
freq=data[FieldName.START].freq,
),
fill_value=self.train_fill_function(ts),
).values
def _left_pad_data(self, data: DataEntry) -> np.array:
ts = self.to_ts(data)
return ts.reindex(
pd.date_range(
start=self.first_timestamp,
end=ts.index[-1],
freq=data[FieldName.START].freq,
),
fill_value=self.test_fill_rule(ts),
).values
@staticmethod
def _transform_target(funcs, dataset: Dataset) -> DataEntry:
return {FieldName.TARGET: np.array([funcs(data) for data in dataset])}
def _restrict_max_dimensionality(self, data: DataEntry) -> DataEntry:
"""
Takes the last max_target_dimension dimensions from a multivariate
dataentry.
Parameters
----------
data
multivariate data entry with (dim, num_timesteps) target field
Returns
-------
DataEntry
data multivariate data entry with
(max_target_dimension, num_timesteps) target field
"""
if self.max_target_dimension is not None:
# restrict maximum dimensionality (for faster testing)
data[FieldName.TARGET] = data[FieldName.TARGET][
-self.max_target_dimension :, :
]
return data
@staticmethod
def to_ts(data: DataEntry) -> pd.Series:
return pd.Series(
data[FieldName.TARGET],
index=pd.date_range(
start=data[FieldName.START],
periods=len(data[FieldName.TARGET]),
freq=data[FieldName.START].freq,
),
)
-116
View File
@@ -1,116 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from functools import lru_cache
from typing import Callable, List, cast
import numpy as np
import pandas as pd
from pandas.tseries.offsets import Tick
from .common import DataEntry
class ProcessStartField:
def __init__(self, name: str, freq: str) -> None:
self.name = name
self.freq = freq
def __call__(self, data: DataEntry) -> DataEntry:
try:
value = ProcessStartField.process(data[self.name], self.freq)
except (TypeError, ValueError) as e:
raise Exception(f'Error "{e}" occurred when reading field "{self.name}"')
data[self.name] = value
return data
@staticmethod
@lru_cache(maxsize=10000)
def process(string: str, freq: str) -> pd.Timestamp:
timestamp = pd.Timestamp(string, freq=freq)
# operate on time information (days, hours, minute, second)
if isinstance(timestamp.freq, Tick):
return pd.Timestamp(timestamp.floor(timestamp.freq), timestamp.freq)
# since we are only interested in the data piece, we normalize the
# time information
timestamp = timestamp.replace(
hour=0, minute=0, second=0, microsecond=0, nanosecond=0
)
return timestamp.freq.rollforward(timestamp)
class ProcessTimeSeriesField:
def __init__(self, name, is_required: bool, is_static: bool, is_cat: bool) -> None:
self.name = name
self.is_required = is_required
self.req_ndim = 1 if is_static else 2
self.dtype = np.int64 if is_cat else np.float32
def __call__(self, data: DataEntry) -> DataEntry:
value = data.get(self.name, None)
if value is not None:
value = np.asarray(value, dtype=self.dtype)
dim_diff = self.req_ndim - value.ndim
if dim_diff == 1:
value = np.expand_dims(a=value, axis=0)
elif dim_diff != 0:
raise Exception(
f"JSON array has bad shape - expected {self.req_ndim} dimensions got {dim_diff}"
)
data[self.name] = value
return data
elif not self.is_required:
return data
else:
raise Exception(f"JSON object is missing a required field `{self.name}`")
class ProcessDataEntry:
def __init__(self, freq: str, one_dim_target: bool = True) -> None:
self.trans = cast(
List[Callable[[DataEntry], DataEntry]],
[
ProcessStartField("start", freq=freq),
ProcessTimeSeriesField(
"target", is_required=True, is_cat=False, is_static=one_dim_target
),
ProcessTimeSeriesField(
"feat_dynamic_cat", is_required=False, is_cat=True, is_static=False
),
ProcessTimeSeriesField(
"feat_dynamic_real",
is_required=False,
is_cat=False,
is_static=False,
),
ProcessTimeSeriesField(
"feat_static_cat", is_required=False, is_cat=True, is_static=True
),
ProcessTimeSeriesField(
"feat_static_real", is_required=False, is_cat=False, is_static=True
),
],
)
def __call__(self, data: DataEntry) -> DataEntry:
for t in self.trans:
data = t(data)
return data
-604
View File
@@ -1,604 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import functools
import itertools
import operator
from typing import (
Any,
Callable,
Dict,
Iterator,
List,
Optional,
Sequence,
Tuple,
Union,
)
# Third-party imports
import numpy as np
import pandas as pd
# First-party imports
from .common import DataEntry
ValueOrCallable = Union[Any, Callable]
Recipe = List[Tuple[str, Callable]]
Env = Dict[str, Any]
def resolve(val_or_callable: ValueOrCallable, context: Env, *args, **kwargs):
if callable(val_or_callable):
return val_or_callable(context, *args, **kwargs)
elif isinstance(val_or_callable, str):
return context[val_or_callable]
else:
return val_or_callable
def generate(
length: int,
recipe: Union[Callable, Recipe],
start: pd.Timestamp,
global_state: Optional[dict] = None,
seed: int = 0,
item_id_prefix: str = "",
) -> Iterator[DataEntry]:
np.random.seed(seed)
if global_state is None:
global_state = {}
if isinstance(recipe, list):
for x in itertools.count():
data: DataEntry = {}
for k, f in recipe:
data[k] = resolve(
f, data, length=length, field_name=k, global_state=global_state,
)
yield dict(**data, item_id=item_id_prefix + str(x), start=start)
else:
assert callable(recipe)
for x in itertools.count():
data = recipe(length=length, global_state=global_state)
yield dict(**data, item_id=item_id_prefix + str(x), start=start)
def evaluate(
funcs: Recipe, length: int, *args, global_state: dict = None, **kwargs
) -> Env:
if global_state is None:
global_state = {}
if "length" in kwargs:
del kwargs["length"]
if "field_name" in kwargs:
del kwargs["field_name"]
if "global_state" in kwargs:
del kwargs["global_state"]
data: DataEntry = {}
for k, f in funcs:
try:
data[k] = resolve(
f,
data,
length=length,
field_name=k,
global_state=global_state,
*args,
**kwargs
)
except ValueError as e:
raise ValueError('Error while evaluating key "{}"'.format(k), e)
return data
def make_func(
length: int, funcs: Recipe, global_state=None
) -> Callable[[int, Env], DataEntry]:
if global_state is None:
global_state = {}
def f(length=length, global_state=global_state, *args, **kwargs):
data = {}
for k, f in funcs:
data[k] = resolve(
f,
data,
length=length,
field_name=k,
global_state=global_state,
*args,
**kwargs
)
return data
return f
def take_as_list(iterator, num):
return list(itertools.islice(iterator, num))
class Debug:
def __init__(self, print_global=False) -> None:
self.print_global = print_global
def __call__(self, x: Env, global_state, **kwargs):
print(x)
if self.print_global:
print(global_state)
return 0
class Lifted:
def __add__(self, other):
return LiftedAdd(self, other)
def __radd__(self, other):
return LiftedAdd(other, self)
def __sub__(self, other):
return LiftedSub(self, other)
def __rsub__(self, other):
return LiftedSub(other, self)
def __mul__(self, other):
return LiftedMul(self, other, operator.mul)
def __rmul__(self, other):
return LiftedMul(other, self, operator.mul)
def __truediv__(self, other):
return LiftedTruediv(self, other, operator.truediv)
def __rtruediv__(self, other):
return LiftedTruediv(other, self, operator.truediv)
def __call__(
self, x: Env, length: int, field_name: str, global_state: Dict, *args, **kwargs
):
pass
class LiftedBinaryOp(Lifted):
def __init__(self, left, right, op) -> None:
self.left = left
self.right = right
self.op = op
def __call__(self, *args, **kwargs):
left = resolve(self.left, *args, **kwargs)
right = resolve(self.right, *args, **kwargs)
return self.op(left, right)
class LiftedAdd(LiftedBinaryOp):
def __init__(self, left, right) -> None:
super().__init__(left, right, operator.add)
class LiftedSub(LiftedBinaryOp):
def __init__(self, left, right) -> None:
super().__init__(left, right, operator.sub)
class LiftedMul(LiftedBinaryOp):
def __init__(self, left, right) -> None:
super().__init__(left, right, operator.mul)
class LiftedTruediv(LiftedBinaryOp):
def __init__(self, left, right) -> None:
super().__init__(left, right, operator.truediv)
class RandomGaussian(Lifted):
def __init__(
self, stddev: ValueOrCallable = 1.0, shape: Sequence[int] = (0,)
) -> None:
self.stddev = stddev
self.shape = shape
def __call__(self, x: Env, length: int, *args, **kwargs):
stddev = resolve(self.stddev, x, length, *args, **kwargs)
s = np.array(self.shape)
s[s == 0] = length
return stddev * np.random.randn(*s)
# Binary recipe that returns 1 if date is in holidays list and 0 otherwise
class BinaryHolidays(Lifted):
# TODO: holidays is type List[datetime.date]
def __init__(self, dates: List[pd.Timestamp], holidays: List[Any]) -> None:
self.dates = dates
self.holidays = holidays
def __call__(self, *args, **kwargs):
length = len(self.dates)
out = np.ones(length)
for i, date in enumerate(self.dates):
# Convert to string to check if inside of holidays datatime.date
if date.date() in self.holidays:
out[i] = 1.0
else:
out[i] = 0.0
return out
class RandomBinary(Lifted):
def __init__(self, prob: ValueOrCallable = 0.1) -> None:
self.prob = prob
def __call__(self, x: Env, length: int, *args, **kwargs):
prob = resolve(self.prob, x, length, *args, **kwargs)
return 1.0 * (np.random.rand(length) < prob)
class RandomSymmetricDirichlet(Lifted):
def __init__(
self, alpha: ValueOrCallable = 1.0, shape: Sequence[int] = (0,)
) -> None:
self.alpha = alpha
self.shape = shape
def __call__(self, x, length, *args, **kwargs):
alpha = resolve(self.alpha, x, length, *args, **kwargs)
s = np.array(self.shape)
s[s == 0] = length
return np.random.dirichlet(alpha * np.ones(s))
class BinaryMarkovChain(Lifted):
def __init__(
self, one_to_zero: ValueOrCallable, zero_to_one: ValueOrCallable
) -> None:
self.one_to_zero = one_to_zero
self.zero_to_one = zero_to_one
def __call__(self, x: Env, length: int, *args, **kwargs):
probs = np.zeros(2)
probs[0] = resolve(self.zero_to_one, x, length, *args, **kwargs)
probs[1] = resolve(self.one_to_zero, x, length, *args, **kwargs)
out = np.ones(length, dtype=np.int) # initial state is 1
uu = np.random.rand(length)
for i in range(1, length):
if uu[i] < probs[out[i - 1]]:
out[i] = 1 - out[i - 1]
else:
out[i] = out[i - 1]
return out
class Constant(Lifted):
def __init__(self, constant) -> None:
self.constant = constant
def __call__(self, *args, **kwargs):
return self.constant
class ConstantVec(Lifted):
def __init__(self, constant: ValueOrCallable) -> None:
self.constant = constant
def __call__(self, x: Env, length: int, *args, **kwargs):
constant = resolve(self.constant, x, length, *args, **kwargs)
return constant * np.ones(length)
class NormalizeMax(Lifted):
def __init__(self, input) -> None:
self.input = input
def __call__(self, x: Env, *args, **kwargs):
inp = resolve(self.input, x, *args, kwargs)
return inp / np.max(inp)
class OnesLike(Lifted):
def __init__(self, other) -> None:
self.other = other
def __call__(self, x, length, *args, **kwargs):
other = resolve(self.other, x, length, **kwargs)
return np.ones_like(other)
class LinearTrend(Lifted):
def __init__(self, slope: ValueOrCallable = 1.0) -> None:
self.slope = slope
def __call__(self, x, length, *args, **kwargs):
slope = resolve(self.slope, x, length, *args, **kwargs)
return slope * np.arange(length) / length
class RandomCat:
def __init__(
self,
cardinalities: List[int],
prob_fun: Callable = RandomSymmetricDirichlet(alpha=1.0, shape=(0,)),
) -> None:
self.cardinalities = cardinalities
self.prob_fun = prob_fun
def __call__(self, x, field_name, global_state, **kwargs):
if field_name not in global_state:
probs = [self.prob_fun(x, length=c) for c in self.cardinalities]
global_state[field_name] = probs
probs = global_state[field_name]
cats = np.array(
[
np.random.choice(np.arange(len(probs[i])), p=probs[i])
for i in range(len(probs))
]
)
return cats
class Lag(Lifted):
def __init__(
self, input: ValueOrCallable, lag: ValueOrCallable = 0, pad_const: int = 0,
) -> None:
self.input = input
self.lag = lag
self.pad_const = pad_const
def __call__(self, x, *args, **kwargs):
feat = resolve(self.input, x, *args, **kwargs)
lag = resolve(self.lag, x, *args, **kwargs)
if lag > 0:
lagged_feat = np.concatenate((self.pad_const * np.ones(lag), feat[:-lag]))
elif lag < 0:
lagged_feat = np.concatenate((feat[-lag:], self.pad_const * np.ones(-lag)))
else:
lagged_feat = feat
return lagged_feat
class ForEachCat(Lifted):
def __init__(self, fun, cat_field="cat", cat_idx=0) -> None:
self.fun = fun
self.cat_field = cat_field
self.cat_idx = cat_idx
def __call__(
self, x: Env, length: int, field_name: str, global_state: Dict, *args, **kwargs
):
c = x[self.cat_field][self.cat_idx]
if field_name not in global_state:
global_state[field_name] = np.empty(
len(global_state[self.cat_field][self.cat_idx]), dtype=np.object,
)
if global_state[field_name][c] is None:
global_state[field_name][c] = self.fun(
x, length=length, field_name=field_name, *args, **kwargs
)
return global_state[field_name][c]
class Eval(Lifted):
def __init__(self, expr: str) -> None:
self.expr = expr
def __call__(self, x: Env, length: int, *args, **kwargs):
return eval(self.expr, globals(), dict(x=x, length=length, **kwargs))
class SmoothSeasonality(Lifted):
def __init__(self, period: ValueOrCallable, phase: ValueOrCallable) -> None:
self.period = period
self.phase = phase
def __call__(self, x: Env, length: int, *args, **kwargs):
period = resolve(self.period, x, length, *args, **kwargs)
phase = resolve(self.phase, x, length, *args, **kwargs)
return (np.sin(2.0 / period * np.pi * (np.arange(length) + phase)) + 1) / 2.0
class Add(Lifted):
def __init__(self, inputs: List[ValueOrCallable]) -> None:
self.inputs = inputs
def __call__(self, x: Env, length: int, *args, **kwargs):
return sum([resolve(k, x, length, *args, **kwargs) for k in self.inputs])
class Mul(Lifted):
def __init__(self, inputs) -> None:
self.inputs = inputs
def __call__(self, x: Env, length: int, *args, **kwargs):
return functools.reduce(
operator.mul, [resolve(k, x, length, *args, **kwargs) for k in self.inputs],
)
class NanWhere(Lifted):
def __init__(self, source: ValueOrCallable, nan_indicator: ValueOrCallable) -> None:
self.source = source
self.nan_indicator = nan_indicator
def __call__(self, x: Env, length: int, *args, **kwargs):
source = resolve(self.source, x, length, *args, **kwargs)
nan_indicator = resolve(self.nan_indicator, x, length, *args, **kwargs)
out = source.copy()
out[nan_indicator == 1] = np.nan
return out
class OneMinus(Lifted):
def __init__(self, source: ValueOrCallable) -> None:
self.source = source
def __call__(self, x: Env, length: int, *args, **kwargs):
value = resolve(self.source, x, length, *args, **kwargs)
return 1 - value
class Concatenate(Lifted):
def __init__(self, inputs: List[ValueOrCallable], axis: int = 0) -> None:
self.inputs = inputs
self.axis = axis
def __call__(self, x: Env, length: int, *args, **kwargs):
inputs = [resolve(z, x, length, **kwargs) for z in self.inputs]
return np.concatenate(inputs, self.axis)
class Stack(Lifted):
def __init__(self, inputs: List[ValueOrCallable]) -> None:
self.inputs = inputs
def __call__(self, x: Env, length: int, *args, **kwargs):
inputs = [resolve(z, x, length, **kwargs) for z in self.inputs]
return np.stack(inputs, axis=0)
class StackPrefix(Lifted):
def __init__(self, prefix: str) -> None:
self.prefix = prefix
def __call__(self, x: Env, length: int, *args, **kwargs):
inputs = [v for k, v in x.items() if k.startswith(self.prefix)]
return np.stack(inputs, axis=0)
class Ref(Lifted):
def __init__(self, field_name: str) -> None:
self.field_name = field_name
def __call__(self, x: Env, length: int, *args, **kwargs):
return x[self.field_name]
class RandomUniform(Lifted):
def __init__(
self, low: ValueOrCallable = 0.0, high: ValueOrCallable = 1.0, shape=(0,),
) -> None:
self.low = low
self.high = high
self.shape = shape
def __call__(self, x: Env, length: int, *args, **kwargs):
low = resolve(self.low, x, length, *args, **kwargs)
high = resolve(self.high, x, length, *args, **kwargs)
s = np.array(self.shape)
s[s == 0] = length
return np.random.uniform(low, high, s)
class RandomInteger(Lifted):
def __init__(
self,
low: ValueOrCallable,
high: ValueOrCallable,
shape: Optional[Sequence[int]] = (0,),
) -> None:
self.low = low
self.high = high
self.shape = shape
def __call__(self, x: Env, length: int, *args, **kwargs):
low = resolve(self.low, x, length, *args, **kwargs)
high = resolve(self.high, x, length, *args, **kwargs)
if self.shape is not None:
s = np.array(self.shape)
s[s == 0] = length
return np.random.randint(low, high, s)
else:
return np.random.randint(low, high)
class RandomChangepoints(Lifted):
def __init__(self, max_num_changepoints: ValueOrCallable) -> None:
self.max_num_changepoints = max_num_changepoints
def __call__(self, x: Env, length: int, *args, **kwargs):
max_num_changepoints = resolve(
self.max_num_changepoints, x, length, *args, **kwargs
)
num_changepoints = np.random.randint(0, max_num_changepoints + 1)
change_idx = np.sort(
np.random.randint(low=1, high=length - 1, size=(num_changepoints,))
)
change_ranges = np.concatenate([change_idx, [length]])
out = np.zeros(length, dtype=np.int)
for i in range(0, num_changepoints):
out[change_ranges[i] : change_ranges[i + 1]] = i + 1
return out
class Repeated(Lifted):
def __init__(self, pattern: ValueOrCallable) -> None:
self.pattern = pattern
def __call__(self, x: Env, length: int, *args, **kwargs):
pattern = resolve(self.pattern, x, length, **kwargs)
repeats = length // len(pattern) + 1
out = np.tile(pattern, (repeats,))
return out[:length]
class Convolve(Lifted):
def __init__(self, input: ValueOrCallable, filter: ValueOrCallable) -> None:
self.filter = filter
self.input = input
def __call__(self, x: Env, length: int, *args, **kwargs):
fil = resolve(self.filter, x, length, **kwargs)
inp = resolve(self.input, x, length, **kwargs)
out = np.convolve(inp, fil, mode="same")
return out
class Dilated(Lifted):
def __init__(self, source: Callable, dilation: int) -> None:
self.source = source
self.dilation = dilation
def __call__(self, x: Env, length: int, *args, **kwargs):
inner = self.source(x, length // self.dilation + 1, **kwargs)
out = np.repeat(inner, self.dilation)
return out[:length]
class Choose(Lifted):
def __init__(self, options: ValueOrCallable, selector: ValueOrCallable) -> None:
self.options = options
self.selector = selector
def __call__(self, x, length, **kwargs):
options = resolve(self.options, x, length, **kwargs)
selector = resolve(self.selector, x, length, **kwargs)
e = np.eye(options.shape[0])
out = np.sum(e[selector] * options.T, axis=1)
return out
class EvalRecipe(Lifted):
def __init__(self, recipe: Recipe, op: ValueOrCallable) -> None:
self.recipe = recipe
self.op = op
def __call__(self, x: Env, *args, **kwargs):
xx = evaluate(self.recipe, *args, **kwargs)
return resolve(self.op, xx, *args, **kwargs)
+1 -14
View File
@@ -1,14 +1 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from .datasets import get_dataset, dataset_recipes
from .datasets import dataset_recipes
-48
View File
@@ -1,48 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import json
from pathlib import Path
# First-party imports
from pts.dataset import ArtificialDataset, generate_sf2, serialize_data_entry
def generate_artificial_dataset(dataset_path: Path, dataset: ArtificialDataset) -> None:
dataset_path_train = dataset_path / "train"
dataset_path_test = dataset_path / "test"
dataset_path.mkdir(exist_ok=True)
dataset_path_train.mkdir(exist_ok=False)
dataset_path_test.mkdir(exist_ok=False)
ds = dataset.generate()
assert ds.test is not None
with (dataset_path / "metadata.json").open("w") as fp:
json.dump(ds.metadata.dict(), fp, indent=2, sort_keys=True)
generate_sf2(
filename=str(dataset_path_train / "train.json"),
time_series=list(map(serialize_data_entry, ds.train)),
is_missing=False,
num_missing=0,
)
generate_sf2(
filename=str(dataset_path_test / "test.json"),
time_series=list(map(serialize_data_entry, ds.test)),
is_missing=False,
num_missing=0,
)
-160
View File
@@ -1,160 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
Loads the datasets used in Salinas et al. 2019 (https://tinyurl.com/woyhhqy).
This wrapper downloads and unpacks them so they don'thave to be attached as
large files in GluonTS master.
"""
import json
import os
import shutil
import tarfile
from pathlib import Path
from typing import NamedTuple, Optional
from urllib import request
from pts.dataset import FileDataset, FieldName
from ._util import save_to_file, to_dict, metadata
class GPCopulaDataset(NamedTuple):
name: str
url: str
num_series: int
prediction_length: int
freq: str
rolling_evaluations: int
max_target_dim: Optional[int] = None
root = (
"https://raw.githubusercontent.com/mbohlkeschneider/gluon-ts/mv_release/datasets/"
)
datasets_info = {
"exchange_rate_nips": GPCopulaDataset(
name="exchange_rate_nips",
url=root + "exchange_rate_nips.tar.gz",
num_series=8,
prediction_length=30,
freq="B",
rolling_evaluations=5,
max_target_dim=None,
),
"electricity_nips": GPCopulaDataset(
name="electricity_nips",
url=root + "electricity_nips.tar.gz",
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
num_series=370,
prediction_length=24,
freq="H",
rolling_evaluations=7,
max_target_dim=None,
),
"traffic_nips": GPCopulaDataset(
name="traffic_nips",
url=root + "traffic_nips.tar.gz",
# note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
num_series=963,
prediction_length=24,
freq="H",
rolling_evaluations=7,
max_target_dim=None,
),
"solar_nips": GPCopulaDataset(
name="solar-energy",
url=root + "solar_nips.tar.gz",
num_series=137,
prediction_length=24,
freq="H",
rolling_evaluations=7,
max_target_dim=None,
),
"wiki-rolling_nips": GPCopulaDataset(
name="wiki-rolling_nips",
# That file lives on GitHub Large file storage (lfs). We need to use
# the exact link, otherwise it will only open the lfs pointer file.
url="https://github.com/mbohlkeschneider/gluon-ts/raw/650ad5ffe92d20e89d491966b6d8b4459e219be8/datasets/wiki-rolling_nips.tar.gz",
num_series=9535,
prediction_length=30,
freq="D",
rolling_evaluations=5,
max_target_dim=2000,
),
"taxi_30min": GPCopulaDataset(
name="taxi_30min",
url=root + "taxi_30min.tar.gz",
num_series=1214,
prediction_length=24,
freq="30min",
rolling_evaluations=56,
max_target_dim=None,
),
}
def generate_gp_copula_dataset(dataset_path: Path, dataset_name: str):
ds_info = datasets_info[dataset_name]
os.makedirs(dataset_path, exist_ok=True)
download_dataset(dataset_path.parent, ds_info)
save_metadata(dataset_path, ds_info)
save_dataset(dataset_path / "train", ds_info)
save_dataset(dataset_path / "test", ds_info)
clean_up_dataset(dataset_path, ds_info)
def download_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
request.urlretrieve(ds_info.url, dataset_path / f"{ds_info.name}.tar.gz")
with tarfile.open(dataset_path / f"{ds_info.name}.tar.gz") as tar:
tar.extractall(path=dataset_path)
def save_metadata(dataset_path: Path, ds_info: GPCopulaDataset):
with open(dataset_path / "metadata.json", "w") as f:
f.write(
json.dumps(
metadata(
cardinality=ds_info.num_series,
freq=ds_info.freq,
prediction_length=ds_info.prediction_length,
)
)
)
def save_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
dataset = list(FileDataset(dataset_path / "*.json", freq=ds_info.freq))
shutil.rmtree(dataset_path)
train_file = dataset_path / "data.json"
save_to_file(
train_file,
[
to_dict(
target_values=data_entry[FieldName.TARGET],
start=data_entry[FieldName.START],
# Handles adding categorical features of rolling
# evaluation dates
cat=[cat - ds_info.num_series * (cat // ds_info.num_series)],
item_id=cat,
)
for cat, data_entry in enumerate(dataset)
],
)
def clean_up_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
os.remove(dataset_path.parent / f"{ds_info.name}.tar.gz")
shutil.rmtree(dataset_path / "metadata")
-197
View File
@@ -1,197 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
Here we reuse the datasets used by LSTNet as the processed url of the datasets
are available on GitHub.
"""
import json
import os
from pathlib import Path
from typing import List, NamedTuple, Optional
import pandas as pd
from pts.dataset import frequency_add
from ._util import save_to_file, to_dict, metadata
def load_from_pandas(
df: pd.DataFrame, time_index: pd.DatetimeIndex, agg_freq: Optional[str] = None,
) -> List[pd.Series]:
df = df.set_index(time_index)
pivot_df = df.transpose()
pivot_df.head()
timeseries = []
for row in pivot_df.iterrows():
ts = pd.Series(row[1].values, index=time_index)
if agg_freq is not None:
ts = ts.resample(agg_freq).sum()
first_valid = ts[ts.notnull()].index[0]
last_valid = ts[ts.notnull()].index[-1]
ts = ts[first_valid:last_valid]
timeseries.append(ts)
return timeseries
class LstnetDataset(NamedTuple):
name: str
url: str
num_series: int
num_time_steps: int
prediction_length: int
rolling_evaluations: int
freq: str
start_date: str
agg_freq: Optional[str] = None
root = (
"https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/"
)
datasets_info = {
"exchange_rate": LstnetDataset(
name="exchange_rate",
url=root + "exchange_rate/exchange_rate.txt.gz",
num_series=8,
num_time_steps=7588,
prediction_length=30,
rolling_evaluations=5,
start_date="1990-01-01",
freq="1B",
agg_freq=None,
),
"electricity": LstnetDataset(
name="electricity",
url=root + "electricity/electricity.txt.gz",
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
# the aggregated ones that is used from LSTNet filters out from the initial 370 series the one with no data
# in 2011
num_series=321,
num_time_steps=26304,
prediction_length=24,
rolling_evaluations=7,
start_date="2012-01-01",
freq="1H",
agg_freq=None,
),
"traffic": LstnetDataset(
name="traffic",
url=root + "traffic/traffic.txt.gz",
# note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
# but only 862 in LSTNet
num_series=862,
num_time_steps=17544,
prediction_length=24,
rolling_evaluations=7,
start_date="2015-01-01",
freq="H",
agg_freq=None,
),
"solar-energy": LstnetDataset(
name="solar-energy",
url=root + "solar-energy/solar_AL.txt.gz",
num_series=137,
num_time_steps=52560,
prediction_length=24,
rolling_evaluations=7,
start_date="2006-01-01",
freq="10min",
agg_freq="1H",
),
}
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
ds_info = datasets_info[dataset_name]
os.makedirs(dataset_path, exist_ok=True)
with open(dataset_path / "metadata.json", "w") as f:
f.write(
json.dumps(
metadata(
cardinality=ds_info.num_series,
freq=ds_info.freq,
prediction_length=ds_info.prediction_length,
)
)
)
train_file = dataset_path / "train" / "data.json"
test_file = dataset_path / "test" / "data.json"
time_index = pd.date_range(
start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps,
)
df = pd.read_csv(ds_info.url, header=None)
assert df.shape == (
ds_info.num_time_steps,
ds_info.num_series,
), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"
timeseries = load_from_pandas(
df=df, time_index=time_index, agg_freq=ds_info.agg_freq
)
# the last date seen during training
ts_index = timeseries[0].index
training_end = ts_index[int(len(ts_index) * (8 / 10))]
train_ts = []
for cat, ts in enumerate(timeseries):
sliced_ts = ts[:training_end]
if len(sliced_ts) > 0:
train_ts.append(
to_dict(
target_values=sliced_ts.values,
start=sliced_ts.index[0],
cat=[cat],
item_id=cat,
)
)
assert len(train_ts) == ds_info.num_series
save_to_file(train_file, train_ts)
# time of the first prediction
prediction_dates = [
frequency_add(training_end, i * ds_info.prediction_length)
for i in range(ds_info.rolling_evaluations)
]
test_ts = []
for prediction_start_date in prediction_dates:
for cat, ts in enumerate(timeseries):
# print(prediction_start_date)
prediction_end_date = frequency_add(
prediction_start_date, ds_info.prediction_length
)
sliced_ts = ts[:prediction_end_date]
test_ts.append(
to_dict(
target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat],
)
)
assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations
save_to_file(test_file, test_ts)
-85
View File
@@ -1,85 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import json
import os
from pathlib import Path
import numpy as np
import pandas as pd
from ._util import save_to_file, to_dict, metadata
def generate_m4_dataset(
dataset_path: Path, m4_freq: str, pandas_freq: str, prediction_length: int
):
m4_dataset_url = "https://github.com/M4Competition/M4-methods/raw/master/Dataset"
train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv", index_col=0)
test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv", index_col=0)
os.makedirs(dataset_path, exist_ok=True)
with open(dataset_path / "metadata.json", "w") as f:
f.write(
json.dumps(
metadata(
cardinality=len(train_df),
freq=pandas_freq,
prediction_length=prediction_length,
)
)
)
train_file = dataset_path / "train" / "data.json"
test_file = dataset_path / "test" / "data.json"
train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values]
test_target_values = [
np.hstack([train_ts, test_ts])
for train_ts, test_ts in zip(train_target_values, test_df.values)
]
if m4_freq == "Yearly":
# some time series have more than 300 years which can not be represented in pandas,
# this is probably due to a misclassification of those time series as Yearly
# we simply use only the last 300 years for training
# note this does not affect test time as prediction length is less than 300 years
train_target_values = [ts[-300:] for ts in train_target_values]
test_target_values = [ts[-300:] for ts in test_target_values]
# the original dataset did not include time stamps, so we use a mock start date for each time series
# we use the earliest point available in pandas
mock_start_dataset = "1750-01-01 00:00:00"
save_to_file(
train_file,
[
to_dict(
target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat
)
for cat, target in enumerate(train_target_values)
],
)
save_to_file(
test_file,
[
to_dict(
target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat
)
for cat, target in enumerate(test_target_values)
],
)
+38 -25
View File
@@ -6,12 +6,13 @@ from functools import lru_cache
import numpy as np
import pandas as pd
from pts.dataset import FieldName
from pts.feature import CustomDateFeatureSet, squared_exponential_kernel
from ._util import metadata, save_to_file
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.repository._util import metadata, save_to_file
from gluonts.time_feature.holiday import squared_exponential_kernel
from pts.feature import CustomDateFeatureSet
def generate_m5_dataset(
def generate_pts_m5_dataset(
dataset_path: Path,
pandas_freq: str,
prediction_length: int = 28,
@@ -46,7 +47,7 @@ def generate_m5_dataset(
)
sales_train_evaluation.sort_index(inplace=True)
sell_prices = pd.read_csv(sell_prices_path, index_col=['item_id', 'store_id'])
sell_prices = pd.read_csv(sell_prices_path, index_col=["item_id", "store_id"])
sell_prices.sort_index(inplace=True)
@lru_cache(maxsize=None)
@@ -161,16 +162,22 @@ def generate_m5_dataset(
"WI": snap_WI_feature,
}[state_id]
time_series["target"] = item.iloc[start_index:1913].values.astype(np.float32).tolist()
time_series["feat_dynamic_real"] = np.concatenate(
(
np.expand_dims(sell_price.iloc[start_index:1913].values, 0),
event_1_feature[:, start_index:1913],
event_2_feature[:, start_index:1913],
snap_feature[:, start_index:1913],
),
0,
).astype(np.float32).tolist()
time_series["target"] = (
item.iloc[start_index:1913].values.astype(np.float32).tolist()
)
time_series["feat_dynamic_real"] = (
np.concatenate(
(
np.expand_dims(sell_price.iloc[start_index:1913].values, 0),
event_1_feature[:, start_index:1913],
event_2_feature[:, start_index:1913],
snap_feature[:, start_index:1913],
),
0,
)
.astype(np.float32)
.tolist()
)
train_ds.append(time_series.copy())
@@ -222,16 +229,22 @@ def generate_m5_dataset(
"WI": snap_WI_feature,
}[state_id]
time_series["target"] = item.iloc[start_index:1941].values.astype(np.float32).tolist()
time_series["feat_dynamic_real"] = np.concatenate(
(
np.expand_dims(sell_price.iloc[start_index:1941].values, 0),
event_1_feature[:, start_index:1941],
event_2_feature[:, start_index:1941],
snap_feature[:, start_index:1941],
),
0,
).astype(np.float32).tolist()
time_series["target"] = (
item.iloc[start_index:1941].values.astype(np.float32).tolist()
)
time_series["feat_dynamic_real"] = (
np.concatenate(
(
np.expand_dims(sell_price.iloc[start_index:1941].values, 0),
event_1_feature[:, start_index:1941],
event_2_feature[:, start_index:1941],
snap_feature[:, start_index:1941],
),
0,
)
.astype(np.float32)
.tolist()
)
test_ds.append(time_series.copy())
-78
View File
@@ -1,78 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import json
import os
from pathlib import Path
from typing import Dict, List, Optional, Any
import numpy as np
def to_dict(
target_values: np.ndarray,
start: str,
cat: Optional[List[int]] = None,
item_id: Optional[Any] = None,
):
def serialize(x):
if np.isnan(x):
return "NaN"
else:
# return x
return float("{0:.6f}".format(float(x)))
res = {
"start": str(start),
"target": [serialize(x) for x in target_values],
}
if cat is not None:
res["feat_static_cat"] = cat
if item_id is not None:
res["item_id"] = item_id
return res
def save_to_file(path: Path, data: List[Dict]):
print(f"saving time-series into {path}")
path_dir = os.path.dirname(path)
os.makedirs(path_dir, exist_ok=True)
with open(path, "wb") as fp:
for d in data:
fp.write(json.dumps(d).encode("utf-8"))
fp.write("\n".encode("utf-8"))
def get_download_path() -> Path:
"""
Returns
-------
Path
default path to download datasets
/home/username/.pytorch/pytorch-ts/
"""
return Path(str(Path.home() / ".pytorch" / "pytorch-ts"))
def metadata(cardinality: int, freq: str, prediction_length: int):
return {
"freq": freq,
"prediction_length": prediction_length,
"feat_static_cat": [
{"name": "feat_static_cat", "cardinality": str(cardinality)}
],
}
+4 -178
View File
@@ -1,183 +1,9 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import logging
from collections import OrderedDict
from functools import partial
from pathlib import Path
from pts.dataset import ConstantDataset, TrainDatasets, load_datasets
from ._artificial import generate_artificial_dataset
from ._gp_copula_2019 import generate_gp_copula_dataset
from ._lstnet import generate_lstnet_dataset
from ._m4 import generate_m4_dataset
from ._m5 import generate_m5_dataset
from ._util import get_download_path
from gluonts.dataset.repository.datasets import dataset_recipes
m4_freq = "Hourly"
pandas_freq = "H"
dataset_path = Path(f"m4-{m4_freq}")
prediction_length = 48
from ._m5 import generate_pts_m5_dataset
dataset_recipes = OrderedDict(
{
# each recipe generates a dataset given a path
"constant": partial(generate_artificial_dataset, dataset=ConstantDataset()),
"exchange_rate": partial(generate_lstnet_dataset, dataset_name="exchange_rate"),
"solar-energy": partial(generate_lstnet_dataset, dataset_name="solar-energy"),
"electricity": partial(generate_lstnet_dataset, dataset_name="electricity"),
"traffic": partial(generate_lstnet_dataset, dataset_name="traffic"),
"exchange_rate_nips": partial(
generate_gp_copula_dataset, dataset_name="exchange_rate_nips"
),
"electricity_nips": partial(
generate_gp_copula_dataset, dataset_name="electricity_nips"
),
"traffic_nips": partial(
generate_gp_copula_dataset, dataset_name="traffic_nips"
),
"solar_nips": partial(generate_gp_copula_dataset, dataset_name="solar_nips"),
"wiki-rolling_nips": partial(
generate_gp_copula_dataset, dataset_name="wiki-rolling_nips"
),
"taxi_30min": partial(generate_gp_copula_dataset, dataset_name="taxi_30min"),
"m4_hourly": partial(
generate_m4_dataset,
m4_freq="Hourly",
pandas_freq="H",
prediction_length=48,
),
"m4_daily": partial(
generate_m4_dataset, m4_freq="Daily", pandas_freq="D", prediction_length=14,
),
"m4_weekly": partial(
generate_m4_dataset,
m4_freq="Weekly",
pandas_freq="W",
prediction_length=13,
),
"m4_monthly": partial(
generate_m4_dataset,
m4_freq="Monthly",
pandas_freq="M",
prediction_length=18,
),
"m4_quarterly": partial(
generate_m4_dataset,
m4_freq="Quarterly",
pandas_freq="3M",
prediction_length=8,
),
"m4_yearly": partial(
generate_m4_dataset,
m4_freq="Yearly",
pandas_freq="12M",
prediction_length=6,
),
"m5": partial(
generate_m5_dataset, pandas_freq="D", prediction_length=28, alpha=0.5
),
}
dataset_recipes["pts_m5"] = partial(
generate_pts_m5_dataset, pandas_freq="D", prediction_length=28
)
dataset_names = list(dataset_recipes.keys())
default_dataset_path = get_download_path() / "datasets"
def materialize_dataset(
dataset_name: str, path: Path = default_dataset_path, regenerate: bool = False,
) -> Path:
"""
Ensures that the dataset is materialized under the `path / dataset_name`
path.
Parameters
----------
dataset_name
name of the dataset, for instance "m4_hourly"
regenerate
whether to regenerate the dataset even if a local file is present.
If this flag is False and the file is present, the dataset will not
be downloaded again.
path
where the dataset should be saved
Returns
-------
the path where the dataset is materialized
"""
assert dataset_name in dataset_recipes.keys(), (
f"{dataset_name} is not present, please choose one from "
f"{dataset_recipes.keys()}."
)
path.mkdir(parents=True, exist_ok=True)
dataset_path = path / dataset_name
dataset_recipe = dataset_recipes[dataset_name]
if not dataset_path.exists() or regenerate:
logging.info(f"downloading and processing {dataset_name}")
dataset_recipe(dataset_path=dataset_path)
else:
logging.info(f"using dataset already processed in path {dataset_path}.")
return dataset_path
def get_dataset(
dataset_name: str,
path: Path = default_dataset_path,
regenerate: bool = False,
shuffle: bool = True,
) -> TrainDatasets:
"""
Get a repository dataset.
The datasets that can be obtained through this function have been used
with different processing over time by several papers (e.g., [SFG17]_,
[LCY+18]_, and [YRD15]_).
Parameters
----------
dataset_name
name of the dataset, for instance "m4_hourly"
regenerate
whether to regenerate the dataset even if a local file is present.
If this flag is False and the file is present, the dataset will not
be downloaded again.
path
where the dataset should be saved
shuffle
wheather to shuffle the training time series
Returns
-------
dataset obtained by either downloading or reloading from local file.
"""
dataset_path = materialize_dataset(dataset_name, path, regenerate)
return load_datasets(
metadata=dataset_path / "metadata.json",
train=dataset_path / "train" / "*.json",
test=dataset_path / "test" / "*.json",
shuffle=shuffle,
)
if __name__ == "__main__":
for dataset in dataset_names:
print(f"generate {dataset}")
ds = get_dataset(dataset, regenerate=True)
print(ds.metadata)
print(sum(1 for _ in list(iter(ds.train))))
-357
View File
@@ -1,357 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import math
from collections import defaultdict
from typing import Any, List, NamedTuple, Optional, Set
import numpy as np
from tqdm import tqdm
from pts.exception import assert_pts
from .common import FieldName
class ScaleHistogram:
"""
Scale histogram of a timeseries dataset
This counts the number of timeseries whose mean of absolute values is in
the `[base ** i, base ** (i+1)]` range for all possible `i`.
The number of entries with empty target is counted separately.
Parameters
----------
base
Log-width of the histogram's buckets.
bin_counts
empty_target_count
"""
def __init__(
self,
base: float = 2.0,
bin_counts: Optional[dict] = None,
empty_target_count: int = 0,
) -> None:
self._base = base
self.bin_counts = defaultdict(int, {} if bin_counts is None else bin_counts)
self.empty_target_count = empty_target_count
self.__init_args__ = dict(
base=self._base,
bin_counts=self.bin_counts,
empty_target_count=empty_target_count,
)
def bucket_index(self, target_values):
assert len(target_values) > 0
scale = np.mean(np.abs(target_values))
scale_bin = int(math.log(scale + 1.0, self._base))
return scale_bin
def add(self, target_values):
if len(target_values) > 0:
bucket = self.bucket_index(target_values)
self.bin_counts[bucket] = self.bin_counts[bucket] + 1
else:
self.empty_target_count = self.empty_target_count + 1
def count(self, target):
if len(target) > 0:
return self.bin_counts[self.bucket_index(target)]
else:
return self.empty_target_count
def __len__(self):
return self.empty_target_count + sum(self.bin_counts.values())
def __eq__(self, other):
return (
isinstance(other, ScaleHistogram)
and self.bin_counts == other.bin_counts
and self.empty_target_count == other.empty_target_count
and self._base == other._base
)
def __str__(self):
string_repr = [
"count of scales in {min}-{max}:{count}".format(
min=self._base ** base_index - 1,
max=self._base ** (base_index + 1) - 1,
count=count,
)
for base_index, count in sorted(self.bin_counts.items(), key=lambda x: x[0])
]
return "\n".join(string_repr)
class DatasetStatistics(NamedTuple):
"""
A NamedTuple to store the statistics of a Dataset.
"""
integer_dataset: bool
max_target: float
mean_abs_target: float
mean_target: float
mean_target_length: float
min_target: float
feat_static_real: List[Set[float]]
feat_static_cat: List[Set[int]]
num_feat_dynamic_real: Optional[int]
num_feat_dynamic_cat: Optional[int]
num_missing_values: int
num_time_observations: int
num_time_series: int
scale_histogram: ScaleHistogram
# DO NOT override the __str__ method, since we rely that we can load
# DatasetStatistics again; i.e. stats == eval(str(stats))
def __eq__(self, other):
for x, y in zip(self._asdict().values(), other._asdict().values()):
if isinstance(x, float):
if abs(x - y) > abs(0.0001 * x):
return False
elif x != y:
return False
return True
# TODO: reorganize modules to avoid circular dependency
# TODO: and substitute Any with Dataset
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
"""
Computes the statistics of a given Dataset.
Parameters
----------
ts_dataset
Dataset of which to compute the statistics.
Returns
-------
DatasetStatistics
NamedTuple containing the statistics.
"""
num_time_observations = 0
num_time_series = 0
min_target = 1e20
max_target = -1e20
sum_target = 0.0
sum_abs_target = 0.0
integer_dataset = True
observed_feat_static_cat: Optional[List[Set[int]]] = None
observed_feat_static_real: Optional[List[Set[float]]] = None
num_feat_static_real: Optional[int] = None
num_feat_static_cat: Optional[int] = None
num_feat_dynamic_real: Optional[int] = None
num_feat_dynamic_cat: Optional[int] = None
num_missing_values = 0
scale_histogram = ScaleHistogram()
with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it:
for num_time_series, ts in it:
# TARGET
target = ts[FieldName.TARGET]
observed_target = target[~np.isnan(target)]
num_observations = len(observed_target)
if num_observations > 0:
# 'nan' is handled in observed_target definition
assert_pts(
np.all(np.isfinite(observed_target)),
"Target values have to be finite (e.g., not inf, -inf, "
"or None) and cannot exceed single precision floating "
"point range.",
)
num_time_observations += num_observations
min_target = float(min(min_target, observed_target.min()))
max_target = float(max(max_target, observed_target.max()))
num_missing_values += int(np.isnan(target).sum())
sum_target += float(observed_target.sum())
sum_abs_target += float(np.abs(observed_target).sum())
integer_dataset = integer_dataset and bool(
np.all(np.mod(observed_target, 1) == 0)
)
scale_histogram.add(observed_target) # after checks for inf and None
# FEAT_STATIC_CAT
feat_static_cat = (
ts[FieldName.FEAT_STATIC_CAT] if FieldName.FEAT_STATIC_CAT in ts else []
)
if num_feat_static_cat is None:
num_feat_static_cat = len(feat_static_cat)
observed_feat_static_cat = [set() for _ in range(num_feat_static_cat)]
# needed to type check
assert num_feat_static_cat is not None
assert observed_feat_static_cat is not None
assert_pts(
num_feat_static_cat == len(feat_static_cat),
"Not all feat_static_cat vectors have the same length {} != {}.",
num_feat_static_cat,
len(feat_static_cat),
)
for i, c in enumerate(feat_static_cat):
observed_feat_static_cat[i].add(c)
# FEAT_STATIC_REAL
feat_static_real = (
ts[FieldName.FEAT_STATIC_REAL]
if FieldName.FEAT_STATIC_REAL in ts
else []
)
if num_feat_static_real is None:
num_feat_static_real = len(feat_static_real)
observed_feat_static_real = [set() for _ in range(num_feat_static_real)]
# needed to type check
assert num_feat_static_real is not None
assert observed_feat_static_real is not None
assert_pts(
num_feat_static_real == len(feat_static_real),
"Not all feat_static_real vectors have the same length {} != {}.",
num_feat_static_real,
len(feat_static_real),
)
for i, c in enumerate(feat_static_real):
observed_feat_static_real[i].add(c)
# FEAT_DYNAMIC_CAT
feat_dynamic_cat = (
ts[FieldName.FEAT_DYNAMIC_CAT]
if FieldName.FEAT_DYNAMIC_CAT in ts
else None
)
if feat_dynamic_cat is None:
# feat_dynamic_cat not found, check it was the first ts we encounter or
# that feat_dynamic_cat were seen before
assert_pts(
num_feat_dynamic_cat is None or num_feat_dynamic_cat == 0,
"feat_dynamic_cat was found for some instances but not others.",
)
num_feat_dynamic_cat = 0
else:
if num_feat_dynamic_cat is None:
# first num_feat_dynamic_cat found
num_feat_dynamic_cat = feat_dynamic_cat.shape[0]
else:
assert_pts(
num_feat_dynamic_cat == feat_dynamic_cat.shape[0],
"Found instances with different number of features in "
"feat_dynamic_cat, found one with {} and another with {}.",
num_feat_dynamic_cat,
feat_dynamic_cat.shape[0],
)
assert_pts(
np.all(np.isfinite(feat_dynamic_cat)),
"Features values have to be finite and cannot exceed single "
"precision floating point range.",
)
num_feat_dynamic_cat_time_steps = feat_dynamic_cat.shape[1]
assert_pts(
num_feat_dynamic_cat_time_steps == len(target),
"Each feature in feat_dynamic_cat has to have the same length as "
"the target. Found an instance with feat_dynamic_cat of length {} "
"and a target of length {}.",
num_feat_dynamic_cat_time_steps,
len(target),
)
# FEAT_DYNAMIC_REAL
feat_dynamic_real = (
ts[FieldName.FEAT_DYNAMIC_REAL]
if FieldName.FEAT_DYNAMIC_REAL in ts
else None
)
if feat_dynamic_real is None:
# feat_dynamic_real not found, check it was the first ts we encounter or
# that feat_dynamic_real were seen before
assert_pts(
num_feat_dynamic_real is None or num_feat_dynamic_real == 0,
"feat_dynamic_real was found for some instances but not others.",
)
num_feat_dynamic_real = 0
else:
if num_feat_dynamic_real is None:
# first num_feat_dynamic_real found
num_feat_dynamic_real = feat_dynamic_real.shape[0]
else:
assert_pts(
num_feat_dynamic_real == feat_dynamic_real.shape[0],
"Found instances with different number of features in "
"feat_dynamic_real, found one with {} and another with {}.",
num_feat_dynamic_real,
feat_dynamic_real.shape[0],
)
assert_pts(
np.all(np.isfinite(feat_dynamic_real)),
"Features values have to be finite and cannot exceed single "
"precision floating point range.",
)
num_feat_dynamic_real_time_steps = feat_dynamic_real.shape[1]
assert_pts(
num_feat_dynamic_real_time_steps == len(target),
"Each feature in feat_dynamic_real has to have the same length as "
"the target. Found an instance with feat_dynamic_real of length {} "
"and a target of length {}.",
num_feat_dynamic_real_time_steps,
len(target),
)
assert_pts(num_time_series > 0, "Time series dataset is empty!")
assert_pts(
num_time_observations > 0, "Only empty time series found in the dataset!",
)
# note this require the above assumption to avoid a division by zero
# runtime error
mean_target_length = num_time_observations / num_time_series
# note this require the above assumption to avoid a division by zero
# runtime error
mean_target = sum_target / num_time_observations
mean_abs_target = sum_abs_target / num_time_observations
integer_dataset = integer_dataset and min_target >= 0.0
assert len(scale_histogram) == num_time_series
return DatasetStatistics(
integer_dataset=integer_dataset,
max_target=max_target,
mean_abs_target=mean_abs_target,
mean_target=mean_target,
mean_target_length=mean_target_length,
min_target=min_target,
num_missing_values=num_missing_values,
feat_static_real=observed_feat_static_real if observed_feat_static_real else [],
feat_static_cat=observed_feat_static_cat if observed_feat_static_cat else [],
num_feat_dynamic_real=num_feat_dynamic_real,
num_feat_dynamic_cat=num_feat_dynamic_cat,
num_time_observations=num_time_observations,
num_time_series=num_time_series,
scale_histogram=scale_histogram,
)
@@ -1,47 +0,0 @@
import itertools
from typing import Dict, Iterable, Iterator, Optional
import numpy as np
import torch
from pts.transform.transform import Transformation
from .common import DataEntry, Dataset
class TransformedIterableDataset(torch.utils.data.IterableDataset):
def __init__(
self, dataset: Dataset, is_train: bool, transform: Transformation
) -> None:
super().__init__()
self.dataset = dataset
self.transform = transform
self.is_train = is_train
self._cur_iter: Optional[Iterator] = None
def _iterate_forever(self, collection: Iterable[DataEntry]) -> Iterator[DataEntry]:
# iterate forever over the collection, the collection must be non empty
while True:
try:
first = next(iter(collection))
except StopIteration:
raise Exception("empty dataset")
else:
for x in itertools.chain([first], collection):
yield x
def __iter__(self) -> Iterator[Dict[str, np.ndarray]]:
if self._cur_iter is None:
self._cur_iter = self.transform(
self._iterate_forever(self.dataset), is_train=self.is_train
)
assert self._cur_iter is not None
while True:
data_entry = next(self._cur_iter)
yield {
k: (v.astype(np.float32) if v.dtype.kind == "f" else v)
for k, v in data_entry.items()
if isinstance(v, np.ndarray) == True
}
# def __len__(self) -> int:
# return len(self.dataset)
-148
View File
@@ -1,148 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
import rapidjson as json
from .common import TrainDatasets, MetaData
from .file_dataset import FileDataset
def frequency_add(ts: pd.Timestamp, amount: int) -> pd.Timestamp:
return ts + ts.freq * amount
def forecast_start(entry):
return frequency_add(entry["start"], len(entry["target"]))
def to_pandas(instance: dict, freq: str = None) -> pd.Series:
"""
Transform a dictionary into a pandas.Series object, using its
"start" and "target" fields.
Parameters
----------
instance
Dictionary containing the time series data.
freq
Frequency to use in the pandas.Series index.
Returns
-------
pandas.Series
Pandas time series object.
"""
target = instance["target"]
start = instance["start"]
if not freq:
freq = start.freqstr
index = pd.date_range(start=start, periods=len(target), freq=freq)
return pd.Series(target, index=index)
def load_datasets(metadata, train, test, shuffle: bool = False) -> TrainDatasets:
"""
Loads a dataset given metadata, train and test path.
Parameters
----------
metadata
Path to the metadata file
train
Path to the training dataset files.
test
Path to the test dataset files.
shuffle
Return shuffled train data.
Returns
-------
TrainDatasets
An object collecting metadata, training data, test data.
"""
meta = MetaData.parse_file(metadata)
train_ds = FileDataset(train, meta.freq, shuffle=shuffle)
test_ds = FileDataset(test, meta.freq) if test else None
return TrainDatasets(metadata=meta, train=train_ds, test=test_ds)
def save_datasets(dataset: TrainDatasets, path_str: str, overwrite=True) -> None:
"""
Saves an TrainDatasets object to a JSON Lines file.
Parameters
----------
dataset
The training datasets.
path_str
Where to save the dataset.
overwrite
Whether to delete previous version in this folder.
"""
path = Path(path_str)
if overwrite:
shutil.rmtree(path, ignore_errors=True)
def dump_line(f, line):
f.write(json.dumps(line).encode("utf-8"))
f.write("\n".encode("utf-8"))
(path / "metadata").mkdir(parents=True)
with open(path / "metadata/metadata.json", "wb") as f:
dump_line(f, dataset.metadata.dict())
(path / "train").mkdir(parents=True)
with open(path / "train/data.json", "wb") as f:
for entry in dataset.train:
dump_line(f, serialize_data_entry(entry))
if dataset.test is not None:
(path / "test").mkdir(parents=True)
with open(path / "test/data.json", "wb") as f:
for entry in dataset.test:
dump_line(f, serialize_data_entry(entry))
def serialize_data_entry(data):
"""
Encode the numpy values in the a DataEntry dictionary into lists so the
dictionary can be JSON serialized.
Parameters
----------
data
The dictionary to be transformed.
Returns
-------
Dict
The transformed dictionary, where all fields where transformed into
strings.
"""
def serialize_field(field):
if isinstance(field, np.ndarray):
# circumvent https://github.com/micropython/micropython/issues/3511
nan_ix = np.isnan(field)
field = field.astype(np.object_)
field[nan_ix] = "NaN"
return field.tolist()
return str(field)
return {k: serialize_field(v) for k, v in data.items() if v is not None}
+10 -7
View File
@@ -2,10 +2,15 @@ import torch
from torch.distributions import Distribution, TransformedDistribution, AffineTransform
class ImplicitQuantile(Distribution):
def __init__(self, implicit_quantile_function, taus, nn_output, predicted_quantiles, validate_args=None):
def __init__(
self,
implicit_quantile_function,
taus,
nn_output,
predicted_quantiles,
validate_args=None,
):
self.predicted_quantiles = predicted_quantiles[0]
self.taus = taus
self.quantile_function = implicit_quantile_function
@@ -46,9 +51,8 @@ class ImplicitQuantile(Distribution):
@staticmethod
def quantile_loss(quantile_forecast, target, tau):
return torch.abs(
(quantile_forecast - target)
* ((target <= quantile_forecast).float() - tau)
)
(quantile_forecast - target) * ((target <= quantile_forecast).float() - tau)
)
class TransformedImplicitQuantile(TransformedDistribution):
@@ -63,4 +67,3 @@ class TransformedImplicitQuantile(TransformedDistribution):
scale *= transform.scale
p = self.base_dist.log_prob(x)
return p * scale
+4 -1
View File
@@ -118,7 +118,10 @@ class ZeroInflatedNegativeBinomial(ZeroInflatedDistribution):
def __init__(self, gate, total_count, probs=None, logits=None, validate_args=None):
base_dist = NegativeBinomial(
total_count=total_count, probs=probs, logits=logits, validate_args=False,
total_count=total_count,
probs=probs,
logits=logits,
validate_args=False,
)
base_dist._validate_args = validate_args
-2
View File
@@ -1,2 +0,0 @@
from .backtest import make_evaluation_predictions, backtest_metrics
from .evaluator import Evaluator, MultivariateEvaluator
-221
View File
@@ -1,221 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import logging
from typing import Dict, Iterator, NamedTuple, Optional, Tuple, Union
# Third-party imports
import pandas as pd
from pts.dataset import (
DataEntry,
Dataset,
DatasetStatistics,
calculate_dataset_statistics,
)
from pts.model import Estimator, Predictor, Forecast
# First-party imports
from pts.transform import AdhocTransform, TransformedDataset
from .evaluator import Evaluator
def make_evaluation_predictions(
dataset: Dataset, predictor: Predictor, num_samples: int
) -> Tuple[Iterator[Forecast], Iterator[pd.Series]]:
"""
Return predictions on the last portion of predict_length time units of the
target. Such portion is cut before making predictions, such a function can
be used in evaluations where accuracy is evaluated on the last portion of
the target.
Parameters
----------
dataset
Dataset where the evaluation will happen. Only the portion excluding
the prediction_length portion is used when making prediction.
predictor
Model used to draw predictions.
num_samples
Number of samples to draw on the model when evaluating.
Returns
-------
"""
prediction_length = predictor.prediction_length
freq = predictor.freq
def add_ts_dataframe(data_iterator: Iterator[DataEntry]) -> Iterator[DataEntry]:
for data_entry in data_iterator:
data = data_entry.copy()
index = pd.date_range(
start=data["start"], freq=freq, periods=data["target"].shape[-1],
)
data["ts"] = pd.DataFrame(index=index, data=data["target"].transpose())
yield data
def ts_iter(dataset: Dataset) -> pd.DataFrame:
for data_entry in add_ts_dataframe(iter(dataset)):
yield data_entry["ts"]
def truncate_target(data):
data = data.copy()
target = data["target"]
assert (
target.shape[-1] >= prediction_length
) # handles multivariate case (target_dim, history_length)
data["target"] = target[..., :-prediction_length]
return data
# TODO filter out time series with target shorter than prediction length
# TODO or fix the evaluator so it supports missing values instead (all
# TODO the test set may be gone otherwise with such a filtering)
dataset_trunc = TransformedDataset(
dataset, transformations=[AdhocTransform(truncate_target)]
)
return (
predictor.predict(dataset_trunc, num_samples=num_samples),
ts_iter(dataset),
)
train_dataset_stats_key = "train_dataset_stats"
test_dataset_stats_key = "test_dataset_stats"
estimator_key = "estimator"
agg_metrics_key = "agg_metrics"
def serialize_message(logger, message: str, variable):
logger.info(f"pts[{message}]: {variable}")
def backtest_metrics(
train_dataset: Optional[Dataset],
test_dataset: Dataset,
forecaster: Union[Estimator, Predictor],
evaluator=Evaluator(quantiles=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)),
num_samples: int = 100,
logging_file: Optional[str] = None,
):
"""
Parameters
----------
train_dataset
Dataset to use for training.
test_dataset
Dataset to use for testing.
forecaster
An estimator or a predictor to use for generating predictions.
evaluator
Evaluator to use.
num_samples
Number of samples to use when generating sample-based forecasts.
logging_file
If specified, information of the backtest is redirected to this file.
Returns
-------
tuple
A tuple of aggregate metrics and per-time-series metrics obtained by
training `forecaster` on `train_dataset` and evaluating the resulting
`evaluator` provided on the `test_dataset`.
"""
if logging_file is not None:
log_formatter = logging.Formatter(
"[%(asctime)s %(levelname)s %(thread)d] %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
handler = logging.FileHandler(logging_file)
handler.setFormatter(log_formatter)
logger.addHandler(handler)
else:
logger = logging.getLogger(__name__)
if train_dataset is not None:
train_statistics = calculate_dataset_statistics(train_dataset)
serialize_message(logger, train_dataset_stats_key, train_statistics)
test_statistics = calculate_dataset_statistics(test_dataset)
serialize_message(logger, test_dataset_stats_key, test_statistics)
if isinstance(forecaster, Estimator):
serialize_message(logger, estimator_key, forecaster)
assert train_dataset is not None
predictor = forecaster.train(train_dataset)
else:
predictor = forecaster
forecast_it, ts_it = make_evaluation_predictions(
test_dataset, predictor=predictor, num_samples=num_samples
)
agg_metrics, item_metrics = evaluator(
ts_it, forecast_it, num_series=len(test_dataset)
)
# we only log aggregate metrics for now as item metrics may be very large
for name, value in agg_metrics.items():
serialize_message(logger, f"metric-{name}", value)
if logging_file is not None:
# Close the file handler to avoid letting the file open.
# https://stackoverflow.com/questions/24816456/python-logging-wont-shutdown
logger.removeHandler(handler)
del logger, handler
return agg_metrics, item_metrics
class BacktestInformation(NamedTuple):
train_dataset_stats: DatasetStatistics
test_dataset_stats: DatasetStatistics
estimator: Estimator
agg_metrics: Dict[str, float]
# @staticmethod
# def make_from_log(log_file):
# with open(log_file, "r") as f:
# return BacktestInformation.make_from_log_contents(
# "\n".join(f.readlines())
# )
# @staticmethod
# def make_from_log_contents(log_contents):
# messages = dict(re.findall(r"pts\[(.*)\]: (.*)", log_contents))
# # avoid to fail if a key is missing for instance in the case a run did
# # not finish so that we can still get partial information
# try:
# return BacktestInformation(
# train_dataset_stats=eval(
# messages[train_dataset_stats_key]
# ), # TODO: use load
# test_dataset_stats=eval(
# messages[test_dataset_stats_key]
# ), # TODO: use load
# estimator=load_code(messages[estimator_key]),
# agg_metrics={
# k: load_code(v)
# for k, v in messages.items()
# if k.startswith("metric-") and v != "nan"
# },
# )
# except Exception as error:
# logging.error(error)
# return None
-730
View File
@@ -1,730 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import multiprocessing
import sys
from itertools import chain, tee
from typing import (
Any,
Dict,
Iterable,
Iterator,
List,
Optional,
Tuple,
Union,
Callable,
)
# Third-party imports
import numpy as np
import pandas as pd
from tqdm import tqdm
# First-party imports
from pts.feature import get_seasonality
from pts.model import Quantile, Forecast
class Evaluator:
"""
Evaluator class, to compute accuracy metrics by comparing observations
to forecasts.
Parameters
----------
quantiles
list of strings of the form 'p10' or floats in [0, 1] with
the quantile levels
seasonality
seasonality to use for seasonal_error, if nothing is passed
uses the default seasonality
for the given series frequency as returned by `get_seasonality`
alpha
Parameter of the MSIS metric from the M4 competition that
defines the confidence interval.
For alpha=0.05 (default) the 95% considered is considered in the metric,
see https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
for more detail on MSIS
calculate_owa
Determines whether the OWA metric should also be calculated,
which is computationally expensive to evaluate and thus slows
down the evaluation process considerably.
By default False.
num_workers
The number of multiprocessing workers that will be used to process
the data in parallel.
Default is multiprocessing.cpu_count().
Setting it to 0 means no multiprocessing.
chunk_size
Controls the approximate chunk size each workers handles at a time.
Default is 32.
"""
default_quantiles = 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
def __init__(
self,
quantiles: Iterable[Union[float, str]] = default_quantiles,
seasonality: Optional[int] = None,
alpha: float = 0.05,
calculate_owa: bool = False,
num_workers: Optional[int] = None,
chunk_size: Optional[int] = None,
) -> None:
self.quantiles = tuple(map(Quantile.parse, quantiles))
self.seasonality = seasonality
self.alpha = alpha
self.calculate_owa = calculate_owa
self.num_workers = (
num_workers if num_workers is not None else multiprocessing.cpu_count()
)
self.chunk_size = chunk_size if chunk_size is not None else 32
def __call__(
self,
ts_iterator: Iterable[Union[pd.DataFrame, pd.Series]],
fcst_iterator: Iterable[Forecast],
num_series: Optional[int] = None,
) -> Tuple[Dict[str, float], pd.DataFrame]:
"""
Compute accuracy metrics by comparing actual data to the forecasts.
Parameters
----------
ts_iterator
iterator containing true target on the predicted range
fcst_iterator
iterator of forecasts on the predicted range
num_series
number of series of the iterator
(optional, only used for displaying progress)
Returns
-------
dict
Dictionary of aggregated metrics
pd.DataFrame
DataFrame containing per-time-series metrics
"""
ts_iterator = iter(ts_iterator)
fcst_iterator = iter(fcst_iterator)
rows = []
with tqdm(
zip(ts_iterator, fcst_iterator),
total=num_series,
desc="Running evaluation",
) as it, np.errstate(invalid="ignore"):
if self.num_workers > 0 and not sys.platform == "win32":
mp_pool = multiprocessing.Pool(
initializer=_worker_init(self), processes=self.num_workers
)
rows = mp_pool.map(
func=_worker_fun, iterable=iter(it), chunksize=self.chunk_size,
)
mp_pool.close()
mp_pool.join()
else:
for ts, forecast in it:
rows.append(self.get_metrics_per_ts(ts, forecast))
assert not any(
True for _ in ts_iterator
), "ts_iterator has more elements than fcst_iterator"
assert not any(
True for _ in fcst_iterator
), "fcst_iterator has more elements than ts_iterator"
if num_series is not None:
assert (
len(rows) == num_series
), f"num_series={num_series} did not match number of elements={len(rows)}"
# If all entries of a target array are NaNs, the resulting metric will have value "masked". Pandas does not
# handle masked values correctly. Thus we set dtype=np.float64 to convert masked values back to NaNs which
# are handled correctly by pandas Dataframes during aggregation.
metrics_per_ts = pd.DataFrame(rows, dtype=np.float64)
return self.get_aggregate_metrics(metrics_per_ts)
@staticmethod
def extract_pred_target(
time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
) -> np.ndarray:
"""
Parameters
----------
time_series
forecast
Returns
-------
np.ndarray
time series cut in the Forecast object dates
"""
assert forecast.index.intersection(time_series.index).equals(forecast.index), (
"Cannot extract prediction target since the index of forecast is outside the index of target\n"
f"Index of forecast: {forecast.index}\n Index of target: {time_series.index}"
)
# cut the time series using the dates of the forecast object
return np.atleast_1d(np.squeeze(time_series.loc[forecast.index].transpose()))
# This method is needed for the owa calculation
# It extracts the training sequence from the Series or DataFrame to a numpy array
@staticmethod
def extract_past_data(
time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
) -> np.ndarray:
"""
Parameters
----------
time_series
forecast
Returns
-------
np.ndarray
time series without the forecast dates
"""
assert forecast.index.intersection(time_series.index).equals(forecast.index), (
"Index of forecast is outside the index of target\n"
f"Index of forecast: {forecast.index}\n Index of target: {time_series.index}"
)
# Remove the prediction range
# If the prediction range is not in the end of the time series,
# everything after the prediction range is truncated
date_before_forecast = forecast.index[0] - forecast.index[0].freq
return np.atleast_1d(
np.squeeze(time_series.loc[:date_before_forecast].transpose())
)
def seasonal_error(self, past_data: np.ndarray, forecast: Forecast) -> float:
r"""
.. math::
seasonal_error = mean(|Y[t] - Y[t-m]|)
where m is the seasonal frequency
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
"""
# Check if the length of the time series is larger than the seasonal frequency
seasonality = (
self.seasonality if self.seasonality else get_seasonality(forecast.freq)
)
if seasonality < len(past_data):
forecast_freq = seasonality
else:
# edge case: the seasonal freq is larger than the length of ts
# revert to freq=1
# logging.info('The seasonal frequency is larger than the length of the time series. Reverting to freq=1.')
forecast_freq = 1
y_t = past_data[:-forecast_freq]
y_tm = past_data[forecast_freq:]
seasonal_mae = np.mean(abs(y_t - y_tm))
return seasonal_mae if seasonal_mae is not np.ma.masked else np.nan
def get_metrics_per_ts(
self, time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
) -> Dict[str, Union[float, str, None]]:
pred_target = np.array(self.extract_pred_target(time_series, forecast))
pred_target = np.ma.masked_invalid(pred_target)
# required for seasonal_error and owa calculation
past_data = np.array(self.extract_past_data(time_series, forecast))
past_data = np.ma.masked_invalid(past_data)
try:
mean_fcst = forecast.mean
except:
mean_fcst = None
median_fcst = forecast.quantile(0.5)
seasonal_error = self.seasonal_error(past_data, forecast)
metrics = {
"item_id": forecast.item_id,
"MSE": self.mse(pred_target, mean_fcst) if mean_fcst is not None else None,
"abs_error": self.abs_error(pred_target, median_fcst),
"abs_target_sum": self.abs_target_sum(pred_target),
"abs_target_mean": self.abs_target_mean(pred_target),
"seasonal_error": seasonal_error,
"MASE": self.mase(pred_target, median_fcst, seasonal_error),
"MAPE": self.mape(pred_target, median_fcst),
"sMAPE": self.smape(pred_target, median_fcst),
"OWA": np.nan, # by default not calculated
"MSIS": self.msis(
pred_target,
forecast.quantile(self.alpha / 2),
forecast.quantile(1.0 - self.alpha / 2),
seasonal_error,
self.alpha,
),
}
if self.calculate_owa:
metrics["OWA"] = self.owa(
pred_target,
median_fcst,
past_data,
seasonal_error,
forecast.start_date,
)
for quantile in self.quantiles:
forecast_quantile = forecast.quantile(quantile.value)
metrics[quantile.loss_name] = self.quantile_loss(
pred_target, forecast_quantile, quantile.value
)
metrics[quantile.coverage_name] = self.coverage(
pred_target, forecast_quantile
)
return metrics
def get_aggregate_metrics(
self, metric_per_ts: pd.DataFrame
) -> Tuple[Dict[str, float], pd.DataFrame]:
agg_funs = {
"MSE": "mean",
"abs_error": "sum",
"abs_target_sum": "sum",
"abs_target_mean": "mean",
"seasonal_error": "mean",
"MASE": "mean",
"MAPE": "mean",
"sMAPE": "mean",
"OWA": "mean",
"MSIS": "mean",
}
for quantile in self.quantiles:
agg_funs[quantile.loss_name] = "sum"
agg_funs[quantile.coverage_name] = "mean"
assert (
set(metric_per_ts.columns) >= agg_funs.keys()
), "The some of the requested item metrics are missing."
totals = {key: metric_per_ts[key].agg(agg) for key, agg in agg_funs.items()}
# derived metrics based on previous aggregate metrics
totals["RMSE"] = np.sqrt(totals["MSE"])
flag = totals["abs_target_mean"] == 0
totals["NRMSE"] = np.divide(
totals["RMSE"] * (1 - flag), totals["abs_target_mean"] + flag
)
flag = totals["abs_target_sum"] == 0
totals["ND"] = np.divide(
totals["abs_error"] * (1 - flag), totals["abs_target_sum"] + flag
)
all_qLoss_names = [quantile.weighted_loss_name for quantile in self.quantiles]
for quantile in self.quantiles:
totals[quantile.weighted_loss_name] = np.divide(
totals[quantile.loss_name], totals["abs_target_sum"]
)
totals["mean_wQuantileLoss"] = np.array(
[totals[ql] for ql in all_qLoss_names]
).mean()
totals["MAE_Coverage"] = np.mean(
[
np.abs(totals[q.coverage_name] - np.array([q.value]))
for q in self.quantiles
]
)
return totals, metric_per_ts
@staticmethod
def mse(target, forecast):
return np.mean(np.square(target - forecast))
@staticmethod
def abs_error(target, forecast):
return np.sum(np.abs(target - forecast))
@staticmethod
def quantile_loss(target, quantile_forecast, q):
return 2.0 * np.sum(
np.abs((quantile_forecast - target) * ((target <= quantile_forecast) - q))
)
@staticmethod
def coverage(target, quantile_forecast):
return np.mean((target < quantile_forecast))
@staticmethod
def mase(target, forecast, seasonal_error):
r"""
.. math::
mase = mean(|Y - Y_hat|) / seasonal_error
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
"""
flag = seasonal_error == 0
return (np.mean(np.abs(target - forecast)) * (1 - flag)) / (
seasonal_error + flag
)
@staticmethod
def mape(target, forecast):
r"""
.. math::
mape = mean(|Y - Y_hat| / |Y|))
"""
denominator = np.abs(target)
flag = denominator == 0
mape = np.mean((np.abs(target - forecast) * (1 - flag)) / (denominator + flag))
return mape
@staticmethod
def smape(target, forecast):
r"""
.. math::
smape = mean(2 * |Y - Y_hat| / (|Y| + |Y_hat|))
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
"""
denominator = np.abs(target) + np.abs(forecast)
flag = denominator == 0
smape = 2 * np.mean(
(np.abs(target - forecast) * (1 - flag)) / (denominator + flag)
)
return smape
@staticmethod
def owa(
target: np.ndarray,
forecast: np.ndarray,
past_data: np.ndarray,
seasonal_error: float,
start_date: pd.Timestamp,
) -> float:
r"""
.. math::
owa = 0.5*(smape/smape_naive + mase/mase_naive)
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
"""
# avoid import error due to circular dependency
from gluonts.model.naive_2 import naive_2
# calculate the forecast of the seasonal naive predictor
naive_median_fcst = naive_2(past_data, len(target), freq=start_date.freqstr)
owa = 0.5 * (
(
Evaluator.smape(target, forecast)
/ Evaluator.smape(target, naive_median_fcst)
)
+ (
Evaluator.mase(target, forecast, seasonal_error)
/ Evaluator.mase(target, naive_median_fcst, seasonal_error)
)
)
return owa
@staticmethod
def msis(target, lower_quantile, upper_quantile, seasonal_error, alpha):
r"""
:math:
msis = mean(U - L + 2/alpha * (L-Y) * I[Y<L] + 2/alpha * (Y-U) * I[Y>U]) /seasonal_error
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
"""
numerator = np.mean(
upper_quantile
- lower_quantile
+ 2.0 / alpha * (lower_quantile - target) * (target < lower_quantile)
+ 2.0 / alpha * (target - upper_quantile) * (target > upper_quantile)
)
flag = seasonal_error == 0
return (numerator * (1 - flag)) / (seasonal_error + flag)
@staticmethod
def abs_target_sum(target):
return np.sum(np.abs(target))
@staticmethod
def abs_target_mean(target):
return np.mean(np.abs(target))
class MultivariateEvaluator(Evaluator):
"""
The MultivariateEvaluator class owns functionality for evaluating
multidimensional target arrays of shape
(target_dimensionality, prediction_length).
Evaluations of individual dimensions will be stored with the corresponding
dimension prefix and contain the metrics calculated by only this dimension.
Metrics with the plain metric name correspond to metrics calculated over
all dimensions.
Additionally, the user can provide additional aggregation functions that
first aggregate the target and forecast over dimensions and then calculate
the metric. These metrics will be prefixed with m_<aggregation_fun_name>_
The evaluation dimensions can be set by the user.
Example:
{'0_MSE': 0.004307240342677687, # MSE of dimension 0
'0_abs_error': 1.6246897801756859,
'1_MSE': 0.003949341769475723, # MSE of dimension 1
'1_abs_error': 1.5052175521850586,
'MSE': 0.004128291056076705, # MSE of all dimensions
'abs_error': 3.1299073323607445,
'm_sum_MSE': 0.02 # MSE of aggregated target and aggregated forecast
(if target_agg_funcs is set).
'm_sum_abs_error': 4.2}
"""
def __init__(
self,
quantiles: Iterable[Union[float, str]] = np.linspace(0.1, 0.9, 9),
seasonality: Optional[int] = None,
alpha: float = 0.05,
eval_dims: List[int] = None,
target_agg_funcs: Dict[str, Callable] = {},
) -> None:
"""
Parameters
----------
quantiles
list of strings of the form 'p10' or floats in [0, 1] with the
quantile levels
seasonality
seasonality to use for seasonal_error, if nothing is passed uses
the default seasonality for the given series frequency as
returned by `get_seasonality`
alpha
parameter of the MSIS metric that defines the CI,
e.g., for alpha=0.05 the 95% CI is considered in the metric.
eval_dims
dimensions of the target that will be evaluated.
target_agg_funcs
pass key-value pairs that define aggregation functions over the
dimension axis. Useful to compute metrics over aggregated target
and forecast (typically sum or mean).
"""
super().__init__(quantiles=quantiles, seasonality=seasonality, alpha=alpha)
self._eval_dims = eval_dims
self.target_agg_funcs = target_agg_funcs
@staticmethod
def extract_target_by_dim(
it_iterator: Iterator[pd.DataFrame], dim: int
) -> Iterator[pd.DataFrame]:
for i in it_iterator:
yield (i[dim])
@staticmethod
def extract_forecast_by_dim(
forecast_iterator: Iterator[Forecast], dim: int
) -> Iterator[Forecast]:
for forecast in forecast_iterator:
yield forecast.copy_dim(dim)
@staticmethod
def extract_aggregate_target(
it_iterator: Iterator[pd.DataFrame], agg_fun: Callable
) -> Iterator[pd.DataFrame]:
for i in it_iterator:
yield i.agg(agg_fun, axis=1)
@staticmethod
def extract_aggregate_forecast(
forecast_iterator: Iterator[Forecast], agg_fun: Callable
) -> Iterator[Forecast]:
for forecast in forecast_iterator:
yield forecast.copy_aggregate(agg_fun)
@staticmethod
def peek(iterator: Iterator[Any]) -> Tuple[Any, Iterator[Any]]:
peeked_object = iterator.__next__()
iterator = chain([peeked_object], iterator)
return peeked_object, iterator
@staticmethod
def get_target_dimensionality(forecast: Forecast) -> int:
target_dim = forecast.dim()
assert target_dim > 1, (
f"the dimensionality of the forecast should be larger than 1, "
f"but got {target_dim}. "
f"Please use the Evaluator to evaluate 1D forecasts."
)
return target_dim
def get_eval_dims(self, target_dimensionality: int) -> List[int]:
eval_dims = (
self._eval_dims
if self._eval_dims is not None
else list(range(0, target_dimensionality))
)
assert max(eval_dims) < target_dimensionality, (
f"eval dims should range from 0 to target_dimensionality - 1, "
f"but got max eval_dim {max(eval_dims)}"
)
return eval_dims
def calculate_aggregate_multivariate_metrics(
self,
ts_iterator: Iterator[pd.DataFrame],
forecast_iterator: Iterator[Forecast],
agg_fun: Callable,
) -> Dict[str, float]:
"""
Parameters
----------
ts_iterator
Iterator over time series
forecast_iterator
Iterator over forecasts
agg_fun
aggregation function
Returns
-------
Dict[str, float]
dictionary with aggregate datasets metrics
"""
agg_metrics, _ = super(MultivariateEvaluator, self).__call__(
self.extract_aggregate_target(ts_iterator, agg_fun),
self.extract_aggregate_forecast(forecast_iterator, agg_fun),
)
return agg_metrics
def calculate_aggregate_vector_metrics(
self, all_agg_metrics: Dict[str, float], all_metrics_per_ts: pd.DataFrame,
) -> Dict[str, float]:
"""
Parameters
----------
all_agg_metrics
dictionary with aggregate metrics of individual dimensions
all_metrics_per_ts
DataFrame containing metrics for all time series of all evaluated
dimensions
Returns
-------
Dict[str, float]
dictionary with aggregate metrics (of individual (evaluated)
dimensions and the entire vector)
"""
vector_aggregate_metrics, _ = self.get_aggregate_metrics(all_metrics_per_ts)
for key, value in vector_aggregate_metrics.items():
all_agg_metrics[key] = value
return all_agg_metrics
def __call__(
self,
ts_iterator: Iterable[pd.DataFrame],
fcst_iterator: Iterable[Forecast],
num_series=None,
) -> Tuple[Dict[str, float], pd.DataFrame]:
ts_iterator = iter(ts_iterator)
fcst_iterator = iter(fcst_iterator)
all_agg_metrics = dict()
all_metrics_per_ts = list()
peeked_forecast, fcst_iterator = self.peek(fcst_iterator)
target_dimensionality = self.get_target_dimensionality(peeked_forecast)
eval_dims = self.get_eval_dims(target_dimensionality)
ts_iterator_set = tee(
ts_iterator, target_dimensionality + len(self.target_agg_funcs)
)
fcst_iterator_set = tee(
fcst_iterator, target_dimensionality + len(self.target_agg_funcs)
)
for dim in eval_dims:
agg_metrics, metrics_per_ts = super(MultivariateEvaluator, self).__call__(
self.extract_target_by_dim(ts_iterator_set[dim], dim),
self.extract_forecast_by_dim(fcst_iterator_set[dim], dim),
)
all_metrics_per_ts.append(metrics_per_ts)
for metric, value in agg_metrics.items():
all_agg_metrics[f"{dim}_{metric}"] = value
all_metrics_per_ts = pd.concat(all_metrics_per_ts)
all_agg_metrics = self.calculate_aggregate_vector_metrics(
all_agg_metrics, all_metrics_per_ts
)
if self.target_agg_funcs:
multivariate_metrics = {
agg_fun_name: self.calculate_aggregate_multivariate_metrics(
ts_iterator_set[-(index + 1)],
fcst_iterator_set[-(index + 1)],
agg_fun,
)
for index, (agg_fun_name, agg_fun) in enumerate(
self.target_agg_funcs.items()
)
}
for key, metric_dict in multivariate_metrics.items():
prefix = f"m_{key}_"
for metric, value in metric_dict.items():
all_agg_metrics[prefix + metric] = value
return all_agg_metrics, all_metrics_per_ts
# This is required for the multiprocessing to work.
_worker_evaluator: Optional[Evaluator] = None
def _worker_init(evaluator: Evaluator):
global _worker_evaluator
_worker_evaluator = evaluator
def _worker_fun(inp: tuple):
ts, forecast = inp
global _worker_evaluator
assert isinstance(
_worker_evaluator, Evaluator
), "Something went wrong with the worker initialization."
return _worker_evaluator.get_metrics_per_ts(ts, forecast)
-3
View File
@@ -1,3 +0,0 @@
def assert_pts(condition: bool, message: str, *args, **kwargs) -> None:
if not condition:
raise Exception(message.format(*args, **kwargs))
-19
View File
@@ -1,23 +1,4 @@
from .holiday import (
SPECIAL_DATE_FEATURES,
SpecialDateFeatureSet,
CustomDateFeatureSet,
CustomHolidayFeatureSet,
squared_exponential_kernel,
exponential_kernel,
)
from .lag import get_lags_for_frequency, get_fourier_lags_for_frequency
from .time_feature import (
DayOfMonth,
DayOfWeek,
DayOfYear,
HourOfDay,
MinuteOfHour,
MonthOfYear,
TimeFeature,
WeekOfYear,
FourierDateFeatures,
time_features_from_frequency_str,
fourier_time_features_from_frequency_str,
)
from .utils import get_granularity, get_seasonality
+13 -232
View File
@@ -1,221 +1,9 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import List, Callable
from typing import Callable, List
import numpy as np
import pandas as pd
from pandas.tseries.holiday import (
TH,
SU,
EasterMonday,
GoodFriday,
Holiday,
USColumbusDay,
USLaborDay,
USMartinLutherKingJr,
USMemorialDay,
USPresidentsDay,
USThanksgivingDay,
)
from pandas.tseries.offsets import DateOffset, Day, Easter
from pandas.tseries.holiday import Holiday
# This is 183 to cover half a year (in both directions), also for leap years
# plus a week and a half to cover holidays offset by a week e.g. easter etc
MAX_WINDOW = 192
def distance_to_holiday(holiday):
def distance_to_day(index):
holiday_date = holiday.dates(
index - pd.Timedelta(days=MAX_WINDOW),
index + pd.Timedelta(days=MAX_WINDOW),
)
assert (
len(holiday_date) != 0
), f"No closest holiday for the date index {index} found."
# It sometimes returns two dates if it is exactly half a year after the
# holiday. In this case, the smaller distance (182 days) is returned.
return (index - holiday_date[0]).days
return distance_to_day
EasterSunday = Holiday("Easter Sunday", month=1, day=1, offset=[Easter(), Day(0)])
NewYearsDay = Holiday("New Years Day", month=1, day=1)
SuperBowl = Holiday("Superbowl", month=2, day=1, offset=DateOffset(weekday=SU(1)))
MothersDay = Holiday("Mothers Day", month=5, day=1, offset=DateOffset(weekday=SU(2)))
IndependenceDay = Holiday("Independence Day", month=7, day=4)
ChristmasEve = Holiday("Christmas", month=12, day=24)
ChristmasDay = Holiday("Christmas", month=12, day=25)
NewYearsEve = Holiday("New Years Eve", month=12, day=31)
BlackFriday = Holiday(
"Black Friday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), Day(1)]
)
CyberMonday = Holiday(
"Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), Day(4)],
)
NEW_YEARS_DAY = "new_years_day"
MARTIN_LUTHER_KING_DAY = "martin_luther_king_day"
SUPERBOWL = "superbowl"
PRESIDENTS_DAY = "presidents_day"
GOOD_FRIDAY = "good_friday"
EASTER_SUNDAY = "easter_sunday"
EASTER_MONDAY = "easter_monday"
MOTHERS_DAY = "mothers_day"
INDEPENDENCE_DAY = "independence_day"
LABOR_DAY = "labor_day"
MEMORIAL_DAY = "memorial_day"
COLUMBUS_DAY = "columbus_day"
THANKSGIVING = "thanksgiving"
CHRISTMAS_EVE = "christmas_eve"
CHRISTMAS_DAY = "christmas_day"
NEW_YEARS_EVE = "new_years_eve"
BLACK_FRIDAY = "black_friday"
CYBER_MONDAY = "cyber_monday"
SPECIAL_DATE_FEATURES = {
NEW_YEARS_DAY: distance_to_holiday(NewYearsDay),
MARTIN_LUTHER_KING_DAY: distance_to_holiday(USMartinLutherKingJr),
SUPERBOWL: distance_to_holiday(SuperBowl),
PRESIDENTS_DAY: distance_to_holiday(USPresidentsDay),
GOOD_FRIDAY: distance_to_holiday(GoodFriday),
EASTER_SUNDAY: distance_to_holiday(EasterSunday),
EASTER_MONDAY: distance_to_holiday(EasterMonday),
MOTHERS_DAY: distance_to_holiday(MothersDay),
INDEPENDENCE_DAY: distance_to_holiday(IndependenceDay),
LABOR_DAY: distance_to_holiday(USLaborDay),
MEMORIAL_DAY: distance_to_holiday(USMemorialDay),
COLUMBUS_DAY: distance_to_holiday(USColumbusDay),
THANKSGIVING: distance_to_holiday(USThanksgivingDay),
CHRISTMAS_EVE: distance_to_holiday(ChristmasEve),
CHRISTMAS_DAY: distance_to_holiday(ChristmasDay),
NEW_YEARS_EVE: distance_to_holiday(NewYearsEve),
BLACK_FRIDAY: distance_to_holiday(BlackFriday),
CYBER_MONDAY: distance_to_holiday(CyberMonday),
}
# Kernel functions
def indicator(distance):
return float(distance == 0)
def exponential_kernel(alpha=1.0, tol=1e-9):
def kernel(distance):
kernel_value = np.exp(-alpha * np.abs(distance))
if kernel_value > tol:
return kernel_value
else:
return 0.0
return kernel
def squared_exponential_kernel(alpha=1.0, tol=1e-9):
def kernel(distance):
kernel_value = np.exp(-alpha * np.abs(distance) ** 2)
if kernel_value > tol:
return kernel_value
else:
return 0.0
return kernel
class SpecialDateFeatureSet:
"""
Implements calculation of holiday features. The SpecialDateFeatureSet is
applied on a pandas Series with Datetimeindex and returns a 2D array of
the shape (len(dates), num_features), where num_features are the number
of holidays.
Note that for lower than daily granularity the distance to the holiday is
still computed on a per-day basis.
Example use:
>>> from pts.features import (
... squared_exponential_kernel,
... SpecialDateFeatureSet,
... CHRISTMAS_DAY,
... CHRISTMAS_EVE
... )
>>> import pandas as pd
>>> sfs = SpecialDateFeatureSet([CHRISTMAS_EVE, CHRISTMAS_DAY])
>>> date_indices = pd.date_range(
... start="2016-12-24",
... end="2016-12-31",
... freq='D'
... )
>>> sfs(date_indices)
array([[1., 0., 0., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0., 0., 0.]])
Example use for using a squared exponential kernel:
>>> kernel = squared_exponential_kernel(alpha=1.0)
>>> sfs = SpecialDateFeatureSet([CHRISTMAS_EVE, CHRISTMAS_DAY], kernel)
>>> sfs(date_indices)
array([[1.00000000e+00, 3.67879441e-01, 1.83156389e-02, 1.23409804e-04,
1.12535175e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[3.67879441e-01, 1.00000000e+00, 3.67879441e-01, 1.83156389e-02,
1.23409804e-04, 1.12535175e-07, 0.00000000e+00, 0.00000000e+00]])
"""
def __init__(
self,
feature_names: List[str],
kernel_function: Callable[[int], int] = indicator,
):
"""
Parameters
----------
feature_names
list of strings with holiday names for which features should be created.
kernel_function
kernel function to pass the feature value based
on distance in days. Can be indicator function (default),
exponential_kernel, squared_exponential_kernel or user defined.
"""
self.feature_names = feature_names
self.num_features = len(feature_names)
self.kernel_function = kernel_function
def __call__(self, dates):
"""
Transform a pandas series with timestamps to holiday features.
Parameters
----------
dates
Pandas series with Datetimeindex timestamps.
"""
return np.vstack(
[
np.hstack(
[
self.kernel_function(SPECIAL_DATE_FEATURES[feat_name](index))
for index in dates
]
)
for feat_name in self.feature_names
]
)
from gluonts.time_feature.holiday import indicator, distance_to_holiday
class CustomDateFeatureSet:
@@ -230,7 +18,7 @@ class CustomDateFeatureSet:
Example use:
>>> import pandas as pd
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
... pd.to_datetime('20200101', format='%Y%m%d')])
>>> date_indices = pd.date_range(
... start="2019-11-24",
@@ -245,7 +33,7 @@ class CustomDateFeatureSet:
Example use for using a squared exponential kernel:
>>> kernel = squared_exponential_kernel(alpha=0.5)
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
... pd.to_datetime('20200101', format='%Y%m%d')], kernel)
>>> cfs(date_indices)
array([[3.72665317e-06, 3.35462628e-04, 1.11089965e-02, 1.35335283e-01,
@@ -287,20 +75,14 @@ class CustomDateFeatureSet:
dates
Pandas series with Datetimeindex timestamps.
"""
return (
np.vstack(
[
np.hstack(
[
self.kernel_function((index - ref_date).days)
for index in dates
]
)
for ref_date in self.reference_dates
]
)
.sum(0, keepdims=True)
)
return np.vstack(
[
np.hstack(
[self.kernel_function((index - ref_date).days) for index in dates]
)
for ref_date in self.reference_dates
]
).sum(0, keepdims=True)
class CustomHolidayFeatureSet:
@@ -383,4 +165,3 @@ class CustomHolidayFeatureSet:
for custom_holiday in self.custom_holidays
]
)
-139
View File
@@ -1,139 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
from typing import List, Optional
# Third-party imports
import numpy as np
from pandas.tseries.frequencies import to_offset
from .utils import get_granularity
def _make_lags(middle: int, delta: int) -> np.ndarray:
"""
Create a set of lags around a middle point including +/- delta
"""
return np.arange(middle - delta, middle + delta + 1).tolist()
def get_lags_for_frequency(
freq_str: str, lag_ub: int = 1200, num_lags: Optional[int] = None
) -> List[int]:
"""
Generates a list of lags that that are appropriate for the given frequency string.
By default all frequencies have the following lags: [1, 2, 3, 4, 5, 6, 7].
Remaining lags correspond to the same `season` (+/- `delta`) in previous `k` cycles.
Here `delta` and `k` are chosen according to the existing code.
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
lag_ub
The maximum value for a lag.
num_lags
Maximum number of lags; by default all generated lags are returned
"""
multiple, granularity = get_granularity(freq_str)
# Lags are target values at the same `season` (+/- delta) but in the previous cycle.
def _make_lags_for_minute(multiple, num_cycles=3):
# We use previous ``num_cycles`` hours to generate lags
return [_make_lags(k * 60 // multiple, 2) for k in range(1, num_cycles + 1)]
def _make_lags_for_hour(multiple, num_cycles=7):
# We use previous ``num_cycles`` days to generate lags
return [_make_lags(k * 24 // multiple, 1) for k in range(1, num_cycles + 1)]
def _make_lags_for_day(multiple, num_cycles=4):
# We use previous ``num_cycles`` weeks to generate lags
# We use the last month (in addition to 4 weeks) to generate lag.
return [_make_lags(k * 7 // multiple, 1) for k in range(1, num_cycles + 1)] + [
_make_lags(30 // multiple, 1)
]
def _make_lags_for_week(multiple, num_cycles=3):
# We use previous ``num_cycles`` years to generate lags
# Additionally, we use previous 4, 8, 12 weeks
return [_make_lags(k * 52 // multiple, 1) for k in range(1, num_cycles + 1)] + [
[4 // multiple, 8 // multiple, 12 // multiple]
]
def _make_lags_for_month(multiple, num_cycles=3):
# We use previous ``num_cycles`` years to generate lags
return [_make_lags(k * 12 // multiple, 1) for k in range(1, num_cycles + 1)]
# multiple, granularity = get_granularity(freq_str)
offset = to_offset(freq_str)
if offset.name == "M":
lags = _make_lags_for_month(offset.n)
elif offset.name == "W-SUN" or offset.name == "W-MON":
lags = _make_lags_for_week(offset.n)
elif offset.name == "D":
lags = _make_lags_for_day(offset.n) + _make_lags_for_week(offset.n / 7.0)
elif offset.name == "B":
# todo find good lags for business day
lags = []
elif offset.name == "H":
lags = (
_make_lags_for_hour(offset.n)
+ _make_lags_for_day(offset.n / 24.0)
+ _make_lags_for_week(offset.n / (24.0 * 7))
)
# minutes
elif offset.name == "T":
lags = (
_make_lags_for_minute(offset.n)
+ _make_lags_for_hour(offset.n / 60.0)
+ _make_lags_for_day(offset.n / (60.0 * 24))
+ _make_lags_for_week(offset.n / (60.0 * 24 * 7))
)
else:
raise Exception("invalid frequency")
# flatten lags list and filter
lags = [int(lag) for sub_list in lags for lag in sub_list if 7 < lag <= lag_ub]
lags = [1, 2, 3, 4, 5, 6, 7] + sorted(list(set(lags)))
return lags[:num_lags]
def get_fourier_lags_for_frequency(freq_str: str, num_lags: Optional[int] = None) -> List[int]:
offset = to_offset(freq_str)
granularity = offset.name
if granularity == "M":
lags = [[1, 12]]
elif granularity == "D":
lags = [[1, 7, 14]]
elif granularity == "B":
lags = [[1, 2]]
elif granularity == "H":
lags = [[1, 24, 168]]
elif granularity == "min":
lags = [[1, 4, 12, 24, 48]]
else:
lags = [[1]]
# use less lags
output_lags = list([int(lag) for sub_list in lags for lag in sub_list])
output_lags = sorted(list(set(output_lags)))
return output_lags[:num_lags]
-206
View File
@@ -1,206 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from abc import ABC, abstractmethod
from typing import List
import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pts.core.component import validated
from .utils import get_granularity
class TimeFeature(ABC):
@validated()
def __init__(self, normalized: bool = True):
self.normalized = normalized
@abstractmethod
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
pass
class MinuteOfHour(TimeFeature):
"""
Minute of hour encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.minute / 59.0 - 0.5
else:
return index.minute.map(float)
class HourOfDay(TimeFeature):
"""
Hour of day encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.hour / 23.0 - 0.5
else:
return index.hour.map(float)
class DayOfWeek(TimeFeature):
"""
Hour of day encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.dayofweek / 6.0 - 0.5
else:
return index.dayofweek.map(float)
class DayOfMonth(TimeFeature):
"""
Day of month encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.day / 30.0 - 0.5
else:
return index.day.map(float)
class DayOfYear(TimeFeature):
"""
Day of year encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.dayofyear / 364.0 - 0.5
else:
return index.dayofyear.map(float)
class MonthOfYear(TimeFeature):
"""
Month of year encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return index.month / 11.0 - 0.5
else:
return index.month.map(float)
class WeekOfYear(TimeFeature):
"""
Week of year encoded as value between [-0.5, 0.5]
"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
if self.normalized:
return pd.Int64Index(index.isocalendar().week) / 51.0 - 0.5
else:
return pd.Int64Index(index.isocalendar().week).map(float)
class FourierDateFeatures(TimeFeature):
@validated()
def __init__(self, freq: str) -> None:
super().__init__()
# reoccurring freq
freqs = [
"month",
"day",
"hour",
"minute",
"weekofyear",
"weekday",
"dayofweek",
"dayofyear",
"daysinmonth",
]
assert freq in freqs
self.freq = freq
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
values = getattr(index, self.freq)
num_values = max(values) + 1
steps = [x * 2.0 * np.pi / num_values for x in values]
return np.vstack([np.cos(steps), np.sin(steps)])
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
"""
Returns a list of time features that will be appropriate for the given frequency string.
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
"""
_, granularity = get_granularity(freq_str)
if granularity == "M":
feature_classes = [MonthOfYear]
elif granularity == "W":
feature_classes = [DayOfMonth, WeekOfYear]
elif granularity in ["D", "B"]:
feature_classes = [DayOfWeek, DayOfMonth, DayOfYear]
elif granularity == "H":
feature_classes = [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]
elif granularity in ["min", "T"]:
feature_classes = [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]
else:
supported_freq_msg = f"""
Unsupported frequency {freq_str}
The following frequencies are supported:
M - monthly
W - week
D - daily
H - hourly
min - minutely
"""
raise RuntimeError(supported_freq_msg)
return [cls() for cls in feature_classes]
def fourier_time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
offset = to_offset(freq_str)
granularity = offset.name
features = {
"M": ["weekofyear"],
"W-SUN": ["daysinmonth", "weekofyear"],
"W-MON": ["daysinmonth", "weekofyear"],
"D": ["dayofweek"],
"B": ["dayofweek", "dayofyear"],
"H": ["hour", "dayofweek"],
"min": ["minute", "hour", "dayofweek"],
"T": ["minute", "hour", "dayofweek"],
}
assert granularity in features, f"freq {granularity} not supported"
feature_classes: List[TimeFeature] = [
FourierDateFeatures(freq=freq) for freq in features[granularity]
]
return feature_classes
-65
View File
@@ -1,65 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import re
from functools import lru_cache
from typing import Tuple
def get_granularity(freq_str: str) -> Tuple[int, str]:
"""
Splits a frequency string such as "7D" into the multiple 7 and the base
granularity "D".
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
"""
freq_regex = r"\s*((\d+)?)\s*([^\d]\w*)"
m = re.match(freq_regex, freq_str)
assert m is not None, "Cannot parse frequency string: %s" % freq_str
groups = m.groups()
multiple = int(groups[1]) if groups[1] is not None else 1
granularity = groups[2]
return multiple, granularity
@lru_cache()
def get_seasonality(freq: str) -> int:
"""
Returns the default seasonality for a given freq str. E.g. for
2H -> 12
"""
match = re.match(r"(\d*)(\w+)", freq)
assert match, "Cannot match freq regex"
mult, base_freq = match.groups()
multiple = int(mult) if mult else 1
seasonalities = {"H": 24, "D": 1, "W": 1, "M": 12, "B": 5}
if base_freq in seasonalities:
seasonality = seasonalities[base_freq]
else:
seasonality = 1
if seasonality % multiple != 0:
# logging.warning(
# f"multiple {multiple} does not divide base "
# f"seasonality {seasonality}."
# f"Falling back to seasonality 1"
# )
return 1
return seasonality // multiple
+2 -5
View File
@@ -1,5 +1,2 @@
from .estimator import Estimator, PTSEstimator
from .forecast import Forecast, SampleForecast, QuantileForecast, DistributionForecast
from .predictor import Predictor, PTSPredictor
from .quantile import Quantile
from .utils import get_module_forward_input_names, copy_parameters, weighted_average
from .utils import get_module_forward_input_names, weighted_average
from .estimator import PyTorchEstimator
+23 -11
View File
@@ -1,19 +1,17 @@
from pts.model.utils import get_module_forward_input_names
from typing import List, Optional
import numpy as np
import torch
import torch.nn as nn
from pts import Trainer
from pts.dataset import FieldName
from pts.feature import (
from gluonts.dataset.field_names import FieldName
from gluonts.time_feature import (
TimeFeature,
get_lags_for_frequency,
time_features_from_frequency_str,
)
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
from pts.modules import DistributionOutput, StudentTOutput
from pts.transform import (
from gluonts.transform import (
Transformation,
Chain,
RemoveFields,
@@ -26,10 +24,19 @@ from pts.transform import (
InstanceSplitter,
ExpectedNumInstanceSampler,
)
from gluonts.torch.support.util import copy_parameters
from gluonts.torch.model.predictor import PyTorchPredictor
from gluonts.torch.modules.distribution_output import DistributionOutput
from gluonts.model.predictor import Predictor
from pts import Trainer
from pts.model import PyTorchEstimator
from pts.modules import StudentTOutput
from .deepar_network import DeepARTrainingNetwork, DeepARPredictionNetwork
class DeepAREstimator(PTSEstimator):
class DeepAREstimator(PyTorchEstimator):
def __init__(
self,
freq: str,
@@ -115,10 +122,14 @@ class DeepAREstimator(PTSEstimator):
)
+ [
AsNumpyArray(
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long,
field=FieldName.FEAT_STATIC_CAT,
expected_ndim=1,
dtype=np.long,
),
AsNumpyArray(
field=FieldName.FEAT_STATIC_REAL, expected_ndim=1, dtype=self.dtype,
field=FieldName.FEAT_STATIC_REAL,
expected_ndim=1,
dtype=self.dtype,
),
AsNumpyArray(
field=FieldName.TARGET,
@@ -218,13 +229,14 @@ class DeepAREstimator(PTSEstimator):
).to(device)
copy_parameters(trained_network, prediction_network)
input_names = get_module_forward_input_names(prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
input_names=input_names,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
freq=self.freq,
prediction_length=self.prediction_length,
device=device,
dtype=self.dtype,
)
+5 -5
View File
@@ -5,9 +5,10 @@ import torch
import torch.nn as nn
from torch.distributions import Distribution
from pts.core.component import validated
from gluonts.core.component import validated
from gluonts.torch.modules.distribution_output import DistributionOutput
from pts.model import weighted_average
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
from pts.modules import MeanScaler, NOPScaler, FeatureEmbedder
def prod(xs):
@@ -18,7 +19,6 @@ def prod(xs):
class DeepARNetwork(nn.Module):
@validated()
def __init__(
self,
@@ -144,7 +144,7 @@ class DeepARNetwork(nn.Module):
past_time_feat[:, self.history_length - self.context_length :, ...],
future_time_feat,
),
dim=1
dim=1,
)
sequence = torch.cat((past_target, future_target), dim=1)
sequence_length = self.history_length + self.prediction_length
@@ -154,7 +154,7 @@ class DeepARNetwork(nn.Module):
sequence=sequence,
sequence_length=sequence_length,
indices=self.lags_seq,
subsequences_length=subsequences_length
subsequences_length=subsequences_length,
)
# scale is computed on the context length last units of the past target
+7 -5
View File
@@ -10,7 +10,7 @@ from pts.feature import (
fourier_time_features_from_frequency_str,
get_fourier_lags_for_frequency,
)
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
from pts.modules import DistributionOutput, LowRankMultivariateNormalOutput
from pts.transform import (
Transformation,
@@ -34,7 +34,7 @@ from pts.transform import (
from .deepvar_network import DeepVARTrainingNetwork, DeepVARPredictionNetwork
class DeepVAREstimator(PTSEstimator):
class DeepVAREstimator(PyTorchEstimator):
def __init__(
self,
input_size: int,
@@ -199,7 +199,9 @@ class DeepVAREstimator(PTSEstimator):
field_name="target_dimension_indicator",
target_field=FieldName.TARGET,
),
AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long),
AsNumpyArray(
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long
),
AsNumpyArray(field=FieldName.FEAT_STATIC_REAL, expected_ndim=1),
InstanceSplitter(
target_field=FieldName.TARGET,
@@ -242,7 +244,7 @@ class DeepVAREstimator(PTSEstimator):
transformation: Transformation,
trained_network: DeepVARTrainingNetwork,
device: torch.device,
) -> PTSPredictor:
) -> PyTorchPredictor:
prediction_network = DeepVARPredictionNetwork(
input_size=self.input_size,
target_dim=self.target_dim,
@@ -263,7 +265,7 @@ class DeepVAREstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
+18 -6
View File
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
from pts.core.component import validated
from gluonts.core.component import validated
from pts.model import weighted_average
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
@@ -250,7 +250,10 @@ class DeepVARTrainingNetwork(nn.Module):
subsequences_length = self.context_length
else:
time_feat = torch.cat(
(past_time_feat[:, -self.context_length :, ...], future_time_feat,),
(
past_time_feat[:, -self.context_length :, ...],
future_time_feat,
),
dim=1,
)
sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
@@ -285,7 +288,9 @@ class DeepVARTrainingNetwork(nn.Module):
return outputs, states, scale, lags_scaled, inputs
def distr(
self, rnn_outputs: torch.Tensor, scale: torch.Tensor,
self,
rnn_outputs: torch.Tensor,
scale: torch.Tensor,
):
"""
Returns the distribution of DeepVAR with respect to the RNN outputs.
@@ -382,7 +387,8 @@ class DeepVARTrainingNetwork(nn.Module):
# put together target sequence
# (batch_size, seq_len, target_dim)
target = torch.cat(
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf), dim=1,
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf),
dim=1,
)
# assert_shape(target, (-1, seq_len, self.target_dim))
@@ -507,7 +513,8 @@ class DeepVARPredictionNetwork(DeepVARTrainingNetwork):
)
distr, distr_args = self.distr(
rnn_outputs=rnn_outputs, scale=repeated_scale,
rnn_outputs=rnn_outputs,
scale=repeated_scale,
)
# (batch_size, 1, target_dim)
@@ -524,7 +531,12 @@ class DeepVARPredictionNetwork(DeepVARTrainingNetwork):
# (batch_size, num_samples, prediction_length, target_dim)
return samples.reshape(
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
(
-1,
self.num_parallel_samples,
self.prediction_length,
self.target_dim,
)
)
def forward(
+82 -80
View File
@@ -1,73 +1,38 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from abc import ABC, abstractmethod
from typing import NamedTuple
from typing import NamedTuple, Optional
from functools import partial
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from gluonts.core.component import validated
from gluonts.dataset.common import Dataset
from gluonts.dataset.loader import TrainDataLoader, ValidationDataLoader
from gluonts.model.estimator import Estimator
from gluonts.torch.model.predictor import PyTorchPredictor
from gluonts.torch.batchify import batchify
from gluonts.transform import SelectFields, Transformation
from pts import Trainer
from pts.dataset import Dataset, TransformedIterableDataset
from pts.transform import Transformation
from .predictor import Predictor
from .utils import get_module_forward_input_names
class Estimator(ABC):
prediction_length: int
freq: str
@abstractmethod
def train(self, training_data: Dataset) -> Predictor:
pass
class DummyEstimator(Estimator):
"""
An `Estimator` that, upon training, simply returns a pre-constructed
`Predictor`.
Parameters
----------
predictor_cls
`Predictor` class to instantiate.
**kwargs
Keyword arguments to pass to the predictor constructor.
"""
def __init__(self, predictor_cls: type, **kwargs) -> None:
self.predictor = predictor_cls(**kwargs)
def train(self, training_data: Dataset) -> Predictor:
return self.predictor
from pts.model import get_module_forward_input_names
class TrainOutput(NamedTuple):
transformation: Transformation
trained_net: nn.Module
predictor: Predictor
predictor: PyTorchPredictor
class PTSEstimator(Estimator):
def __init__(self, trainer: Trainer, dtype: np.dtype = np.float32) -> None:
class PyTorchEstimator(Estimator):
@validated()
def __init__(
self, trainer: Trainer, lead_time: int = 0, dtype: np.dtype = np.float32
) -> None:
super().__init__(lead_time=lead_time)
self.trainer = trainer
self.dtype = dtype
@abstractmethod
def create_transformation(self) -> Transformation:
"""
Create and return the transformation needed for training and inference.
@@ -78,9 +43,8 @@ class PTSEstimator(Estimator):
The transformation that will be applied entry-wise to datasets,
at training and inference time.
"""
pass
raise NotImplementedError
@abstractmethod
def create_training_network(self, device: torch.device) -> nn.Module:
"""
Create and return the network used for training (i.e., computing the
@@ -91,15 +55,14 @@ class PTSEstimator(Estimator):
nn.Module
The network that computes the loss given input data.
"""
pass
raise NotImplementedError
@abstractmethod
def create_predictor(
self,
transformation: Transformation,
trained_network: nn.Module,
device: torch.device,
) -> Predictor:
) -> PyTorchPredictor:
"""
Create and return a predictor object.
@@ -108,32 +71,56 @@ class PTSEstimator(Estimator):
Predictor
A predictor wrapping a `nn.Module` used for inference.
"""
pass
raise NotImplementedError
def train_model(self, training_data: Dataset) -> TrainOutput:
def train_model(
self,
training_data: Dataset,
validation_data: Optional[Dataset] = None,
num_workers: Optional[int] = None,
num_prefetch: Optional[int] = None,
shuffle_buffer_length: Optional[int] = None,
**kwargs,
) -> TrainOutput:
transformation = self.create_transformation()
transformation.estimate(iter(training_data))
training_iter_dataset = TransformedIterableDataset(
dataset=training_data,
is_train=True,
transform=transformation
)
training_data_loader = DataLoader(
training_iter_dataset,
batch_size=self.trainer.batch_size,
num_workers=self.trainer.num_workers,
pin_memory=self.trainer.pin_memory
)
# ensure that the training network is created on the same device
trained_net = self.create_training_network(self.trainer.device)
input_names = get_module_forward_input_names(trained_net)
training_data_loader = TrainDataLoader(
dataset=training_data,
transform=transformation + SelectFields(input_names),
batch_size=self.trainer.batch_size,
stack_fn=partial(
batchify,
device=self.trainer.device,
),
num_workers=num_workers,
num_prefetch=num_prefetch,
shuffle_buffer_length=shuffle_buffer_length,
**kwargs,
)
validation_data_loader = None
if validation_data is not None:
validation_data_loader = ValidationDataLoader(
dataset=validation_data,
transform=transformation + SelectFields(input_names),
batch_size=self.trainer.batch_size,
stack_fn=partial(
batchify,
device=self.trainer.device,
),
num_workers=num_workers,
num_prefetch=num_prefetch,
**kwargs,
)
self.trainer(
net=trained_net,
input_names=get_module_forward_input_names(trained_net),
data_loader=training_data_loader,
train_iter=training_data_loader,
validation_iter=validation_data_loader,
)
return TrainOutput(
@@ -144,5 +131,20 @@ class PTSEstimator(Estimator):
),
)
def train(self, training_data: Dataset) -> Predictor:
return self.train_model(training_data).predictor
def train(
self,
training_data: Dataset,
validation_data: Optional[Dataset] = None,
num_workers: Optional[int] = None,
num_prefetch: Optional[int] = None,
shuffle_buffer_length: Optional[int] = None,
**kwargs,
) -> PyTorchPredictor:
return self.train_model(
training_data,
validation_data,
num_workers,
num_prefetch,
shuffle_buffer_length,
**kwargs,
).predictor
-552
View File
@@ -1,552 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from abc import ABC, abstractmethod
from enum import Enum
from typing import Dict, List, Optional, Set, Union, Callable
import numpy as np
import pandas as pd
import torch
from pydantic import BaseModel, Field
from torch.distributions import Distribution
from .quantile import Quantile
class OutputType(str, Enum):
mean = "mean"
samples = "samples"
quantiles = "quantiles"
class Config(BaseModel):
num_samples: int = Field(100, alias="num_eval_samples")
output_types: Set[OutputType] = {"quantiles", "mean"}
# FIXME: validate list elements
quantiles: List[str] = ["0.1", "0.5", "0.9"]
class Config:
allow_population_by_field_name = True
# store additional fields
extra = "allow"
class Forecast(ABC):
start_date: pd.Timestamp
freq: str
item_id: Optional[str]
info: Optional[Dict]
prediction_length: int
mean: np.ndarray
_index = None
@abstractmethod
def quantile(self, q: Union[float, str]) -> np.ndarray:
"""
Computes a quantile from the predicted distribution.
Parameters
----------
q
Quantile to compute.
Returns
-------
numpy.ndarray
Value of the quantile across the prediction range.
"""
pass
def quantile_ts(self, q: Union[float, str]) -> pd.Series:
return pd.Series(data=self.quantile(q), index=self.index)
@property
def median(self) -> np.ndarray:
return self.quantile(0.5)
def plot(
self,
prediction_intervals=(50.0, 90.0),
show_mean=False,
color="b",
label=None,
output_file=None,
*args,
**kwargs,
):
"""
Plots the median of the forecast as well as confidence bounds.
(requires matplotlib and pandas).
Parameters
----------
prediction_intervals : float or list of floats in [0, 100]
Confidence interval size(s). If a list, it will stack the error
plots for each confidence interval. Only relevant for error styles
with "ci" in the name.
show_mean : boolean
Whether to also show the mean of the forecast.
color : matplotlib color name or dictionary
The color used for plotting the forecast.
label : string
A label (prefix) that is used for the forecast
output_file : str or None, default None
Output path for the plot file. If None, plot is not saved to file.
args :
Other arguments are passed to main plot() call
kwargs :
Other keyword arguments are passed to main plot() call
"""
# matplotlib==2.0.* gives errors in Brazil builds and has to be
# imported locally
import matplotlib.pyplot as plt
label_prefix = "" if label is None else label + "-"
for c in prediction_intervals:
assert 0.0 <= c <= 100.0
ps = [50.0] + [
50.0 + f * c / 2.0 for c in prediction_intervals for f in [-1.0, +1.0]
]
percentiles_sorted = sorted(set(ps))
def alpha_for_percentile(p):
return (p / 100.0) ** 0.3
ps_data = [self.quantile(p / 100.0) for p in percentiles_sorted]
i_p50 = len(percentiles_sorted) // 2
p50_data = ps_data[i_p50]
p50_series = pd.Series(data=p50_data, index=self.index)
p50_series.plot(color=color, ls="-", label=f"{label_prefix}median")
if show_mean:
mean_data = np.mean(self._sorted_samples, axis=0)
pd.Series(data=mean_data, index=self.index).plot(
color=color, ls=":", label=f"{label_prefix}mean", *args, **kwargs,
)
for i in range(len(percentiles_sorted) // 2):
ptile = percentiles_sorted[i]
alpha = alpha_for_percentile(ptile)
plt.fill_between(
self.index,
ps_data[i],
ps_data[-i - 1],
facecolor=color,
alpha=alpha,
interpolate=True,
*args,
**kwargs,
)
# Hack to create labels for the error intervals.
# Doesn't actually plot anything, because we only pass a single data point
pd.Series(data=p50_data[:1], index=self.index[:1]).plot(
color=color,
alpha=alpha,
linewidth=10,
label=f"{label_prefix}{100 - ptile * 2}%",
*args,
**kwargs,
)
if output_file:
plt.savefig(output_file)
@property
def index(self) -> pd.DatetimeIndex:
if self._index is None:
self._index = pd.date_range(
self.start_date, periods=self.prediction_length, freq=self.freq
)
return self._index
def as_json_dict(self, config: "Config") -> dict:
result = {}
if OutputType.mean in config.output_types:
result["mean"] = self.mean.tolist()
if OutputType.quantiles in config.output_types:
quantiles = map(Quantile.parse, config.quantiles)
result["quantiles"] = {
quantile.name: self.quantile(quantile.value).tolist()
for quantile in quantiles
}
if OutputType.samples in config.output_types:
result["samples"] = []
return result
class SampleForecast(Forecast):
"""
A `Forecast` object, where the predicted distribution is represented
internally as samples.
Parameters
----------
samples
Array of size (num_samples, prediction_length)
start_date
start of the forecast
freq
forecast frequency
info
additional information that the forecaster may provide e.g. estimated
parameters, number of iterations ran etc.
"""
def __init__(
self,
samples: Union[torch.Tensor, np.ndarray],
start_date: pd.Timestamp,
freq: str,
item_id: Optional[str] = None,
info: Optional[Dict] = None,
) -> None:
assert isinstance(
samples, (np.ndarray, torch.Tensor)
), "samples should be either a numpy array or an torch tensor"
assert (
len(np.shape(samples)) == 2 or len(np.shape(samples)) == 3
), "samples should be a 2-dimensional or 3-dimensional array. Dimensions found: {}".format(
len(np.shape(samples))
)
self.samples = (
samples if (isinstance(samples, np.ndarray)) else samples.cpu().numpy()
)
self._sorted_samples_value = None
self._mean = None
self._dim = None
self.item_id = item_id
self.info = info
assert isinstance(
start_date, pd.Timestamp
), "start_date should be a pandas Timestamp object"
self.start_date = start_date
assert isinstance(freq, str), "freq should be a string"
self.freq = freq
@property
def _sorted_samples(self):
if self._sorted_samples_value is None:
self._sorted_samples_value = np.sort(self.samples, axis=0)
return self._sorted_samples_value
@property
def num_samples(self):
"""
The number of samples representing the forecast.
"""
return self.samples.shape[0]
@property
def prediction_length(self):
"""
Time length of the forecast.
"""
return self.samples.shape[1]
@property
def mean(self) -> np.ndarray:
"""
Forecast mean.
"""
if self._mean is not None:
return self._mean
else:
return np.mean(self.samples, axis=0)
@property
def mean_ts(self) -> pd.Series:
"""
Forecast mean, as a pandas.Series object.
"""
return pd.Series(data=self.mean, index=self.index)
def quantile(self, q: Union[float, str]) -> np.ndarray:
q = Quantile.parse(q).value
sample_idx = int(np.round((self.num_samples - 1) * q))
return self._sorted_samples[sample_idx, :]
def copy_dim(self, dim: int) -> "SampleForecast":
"""
Returns a new Forecast object with only the selected sub-dimension.
Parameters
----------
dim
The returned forecast object will only represent this dimension.
"""
if len(self.samples.shape) == 2:
samples = self.samples
else:
target_dim = self.samples.shape[2]
assert dim < target_dim, (
f"must set 0 <= dim < target_dim, but got dim={dim},"
f" target_dim={target_dim}"
)
samples = self.samples[:, :, dim]
return SampleForecast(
samples=samples,
start_date=self.start_date,
freq=self.freq,
item_id=self.item_id,
info=self.info,
)
def copy_aggregate(self, agg_fun: Callable) -> "SampleForecast":
"""
Returns a new Forecast object with a time series aggregated over the
dimension axis.
Parameters
----------
agg_fun
Aggregation function that defines the aggregation operation
(typically mean or sum).
"""
if len(self.samples.shape) == 2:
samples = self.samples
else:
# Aggregate over target dimension axis
samples = agg_fun(self.samples, axis=2)
return SampleForecast(
samples=samples,
start_date=self.start_date,
freq=self.freq,
item_id=self.item_id,
info=self.info,
)
def dim(self) -> int:
"""
Returns the dimensionality of the forecast object.
"""
if self._dim is not None:
return self._dim
else:
if len(self.samples.shape) == 2:
# univariate target
# shape: (num_samples, prediction_length)
return 1
else:
# multivariate target
# shape: (num_samples, prediction_length, target_dim)
return self.samples.shape[2]
def as_json_dict(self, config: "Config") -> dict:
result = super().as_json_dict(config)
if OutputType.samples in config.output_types:
result["samples"] = self.samples.tolist()
return result
def __repr__(self):
return ", ".join(
[
f"SampleForecast({self.samples!r})",
f"{self.start_date!r}",
f"{self.freq!r}",
f"item_id={self.item_id!r}",
f"info={self.info!r})",
]
)
class QuantileForecast(Forecast):
"""
A Forecast that contains arrays (i.e. time series) for quantiles and mean
Parameters
----------
forecast_arrays
An array of forecasts
start_date
start of the forecast
freq
forecast frequency
forecast_keys
A list of quantiles of the form '0.1', '0.9', etc.,
and potentially 'mean'. Each entry corresponds to one array in
forecast_arrays.
info
additional information that the forecaster may provide e.g. estimated
parameters, number of iterations ran etc.
"""
def __init__(
self,
forecast_arrays: np.ndarray,
start_date: pd.Timestamp,
freq: str,
forecast_keys: List[str],
item_id: Optional[str] = None,
info: Optional[Dict] = None,
) -> None:
self.forecast_array = forecast_arrays
self.start_date = pd.Timestamp(start_date, freq=freq)
self.freq = freq
# normalize keys
self.forecast_keys = [
Quantile.from_str(key).name if key != "mean" else key
for key in forecast_keys
]
self.item_id = item_id
self.info = info
self._dim = None
shape = self.forecast_array.shape
assert shape[0] == len(self.forecast_keys), (
f"The forecast_array (shape={shape} should have the same "
f"length as the forecast_keys (len={len(self.forecast_keys)})."
)
self.prediction_length = shape[-1]
self._forecast_dict = {
k: self.forecast_array[i] for i, k in enumerate(self.forecast_keys)
}
self._nan_out = np.array([np.nan] * self.prediction_length)
def quantile(self, q: Union[float, str]) -> np.ndarray:
q_str = Quantile.parse(q).name
# We return nan here such that evaluation runs through
return self._forecast_dict.get(q_str, self._nan_out)
@property
def mean(self) -> np.ndarray:
"""
Forecast mean.
"""
return self._forecast_dict.get("mean", self._nan_out)
def dim(self) -> int:
"""
Returns the dimensionality of the forecast object.
"""
if self._dim is not None:
return self._dim
else:
if (
len(self.forecast_array.shape) == 2
): # 1D target. shape: (num_samples, prediction_length)
return 1
else:
return self.forecast_array.shape[
1
] # 2D target. shape: (num_samples, target_dim, prediction_length)
def __repr__(self):
return ", ".join(
[
f"QuantileForecast({self.forecast_array!r})",
f"start_date={self.start_date!r}",
f"freq={self.freq!r}",
f"forecast_keys={self.forecast_keys!r}",
f"item_id={self.item_id!r}",
f"info={self.info!r})",
]
)
class DistributionForecast(Forecast):
"""
A `Forecast` object that uses a distribution directly.
This can for instance be used to represent marginal probability
distributions for each time point -- although joint distributions are
also possible, e.g. when using MultiVariateGaussian).
Parameters
----------
distribution
Distribution object. This should represent the entire prediction
length, i.e., if we draw `num_samples` samples from the distribution,
the sample shape should be
samples = trans_dist.sample(num_samples)
samples.shape -> (num_samples, prediction_length)
start_date
start of the forecast
freq
forecast frequency
info
additional information that the forecaster may provide e.g. estimated
parameters, number of iterations ran etc.
"""
def __init__(
self,
distribution: Distribution,
start_date: pd.Timestamp,
freq: str,
item_id: Optional[str] = None,
info: Optional[Dict] = None,
) -> None:
self.distribution = distribution
self.shape = self.distribution.batch_shape + self.distribution.event_shape
self.prediction_length = self.shape[0]
self.item_id = item_id
self.info = info
assert isinstance(
start_date, pd.Timestamp
), "start_date should be a pandas Timestamp object"
self.start_date = start_date
assert isinstance(freq, str), "freq should be a string"
self.freq = freq
self._mean = None
@property
def mean(self) -> np.ndarray:
"""
Forecast mean.
"""
if self._mean is not None:
return self._mean
else:
self._mean = self.distribution.mean.cpu().numpy()
return self._mean
@property
def mean_ts(self) -> pd.Series:
"""
Forecast mean, as a pandas.Series object.
"""
return pd.Series(data=self.mean, index=self.index)
def quantile(self, level: Union[float, str]) -> np.ndarray:
level = Quantile.parse(level).value
q = self.distribution.icdf(torch.tensor([level])).cpu().numpy()
return q
def to_sample_forecast(self, num_samples: int = 200) -> SampleForecast:
return SampleForecast(
samples=self.distribution.sample((num_samples,)),
start_date=self.start_date,
freq=self.freq,
item_id=self.item_id,
info=self.info,
)
-195
View File
@@ -1,195 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from abc import ABC, abstractmethod
from typing import Any, Callable, Iterator, List, Optional
import numpy as np
import torch
import torch.nn as nn
from pts.core.component import validated
from pts.dataset import InferenceDataLoader, DataEntry, FieldName
from pts.modules import DistributionOutput
from .forecast import Forecast, DistributionForecast, QuantileForecast, SampleForecast
OutputTransform = Callable[[DataEntry, np.ndarray], np.ndarray]
def _extract_instances(x: Any) -> Any:
"""
Helper function to extract individual instances from batched
mxnet results.
For a tensor `a`
_extract_instances(a) -> [a[0], a[1], ...]
For (nested) tuples of tensors `(a, (b, c))`
_extract_instances((a, (b, c)) -> [(a[0], (b[0], c[0])), (a[1], (b[1], c[1])), ...]
"""
if isinstance(x, (np.ndarray, torch.Tensor)):
for i in range(x.shape[0]):
# yield x[i: i + 1]
yield x[i]
elif isinstance(x, tuple):
for m in zip(*[_extract_instances(y) for y in x]):
yield tuple([r for r in m])
elif isinstance(x, list):
for m in zip(*[_extract_instances(y) for y in x]):
yield [r for r in m]
elif x is None:
while True:
yield None
else:
assert False
class ForecastGenerator(ABC):
"""
Classes used to bring the output of a network into a class.
"""
@abstractmethod
def __call__(
self,
inference_data_loader: InferenceDataLoader,
prediction_net: nn.Module,
input_names: List[str],
freq: str,
output_transform: Optional[OutputTransform],
num_samples: Optional[int],
**kwargs
) -> Iterator[Forecast]:
pass
class DistributionForecastGenerator(ForecastGenerator):
def __init__(self, distr_output: DistributionOutput) -> None:
self.distr_output = distr_output
def __call__(
self,
inference_data_loader: InferenceDataLoader,
prediction_net: nn.Module,
input_names: List[str],
freq: str,
output_transform: Optional[OutputTransform],
num_samples: Optional[int],
**kwargs
) -> Iterator[DistributionForecast]:
for batch in inference_data_loader:
inputs = [batch[k] for k in input_names]
outputs = prediction_net(*inputs)
if output_transform is not None:
outputs = output_transform(batch, outputs)
distributions = [
self.distr_output.distribution(*u) for u in _extract_instances(outputs)
]
i = -1
for i, distr in enumerate(distributions):
yield DistributionForecast(
distr,
start_date=batch["forecast_start"][i],
freq=freq,
item_id=batch[FieldName.ITEM_ID][i]
if FieldName.ITEM_ID in batch
else None,
info=batch["info"][i] if "info" in batch else None,
)
assert i + 1 == len(batch["forecast_start"])
class QuantileForecastGenerator(ForecastGenerator):
def __init__(self, quantiles: List[str]) -> None:
self.quantiles = quantiles
def __call__(
self,
inference_data_loader: InferenceDataLoader,
prediction_net: nn.Module,
input_names: List[str],
freq: str,
output_transform: Optional[OutputTransform],
num_samples: Optional[int],
**kwargs
) -> Iterator[Forecast]:
for batch in inference_data_loader:
inputs = [batch[k] for k in input_names]
outputs = prediction_net(*inputs).cpu().numpy()
if output_transform is not None:
outputs = output_transform(batch, outputs)
i = -1
for i, output in enumerate(outputs):
yield QuantileForecast(
output,
start_date=batch["forecast_start"][i],
freq=freq,
item_id=batch[FieldName.ITEM_ID][i]
if FieldName.ITEM_ID in batch
else None,
info=batch["info"][i] if "info" in batch else None,
forecast_keys=self.quantiles,
)
assert i + 1 == len(batch["forecast_start"])
class SampleForecastGenerator(ForecastGenerator):
@validated()
def __init__(self):
pass
def __call__(
self,
inference_data_loader: InferenceDataLoader,
prediction_net: nn.Module,
input_names: List[str],
freq: str,
output_transform: Optional[OutputTransform],
num_samples: Optional[int],
**kwargs
) -> Iterator[Forecast]:
for batch in inference_data_loader:
inputs = [batch[k] for k in input_names]
outputs = prediction_net(*inputs).cpu().numpy()
if output_transform is not None:
outputs = output_transform(batch, outputs)
if num_samples:
num_collected_samples = outputs[0].shape[0]
collected_samples = [outputs]
while num_collected_samples < num_samples:
outputs = prediction_net(*inputs).cpu().numpy()
if output_transform is not None:
outputs = output_transform(batch, outputs)
collected_samples.append(outputs)
num_collected_samples += outputs[0].shape[0]
outputs = [
np.concatenate(s)[:num_samples] for s in zip(*collected_samples)
]
assert len(outputs[0]) == num_samples
i = -1
for i, output in enumerate(outputs):
yield SampleForecast(
output,
start_date=batch["forecast_start"][i],
freq=freq,
item_id=batch[FieldName.ITEM_ID][i]
if FieldName.ITEM_ID in batch
else None,
info=batch["info"][i] if "info" in batch else None,
)
assert i + 1 == len(batch["forecast_start"])
+4 -4
View File
@@ -6,7 +6,7 @@ import torch.nn as nn
from pts import Trainer
from pts.dataset import FieldName
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
from pts.transform import (
InstanceSplitter,
Transformation,
@@ -19,7 +19,7 @@ from pts.transform import (
from .lstnet_network import LSTNetTrain, LSTNetPredict
class LSTNetEstimator(PTSEstimator):
class LSTNetEstimator(PyTorchEstimator):
def __init__(
self,
freq: str,
@@ -110,7 +110,7 @@ class LSTNetEstimator(PTSEstimator):
transformation: Transformation,
trained_network: LSTNetTrain,
device: torch.device,
) -> PTSPredictor:
) -> PyTorchPredictor:
prediction_network = LSTNetPredict(
num_series=self.num_series,
channels=self.channels,
@@ -131,7 +131,7 @@ class LSTNetEstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
+4 -4
View File
@@ -110,7 +110,7 @@ class LSTNetBase(nn.Module):
) -> torch.Tensor:
scaled_past_target, scale = self.scaler(
past_target[..., -self.context_length :], # [B, C, T]
past_observed_values[..., -self.context_length :] # [B, C, T]
past_observed_values[..., -self.context_length :], # [B, C, T]
)
# CNN
@@ -121,7 +121,7 @@ class LSTNetBase(nn.Module):
# RNN
r = c.permute(2, 0, 1) # [F (T), B, C]
_, r = self.rnn(r) # [1, B, H]
r = self.dropout(r.squeeze(0)) # [B, H]
r = self.dropout(r.squeeze(0)) # [B, H]
# Skip-RNN
skip_c = c[..., -self.conv_skip * self.skip_size :]
@@ -174,7 +174,7 @@ class LSTNetTrain(LSTNetBase):
if self.horizon:
future_target = future_target[..., -1:]
loss = self.loss_fn(ret*scale, future_target)
loss = self.loss_fn(ret * scale, future_target)
return loss
@@ -183,6 +183,6 @@ class LSTNetPredict(LSTNetBase):
self, past_target: torch.Tensor, past_observed_values: torch.Tensor
) -> torch.Tensor:
ret, scale = super().forward(past_target, past_observed_values)
ret = (ret*scale).permute(0, 2, 1)
ret = (ret * scale).permute(0, 2, 1)
return ret.unsqueeze(1)
+1
View File
@@ -164,6 +164,7 @@ class NBEATSEnsembleEstimator(Estimator):
**kwargs
Arguments passed down to the individual estimators.
"""
def __init__(
self,
freq: str,
+17 -14
View File
@@ -5,7 +5,7 @@ import torch.nn as nn
from pts import Trainer
from pts.dataset import FieldName
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
from pts.transform import (
InstanceSplitter,
Transformation,
@@ -20,7 +20,7 @@ from .n_beats_network import (
)
class NBEATSEstimator(PTSEstimator):
class NBEATSEstimator(PyTorchEstimator):
def __init__(
self,
freq: str,
@@ -124,10 +124,14 @@ class NBEATSEstimator(PTSEstimator):
# conditioning part and a to-predict part, for each training example.
def create_transformation(self) -> Transformation:
return Chain(
[ RemoveFields(
field_names=[FieldName.FEAT_STATIC_REAL,
FieldName.FEAT_DYNAMIC_REAL,
FieldName.FEAT_DYNAMIC_CAT]),
[
RemoveFields(
field_names=[
FieldName.FEAT_STATIC_REAL,
FieldName.FEAT_DYNAMIC_REAL,
FieldName.FEAT_DYNAMIC_CAT,
]
),
InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
@@ -137,11 +141,11 @@ class NBEATSEstimator(PTSEstimator):
past_length=self.context_length,
future_length=self.prediction_length,
time_series_fields=[],
)
),
]
)
def create_training_network(self, device: torch.device) -> NBEATSTrainingNetwork:
def create_training_network(self, device: torch.device) -> NBEATSTrainingNetwork:
return NBEATSTrainingNetwork(
prediction_length=self.prediction_length,
context_length=self.context_length,
@@ -156,10 +160,9 @@ class NBEATSEstimator(PTSEstimator):
freq=self.freq,
).to(device)
def create_predictor(
self,
transformation: Transformation,
self,
transformation: Transformation,
trained_network: nn.Module,
device: torch.device,
) -> Predictor:
@@ -172,12 +175,12 @@ class NBEATSEstimator(PTSEstimator):
num_block_layers=self.num_block_layers,
expansion_coefficient_lengths=self.expansion_coefficient_lengths,
sharing=self.sharing,
stack_types=self.stack_types
stack_types=self.stack_types,
).to(device)
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
+8 -6
View File
@@ -258,7 +258,8 @@ class NBEATSNetwork(nn.Module):
flag = denominator == 0
return (200 / self.prediction_length) * torch.mean(
(torch.abs(future_target - forecast) * torch.logical_not(flag)) / (denominator + flag),
(torch.abs(future_target - forecast) * torch.logical_not(flag))
/ (denominator + flag),
dim=1,
)
@@ -269,7 +270,8 @@ class NBEATSNetwork(nn.Module):
flag = denominator == 0
return (100 / self.prediction_length) * torch.mean(
(torch.abs(future_target - forecast) * torch.logical_not(flag)) / (denominator + flag),
(torch.abs(future_target - forecast) * torch.logical_not(flag))
/ (denominator + flag),
dim=1,
)
@@ -292,9 +294,10 @@ class NBEATSNetwork(nn.Module):
)
flag = seasonal_error == 0
return (torch.mean(torch.abs(future_target - forecast), dim=1) * torch.logical_not(flag)) / (
seasonal_error + flag
)
return (
torch.mean(torch.abs(future_target - forecast), dim=1)
* torch.logical_not(flag)
) / (seasonal_error + flag)
class NBEATSTrainingNetwork(NBEATSNetwork):
@@ -342,4 +345,3 @@ class NBEATSPredictionNetwork(NBEATSNetwork):
forecasts = super().forward(past_target=past_target)
return forecasts.unsqueeze(1)
-190
View File
@@ -1,190 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import json
from abc import ABC, abstractmethod
from pathlib import Path
from pydoc import locate
from typing import Iterator, Callable, Optional
import numpy as np
import torch
import torch.nn as nn
import pts
from pts.core.serde import dump_json, fqname_for, load_json
from pts.dataset import Dataset, DataEntry, InferenceDataLoader
from pts.transform import Transformation
from .forecast import Forecast
from .forecast_generator import ForecastGenerator, SampleForecastGenerator
from .utils import get_module_forward_input_names
OutputTransform = Callable[[DataEntry, np.ndarray], np.ndarray]
class Predictor(ABC):
__version__: str = pts.__version__
def __init__(self, prediction_length: int, freq: str) -> None:
self.prediction_length = prediction_length
self.freq = freq
@abstractmethod
def predict(self, dataset: Dataset, **kwargs) -> Iterator[Forecast]:
pass
def serialize(self, path: Path) -> None:
# serialize Predictor type
with (path / "type.txt").open("w") as fp:
fp.write(fqname_for(self.__class__))
with (path / "version.json").open("w") as fp:
json.dump(
{"model": self.__version__, "pts": pts.__version__}, fp
)
@classmethod
def deserialize(
cls, path: Path, device: Optional[torch.device] = None
) -> "Predictor":
"""
Load a serialized predictor from the given path
Parameters
----------
path
Path to the serialized files predictor.
device
Optional pytorch to be used with the predictor.
If nothing is passed will use the GPU if available and CPU otherwise.
"""
# deserialize Predictor type
with (path / "type.txt").open("r") as fp:
tpe = locate(fp.readline())
# ensure that predictor_cls is a subtype of Predictor
if not issubclass(tpe, Predictor):
raise IOError(
f"Class {fqname_for(tpe)} is not "
f"a subclass of {fqname_for(Predictor)}"
)
# call deserialize() for the concrete Predictor type
return tpe.deserialize(path, device)
class PTSPredictor(Predictor):
def __init__(
self,
prediction_net: nn.Module,
batch_size: int,
prediction_length: int,
freq: str,
device: torch.device,
input_transform: Transformation,
forecast_generator: ForecastGenerator = SampleForecastGenerator(),
output_transform: Optional[OutputTransform] = None,
dtype: np.dtype = np.float32,
) -> None:
super().__init__(prediction_length, freq)
self.input_names = get_module_forward_input_names(prediction_net)
self.prediction_net = prediction_net
self.batch_size = batch_size
self.input_transform = input_transform
self.forecast_generator = forecast_generator
self.output_transform = output_transform
self.device = device
self.dtype = dtype
def predict(
self, dataset: Dataset, num_samples: Optional[int] = None
) -> Iterator[Forecast]:
inference_data_loader = InferenceDataLoader(
dataset,
self.input_transform,
self.batch_size,
device=self.device,
dtype=self.dtype,
)
self.prediction_net.eval()
with torch.no_grad():
yield from self.forecast_generator(
inference_data_loader=inference_data_loader,
prediction_net=self.prediction_net,
input_names=self.input_names,
freq=self.freq,
output_transform=self.output_transform,
num_samples=num_samples,
)
def serialize(self, path: Path) -> None:
super().serialize(path)
# serialize network
model_name = 'prediction_net'
with (path / f"{model_name}-network.json").open("w") as fp:
print(dump_json(self.prediction_net), file=fp)
torch.save(self.prediction_net.state_dict(), path / "prediction_net")
# serialize input transformation chain
with (path / "input_transform.json").open("w") as fp:
print(dump_json(self.input_transform), file=fp)
# serialize output transformation chain
with (path / "output_transform.json").open("w") as fp:
print(dump_json(self.output_transform), file=fp)
# serialize all remaining constructor parameters
with (path / "parameters.json").open("w") as fp:
parameters = dict(
batch_size=self.batch_size,
prediction_length=self.prediction_length,
freq=self.freq,
dtype=self.dtype,
forecast_generator=self.forecast_generator,
input_names=self.input_names,
)
print(dump_json(parameters), file=fp)
@classmethod
def deserialize(
cls, path: Path, device: Optional[torch.device] = None
) -> "PTSPredictor":
# deserialize constructor parameters
with (path / "parameters.json").open("r") as fp:
parameters = load_json(fp.read())
# deserialize transformation chain
with (path / "input_transform.json").open("r") as fp:
transformation = load_json(fp.read())
# deserialize prediction network
model_name = 'prediction_net'
with (path / f"{model_name}-network.json").open("r") as fp:
prediction_net = load_json(fp.read())
prediction_net.load_state_dict(torch.load(path / "prediction_net"))
# input_names is derived from the prediction_net
if "input_names" in parameters:
del parameters["input_names"]
parameters["device"] = device
return PTSPredictor(
input_transform=transformation,
prediction_net=prediction_net,
**parameters
)
-98
View File
@@ -1,98 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import re
from typing import NamedTuple, Union
class Quantile(NamedTuple):
value: float
name: str
@property
def loss_name(self):
return f"QuantileLoss[{self.name}]"
@property
def weighted_loss_name(self):
return f"wQuantileLoss[{self.name}]"
@property
def coverage_name(self):
return f"Coverage[{self.name}]"
@classmethod
def checked(cls, value: float, name: str) -> "Quantile":
if not 0 <= value <= 1:
raise Exception(f"quantile value should be in [0, 1] but found {value}")
return Quantile(value, name)
@classmethod
def from_float(cls, quantile: float) -> "Quantile":
assert isinstance(quantile, float)
return cls.checked(value=quantile, name=str(quantile))
@classmethod
def from_str(cls, quantile: str) -> "Quantile":
assert isinstance(quantile, str)
try:
return cls.checked(value=float(quantile), name=quantile)
except ValueError:
m = re.match(r"^p(\d{2})$", quantile)
if m is None:
raise Exception(
"Quantile string should be of the form "
f'"p10", "p50", ... or "0.1", "0.5", ... but found {quantile}'
)
else:
quantile_float: float = int(m.group(1)) / 100
return cls(value=quantile_float, name=str(quantile_float))
@classmethod
def parse(cls, quantile: Union["Quantile", float, str]) -> "Quantile":
"""Produces equivalent float and string representation of a given
quantile level.
>>> Quantile.parse(0.1)
Quantile(value=0.1, name='0.1')
>>> Quantile.parse('0.2')
Quantile(value=0.2, name='0.2')
>>> Quantile.parse('0.20')
Quantile(value=0.2, name='0.20')
>>> Quantile.parse('p99')
Quantile(value=0.99, name='0.99')
Parameters
----------
quantile
Quantile, can be a float a str representing a float e.g. '0.1' or a
quantile string of the form 'p0.1'.
Returns
-------
Quantile
A tuple containing both a float and a string representation of the
input quantile level.
"""
if isinstance(quantile, Quantile):
return quantile
elif isinstance(quantile, float):
return cls.from_float(quantile)
else:
return cls.from_str(quantile)
-1
View File
@@ -3,4 +3,3 @@ from .simple_feedforward_network import (
SimpleFeedForwardTrainingNetwork,
SimpleFeedForwardPredictionNetwork,
)
@@ -3,29 +3,39 @@ from typing import List, Optional
import torch
import torch.nn as nn
from pts import Trainer
from pts.dataset import FieldName
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
from pts.modules import DistributionOutput, StudentTOutput
from pts.transform import (
from gluonts.torch.support.util import copy_parameters
from gluonts.torch.model.predictor import PyTorchPredictor
from gluonts.torch.modules.distribution_output import DistributionOutput
from gluonts.model.predictor import Predictor
from gluonts.dataset.field_names import FieldName
from gluonts.time_feature import (
TimeFeature,
get_lags_for_frequency,
time_features_from_frequency_str,
)
from gluonts.transform import (
Transformation,
Chain,
InstanceSplitter,
ExpectedNumInstanceSampler,
)
from pts import Trainer
from pts.model import PyTorchEstimator
from pts.modules import StudentTOutput
from .simple_feedforward_network import (
SimpleFeedForwardTrainingNetwork,
SimpleFeedForwardPredictionNetwork,
)
class SimpleFeedForwardEstimator(PTSEstimator):
class SimpleFeedForwardEstimator(PyTorchEstimator):
"""
SimpleFeedForwardEstimator shows how to build a simple MLP model predicting
the next target time-steps given the previous ones.
Given that we want to define a pytorch model trainable by SGD, we inherit the
parent class `PTSEstimator` that handles most of the logic for fitting a
parent class `PyTorchEstimator` that handles most of the logic for fitting a
neural-network.
We thus only have to define:
@@ -148,7 +158,7 @@ class SimpleFeedForwardEstimator(PTSEstimator):
transformation: Transformation,
trained_network: nn.Module,
device: torch.device,
) -> PTSPredictor:
) -> PyTorchPredictor:
prediction_network = SimpleFeedForwardPredictionNetwork(
num_hidden_dimensions=self.num_hidden_dimensions,
prediction_length=self.prediction_length,
@@ -161,7 +171,7 @@ class SimpleFeedForwardEstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
@@ -4,8 +4,10 @@ import torch
import torch.nn as nn
from torch.distributions import Distribution
from pts.core.component import validated
from pts.modules import MeanScaler, NOPScaler, DistributionOutput, LambdaLayer
from gluonts.core.component import validated
from gluonts.torch.modules.distribution_output import DistributionOutput
from gluonts.torch.modules.lambda_layer import LambdaLayer
from pts.modules import MeanScaler, NOPScaler
class SimpleFeedForwardNetworkBase(nn.Module):
@@ -35,6 +37,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
Distribution to fit.
kwargs
"""
@validated()
def __init__(
self,
@@ -60,7 +63,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
if i == 0:
input_size = context_length
else:
input_size = dims[i-1]
input_size = dims[i - 1]
modules += [nn.Linear(input_size, units), nn.ReLU()]
if self.batch_normalization:
modules.append(nn.BatchNorm1d(units))
@@ -83,7 +86,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
past_target,
torch.ones_like(past_target), # TODO: pass the actual observed here
)
mlp_outputs = self.mlp(scaled_target)
distr_args = self.distr_args_proj(mlp_outputs)
return self.distr_output.distribution(
+12 -7
View File
@@ -9,7 +9,7 @@ from pts.feature import (
fourier_time_features_from_frequency_str,
get_fourier_lags_for_frequency,
)
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
from pts.transform import (
Transformation,
Chain,
@@ -27,7 +27,7 @@ from pts.transform import (
from .tempflow_network import TempFlowTrainingNetwork, TempFlowPredictionNetwork
class TempFlowEstimator(PTSEstimator):
class TempFlowEstimator(PyTorchEstimator):
def __init__(
self,
input_size: int,
@@ -49,7 +49,6 @@ class TempFlowEstimator(PTSEstimator):
n_hidden=2,
conditioning_length: int = 200,
dequantize: bool = False,
scaling: bool = True,
pick_incomplete: bool = False,
lags_seq: Optional[List[int]] = None,
@@ -100,10 +99,16 @@ class TempFlowEstimator(PTSEstimator):
def create_transformation(self) -> Transformation:
return Chain(
[
AsNumpyArray(field=FieldName.TARGET, expected_ndim=2,),
AsNumpyArray(
field=FieldName.TARGET,
expected_ndim=2,
),
# maps the target to (1, T)
# if the target data is uni dimensional
ExpandDimArray(field=FieldName.TARGET, axis=None,),
ExpandDimArray(
field=FieldName.TARGET,
axis=None,
),
AddObservedValuesIndicator(
target_field=FieldName.TARGET,
output_field=FieldName.OBSERVED_VALUES,
@@ -176,7 +181,7 @@ class TempFlowEstimator(PTSEstimator):
transformation: Transformation,
trained_network: TempFlowTrainingNetwork,
device: torch.device,
) -> PTSPredictor:
) -> PyTorchPredictor:
prediction_network = TempFlowPredictionNetwork(
input_size=self.input_size,
target_dim=self.target_dim,
@@ -202,7 +207,7 @@ class TempFlowEstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
+13 -5
View File
@@ -3,13 +3,12 @@ from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
from pts.core.component import validated
from gluonts.core.component import validated
from pts.model import weighted_average
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
class TempFlowTrainingNetwork(nn.Module):
@validated()
def __init__(
self,
@@ -55,7 +54,10 @@ class TempFlowTrainingNetwork(nn.Module):
batch_first=True,
)
flow_cls = {"RealNVP": RealNVP, "MAF": MAF,}[flow_type]
flow_cls = {
"RealNVP": RealNVP,
"MAF": MAF,
}[flow_type]
self.flow = flow_cls(
input_size=target_dim,
n_blocks=n_blocks,
@@ -377,7 +379,8 @@ class TempFlowTrainingNetwork(nn.Module):
# put together target sequence
# (batch_size, seq_len, target_dim)
target = torch.cat(
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf), dim=1,
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf),
dim=1,
)
# assert_shape(target, (-1, seq_len, self.target_dim))
@@ -519,7 +522,12 @@ class TempFlowPredictionNetwork(TempFlowTrainingNetwork):
# (batch_size, num_samples, prediction_length, target_dim)
return samples.reshape(
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
(
-1,
self.num_parallel_samples,
self.prediction_length,
self.target_dim,
)
)
def forward(
+1 -1
View File
@@ -1 +1 @@
from .transformer_estimator import TransformerEstimator
from .transformer_estimator import TransformerEstimator
@@ -11,7 +11,7 @@ from pts.feature import (
fourier_time_features_from_frequency_str,
get_fourier_lags_for_frequency,
)
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
from pts.modules import DistributionOutput, StudentTOutput
from pts.transform import (
Transformation,
@@ -32,7 +32,7 @@ from .transformer_network import (
)
class TransformerEstimator(PTSEstimator):
class TransformerEstimator(PyTorchEstimator):
def __init__(
self,
input_size: int,
@@ -75,7 +75,9 @@ class TransformerEstimator(PTSEstimator):
self.embedding_dimension = embedding_dimension
self.num_parallel_samples = num_parallel_samples
self.lags_seq = (
lags_seq if lags_seq is not None else get_fourier_lags_for_frequency(freq_str=freq)
lags_seq
if lags_seq is not None
else get_fourier_lags_for_frequency(freq_str=freq)
)
self.time_features = (
time_features
@@ -117,7 +119,9 @@ class TransformerEstimator(PTSEstimator):
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long
),
AsNumpyArray(
field=FieldName.FEAT_STATIC_REAL, expected_ndim=1, dtype=self.dtype,
field=FieldName.FEAT_STATIC_REAL,
expected_ndim=1,
dtype=self.dtype,
),
AsNumpyArray(
field=FieldName.TARGET,
@@ -220,7 +224,7 @@ class TransformerEstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
+7 -8
View File
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
import torch
import torch.nn as nn
from pts.core.component import validated
from gluonts.core.component import validated
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
@@ -15,7 +15,6 @@ def prod(xs):
class TransformerNetwork(nn.Module):
@validated()
def __init__(
self,
@@ -72,7 +71,8 @@ class TransformerNetwork(nn.Module):
self.proj_dist_args = distr_output.get_args_proj(d_model)
self.embedder = FeatureEmbedder(
cardinalities=cardinality, embedding_dims=embedding_dimension,
cardinalities=cardinality,
embedding_dims=embedding_dimension,
)
if scaling:
@@ -82,7 +82,8 @@ class TransformerNetwork(nn.Module):
# mask
self.register_buffer(
"tgt_mask", self.transformer.generate_square_subsequent_mask(prediction_length)
"tgt_mask",
self.transformer.generate_square_subsequent_mask(prediction_length),
)
@staticmethod
@@ -154,9 +155,7 @@ class TransformerNetwork(nn.Module):
else:
time_feat = torch.cat(
(
past_time_feat[
:, self.history_length - self.context_length :, ...
],
past_time_feat[:, self.history_length - self.context_length :, ...],
future_time_feat,
),
dim=1,
@@ -177,7 +176,7 @@ class TransformerNetwork(nn.Module):
# scale shape is (batch_size, 1, *target_shape)
_, scale = self.scaler(
past_target[:, -self.context_length :, ...],
past_observed_values[:, -self.context_length :, ...]
past_observed_values[:, -self.context_length :, ...],
)
embedded_cat = self.embedder(feat_static_cat)
@@ -9,7 +9,7 @@ from pts.feature import (
fourier_time_features_from_frequency_str,
get_fourier_lags_for_frequency,
)
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
from pts.transform import (
Transformation,
Chain,
@@ -24,10 +24,13 @@ from pts.transform import (
SetFieldIfNotPresent,
TargetDimIndicator,
)
from .transformer_tempflow_network import TransformerTempFlowTrainingNetwork, TransformerTempFlowPredictionNetwork
from .transformer_tempflow_network import (
TransformerTempFlowTrainingNetwork,
TransformerTempFlowPredictionNetwork,
)
class TransformerTempFlowEstimator(PTSEstimator):
class TransformerTempFlowEstimator(PyTorchEstimator):
def __init__(
self,
input_size: int,
@@ -52,7 +55,6 @@ class TransformerTempFlowEstimator(PTSEstimator):
n_hidden=2,
conditioning_length: int = 200,
dequantize: bool = False,
scaling: bool = True,
pick_incomplete: bool = False,
lags_seq: Optional[List[int]] = None,
@@ -108,10 +110,16 @@ class TransformerTempFlowEstimator(PTSEstimator):
def create_transformation(self) -> Transformation:
return Chain(
[
AsNumpyArray(field=FieldName.TARGET, expected_ndim=2,),
AsNumpyArray(
field=FieldName.TARGET,
expected_ndim=2,
),
# maps the target to (1, T)
# if the target data is uni dimensional
ExpandDimArray(field=FieldName.TARGET, axis=None,),
ExpandDimArray(
field=FieldName.TARGET,
axis=None,
),
AddObservedValuesIndicator(
target_field=FieldName.TARGET,
output_field=FieldName.OBSERVED_VALUES,
@@ -156,7 +164,9 @@ class TransformerTempFlowEstimator(PTSEstimator):
]
)
def create_training_network(self, device: torch.device) -> TransformerTempFlowTrainingNetwork:
def create_training_network(
self, device: torch.device
) -> TransformerTempFlowTrainingNetwork:
return TransformerTempFlowTrainingNetwork(
input_size=self.input_size,
target_dim=self.target_dim,
@@ -187,7 +197,7 @@ class TransformerTempFlowEstimator(PTSEstimator):
transformation: Transformation,
trained_network: TransformerTempFlowTrainingNetwork,
device: torch.device,
) -> PTSPredictor:
) -> PyTorchPredictor:
prediction_network = TransformerTempFlowPredictionNetwork(
input_size=self.input_size,
target_dim=self.target_dim,
@@ -216,7 +226,7 @@ class TransformerTempFlowEstimator(PTSEstimator):
copy_parameters(trained_network, prediction_network)
return PTSPredictor(
return PyTorchPredictor(
input_transform=transformation,
prediction_net=prediction_network,
batch_size=self.trainer.batch_size,
@@ -3,12 +3,11 @@ from typing import List, Optional, Tuple
import torch
import torch.nn as nn
from pts.core.component import validated
from gluonts.core.component import validated
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
class TransformerTempFlowTrainingNetwork(nn.Module):
@validated()
def __init__(
self,
@@ -61,7 +60,10 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
activation=act_type,
)
flow_cls = {"RealNVP": RealNVP, "MAF": MAF,}[flow_type]
flow_cls = {
"RealNVP": RealNVP,
"MAF": MAF,
}[flow_type]
self.flow = flow_cls(
input_size=target_dim,
n_blocks=n_blocks,
@@ -146,9 +148,7 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
future_time_feat: Optional[torch.Tensor],
future_target_cdf: Optional[torch.Tensor],
target_dimension_indicator: torch.Tensor,
) -> Tuple[
torch.Tensor, torch.Tensor, torch.Tensor,
]:
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
"""
Unrolls the RNN encoder over past and, if present, future data.
Returns outputs and state of the encoder, plus the scale of
@@ -204,7 +204,10 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
subsequences_length = self.context_length
else:
time_feat = torch.cat(
(past_time_feat[:, -self.context_length :, ...], future_time_feat,),
(
past_time_feat[:, -self.context_length :, ...],
future_time_feat,
),
dim=1,
)
sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
@@ -516,7 +519,12 @@ class TransformerTempFlowPredictionNetwork(TransformerTempFlowTrainingNetwork):
# (batch_size, num_samples, prediction_length, target_dim)
return samples.reshape(
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
(
-1,
self.num_parallel_samples,
self.prediction_length,
self.target_dim,
)
)
def forward(
+27 -22
View File
@@ -7,30 +7,35 @@ import torch.nn as nn
def get_module_forward_input_names(module: nn.Module):
params = inspect.signature(module.forward).parameters
return list(params)
def copy_parameters(net_source: nn.Module, net_dest: nn.Module) -> None:
net_dest.load_state_dict(net_source.state_dict())
param_names = [k for k, v in params.items() if not str(v).startswith("*")]
return param_names
def weighted_average(
tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None
):
x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None
) -> torch.Tensor:
"""
Computes the weighted average of a given tensor across a given dim, masking
values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
Parameters
----------
x
Input tensor, of which the average must be computed.
weights
Weights tensor, of the same shape as `x`.
dim
The dim along which to average `x`
Returns
-------
Tensor:
The tensor with values averaged along the specified `dim`.
"""
if weights is not None:
weighted_tensor = tensor * weights
if dim is not None:
sum_weights = torch.sum(weights, dim)
sum_weighted_tensor = torch.sum(weighted_tensor, dim)
else:
sum_weights = weights.sum()
sum_weighted_tensor = weighted_tensor.sum()
sum_weights = torch.max(torch.ones_like(sum_weights), sum_weights)
return sum_weighted_tensor / sum_weights
weighted_tensor = torch.where(weights != 0, x * weights, torch.zeros_like(x))
sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum())/ sum_weights
else:
if dim is not None:
return torch.mean(tensor, dim=dim)
else:
return tensor.mean()
return x.mean(dim=dim)
-4
View File
@@ -1,7 +1,4 @@
from .distribution_output import (
ArgProj,
Output,
DistributionOutput,
NormalOutput,
StudentTOutput,
BetaOutput,
@@ -20,5 +17,4 @@ from .distribution_output import (
)
from .feature import FeatureEmbedder, FeatureAssembler
from .flows import RealNVP, MAF
from .lambda_layer import LambdaLayer
from .scaler import MeanScaler, NOPScaler
+39 -103
View File
@@ -19,7 +19,8 @@ from torch.distributions import (
MultivariateNormal,
TransformedDistribution,
AffineTransform,
Poisson)
Poisson,
)
from pts.distributions import (
ZeroInflatedPoisson,
@@ -29,79 +30,13 @@ from pts.distributions import (
ImplicitQuantile,
TransformedImplicitQuantile,
)
from pts.core.component import validated
from gluonts.core.component import validated
from gluonts.torch.modules.distribution_output import (
DistributionOutput,
LambdaLayer,
PtArgProj,
)
from pts.modules.iqn_modules import ImplicitQuantileModule
from .lambda_layer import LambdaLayer
class ArgProj(nn.Module):
def __init__(
self,
in_features: int,
args_dim: Dict[str, int],
domain_map: Callable[..., Tuple[torch.Tensor]],
dtype: np.dtype = np.float32,
prefix: Optional[str] = None,
**kwargs,
):
super().__init__(**kwargs)
self.args_dim = args_dim
self.dtype = dtype
self.proj = nn.ModuleList(
[nn.Linear(in_features, dim) for dim in args_dim.values()]
)
self.domain_map = domain_map
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
params_unbounded = [proj(x) for proj in self.proj]
return self.domain_map(*params_unbounded)
class Output(ABC):
in_features: int
args_dim: Dict[str, int]
_dtype: np.dtype = np.float32
@property
def dtype(self):
return self._dtype
@dtype.setter
def dtype(self, dtype: np.dtype):
self._dtype = dtype
def get_args_proj(self, in_features: int, prefix: Optional[str] = None) -> ArgProj:
return ArgProj(
in_features=in_features,
args_dim=self.args_dim,
domain_map=LambdaLayer(self.domain_map),
prefix=prefix,
dtype=self.dtype,
)
@abstractclassmethod
def domain_map(cls, *args: torch.Tensor):
pass
class DistributionOutput(Output, ABC):
distr_cls: type
@validated()
def __init__(self) -> None:
pass
def distribution(
self, distr_args, scale: Optional[torch.Tensor] = None
) -> Distribution:
distr = self.distr_cls(*distr_args)
if scale is None:
return distr
else:
return TransformedDistribution(distr, [AffineTransform(loc=0, scale=scale)])
class IndependentDistributionOutput(DistributionOutput):
@@ -364,7 +299,9 @@ class PiecewiseLinearOutput(DistributionOutput):
return gamma.squeeze(axis=-1), slopes_proj, knot_spacings_proj
def distribution(
self, distr_args, scale: Optional[torch.Tensor] = None,
self,
distr_args,
scale: Optional[torch.Tensor] = None,
) -> PiecewiseLinear:
if scale is None:
return self.distr_cls(*distr_args)
@@ -415,7 +352,11 @@ class NormalMixtureOutput(DistributionOutput):
class LowRankMultivariateNormalOutput(DistributionOutput):
@validated()
def __init__(
self, dim: int, rank: int, sigma_init: float = 1.0, sigma_minimum: float = 1e-3,
self,
dim: int,
rank: int,
sigma_init: float = 1.0,
sigma_minimum: float = 1e-3,
) -> None:
self.distr_cls = LowRankMultivariateNormal
self.dim = dim
@@ -508,25 +449,16 @@ class FlowOutput(DistributionOutput):
return (self.dim,)
class QuantileArgProj(ArgProj):
class QuantilePtArgProj(PtArgProj):
def __init__(
self,
in_features: int,
output_domain_cls: nn.Module,
args_dim: Dict[str, int],
domain_map: Callable[..., Tuple[torch.Tensor]],
dtype: np.dtype = np.float32,
prefix: Optional[str] = None,
**kwargs,
self,
in_features: int,
output_domain_cls: nn.Module,
args_dim: Dict[str, int],
domain_map: Callable[..., Tuple[torch.Tensor]],
**kwargs,
):
super().__init__(
in_features,
args_dim,
domain_map,
dtype,
prefix,
**kwargs
)
super().__init__(in_features, args_dim, domain_map, **kwargs)
self.output_domain_cls = output_domain_cls
self.proj = ImplicitQuantileModule(in_features, output_domain_cls)
@@ -535,8 +467,8 @@ class QuantileArgProj(ArgProj):
forecast_length = x.shape[1]
device = x.device
taus = torch.rand(size=(batch_size, forecast_length), device=device)
self.register_buffer('taus', taus)
self.register_buffer('nn_ouput', x.clone().detach())
self.register_buffer("taus", taus)
self.register_buffer("nn_ouput", x.clone().detach())
predicted_quantiles = self.proj(x, taus)
return self.domain_map(predicted_quantiles)
@@ -548,6 +480,7 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
output_domain_cls: type = nn.Module
quantile_arg_proj: type = nn.Module
@validated()
def __init__(self, output_domain: str) -> None:
super().__init__()
self.set_output_domain_map(output_domain)
@@ -559,14 +492,17 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
"Positive": nn.Softplus,
"Real": nn.Identity,
}
assert output_domain in available_domain_map_cls.keys(), \
"Only the following output domains are allowed: {}".format(available_domain_map_cls.keys())
assert (
output_domain in available_domain_map_cls.keys()
), "Only the following output domains are allowed: {}".format(
available_domain_map_cls.keys()
)
output_domain_cls = available_domain_map_cls[output_domain]
cls.output_domain_cls = output_domain_cls
@classmethod
def set_args_proj(cls):
cls.quantile_arg_proj = QuantileArgProj(
cls.quantile_arg_proj = QuantilePtArgProj(
in_features=cls.in_features,
output_domain_cls=cls.output_domain_cls,
args_dim=cls.args_dim,
@@ -584,11 +520,13 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
cls.set_args_proj()
return cls.quantile_arg_proj
def get_args_proj(self, in_features: int, prefix: Optional[str] = None) :
def get_args_proj(self, in_features: int, prefix: Optional[str] = None):
return self.args_proj(in_features)
def distribution(
self, distr_args, scale: Optional[torch.Tensor] = None,
self,
distr_args,
scale: Optional[torch.Tensor] = None,
) -> ImplicitQuantile:
args_proj = self.get_args_proj(self.in_features)
@@ -597,7 +535,8 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
implicit_quantile_function=implicit_quantile_function,
taus=list(args_proj.buffers())[0],
nn_output=list(args_proj.buffers())[1],
predicted_quantiles=distr_args)
predicted_quantiles=distr_args,
)
if scale is None:
return distr
else:
@@ -608,6 +547,3 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
@property
def event_shape(self) -> Tuple:
return ()
+5 -1
View File
@@ -5,7 +5,11 @@ import torch.nn as nn
class FeatureEmbedder(nn.Module):
def __init__(self, cardinalities: List[int], embedding_dims: List[int],) -> None:
def __init__(
self,
cardinalities: List[int],
embedding_dims: List[int],
) -> None:
super().__init__()
self.__num_features = len(cardinalities)
+1 -1
View File
@@ -52,6 +52,6 @@ class QuantileLayer(nn.Module):
integers = torch.repeat_interleave(
torch.arange(0, self.n_cos_embedding).unsqueeze(dim=0),
repeats=tau.shape[-1],
dim=0
dim=0,
).to(tau.device)
return torch.cos(pi * tau.unsqueeze(dim=-1) * integers)
-10
View File
@@ -1,10 +0,0 @@
import torch.nn as nn
class LambdaLayer(nn.Module):
def __init__(self, function):
super().__init__()
self._func = function
def forward(self, x, *args):
return self._func(x, *args)
+1 -1
View File
@@ -37,7 +37,7 @@ class Scaler(ABC, nn.Module):
Tensor
Tensor containing the "scaled" data, shape: (N, T, C) or (N, C, T).
Tensor
Tensor containing the scale, of shape (N, C) if ``keepdim == False``,
Tensor containing the scale, of shape (N, C) if ``keepdim == False``,
and shape (N, 1, C) or (N, C, 1) if ``keepdim == True``.
"""
+23 -18
View File
@@ -1,14 +1,17 @@
import time
from typing import List, Optional
from typing import List, Optional, Union
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from gluonts.core.component import validated
from gluonts.dataset.loader import TrainDataLoader, ValidationDataLoader
class Trainer:
@validated()
def __init__(
self,
epochs: int = 100,
@@ -18,7 +21,7 @@ class Trainer:
pin_memory: bool = False,
learning_rate: float = 1e-3,
weight_decay: float = 1e-6,
device: Optional[torch.device] = None,
device: Optional[Union[torch.device, str]] = None,
) -> None:
self.epochs = epochs
self.batch_size = batch_size
@@ -30,26 +33,26 @@ class Trainer:
self.pin_memory = pin_memory
def __call__(
self, net: nn.Module, input_names: List[str], data_loader: DataLoader
self,
net: nn.Module,
train_iter: TrainDataLoader,
validation_iter: Optional[ValidationDataLoader] = None,
) -> None:
optimizer = torch.optim.Adam(
net.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
)
writer = SummaryWriter()
#writer.add_graph(net)
for epoch_no in range(self.epochs):
# mark epoch start time
tic = time.time()
avg_epoch_loss = 0.0
with tqdm(data_loader) as it:
with tqdm(train_iter) as it:
for batch_no, data_entry in enumerate(it, start=1):
optimizer.zero_grad()
inputs = [data_entry[k].to(self.device) for k in input_names]
#inputs = [data_entry[k].to(self.device) for k in input_names]
output = net(*inputs)
output = net(*data_entry.values())
if isinstance(output, (list, tuple)):
loss = output[0]
else:
@@ -63,18 +66,20 @@ class Trainer:
},
refresh=False,
)
n_iter = epoch_no*self.num_batches_per_epoch + batch_no
writer.add_scalar('Loss/train', loss.item(), n_iter)
n_iter = epoch_no * self.num_batches_per_epoch + batch_no
#.add_scalar("Loss/train", loss.item(), n_iter)
loss.backward()
optimizer.step()
if self.num_batches_per_epoch == batch_no:
for name, param in net.named_parameters():
writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)
# for name, param in net.named_parameters():
# writer.add_histogram(
# name, param.clone().cpu().data.numpy(), n_iter
# )
break
# mark epoch end time and log time cost of current epoch
toc = time.time()
writer.close()
#writer.close()
-52
View File
@@ -1,52 +0,0 @@
from .convert import (
AsNumpyArray,
ExpandDimArray,
VstackFeatures,
ConcatFeatures,
SwapAxes,
ListFeatures,
TargetDimIndicator,
SampleTargetDim,
CDFtoGaussianTransform,
cdf_to_gaussian_forward_transform,
)
from .dataset import TransformedDataset
from .feature import (
target_transformation_length,
AddObservedValuesIndicator,
AddConstFeature,
AddTimeFeatures,
AddAgeFeature,
)
from .field import (
RemoveFields,
RenameFields,
SetField,
SetFieldIfNotPresent,
SelectFields,
)
from .sampler import (
InstanceSampler,
UniformSplitSampler,
TestSplitSampler,
ExpectedNumInstanceSampler,
BucketInstanceSampler,
ContinuousTimePointSampler,
ContinuousTimeUniformSampler,
)
from .split import (
shift_timestamp,
InstanceSplitter,
CanonicalInstanceSplitter,
ContinuousTimeInstanceSplitter,
)
from .transform import (
Transformation,
Chain,
Identity,
MapTransformation,
SimpleTransformation,
AdhocTransform,
FlatMapTransformation,
FilterTransformation,
)
-713
View File
@@ -1,713 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import Iterator, List, Tuple, Optional
import numpy as np
import torch
from scipy.special import erf, erfinv
from pts.core.component import validated
from pts.dataset import DataEntry
from pts.exception import assert_pts
from .transform import (
SimpleTransformation,
MapTransformation,
FlatMapTransformation,
)
class AsNumpyArray(SimpleTransformation):
"""
Converts the value of a field into a numpy array.
Parameters
----------
expected_ndim
Expected number of dimensions. Throws an exception if the number of
dimensions does not match.
dtype
numpy dtype to use.
"""
@validated()
def __init__(
self, field: str, expected_ndim: int, dtype: np.dtype = np.float32
) -> None:
self.field = field
self.expected_ndim = expected_ndim
self.dtype = dtype
def transform(self, data: DataEntry) -> DataEntry:
value = data[self.field]
if not isinstance(value, float):
# this lines produces "ValueError: setting an array element with a
# sequence" on our test
# value = np.asarray(value, dtype=np.float32)
# see https://stackoverflow.com/questions/43863748/
value = np.asarray(list(value), dtype=self.dtype)
else:
# ugly: required as list conversion will fail in the case of a
# float
value = np.asarray(value, dtype=self.dtype)
assert_pts(
value.ndim >= self.expected_ndim,
'Input for field "{self.field}" does not have the required'
"dimension (field: {self.field}, ndim observed: {value.ndim}, "
"expected ndim: {self.expected_ndim})",
value=value,
self=self,
)
data[self.field] = value
return data
class ExpandDimArray(SimpleTransformation):
"""
Expand dims in the axis specified, if the axis is not present does nothing.
(This essentially calls np.expand_dims)
Parameters
----------
field
Field in dictionary to use
axis
Axis to expand (see np.expand_dims for details)
"""
@validated()
def __init__(self, field: str, axis: Optional[int] = None) -> None:
self.field = field
self.axis = axis
def transform(self, data: DataEntry) -> DataEntry:
if self.axis is not None:
data[self.field] = np.expand_dims(data[self.field], axis=self.axis)
return data
class VstackFeatures(SimpleTransformation):
"""
Stack fields together using ``np.vstack``.
Fields with value ``None`` are ignored.
Parameters
----------
output_field
Field name to use for the output
input_fields
Fields to stack together
drop_inputs
If set to true the input fields will be dropped.
"""
@validated()
def __init__(
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
) -> None:
self.output_field = output_field
self.input_fields = input_fields
self.cols_to_drop = (
[]
if not drop_inputs
else [fname for fname in self.input_fields if fname != output_field]
)
def transform(self, data: DataEntry) -> DataEntry:
r = [data[fname] for fname in self.input_fields if data[fname] is not None]
output = np.vstack(r)
data[self.output_field] = output
for fname in self.cols_to_drop:
del data[fname]
return data
class ConcatFeatures(SimpleTransformation):
"""
Concatenate fields together using ``np.concatenate``.
Fields with value ``None`` are ignored.
Parameters
----------
output_field
Field name to use for the output
input_fields
Fields to stack together
drop_inputs
If set to true the input fields will be dropped.
"""
@validated()
def __init__(
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
) -> None:
self.output_field = output_field
self.input_fields = input_fields
self.cols_to_drop = (
[]
if not drop_inputs
else [fname for fname in self.input_fields if fname != output_field]
)
def transform(self, data: DataEntry) -> DataEntry:
r = [data[fname] for fname in self.input_fields if data[fname] is not None]
output = np.concatenate(r)
data[self.output_field] = output
for fname in self.cols_to_drop:
del data[fname]
return data
class SwapAxes(SimpleTransformation):
"""
Apply `np.swapaxes` to fields.
Parameters
----------
input_fields
Field to apply to
axes
Axes to use
"""
@validated()
def __init__(self, input_fields: List[str], axes: Tuple[int, int]) -> None:
self.input_fields = input_fields
self.axis1, self.axis2 = axes
def transform(self, data: DataEntry) -> DataEntry:
for field in self.input_fields:
data[field] = self.swap(data[field])
return data
def swap(self, v):
if isinstance(v, np.ndarray):
return np.swapaxes(v, self.axis1, self.axis2)
if isinstance(v, list):
return [self.swap(x) for x in v]
else:
raise ValueError(
f"Unexpected field type {type(v).__name__}, expected "
f"np.ndarray or list[np.ndarray]"
)
class ListFeatures(SimpleTransformation):
"""
Creates a new field which contains a list of features.
Parameters
----------
output_field
Field name for output
input_fields
Fields to combine into list
drop_inputs
If true the input fields will be removed from the result.
"""
@validated()
def __init__(
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
) -> None:
self.output_field = output_field
self.input_fields = input_fields
self.cols_to_drop = (
[]
if not drop_inputs
else [fname for fname in self.input_fields if fname != output_field]
)
def transform(self, data: DataEntry) -> DataEntry:
data[self.output_field] = [data[fname] for fname in self.input_fields]
for fname in self.cols_to_drop:
del data[fname]
return data
class TargetDimIndicator(SimpleTransformation):
"""
Label-encoding of the target dimensions.
"""
@validated()
def __init__(self, field_name: str, target_field: str) -> None:
self.field_name = field_name
self.target_field = target_field
def transform(self, data: DataEntry) -> DataEntry:
data[self.field_name] = np.arange(0, data[self.target_field].shape[0])
return data
class SampleTargetDim(FlatMapTransformation):
"""
Samples random dimensions from the target at training time.
"""
@validated()
def __init__(
self,
field_name: str,
target_field: str,
observed_values_field: str,
num_samples: int,
shuffle: bool = True,
) -> None:
self.field_name = field_name
self.target_field = target_field
self.observed_values_field = observed_values_field
self.num_samples = num_samples
self.shuffle = shuffle
def flatmap_transform(
self, data: DataEntry, is_train: bool, slice_future_target: bool = True
) -> Iterator[DataEntry]:
if not is_train:
yield data
else:
# (target_dim,)
target_dimensions = data[self.field_name]
if self.shuffle:
np.random.shuffle(target_dimensions)
target_dimensions = target_dimensions[: self.num_samples]
data[self.field_name] = target_dimensions
# (seq_len, target_dim) -> (seq_len, num_samples)
for field in [
f"past_{self.target_field}",
f"future_{self.target_field}",
f"past_{self.observed_values_field}",
f"future_{self.observed_values_field}",
]:
data[field] = data[field][:, target_dimensions]
yield data
class CDFtoGaussianTransform(MapTransformation):
"""
Marginal transformation that transforms the target via an empirical CDF
to a standard gaussian as described here: https://arxiv.org/abs/1910.03002
To be used in conjunction with a multivariate gaussian to from a copula.
Note that this transformation is currently intended for multivariate
targets only.
"""
@validated()
def __init__(
self,
target_dim: int,
target_field: str,
observed_values_field: str,
cdf_suffix="_cdf",
max_context_length: Optional[int] = None,
) -> None:
"""
Constructor for CDFtoGaussianTransform.
Parameters
----------
target_dim
Dimensionality of the target.
target_field
Field that will be transformed.
observed_values_field
Field that indicates observed values.
cdf_suffix
Suffix to mark the field with the transformed target.
max_context_length
Sets the maximum context length for the empirical CDF.
"""
self.target_field = target_field
self.past_target_field = "past_" + self.target_field
self.future_target_field = "future_" + self.target_field
self.past_observed_field = f"past_{observed_values_field}"
self.sort_target_field = f"past_{target_field}_sorted"
self.slopes_field = "slopes"
self.intercepts_field = "intercepts"
self.cdf_suffix = cdf_suffix
self.max_context_length = max_context_length
self.target_dim = target_dim
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
self._preprocess_data(data, is_train=is_train)
self._calc_pw_linear_params(data)
for target_field in [self.past_target_field, self.future_target_field]:
data[target_field + self.cdf_suffix] = self.standard_gaussian_ppf(
self._empirical_cdf_forward_transform(
data[self.sort_target_field],
data[target_field],
data[self.slopes_field],
data[self.intercepts_field],
)
)
return data
def _preprocess_data(self, data: DataEntry, is_train: bool):
"""
Performs several preprocess operations for computing the empirical CDF.
1) Reshaping the data.
2) Normalizing the target length.
3) Adding noise to avoid zero slopes (training only)
4) Sorting the target to compute the empirical CDF
Parameters
----------
data
DataEntry with input data.
is_train
if is_train is True, this function adds noise to the target to
avoid zero slopes in the piece-wise linear function.
Returns
-------
"""
# (target_length, target_dim)
past_target_vec = data[self.past_target_field].copy()
# pick only observed values
target_length, target_dim = past_target_vec.shape
# (target_length, target_dim)
past_observed = (data[self.past_observed_field] > 0) * (
data["past_is_pad"].reshape((-1, 1)) == 0
)
assert past_observed.ndim == 2
assert target_dim == self.target_dim
past_target_vec = past_target_vec[past_observed.min(axis=1)]
assert past_target_vec.ndim == 2
assert past_target_vec.shape[1] == self.target_dim
expected_length = (
target_length
if self.max_context_length is None
else self.max_context_length
)
if target_length != expected_length:
# Fills values in the case where past_target_vec.shape[-1] <
# target_length
# as dataset.loader.BatchBuffer does not support varying shapes
past_target_vec = CDFtoGaussianTransform._fill(
past_target_vec, expected_length
)
# sorts along the time dimension to compute empirical CDF of each
# dimension
if is_train:
past_target_vec = self._add_noise(past_target_vec)
past_target_vec.sort(axis=0)
assert past_target_vec.shape == (expected_length, self.target_dim)
data[self.sort_target_field] = past_target_vec
def _calc_pw_linear_params(self, data: DataEntry):
"""
Calculates the piece-wise linear parameters to interpolate between
the observed values in the empirical CDF.
Once current limitation is that we use a zero slope line as the last
piece. Thus, we cannot forecast anything higher than the highest
observed value.
Parameters
----------
data
Input data entry containing a sorted target field.
Returns
-------
"""
sorted_target = data[self.sort_target_field]
sorted_target_length, target_dim = sorted_target.shape
quantiles = np.stack(
[np.arange(sorted_target_length) for _ in range(target_dim)], axis=1,
) / float(sorted_target_length)
x_diff = np.diff(sorted_target, axis=0)
y_diff = np.diff(quantiles, axis=0)
# Calculate slopes of the pw-linear pieces.
slopes = np.where(x_diff == 0.0, np.zeros_like(x_diff), y_diff / x_diff)
zeroes = np.zeros_like(np.expand_dims(slopes[0, :], axis=0))
slopes = np.append(slopes, zeroes, axis=0)
# Calculate intercepts of the pw-linear pieces.
intercepts = quantiles - slopes * sorted_target
# Populate new fields with the piece-wise linear parameters.
data[self.slopes_field] = slopes
data[self.intercepts_field] = intercepts
def _empirical_cdf_forward_transform(
self,
sorted_values: np.ndarray,
values: np.ndarray,
slopes: np.ndarray,
intercepts: np.ndarray,
) -> np.ndarray:
"""
Applies the empirical CDF forward transformation.
Parameters
----------
sorted_values
Sorted target vector.
values
Values (real valued) that will be transformed to empirical CDF
values.
slopes
Slopes of the piece-wise linear function.
intercepts
Intercepts of the piece-wise linear function.
Returns
-------
quantiles
Empirical CDF quantiles in [0, 1] interval with winzorized cutoff.
"""
m = sorted_values.shape[0]
quantiles = self._forward_transform(sorted_values, values, slopes, intercepts)
quantiles = np.clip(
quantiles, self.winsorized_cutoff(m), 1 - self.winsorized_cutoff(m)
)
return quantiles
@staticmethod
def _add_noise(x: np.array) -> np.array:
scale_noise = 0.2
std = np.sqrt(
(np.square(x - x.mean(axis=1, keepdims=True))).mean(axis=1, keepdims=True)
)
noise = np.random.normal(
loc=np.zeros_like(x), scale=np.ones_like(x) * std * scale_noise
)
x = x + noise
return x
@staticmethod
def _search_sorted(sorted_vec: np.array, to_insert_vec: np.array) -> np.array:
"""
Finds the indices of the active piece-wise linear function.
Parameters
----------
sorted_vec
Sorted target vector.
to_insert_vec
Vector for which the indicies of the active linear functions
will be computed
Returns
-------
indices
Indices mapping to the active linear function.
"""
indices_left = np.searchsorted(sorted_vec, to_insert_vec, side="left")
indices_right = np.searchsorted(sorted_vec, to_insert_vec, side="right")
indices = indices_left + (indices_right - indices_left) // 2
indices = indices - 1
indices = np.minimum(indices, len(sorted_vec) - 1)
indices[indices < 0] = 0
return indices
def _forward_transform(
self,
sorted_vec: np.array,
target: np.array,
slopes: np.array,
intercepts: np.array,
) -> np.array:
"""
Applies the forward transformation to the marginals of the multivariate
target. Target (real valued) -> empirical cdf [0, 1]
Parameters
----------
sorted_vec
Sorted (past) target vector.
target
Target that will be transformed.
slopes
Slopes of the piece-wise linear function.
intercepts
Intercepts of the piece-wise linear function
Returns
-------
transformed_target
Transformed target vector.
"""
transformed = list()
for sorted, t, slope, intercept in zip(
sorted_vec.transpose(),
target.transpose(),
slopes.transpose(),
intercepts.transpose(),
):
indices = self._search_sorted(sorted, t)
transformed_value = slope[indices] * t + intercept[indices]
transformed.append(transformed_value)
return np.array(transformed).transpose()
@staticmethod
def standard_gaussian_cdf(x: np.array) -> np.array:
u = x / (np.sqrt(2.0))
return (erf(u) + 1.0) / 2.0
@staticmethod
def standard_gaussian_ppf(y: np.array) -> np.array:
y_clipped = np.clip(y, a_min=1.0e-6, a_max=1.0 - 1.0e-6)
return np.sqrt(2.0) * erfinv(2.0 * y_clipped - 1.0)
@staticmethod
def winsorized_cutoff(m: np.array) -> np.array:
"""
Apply truncation to the empirical CDF estimator to reduce variance as
described here: https://arxiv.org/abs/0903.0649
Parameters
----------
m
Input array with empirical CDF values.
Returns
-------
res
Truncated empirical CDf values.
"""
res = 1 / (4 * m ** 0.25 * np.sqrt(3.14 * np.log(m)))
assert 0 < res < 1
return res
@staticmethod
def _fill(target: np.ndarray, expected_length: int) -> np.ndarray:
"""
Makes sure target has at least expected_length time-units by repeating
it or using zeros.
Parameters
----------
target : shape (seq_len, dim)
expected_length
Returns
-------
array of shape (target_length, dim)
"""
current_length, target_dim = target.shape
if current_length == 0:
# todo handle the case with no observation better,
# we could use dataset statistics but for now we use zeros
filled_target = np.zeros((expected_length, target_dim))
elif current_length < expected_length:
filled_target = np.vstack(
[target for _ in range(expected_length // current_length + 1)]
)
filled_target = filled_target[:expected_length]
elif current_length > expected_length:
filled_target = target[-expected_length:]
else:
filled_target = target
assert filled_target.shape == (expected_length, target_dim)
return filled_target
def cdf_to_gaussian_forward_transform(
input_batch: DataEntry, outputs: torch.Tensor
) -> np.ndarray:
"""
Forward transformation of the CDFtoGaussianTransform.
Parameters
----------
input_batch
Input data to the predictor.
outputs
Predictor outputs.
Returns
-------
outputs
Forward transformed outputs.
"""
def _empirical_cdf_inverse_transform(
batch_target_sorted: torch.Tensor,
batch_predictions: torch.Tensor,
slopes: torch.Tensor,
intercepts: torch.Tensor,
) -> np.ndarray:
"""
Apply forward transformation of the empirical CDF.
Parameters
----------
batch_target_sorted
Sorted targets of the input batch.
batch_predictions
Predictions of the underlying probability distribution
slopes
Slopes of the piece-wise linear function.
intercepts
Intercepts of the piece-wise linear function.
Returns
-------
outputs
Forward transformed outputs.
"""
slopes = slopes.cpu().numpy()
intercepts = intercepts.cpu().numpy()
batch_target_sorted = batch_target_sorted.cpu().numpy()
_, num_timesteps, _ = batch_target_sorted.shape
indices = np.floor(batch_predictions * num_timesteps)
# indices = indices - 1
# for now project into [0, 1]
indices = np.clip(indices, 0, num_timesteps - 1)
indices = indices.astype(np.int)
transformed = np.where(
np.take_along_axis(slopes, indices, axis=1) != 0.0,
(batch_predictions - np.take_along_axis(intercepts, indices, axis=1))
/ np.take_along_axis(slopes, indices, axis=1),
np.take_along_axis(batch_target_sorted, indices, axis=1),
)
return transformed
# applies inverse cdf to all outputs
_, samples, _, _ = outputs.shape
for sample_index in range(0, samples):
outputs[:, sample_index, :, :] = _empirical_cdf_inverse_transform(
input_batch["past_target_sorted"],
CDFtoGaussianTransform.standard_gaussian_cdf(
outputs[:, sample_index, :, :]
),
input_batch["slopes"],
input_batch["intercepts"],
)
return outputs
-47
View File
@@ -1,47 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import Iterator, List
from pts.dataset import DataEntry, Dataset
from .transform import Chain, Transformation
class TransformedDataset(Dataset):
"""
A dataset that corresponds to applying a list of transformations to each
element in the base_dataset.
This only supports SimpleTransformations, which do the same thing at
prediction and training time.
Parameters
----------
base_dataset
Dataset to transform
transformations
List of transformations to apply
"""
def __init__(
self, base_dataset: Dataset, transformations: List[Transformation]
) -> None:
self.base_dataset = base_dataset
self.transformations = Chain(transformations)
def __iter__(self) -> Iterator[DataEntry]:
yield from self.transformations(self.base_dataset, is_train=True)
def __len__(self):
return sum(1 for _ in self)
-257
View File
@@ -1,257 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import List
import numpy as np
import pandas as pd
from pts.core.component import validated
from pts.dataset import DataEntry
from pts.feature import TimeFeature
from .split import shift_timestamp
from .transform import SimpleTransformation, MapTransformation
def target_transformation_length(
target: np.array, pred_length: int, is_train: bool
) -> int:
return target.shape[-1] + (0 if is_train else pred_length)
class AddObservedValuesIndicator(SimpleTransformation):
"""
Replaces missing values in a numpy array (NaNs) with a dummy value and adds
an "observed"-indicator that is ``1`` when values are observed and ``0``
when values are missing.
Parameters
----------
target_field
Field for which missing values will be replaced
output_field
Field name to use for the indicator
dummy_value
Value to use for replacing missing values.
convert_nans
If set to true (default) missing values will be replaced. Otherwise
they will not be replaced. In any case the indicator is included in the
result.
"""
@validated()
def __init__(
self,
target_field: str,
output_field: str,
dummy_value: int = 0,
convert_nans: bool = True,
dtype: np.dtype = np.float32,
) -> None:
self.dummy_value = dummy_value
self.target_field = target_field
self.output_field = output_field
self.convert_nans = convert_nans
self.dtype = dtype
def transform(self, data: DataEntry) -> DataEntry:
value = data[self.target_field]
nan_indices = np.where(np.isnan(value))
nan_entries = np.isnan(value)
if self.convert_nans:
value[nan_indices] = self.dummy_value
data[self.target_field] = value
# Invert bool array so that missing values are zeros and store as float
data[self.output_field] = np.invert(nan_entries).astype(self.dtype)
return data
class AddConstFeature(MapTransformation):
"""
Expands a `const` value along the time axis as a dynamic feature, where
the T-dimension is defined as the sum of the `pred_length` parameter and
the length of a time series specified by the `target_field`.
If `is_train=True` the feature matrix has the same length as the `target` field.
If `is_train=False` the feature matrix has length len(target) + pred_length
Parameters
----------
output_field
Field name for output.
target_field
Field containing the target array. The length of this array will be used.
pred_length
Prediction length (this is necessary since
features have to be available in the future)
const
Constant value to use.
dtype
Numpy dtype to use for resulting array.
"""
@validated()
def __init__(
self,
output_field: str,
target_field: str,
pred_length: int,
const: float = 1.0,
dtype: np.dtype = np.float32,
) -> None:
self.pred_length = pred_length
self.const = const
self.dtype = dtype
self.output_field = output_field
self.target_field = target_field
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
length = target_transformation_length(
data[self.target_field], self.pred_length, is_train=is_train
)
data[self.output_field] = self.const * np.ones(
shape=(1, length), dtype=self.dtype
)
return data
class AddTimeFeatures(MapTransformation):
"""
Adds a set of time features.
If `is_train=True` the feature matrix has the same length as the `target` field.
If `is_train=False` the feature matrix has length len(target) + pred_length
Parameters
----------
start_field
Field with the start time stamp of the time series
target_field
Field with the array containing the time series values
output_field
Field name for result.
time_features
list of time features to use.
pred_length
Prediction length
"""
@validated()
def __init__(
self,
start_field: str,
target_field: str,
output_field: str,
time_features: List[TimeFeature],
pred_length: int,
) -> None:
self.date_features = time_features
self.pred_length = pred_length
self.start_field = start_field
self.target_field = target_field
self.output_field = output_field
self._min_time_point: pd.Timestamp = None
self._max_time_point: pd.Timestamp = None
self._full_range_date_features: np.ndarray = None
self._date_index: pd.DatetimeIndex = None
def _update_cache(self, start: pd.Timestamp, length: int) -> None:
end = shift_timestamp(start, length)
if self._min_time_point is not None:
if self._min_time_point <= start and end <= self._max_time_point:
return
if self._min_time_point is None:
self._min_time_point = start
self._max_time_point = end
self._min_time_point = min(shift_timestamp(start, -50), self._min_time_point)
self._max_time_point = max(shift_timestamp(end, 50), self._max_time_point)
self.full_date_range = pd.date_range(
self._min_time_point, self._max_time_point, freq=start.freq
)
self._full_range_date_features = (
np.vstack([feat(self.full_date_range) for feat in self.date_features])
if self.date_features
else None
)
self._date_index = pd.Series(
index=self.full_date_range, data=np.arange(len(self.full_date_range)),
)
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
start = data[self.start_field]
length = target_transformation_length(
data[self.target_field], self.pred_length, is_train=is_train
)
self._update_cache(start, length)
i0 = self._date_index[start]
features = (
self._full_range_date_features[..., i0 : i0 + length]
if self.date_features
else None
)
data[self.output_field] = features
return data
class AddAgeFeature(MapTransformation):
"""
Adds an 'age' feature to the data_entry.
The age feature starts with a small value at the start of the time series
and grows over time.
If `is_train=True` the age feature has the same length as the `target`
field.
If `is_train=False` the age feature has length len(target) + pred_length
Parameters
----------
target_field
Field with target values (array) of time series
output_field
Field name to use for the output.
pred_length
Prediction length
log_scale
If set to true the age feature grows logarithmically otherwise linearly
over time.
"""
@validated()
def __init__(
self,
target_field: str,
output_field: str,
pred_length: int,
log_scale: bool = True,
dtype: np.dtype = np.float32,
) -> None:
self.pred_length = pred_length
self.target_field = target_field
self.feature_name = output_field
self.log_scale = log_scale
self._age_feature = np.zeros(0)
self.dtype = dtype
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
length = target_transformation_length(
data[self.target_field], self.pred_length, is_train=is_train
)
if self.log_scale:
age = np.log10(2.0 + np.arange(length, dtype=self.dtype))
else:
age = np.arange(length, dtype=self.dtype)
data[self.feature_name] = age.reshape((1, length))
return data
-118
View File
@@ -1,118 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from collections import Counter
from typing import Any, Dict, List
from pts.core.component import validated
from pts.dataset import DataEntry
from .transform import SimpleTransformation, MapTransformation
class RenameFields(SimpleTransformation):
"""
Rename fields using a mapping
Parameters
----------
mapping
Name mapping `input_name -> output_name`
"""
@validated()
def __init__(self, mapping: Dict[str, str]) -> None:
self.mapping = mapping
values_count = Counter(mapping.values())
for new_key, count in values_count.items():
assert count == 1, f"Mapped key {new_key} occurs multiple time"
def transform(self, data: DataEntry):
for key, new_key in self.mapping.items():
if key not in data:
continue
assert new_key not in data
data[new_key] = data[key]
del data[key]
return data
class RemoveFields(SimpleTransformation):
@validated()
def __init__(self, field_names: List[str]) -> None:
self.field_names = field_names
def transform(self, data: DataEntry) -> DataEntry:
for k in self.field_names:
if k in data.keys():
del data[k]
return data
class SetField(SimpleTransformation):
"""
Sets a field in the dictionary with the given value.
Parameters
----------
output_field
Name of the field that will be set
value
Value to be set
"""
@validated()
def __init__(self, output_field: str, value: Any) -> None:
self.output_field = output_field
self.value = value
def transform(self, data: DataEntry) -> DataEntry:
data[self.output_field] = self.value
return data
class SetFieldIfNotPresent(SimpleTransformation):
"""Sets a field in the dictionary with the given value, in case it does not
exist already.
Parameters
----------
output_field
Name of the field that will be set
value
Value to be set
"""
@validated()
def __init__(self, field: str, value: Any) -> None:
self.output_field = field
self.value = value
def transform(self, data: DataEntry) -> DataEntry:
if self.output_field not in data.keys():
data[self.output_field] = self.value
return data
class SelectFields(MapTransformation):
"""
Only keep the listed fields
Parameters
----------
input_fields
List of fields to keep.
"""
@validated()
def __init__(self, input_fields: List[str]) -> None:
self.input_fields = input_fields
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
return {f: data[f] for f in self.input_fields}
-176
View File
@@ -1,176 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from abc import ABC, abstractmethod
import numpy as np
from pts.core.component import validated
from pts.dataset.stat import ScaleHistogram
class InstanceSampler(ABC):
"""
An InstanceSampler is called with the time series and the valid
index bounds a, b and should return a set of indices a <= i <= b
at which training instances will be generated.
The object should be called with:
Parameters
----------
ts
target that should be sampled with shape (dim, seq_len)
a
first index of the target that can be sampled
b
last index of the target that can be sampled
Returns
-------
np.ndarray
Selected points to sample
"""
@abstractmethod
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
pass
class UniformSplitSampler(InstanceSampler):
"""
Samples each point with the same fixed probability.
Parameters
----------
p
Probability of selecting a time point
"""
def __init__(self, p: float) -> None:
self.p = p
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
assert a <= b, "First index must be less than or equal to the last index."
window_size = b - a + 1
(indices,) = np.where(np.random.random_sample(window_size) < self.p)
return indices + a
class TestSplitSampler(InstanceSampler):
"""
Sampler used for prediction. Always selects the last time point for
splitting i.e. the forecast point for the time series.
"""
def __init__(self) -> None:
pass
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
return np.array([b])
class ExpectedNumInstanceSampler(InstanceSampler):
"""
Keeps track of the average time series length and adjusts the probability
per time point such that on average `num_instances` training examples are
generated per time series.
Parameters
----------
num_instances
number of training examples generated per time series on average
"""
@validated()
def __init__(self, num_instances: float) -> None:
self.num_instances = num_instances
self.total_length = 0
self.n = 0
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
window_size = b - a + 1
self.n += 1
self.total_length += window_size
avg_length = self.total_length / self.n
sampler = UniformSplitSampler(self.num_instances / avg_length)
return sampler(ts, a, b)
class BucketInstanceSampler(InstanceSampler):
"""
This sample can be used when working with a set of time series that have a
skewed distributions. For instance, if the dataset contains many time series
with small values and few with large values.
The probability of sampling from bucket i is the inverse of its number of elements.
Parameters
----------
scale_histogram
The histogram of scale for the time series. Here scale is the mean abs
value of the time series.
"""
def __init__(self, scale_histogram: ScaleHistogram) -> None:
# probability of sampling a bucket i is the inverse of its number of
# elements
self.scale_histogram = scale_histogram
self.lookup = np.arange(2 ** 13)
def __call__(self, ts: np.ndarray, a: int, b: int) -> None:
while ts.shape[-1] >= len(self.lookup):
self.lookup = np.arange(2 * len(self.lookup))
p = 1.0 / self.scale_histogram.count(ts)
mask = np.random.uniform(low=0.0, high=1.0, size=b - a + 1) < p
indices = self.lookup[a : a + len(mask)][mask]
return indices
class ContinuousTimePointSampler(ABC):
"""
Abstract class for "continuous time" samplers, which, given a lower bound
and upper bound, sample "points" (events) in continuous time from a
specified interval.
"""
def __init__(self, num_instances: int) -> None:
self.num_instances = num_instances
@abstractmethod
def __call__(self, a: float, b: float) -> np.ndarray:
"""
Returns random points in the real interval between :code:`a` and
:code:`b`.
Parameters
----------
a
The lower bound (minimum time value that a sampled point can take)
b
Upper bound. Must be greater than a.
"""
pass
class ContinuousTimeUniformSampler(ContinuousTimePointSampler):
"""
Implements a simple random sampler to sample points in the continuous
interval between :code:`a` and :code:`b`.
"""
def __call__(self, a: float, b: float) -> np.ndarray:
assert a <= b, "Interval start time must be before interval end time."
return np.random.rand(self.num_instances) * (b - a) + a
-529
View File
@@ -1,529 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from functools import lru_cache
from typing import Iterator, List, Optional
import numpy as np
import pandas as pd
from pts.core.component import validated
from pts.dataset import DataEntry, FieldName
from .sampler import InstanceSampler, ContinuousTimePointSampler
from .transform import FlatMapTransformation
def shift_timestamp(ts: pd.Timestamp, offset: int) -> pd.Timestamp:
"""
Computes a shifted timestamp.
Basic wrapping around pandas ``ts + offset`` with caching and exception
handling.
"""
return _shift_timestamp_helper(ts, ts.freq, offset)
@lru_cache(maxsize=10000)
def _shift_timestamp_helper(ts: pd.Timestamp, freq: str, offset: int) -> pd.Timestamp:
"""
We are using this helper function which explicitly uses the frequency as a
parameter, because the frequency is not included in the hash of a time
stamp.
I.e.
pd.Timestamp(x, freq='1D') and pd.Timestamp(x, freq='1min')
hash to the same value.
"""
try:
# this line looks innocent, but can create a date which is out of
# bounds values over year 9999 raise a ValueError
# values over 2262-04-11 raise a pandas OutOfBoundsDatetime
return ts + offset * ts.freq
except (ValueError, pd._libs.OutOfBoundsDatetime) as ex:
raise Exception(ex)
class InstanceSplitter(FlatMapTransformation):
"""
Selects training instances, by slicing the target and other time series
like arrays at random points in training mode or at the last time point in
prediction mode. Assumption is that all time like arrays start at the same
time point.
The target and each time_series_field is removed and instead two
corresponding fields with prefix `past_` and `future_` are included. E.g.
If the target array is one-dimensional, the resulting instance has shape
(len_target). In the multi-dimensional case, the instance has shape (dim,
len_target).
target -> past_target and future_target
The transformation also adds a field 'past_is_pad' that indicates whether
values where padded or not.
Convention: time axis is always the last axis.
Parameters
----------
target_field
field containing the target
is_pad_field
output field indicating whether padding happened
start_field
field containing the start date of the time series
forecast_start_field
output field that will contain the time point where the forecast starts
train_sampler
instance sampler that provides sampling indices given a time-series
past_length
length of the target seen before making prediction
future_length
length of the target that must be predicted
time_first
whether to have time series output in (time, dimension) or in
(dimension, time) layout
time_series_fields
fields that contains time-series, they are split in the same interval
as the target
pick_incomplete
whether training examples can be sampled with only a part of
past_length time-units
present for the time series. This is useful to train models for
cold-start. In such case, is_pad_out contains an indicator whether
data is padded or not.
"""
@validated()
def __init__(
self,
target_field: str,
is_pad_field: str,
start_field: str,
forecast_start_field: str,
train_sampler: InstanceSampler,
past_length: int,
future_length: int,
time_first: bool = True,
time_series_fields: Optional[List[str]] = None,
pick_incomplete: bool = True,
) -> None:
assert future_length > 0
self.train_sampler = train_sampler
self.past_length = past_length
self.future_length = future_length
self.time_first = time_first
self.ts_fields = time_series_fields if time_series_fields is not None else []
self.target_field = target_field
self.is_pad_field = is_pad_field
self.start_field = start_field
self.forecast_start_field = forecast_start_field
self.pick_incomplete = pick_incomplete
def _past(self, col_name):
return f"past_{col_name}"
def _future(self, col_name):
return f"future_{col_name}"
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
pl = self.future_length
slice_cols = self.ts_fields + [self.target_field]
target = data[self.target_field]
len_target = target.shape[-1]
minimum_length = (
self.future_length
if self.pick_incomplete
else self.past_length + self.future_length
)
if is_train:
sampling_bounds = (
(0, len_target - self.future_length)
if self.pick_incomplete
else (self.past_length, len_target - self.future_length)
)
# We currently cannot handle time series that are
# too short during training, so we just skip these.
# If we want to include them we would need to pad and to
# mask the loss.
sampled_indices = (
np.array([], dtype=int)
if len_target < minimum_length
else self.train_sampler(target, *sampling_bounds)
)
else:
assert self.pick_incomplete or len_target >= self.past_length
sampled_indices = np.array([len_target], dtype=int)
for i in sampled_indices:
pad_length = max(self.past_length - i, 0)
if not self.pick_incomplete:
assert pad_length == 0, f"pad_length should be zero, got {pad_length}"
d = data.copy()
for ts_field in slice_cols:
if i > self.past_length:
# truncate to past_length
past_piece = d[ts_field][..., i - self.past_length : i]
elif i < self.past_length:
pad_block = np.zeros(
d[ts_field].shape[:-1] + (pad_length,), dtype=d[ts_field].dtype,
)
past_piece = np.concatenate(
[pad_block, d[ts_field][..., :i]], axis=-1
)
else:
past_piece = d[ts_field][..., :i]
d[self._past(ts_field)] = past_piece
d[self._future(ts_field)] = d[ts_field][..., i : i + pl]
del d[ts_field]
pad_indicator = np.zeros(self.past_length)
if pad_length > 0:
pad_indicator[:pad_length] = 1
if self.time_first:
for ts_field in slice_cols:
d[self._past(ts_field)] = d[self._past(ts_field)].transpose()
d[self._future(ts_field)] = d[self._future(ts_field)].transpose()
d[self._past(self.is_pad_field)] = pad_indicator
d[self.forecast_start_field] = shift_timestamp(d[self.start_field], i)
yield d
class CanonicalInstanceSplitter(FlatMapTransformation):
"""
Selects instances, by slicing the target and other time series
like arrays at random points in training mode or at the last time point in
prediction mode. Assumption is that all time like arrays start at the same
time point.
In training mode, the returned instances contain past_`target_field`
as well as past_`time_series_fields`.
In prediction mode, one can set `use_prediction_features` to get
future_`time_series_fields`.
If the target array is one-dimensional, the `target_field` in the resulting instance has shape
(`instance_length`). In the multi-dimensional case, the instance has shape (`dim`, `instance_length`),
where `dim` can also take a value of 1.
In the case of insufficient number of time series values, the
transformation also adds a field 'past_is_pad' that indicates whether
values where padded or not, and the value is padded with
`default_pad_value` with a default value 0.
This is done only if `allow_target_padding` is `True`,
and the length of `target` is smaller than `instance_length`.
Parameters
----------
target_field
fields that contains time-series
is_pad_field
output field indicating whether padding happened
start_field
field containing the start date of the time series
forecast_start_field
field containing the forecast start date
instance_sampler
instance sampler that provides sampling indices given a time-series
instance_length
length of the target seen before making prediction
time_first
whether to have time series output in (time, dimension) or in
(dimension, time) layout
time_series_fields
fields that contains time-series, they are split in the same interval
as the target
allow_target_padding
flag to allow padding
pad_value
value to be used for padding
use_prediction_features
flag to indicate if prediction range features should be returned
prediction_length
length of the prediction range, must be set if
use_prediction_features is True
"""
def __init__(
self,
target_field: str,
is_pad_field: str,
start_field: str,
forecast_start_field: str,
instance_sampler: InstanceSampler,
instance_length: int,
time_first: bool = True,
time_series_fields: List[str] = [],
allow_target_padding: bool = False,
pad_value: float = 0.0,
use_prediction_features: bool = False,
prediction_length: Optional[int] = None,
) -> None:
self.instance_sampler = instance_sampler
self.instance_length = instance_length
self.time_first = time_first
self.dynamic_feature_fields = time_series_fields
self.target_field = target_field
self.allow_target_padding = allow_target_padding
self.pad_value = pad_value
self.is_pad_field = is_pad_field
self.start_field = start_field
self.forecast_start_field = forecast_start_field
assert (
not use_prediction_features or prediction_length is not None
), "You must specify `prediction_length` if `use_prediction_features`"
self.use_prediction_features = use_prediction_features
self.prediction_length = prediction_length
def _past(self, col_name):
return f"past_{col_name}"
def _future(self, col_name):
return f"future_{col_name}"
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
ts_fields = self.dynamic_feature_fields + [self.target_field]
ts_target = data[self.target_field]
len_target = ts_target.shape[-1]
if is_train:
if len_target < self.instance_length:
sampling_indices = (
# Returning [] for all time series will cause this to be in loop forever!
[len_target]
if self.allow_target_padding
else []
)
else:
sampling_indices = self.instance_sampler(
ts_target, self.instance_length, len_target
)
else:
sampling_indices = [len_target]
for i in sampling_indices:
d = data.copy()
pad_length = max(self.instance_length - i, 0)
# update start field
d[self.start_field] = shift_timestamp(
data[self.start_field], i - self.instance_length
)
# set is_pad field
is_pad = np.zeros(self.instance_length)
if pad_length > 0:
is_pad[:pad_length] = 1
d[self.is_pad_field] = is_pad
# update time series fields
for ts_field in ts_fields:
full_ts = data[ts_field]
if pad_length > 0:
pad_pre = self.pad_value * np.ones(
shape=full_ts.shape[:-1] + (pad_length,)
)
past_ts = np.concatenate([pad_pre, full_ts[..., :i]], axis=-1)
else:
past_ts = full_ts[..., (i - self.instance_length) : i]
past_ts = past_ts.transpose() if self.time_first else past_ts
d[self._past(ts_field)] = past_ts
if self.use_prediction_features and not is_train:
if not ts_field == self.target_field:
future_ts = full_ts[..., i : i + self.prediction_length]
future_ts = (
future_ts.transpose() if self.time_first else future_ts
)
d[self._future(ts_field)] = future_ts
del d[ts_field]
d[self.forecast_start_field] = shift_timestamp(
d[self.start_field], self.instance_length
)
yield d
class ContinuousTimeInstanceSplitter(FlatMapTransformation):
"""
Selects training instances by slicing "intervals" from a continuos-time
process instantiation. Concretely, the input data is expected to describe an
instantiation from a point (or jump) process, with the "target"
identifying inter-arrival times and other features (marks), as described
in detail below.
The splitter will then take random points in continuous time from each
given observation, and return a (variable-length) array of points in
the past (context) and the future (prediction) intervals.
The transformation is analogous to its discrete counterpart
`InstanceSplitter` except that
- It does not allow "incomplete" records. That is, the past and future
intervals sampled are always complete
- Outputs a (T, C) layout.
- Does not accept `time_series_fields` (i.e., only accepts target fields) as these
would typically not be available in TPP data.
The target arrays are expected to have (2, T) layout where the first axis
corresponds to the (i) interarrival times between consecutive points, in
order and (ii) integer identifiers of marks (from {0, 1, ..., :code:`num_marks`}).
The returned arrays will have (T, 2) layout.
For example, the array below corresponds to a target array where points with timestamps
0.5, 1.1, and 1.5 were observed belonging to categories (marks) 3, 1 and 0
respectively: :code:`[[0.5, 0.6, 0.4], [3, 1, 0]]`.
Parameters
----------
past_interval_length
length of the interval seen before making prediction
future_interval_length
length of the interval that must be predicted
train_sampler
instance sampler that provides sampling indices given a time-series
target_field
field containing the target
start_field
field containing the start date of the of the point process observation
end_field
field containing the end date of the point process observation
forecast_start_field
output field that will contain the time point where the forecast starts
"""
def __init__(
self,
past_interval_length: float,
future_interval_length: float,
train_sampler: ContinuousTimePointSampler,
target_field: str = FieldName.TARGET,
start_field: str = FieldName.START,
end_field: str = "end",
forecast_start_field: str = FieldName.FORECAST_START,
) -> None:
assert (
future_interval_length > 0
), "Prediction interval must have length greater than 0."
self.train_sampler = train_sampler
self.past_interval_length = past_interval_length
self.future_interval_length = future_interval_length
self.target_field = target_field
self.start_field = start_field
self.end_field = end_field
self.forecast_start_field = forecast_start_field
# noinspection PyMethodMayBeStatic
def _mask_sorted(self, a: np.ndarray, lb: float, ub: float):
start = np.searchsorted(a, lb)
end = np.searchsorted(a, ub)
return np.arange(start, end)
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
assert data[self.start_field].freq == data[self.end_field].freq
total_interval_length = (data[self.end_field] - data[self.start_field]) / data[
self.start_field
].freq.delta
# sample forecast start times in continuous time
if is_train:
if total_interval_length < (
self.future_interval_length + self.past_interval_length
):
sampling_times: np.ndarray = np.array([])
else:
sampling_times = self.train_sampler(
self.past_interval_length,
total_interval_length - self.future_interval_length,
)
else:
sampling_times = np.array([total_interval_length])
ia_times = data[self.target_field][0, :]
marks = data[self.target_field][1:, :]
ts = np.cumsum(ia_times)
assert ts[-1] < total_interval_length, (
"Target interarrival times provided are inconsistent with "
"start and end timestamps."
)
# select field names that will be included in outputs
keep_cols = {
k: v
for k, v in data.items()
if k not in [self.target_field, self.start_field, self.end_field]
}
for future_start in sampling_times:
r: DataEntry = dict()
past_start = future_start - self.past_interval_length
future_end = future_start + self.future_interval_length
assert past_start >= 0
past_mask = self._mask_sorted(ts, past_start, future_start)
past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[np.newaxis]
r[f"past_{self.target_field}"] = np.concatenate(
[past_ia_times, marks[:, past_mask]], axis=0
).transpose()
r["past_valid_length"] = np.array([len(past_mask)])
r[self.forecast_start_field] = (
data[self.start_field]
+ data[self.start_field].freq.delta * future_start
)
if is_train: # include the future only if is_train
assert future_end <= total_interval_length
future_mask = self._mask_sorted(ts, future_start, future_end)
future_ia_times = np.diff(np.r_[0, ts[future_mask] - future_start])[
np.newaxis
]
r[f"future_{self.target_field}"] = np.concatenate(
[future_ia_times, marks[:, future_mask]], axis=0
).transpose()
r["future_valid_length"] = np.array([len(future_mask)])
# include other fields
r.update(keep_cols.copy())
yield r
+2 -1
View File
@@ -16,7 +16,7 @@ from abc import ABC, abstractmethod
from functools import reduce
from typing import Callable, Iterator, Iterable, List
from pts.core.component import validated
from gluonts.core.component import validated
from pts.dataset import DataEntry
MAX_IDLE_TRANSFORMS = 100
@@ -43,6 +43,7 @@ class Chain(Transformation):
"""
Chain multiple transformations together.
"""
@validated()
def __init__(self, trans: List[Transformation]) -> None:
self.transformations = []
+2 -2
View File
@@ -16,7 +16,8 @@ setup(
zip_safe=True,
python_requires=">=3.6",
install_requires = [
'torch>=1.5.0',
'torch>=1.7.0',
'glounts>=0.6.4',
'holidays',
'numpy',
'pandas>=1.1',
@@ -24,7 +25,6 @@ setup(
'tqdm',
'pydantic',
'matplotlib',
'python-rapidjson',
'tensorboard',
],
-30
View File
@@ -1,30 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# First-party imports
from pts.dataset import FieldName
def test_dataset_fields():
assert (
"feat_static_cat" == FieldName.FEAT_STATIC_CAT
), "Error in the FieldName 'feat_static_cat'."
assert (
"feat_static_real" == FieldName.FEAT_STATIC_REAL
), "Error in the FieldName 'feat_static_real'."
assert (
"feat_dynamic_cat" == FieldName.FEAT_DYNAMIC_CAT
), "Error in the FieldName 'feat_dynamic_cat'."
assert (
"feat_dynamic_real" == FieldName.FEAT_DYNAMIC_REAL
), "Error in the FieldName 'feat_dynamic_real'."
-129
View File
@@ -1,129 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import numpy as np
# Standard library imports
import pytest
# First-party imports
from pts.dataset import ListDataset, MultivariateGrouper
UNIVARIATE_TS = [
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
],
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-08", "target": [5, 6, 7, 8]},
],
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-07", "target": [0]},
],
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-01", "target": [0]},
],
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-08", "target": [5, 6, 7, 8]},
],
]
MULTIVARIATE_TS = [
[{"start": "2014-09-07", "target": [[1, 2, 3, 4], [5, 6, 7, 8]]}],
[{"start": "2014-09-07", "target": [[1, 2, 3, 4, 2.5], [6.5, 5, 6, 7, 8]],}],
[{"start": "2014-09-07", "target": [[1, 2, 3, 4], [0, 0, 0, 0]]}],
[
{
"start": "2014-09-01",
"target": [
[2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 1, 2, 3, 4],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
],
}
],
[{"start": "2014-09-07", "target": [[1, 2, 3, 4, 0], [0, 5, 6, 7, 8]]}],
]
TRAIN_FILL_RULE = [np.mean, np.mean, np.mean, np.mean, lambda x: 0.0]
@pytest.mark.parametrize(
"univariate_ts, multivariate_ts, train_fill_rule",
zip(UNIVARIATE_TS, MULTIVARIATE_TS, TRAIN_FILL_RULE),
)
def test_multivariate_grouper_train(
univariate_ts, multivariate_ts, train_fill_rule
) -> None:
univariate_ds = ListDataset(univariate_ts, freq="1D")
multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False)
grouper = MultivariateGrouper(train_fill_rule=train_fill_rule)
assert (
list(grouper(univariate_ds))[0]["target"] == list(multivariate_ds)[0]["target"]
).all()
assert list(grouper(univariate_ds))[0]["start"] == list(multivariate_ds)[0]["start"]
UNIVARIATE_TS_TEST = [
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
{"start": "2014-09-08", "target": [0, 1, 2, 3]},
{"start": "2014-09-08", "target": [4, 5, 6, 7]},
],
[
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
{"start": "2014-09-08", "target": [0, 1, 2, 3]},
{"start": "2014-09-08", "target": [4, 5, 6, 7]},
],
]
MULTIVARIATE_TS_TEST = [
[
{"start": "2014-09-07", "target": [[1, 2, 3, 4], [5, 6, 7, 8]]},
{"start": "2014-09-07", "target": [[0, 0, 1, 2, 3], [0, 4, 5, 6, 7]]},
],
[
{"start": "2014-09-07", "target": [[5, 6, 7, 8]]},
{"start": "2014-09-07", "target": [[0, 4, 5, 6, 7]]},
],
]
TEST_FILL_RULE = [lambda x: 0.0, lambda x: 0.0]
MAX_TARGET_DIM = [2, 1]
@pytest.mark.parametrize(
"univariate_ts, multivariate_ts, test_fill_rule, max_target_dim",
zip(UNIVARIATE_TS_TEST, MULTIVARIATE_TS_TEST, TEST_FILL_RULE, MAX_TARGET_DIM,),
)
def test_multivariate_grouper_test(
univariate_ts, multivariate_ts, test_fill_rule, max_target_dim
) -> None:
univariate_ds = ListDataset(univariate_ts, freq="1D")
multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False)
grouper = MultivariateGrouper(
test_fill_rule=test_fill_rule, num_test_dates=2, max_target_dim=max_target_dim,
)
for grouped_data, multivariate_data in zip(grouper(univariate_ds), multivariate_ds):
assert (grouped_data["target"] == multivariate_data["target"]).all()
assert grouped_data["start"] == multivariate_data["start"]
-21
View File
@@ -1,21 +0,0 @@
import pandas as pd
import pytest
from pts.dataset import ProcessStartField
@pytest.mark.parametrize(
"freq, expected",
[
("B", "2019-11-01"),
("W", "2019-11-03"),
("M", "2019-11-30"),
("12M", "2019-11-30"),
("A-DEC", "2019-12-31"),
],
)
def test_process_start_field(freq, expected):
process = ProcessStartField.process
given = "2019-11-01 12:34:56"
assert process(given, freq) == pd.Timestamp(expected, freq)
-340
View File
@@ -1,340 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
import unittest
from typing import cast
# Third-party imports
import numpy as np
import pandas as pd
# First-party imports
from pts.dataset import DataEntry, Dataset
from pts.dataset.stat import (
DatasetStatistics,
ScaleHistogram,
calculate_dataset_statistics,
)
def make_dummy_dynamic_feat(target, num_features) -> np.ndarray:
# gives dummy dynamic_feat constructed from the target
return np.vstack([target * (i + 1) for i in range(num_features)])
# default values for TimeSeries field
start = pd.Timestamp("1985-01-02", freq="1D")
target = np.random.randint(0, 10, 20)
fsc = [0, 1]
fsr = [0.1, 0.2]
def make_time_series(
start=start,
target=target,
feat_static_cat=fsc,
feat_static_real=fsr,
num_feat_dynamic_cat=1,
num_feat_dynamic_real=1,
) -> DataEntry:
feat_dynamic_cat = (
make_dummy_dynamic_feat(target, num_feat_dynamic_cat).astype("int64")
if num_feat_dynamic_cat > 0
else None
)
feat_dynamic_real = (
make_dummy_dynamic_feat(target, num_feat_dynamic_real).astype("float")
if num_feat_dynamic_real > 0
else None
)
data = {
"start": start,
"target": target,
"feat_static_cat": feat_static_cat,
"feat_static_real": feat_static_real,
"feat_dynamic_cat": feat_dynamic_cat,
"feat_dynamic_real": feat_dynamic_real,
}
return data
def ts(
start,
target,
feat_static_cat=None,
feat_static_real=None,
feat_dynamic_cat=None,
feat_dynamic_real=None,
) -> DataEntry:
d = {"start": start, "target": target}
if feat_static_cat is not None:
d["feat_static_cat"] = feat_static_cat
if feat_static_real is not None:
d["feat_static_real"] = feat_static_real
if feat_dynamic_cat is not None:
d["feat_dynamic_cat"] = feat_dynamic_cat
if feat_dynamic_real is not None:
d["feat_dynamic_real"] = feat_dynamic_real
return d
class DatasetStatisticsTest(unittest.TestCase):
def test_dataset_statistics(self) -> None:
n = 2
T = 10
# use integers to avoid float conversion that can fail comparison
np.random.seed(0)
targets = np.random.randint(0, 10, (n, T))
scale_histogram = ScaleHistogram()
for i in range(n):
scale_histogram.add(targets[i, :])
scale_histogram.add([])
expected = DatasetStatistics(
integer_dataset=True,
num_time_series=n + 1,
num_time_observations=targets.size,
mean_target_length=T * 2 / 3,
min_target=targets.min(),
mean_target=targets.mean(),
mean_abs_target=targets.mean(),
max_target=targets.max(),
feat_static_real=[{0.1}, {0.2, 0.3}],
feat_static_cat=[{1}, {2, 3}],
num_feat_dynamic_real=2,
num_feat_dynamic_cat=2,
num_missing_values=0,
scale_histogram=scale_histogram,
)
# FIXME: the cast below is a hack to make mypy happy
timeseries = cast(
Dataset,
[
make_time_series(
target=targets[0, :],
feat_static_cat=[1, 2],
feat_static_real=[0.1, 0.2],
num_feat_dynamic_cat=2,
num_feat_dynamic_real=2,
),
make_time_series(
target=targets[1, :],
feat_static_cat=[1, 3],
feat_static_real=[0.1, 0.3],
num_feat_dynamic_cat=2,
num_feat_dynamic_real=2,
),
make_time_series(
target=np.array([]),
feat_static_cat=[1, 3],
feat_static_real=[0.1, 0.3],
num_feat_dynamic_cat=2,
num_feat_dynamic_real=2,
),
],
)
found = calculate_dataset_statistics(timeseries)
assert expected == found
def test_dataset_histogram(self) -> None:
# generates 2 ** N - 1 timeseries with constant increasing values
N = 6
n = 2 ** N - 1
T = 5
targets = np.ones((n, T))
for i in range(0, n):
targets[i, :] = targets[i, :] * i
# FIXME: the cast below is a hack to make mypy happy
timeseries = cast(
Dataset, [make_time_series(target=targets[i, :]) for i in range(n)]
)
found = calculate_dataset_statistics(timeseries)
hist = found.scale_histogram.bin_counts
for i in range(0, N):
assert i in hist
assert hist[i] == 2 ** i
class DatasetStatisticsExceptions(unittest.TestCase):
def test_dataset_statistics_exceptions(self) -> None:
def check_error_message(expected_regex, dataset) -> None:
with self.assertRaisesRegex(Exception, expected_regex):
calculate_dataset_statistics(dataset)
check_error_message("Time series dataset is empty!", [])
check_error_message(
"Only empty time series found in the dataset!",
[make_time_series(target=np.random.randint(0, 10, 0))],
)
# infinite target
# check_error_message(
# "Target values have to be finite (e.g., not inf, -inf, "
# "or None) and cannot exceed single precision floating "
# "point range.",
# [make_time_series(target=np.full(20, np.inf))]
# )
# different number of feat_dynamic_{cat, real}
check_error_message(
"Found instances with different number of features in "
"feat_dynamic_cat, found one with 2 and another with 1.",
[
make_time_series(num_feat_dynamic_cat=2),
make_time_series(num_feat_dynamic_cat=1),
],
)
check_error_message(
"Found instances with different number of features in "
"feat_dynamic_cat, found one with 0 and another with 1.",
[
make_time_series(num_feat_dynamic_cat=0),
make_time_series(num_feat_dynamic_cat=1),
],
)
check_error_message(
"feat_dynamic_cat was found for some instances but not others.",
[
make_time_series(num_feat_dynamic_cat=1),
make_time_series(num_feat_dynamic_cat=0),
],
)
check_error_message(
"Found instances with different number of features in "
"feat_dynamic_real, found one with 2 and another with 1.",
[
make_time_series(num_feat_dynamic_real=2),
make_time_series(num_feat_dynamic_real=1),
],
)
check_error_message(
"Found instances with different number of features in "
"feat_dynamic_real, found one with 0 and another with 1.",
[
make_time_series(num_feat_dynamic_real=0),
make_time_series(num_feat_dynamic_real=1),
],
)
check_error_message(
"feat_dynamic_real was found for some instances but not others.",
[
make_time_series(num_feat_dynamic_real=1),
make_time_series(num_feat_dynamic_real=0),
],
)
# infinite feat_dynamic_{cat,real}
inf_dynamic_feat = np.full((2, len(target)), np.inf)
check_error_message(
"Features values have to be finite and cannot exceed single "
"precision floating point range.",
[
ts(
start,
target,
feat_dynamic_cat=inf_dynamic_feat,
feat_static_cat=[0, 1],
)
],
)
check_error_message(
"Features values have to be finite and cannot exceed single "
"precision floating point range.",
[
ts(
start,
target,
feat_dynamic_real=inf_dynamic_feat,
feat_static_cat=[0, 1],
)
],
)
# feat_dynamic_{cat, real} different length from target
check_error_message(
"Each feature in feat_dynamic_cat has to have the same length as the "
"target. Found an instance with feat_dynamic_cat of length 1 and a "
"target of length 20.",
[
ts(
start=start,
target=target,
feat_static_cat=[0, 1],
feat_dynamic_cat=np.ones((1, 1)),
)
],
)
check_error_message(
"Each feature in feat_dynamic_real has to have the same length as the "
"target. Found an instance with feat_dynamic_real of length 1 and a "
"target of length 20.",
[
ts(
start=start,
target=target,
feat_static_cat=[0, 1],
feat_dynamic_real=np.ones((1, 1)),
)
],
)
# feat_static_{cat, real} different length
check_error_message(
"Not all feat_static_cat vectors have the same length 2 != 1.",
[
ts(start=start, target=target, feat_static_cat=[0, 1]),
ts(start=start, target=target, feat_static_cat=[1]),
],
)
check_error_message(
"Not all feat_static_real vectors have the same length 2 != 1.",
[
ts(start=start, target=target, feat_static_real=[0, 1]),
ts(start=start, target=target, feat_static_real=[1]),
],
)
calculate_dataset_statistics(
# FIXME: the cast below is a hack to make mypy happy
cast(
Dataset,
[
make_time_series(num_feat_dynamic_cat=2),
make_time_series(num_feat_dynamic_cat=2),
],
)
)
calculate_dataset_statistics(
# FIXME: the cast below is a hack to make mypy happy
cast(
Dataset,
[
make_time_series(num_feat_dynamic_cat=0),
make_time_series(num_feat_dynamic_cat=0),
],
)
)
-649
View File
@@ -1,649 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Third-party imports
import numpy as np
import pandas as pd
import pytest
# First-party imports
from pts.evaluation import (
Evaluator,
MultivariateEvaluator,
)
from pts.feature import get_seasonality
from pts.model.forecast import QuantileForecast, SampleForecast
QUANTILES = [str(q / 10.0) for q in range(1, 10)]
def data_iterator(ts):
"""
:param ts: list of pd.Series or pd.DataFrame
:return:
"""
for i in range(len(ts)):
yield ts[i]
def fcst_iterator(fcst, start_dates, freq):
"""
:param fcst: list of numpy arrays with the sample paths
:return:
"""
for i in range(len(fcst)):
yield SampleForecast(samples=fcst[i], start_date=start_dates[i], freq=freq)
def iterator(it):
"""
Convenience function to toggle whether to consume dataset and forecasts as iterators or iterables.
:param it:
:return: it (as iterator)
"""
return iter(it)
def iterable(it):
"""
Convenience function to toggle whether to consume dataset and forecasts as iterators or iterables.
:param it:
:return: it (as iterable)
"""
return list(it)
def naive_forecaster(ts, prediction_length, num_samples=100, target_dim=0):
"""
:param ts: pandas.Series
:param prediction_length:
:param num_samples: number of sample paths
:param target_dim: number of axes of target (0: scalar, 1: array, ...)
:return: np.array with dimension (num_samples, prediction_length)
"""
# naive prediction: last observed value
naive_pred = ts.values[-prediction_length - 1]
assert len(naive_pred.shape) == target_dim
return np.tile(
naive_pred,
(num_samples, prediction_length) + tuple(1 for _ in range(target_dim)),
)
def naive_multivariate_forecaster(ts, prediction_length, num_samples=100):
return naive_forecaster(ts, prediction_length, num_samples, target_dim=1)
def calculate_metrics(
timeseries,
evaluator,
ts_datastructure,
has_nans=False,
forecaster=naive_forecaster,
input_type=iterator,
):
num_timeseries = timeseries.shape[0]
num_timestamps = timeseries.shape[1]
if has_nans:
timeseries[0, 1] = np.nan
timeseries[0, 7] = np.nan
num_samples = 100
prediction_length = 3
freq = "1D"
ts_start_dates = (
[]
) # starting date of each time series - can be different in general
pd_timeseries = [] # list of pandas.DataFrame
samples = [] # list of forecast samples
start_dates = [] # start date of the prediction range
for i in range(num_timeseries):
ts_start_dates.append(pd.Timestamp(year=2018, month=1, day=1, hour=1))
index = pd.date_range(ts_start_dates[i], periods=num_timestamps, freq=freq)
pd_timeseries.append(ts_datastructure(timeseries[i], index=index))
samples.append(forecaster(pd_timeseries[i], prediction_length, num_samples))
start_dates.append(
pd.date_range(ts_start_dates[i], periods=num_timestamps, freq=freq)[
-prediction_length
]
)
# data iterator
data_iter = input_type(data_iterator(pd_timeseries))
fcst_iter = input_type(fcst_iterator(samples, start_dates, freq))
# evaluate
agg_df, item_df = evaluator(data_iter, fcst_iter)
return agg_df, item_df
TIMESERIES_M4 = [
np.array(
[
[
2.943_013,
2.822_251,
4.196_222,
1.328_664,
4.947_390,
3.333_131,
1.479_800,
2.265_094,
3.413_493,
3.497_607,
],
[
-0.126_781_2,
3.057_412_2,
1.901_594_4,
2.772_549_5,
3.312_853_1,
4.411_818_0,
3.709_025_2,
4.322_028,
2.565_359,
3.074_308,
],
[
2.542_998,
2.336_757,
1.417_916,
1.335_139,
2.523_035,
3.645_589,
3.382_819,
2.075_960,
2.643_869,
2.772_456,
],
[
0.315_685_6,
1.892_312_1,
2.476_861_2,
3.511_628_6,
4.384_346_5,
2.960_685_6,
4.897_572_5,
3.280_125,
4.768_556,
4.958_616,
],
[
2.205_877_3,
0.782_759_4,
2.401_420_8,
2.385_643_4,
4.845_818_2,
3.102_322_9,
3.567_723_7,
4.878_143,
3.735_245,
2.218_113,
],
]
),
np.array(
[
[
13.11301,
13.16225,
14.70622,
12.00866,
15.79739,
14.35313,
12.66980,
13.62509,
14.94349,
15.19761,
],
[
10.04322,
13.39741,
12.41159,
13.45255,
14.16285,
15.43182,
14.89903,
15.68203,
14.09536,
14.77431,
],
[
12.71300,
12.67676,
11.92792,
12.01514,
13.37303,
14.66559,
14.57282,
13.43596,
14.17387,
14.47246,
],
[
10.48569,
12.23231,
12.98686,
14.19163,
15.23435,
13.98069,
16.08757,
14.64012,
16.29856,
16.65862,
],
[
12.37588,
11.12276,
12.91142,
13.06564,
15.69582,
14.12232,
14.75772,
16.23814,
15.26524,
13.91811,
],
]
),
]
RES_M4 = [
{
"MASE": 0.816_837_618,
"MAPE": 0.324_517_430_685_928_1,
"sMAPE": 0.326_973_268_4,
"seasonal_error": np.array(
[1.908_101, 1.258_838, 0.63018, 1.238_201, 1.287_771]
),
},
{
"MASE": 0.723_948_2,
"MAPE": 0.063_634_129_851_747_6,
"sMAPE": 0.065_310_85,
"seasonal_error": np.array(
[1.867_847, 1.315_505, 0.602_587_4, 1.351_535, 1.339_179]
),
},
]
@pytest.mark.parametrize("timeseries, res", zip(TIMESERIES_M4, RES_M4))
def test_MASE_sMAPE_M4(timeseries, res):
ts_datastructure = pd.Series
evaluator = Evaluator(quantiles=QUANTILES)
agg_df, item_df = calculate_metrics(timeseries, evaluator, ts_datastructure)
assert abs((agg_df["MASE"] - res["MASE"]) / res["MASE"]) < 0.001, (
"Scores for the metric MASE do not match: "
"\nexpected: {} \nobtained: {}".format(res["MASE"], agg_df["MASE"])
)
assert abs((agg_df["MAPE"] - res["MAPE"]) / res["MAPE"]) < 0.001, (
"Scores for the metric MAPE do not match: \nexpected: {} "
"\nobtained: {}".format(res["MAPE"], agg_df["MAPE"])
)
assert abs((agg_df["sMAPE"] - res["sMAPE"]) / res["sMAPE"]) < 0.001, (
"Scores for the metric sMAPE do not match: \nexpected: {} "
"\nobtained: {}".format(res["sMAPE"], agg_df["sMAPE"])
)
assert sum(abs(item_df["seasonal_error"].values - res["seasonal_error"])) < 0.001, (
"Scores for the metric seasonal_error do not match: \nexpected: {} "
"\nobtained: {}".format(res["seasonal_error"], item_df["seasonal_error"].values)
)
TIMESERIES = [
np.ones((5, 10), dtype=np.float64),
np.ones((5, 10), dtype=np.float64),
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
np.array([[np.nan] * 10, [1.0] * 10]),
]
RES = [
{
"MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 15.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"MAPE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
},
{
"MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 14.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"MAPE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
},
{
"MSE": 4.666_666_666_666,
"abs_error": 30.0,
"abs_target_sum": 420.0,
"abs_target_mean": 28.0,
"seasonal_error": 1.0,
"MASE": 2.0,
"MAPE": 0.103_112_211_532_524_85,
"sMAPE": 0.113_254_049_3,
"MSIS": 80.0,
"RMSE": 2.160_246_899_469_286_9,
"NRMSE": 0.077_151_674_981_045_956,
"ND": 0.071_428_571_428_571_42,
"MAE_Coverage": 0.5,
},
{
"MSE": 5.033_333_333_333_3,
"abs_error": 29.0,
"abs_target_sum": 413.0,
"abs_target_mean": 28.1,
"seasonal_error": 1.0,
"MASE": 2.1,
"MAPE": 0.113_032_846_453_159_77,
"sMAPE": 0.125_854_781_903_299_57,
"MSIS": 84.0,
"RMSE": 2.243_509_156_061_845_6,
"NRMSE": 0.079_840_183_489_745_39,
"ND": 0.070_217_917_675_544_79,
"MAE_Coverage": 0.5,
},
{
"MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 3.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"MAPE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
},
]
HAS_NANS = [False, True, False, True, True]
INPUT_TYPE = [iterable, iterable, iterator, iterator, iterable]
@pytest.mark.parametrize(
"timeseries, res, has_nans, input_type", zip(TIMESERIES, RES, HAS_NANS, INPUT_TYPE),
)
def test_metrics(timeseries, res, has_nans, input_type):
ts_datastructure = pd.Series
evaluator = Evaluator(quantiles=QUANTILES, num_workers=0)
agg_metrics, item_metrics = calculate_metrics(
timeseries,
evaluator,
ts_datastructure,
has_nans=has_nans,
input_type=input_type,
)
for metric, score in agg_metrics.items():
if metric in res.keys():
assert abs(score - res[metric]) < 0.001, (
"Scores for the metric {} do not match: \nexpected: {} "
"\nobtained: {}".format(metric, res[metric], score)
)
@pytest.mark.parametrize(
"timeseries, res, has_nans, input_type", zip(TIMESERIES, RES, HAS_NANS, INPUT_TYPE),
)
def test_metrics_mp(timeseries, res, has_nans, input_type):
ts_datastructure = pd.Series
# Default will be multiprocessing evaluator
evaluator = Evaluator(quantiles=QUANTILES, num_workers=4)
agg_metrics, item_metrics = calculate_metrics(
timeseries,
evaluator,
ts_datastructure,
has_nans=has_nans,
input_type=input_type,
)
for metric, score in agg_metrics.items():
if metric in res.keys():
assert abs(score - res[metric]) < 0.001, (
"Scores for the metric {} do not match: \nexpected: {} "
"\nobtained: {}".format(metric, res[metric], score)
)
TIMESERIES_MULTIVARIATE = [
np.ones((5, 10, 2), dtype=np.float64),
np.ones((5, 10, 2), dtype=np.float64),
np.ones((5, 10, 2), dtype=np.float64),
np.stack(
(
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
),
axis=2,
),
np.stack(
(
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
),
axis=2,
),
np.stack(
(
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
),
axis=2,
),
]
RES_MULTIVARIATE = [
{
"MSE": 0.0,
"0_MSE": 0.0,
"1_MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 15.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
"m_sum_MSE": 0.0,
},
{
"MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 15.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
"m_sum_MSE": 0.0,
},
{
"MSE": 0.0,
"abs_error": 0.0,
"abs_target_sum": 30.0,
"abs_target_mean": 1.0,
"seasonal_error": 0.0,
"MASE": 0.0,
"sMAPE": 0.0,
"MSIS": 0.0,
"RMSE": 0.0,
"NRMSE": 0.0,
"ND": 0.0,
"MAE_Coverage": 0.5,
"m_sum_MSE": 0.0,
},
{
"MSE": 4.666_666_666_666,
"abs_error": 30.0,
"abs_target_sum": 420.0,
"abs_target_mean": 28.0,
"seasonal_error": 1.0,
"MASE": 2.0,
"sMAPE": 0.113_254_049_3,
"MSIS": 80.0,
"RMSE": 2.160_246_899_469_286_9,
"NRMSE": 0.077_151_674_981_045_956,
"ND": 0.071_428_571_428_571_42,
"MAE_Coverage": 0.5,
"m_sum_MSE": 18.666_666_666_666,
},
{
"MSE": 4.666_666_666_666,
"abs_error": 30.0,
"abs_target_sum": 1170.0,
"abs_target_mean": 78.0,
"seasonal_error": 1.0,
"MASE": 2.0,
"sMAPE": 0.026_842_301_756_499_45,
"MSIS": 80.0,
"RMSE": 2.160_246_899_469_286_9,
"NRMSE": 0.027_695_473_070_119_065,
"ND": 0.025_641_025_641_025_64,
"MAE_Coverage": 0.5,
"m_sum_MSE": 18.666_666_666_666,
},
{
"MSE": 4.666_666_666_666,
"abs_error": 60.0,
"abs_target_sum": 1590.0,
"abs_target_mean": 53.0,
"seasonal_error": 1.0,
"MASE": 2.0,
"sMAPE": 0.070_048_175_528_249_73,
"MSIS": 80.0,
"RMSE": 2.160_246_899_469_286_9,
"NRMSE": 0.040_759_375_461_684_65,
"ND": 0.037_735_849_056_603_77,
"MAE_Coverage": 0.5,
"m_sum_MSE": 18.666_666_666_666,
},
]
HAS_NANS_MULTIVARIATE = [False, False, False, False, False, False]
EVAL_DIMS = [[0], [1], [0, 1], [0], [1], None]
INPUT_TYPE = [iterable, iterable, iterator, iterator, iterable, iterator]
@pytest.mark.parametrize(
"timeseries, res, has_nans, eval_dims, input_type",
zip(
TIMESERIES_MULTIVARIATE,
RES_MULTIVARIATE,
HAS_NANS_MULTIVARIATE,
EVAL_DIMS,
INPUT_TYPE,
),
)
def test_metrics_multivariate(timeseries, res, has_nans, eval_dims, input_type):
ts_datastructure = pd.DataFrame
evaluator = MultivariateEvaluator(
quantiles=QUANTILES, eval_dims=eval_dims, target_agg_funcs={"sum": np.sum},
)
agg_metrics, item_metrics = calculate_metrics(
timeseries,
evaluator,
ts_datastructure,
has_nans=has_nans,
forecaster=naive_multivariate_forecaster,
input_type=input_type,
)
for metric, score in agg_metrics.items():
if metric in res.keys():
assert abs(score - res[metric]) < 0.001, (
"Scores for the metric {} do not match: \nexpected: {} "
"\nobtained: {}".format(metric, res[metric], score)
)
def test_evaluation_with_QuantileForecast():
start = "2012-01-11"
target = [2.4, 1.0, 3.0, 4.4, 5.5, 4.9] * 11
index = pd.date_range(start=start, freq="1D", periods=len(target))
ts = pd.Series(index=index, data=target)
ev = Evaluator(quantiles=("0.1", "0.2", "0.5"))
fcst = [
QuantileForecast(
start_date=pd.Timestamp("2012-01-11"),
freq="D",
forecast_arrays=np.array([[2.4, 9.0, 3.0, 2.4, 5.5, 4.9] * 10]),
forecast_keys=["0.5"],
)
]
agg_metric, _ = ev(iter([ts]), iter(fcst))
assert np.isfinite(agg_metric["wQuantileLoss[0.5]"])
@pytest.mark.parametrize(
"freq, expected_seasonality",
[
("1H", 24),
("H", 24),
("2H", 12),
("3H", 8),
("4H", 6),
("15H", 1),
("5B", 1),
("1B", 5),
("2W", 1),
("3M", 4),
("1D", 1),
("7D", 1),
("8D", 1),
],
)
def test_get_seasonality(freq, expected_seasonality):
assert get_seasonality(freq) == expected_seasonality
-311
View File
@@ -1,311 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from pts.feature import get_lags_for_frequency
# These are the expected lags for common frequencies and corner cases.
# By default all frequencies have the following lags: [1, 2, 3, 4, 5, 6, 7].
# Remaining lags correspond to the same `season` (+/- `delta`) in previous `k` cycles.
expected_lags = {
# (apart from the default lags) centered around each of the last 3 hours (delta = 2)
"min": [
1,
2,
3,
4,
5,
6,
7,
58,
59,
60,
61,
62,
118,
119,
120,
121,
122,
178,
179,
180,
181,
182,
],
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1)
"15min": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
+ [
95,
96,
97,
191,
192,
193,
287,
288,
289,
383,
384,
385,
479,
480,
481,
575,
576,
577,
671,
672,
673,
],
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + 3 weeks (delta = 1)
"30min": [1, 2, 3, 4, 5, 6, 7, 8]
+ [
47,
48,
49,
95,
96,
97,
143,
144,
145,
191,
192,
193,
239,
240,
241,
287,
288,
289,
335,
336,
337,
]
+ [671, 672, 673, 1007, 1008, 1009],
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
"59min": [1, 2, 3, 4, 5, 6, 7]
+ [
23,
24,
25,
47,
48,
49,
72,
73,
74,
96,
97,
98,
121,
122,
123,
145,
146,
147,
169,
170,
171,
]
+ [340, 341, 342, 511, 512, 513, 682, 683, 684, 731, 732, 733],
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
"61min": [1, 2, 3, 4, 5, 6, 7]
+ [
22,
23,
24,
46,
47,
48,
69,
70,
71,
93,
94,
95,
117,
118,
119,
140,
141,
142,
164,
165,
166,
]
+ [329, 330, 331, 494, 495, 496, 659, 660, 661, 707, 708, 709],
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
"H": [1, 2, 3, 4, 5, 6, 7]
+ [
23,
24,
25,
47,
48,
49,
71,
72,
73,
95,
96,
97,
119,
120,
121,
143,
144,
145,
167,
168,
169,
]
+ [335, 336, 337, 503, 504, 505, 671, 672, 673, 719, 720, 721],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0)
"6H": [
1,
2,
3,
4,
5,
6,
7,
8,
9,
11,
12,
13,
15,
16,
17,
19,
20,
21,
23,
24,
25,
27,
28,
29,
]
+ [55, 56, 57, 83, 84, 85, 111, 112, 113]
+ [119, 120, 121]
+ [224, 336],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0) + last year (delta = 1)
"12H": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+ [27, 28, 29, 41, 42, 43, 55, 56, 57]
+ [59, 60, 61]
+ [112, 168]
+ [727, 728, 729],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
"23H": [1, 2, 3, 4, 5, 6, 7, 8]
+ [13, 14, 15, 20, 21, 22, 28, 29]
+ [30, 31, 32]
+ [58, 87]
+ [378, 379, 380, 758, 759, 760, 1138, 1139, 1140],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
"25H": [1, 2, 3, 4, 5, 6, 7]
+ [12, 13, 14, 19, 20, 21, 25, 26, 27]
+ [28, 29]
+ [53, 80]
+ [348, 349, 350, 697, 698, 699, 1047, 1048, 1049],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
"D": [1, 2, 3, 4, 5, 6, 7, 8]
+ [13, 14, 15, 20, 21, 22, 27, 28, 29]
+ [30, 31]
+ [56, 84]
+ [363, 364, 365, 727, 728, 729, 1091, 1092, 1093],
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
"2D": [1, 2, 3, 4, 5]
+ [6, 7, 8, 9, 10, 11, 13, 14, 15]
+ [16]
+ [28, 42]
+ [181, 182, 183, 363, 364, 365, 545, 546, 547],
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
"6D": [1, 2, 3, 4, 5, 6, 7, 9, 14] + [59, 60, 61, 120, 121, 122, 181, 182, 183],
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
"W": [1, 2, 3, 4, 5, 6, 7, 8, 12] + [51, 52, 53, 103, 104, 105, 155, 156, 157],
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
"8D": [1, 2, 3, 4, 5, 6, 7, 10] + [44, 45, 46, 90, 91, 92, 135, 136, 137],
# centered around each of the last 3 years (delta = 1)
"4W": [1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 25, 26, 27, 38, 39, 40],
# centered around each of the last 3 years (delta = 1)
"3W": [1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 33, 34, 35, 51, 52, 53],
# centered around each of the last 3 years (delta = 1)
"5W": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 19, 20, 21, 30, 31, 32],
# centered around each of the last 3 years (delta = 1)
"M": [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 23, 24, 25, 35, 36, 37],
# default
"6M": [1, 2, 3, 4, 5, 6, 7],
# default
"12M": [1, 2, 3, 4, 5, 6, 7],
}
# For the default multiple (1)
for freq in ["min", "H", "D", "W", "M"]:
expected_lags["1" + freq] = expected_lags[freq]
# For frequencies that do not have unique form
expected_lags["60min"] = expected_lags["1H"]
expected_lags["24H"] = expected_lags["1D"]
expected_lags["7D"] = expected_lags["1W"]
def test_lags():
freq_strs = [
"min",
"1min",
"15min",
"30min",
"59min",
"60min",
"61min",
"H",
"1H",
"6H",
"12H",
"23H",
"24H",
"25H",
"D",
"1D",
"2D",
"6D",
"7D",
"8D",
"W",
"1W",
"3W",
"4W",
"5W",
"M",
"6M",
"12M",
]
for freq_str in freq_strs:
lags = get_lags_for_frequency(freq_str)
assert (
lags == expected_lags[freq_str]
), "lags do not match for the frequency '{}':\nexpected: {},\nprovided: {}".format(
freq_str, expected_lags[freq_str], lags
)
+1 -1
View File
@@ -17,8 +17,8 @@ from torch.nn.utils import clip_grad_norm_
from torch.optim import SGD
from torch.utils.data import TensorDataset, DataLoader
from gluonts.torch.modules.distribution_output import DistributionOutput
from pts.modules import (
DistributionOutput,
StudentTOutput,
BetaOutput,
NegativeBinomialOutput,
@@ -11,13 +11,14 @@ from torch.nn.utils import clip_grad_norm_
from torch.optim import SGD
from torch.utils.data import TensorDataset, DataLoader
from gluonts.dataset.repository.datasets import get_dataset
from gluonts.evaluation import Evaluator
from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.torch.modules.distribution_output import DistributionOutput
from pts import Trainer
from pts.dataset.repository import get_dataset
from pts.evaluation import make_evaluation_predictions, Evaluator
from pts.model.deepar import DeepAREstimator
from pts.model.simple_feedforward import SimpleFeedForwardEstimator
from pts.modules import (
DistributionOutput,
ImplicitQuantileOutput
)
@@ -172,7 +173,7 @@ def test_training_with_implicit_quantile_output():
)
forecasts = list(forecast_it)
tss = list(ts_it)
evaluator = Evaluator()
evaluator = Evaluator(num_workers=0)
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test))
assert agg_metrics["MSE"] > 0
@@ -220,7 +221,7 @@ def test_instanciation_of_args_proj():
)
forecasts = list(forecast_it)
tss = list(ts_it)
evaluator = Evaluator()
evaluator = Evaluator(num_workers=0)
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test))
assert distr_output.method_calls == 2
-808
View File
@@ -1,808 +0,0 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Standard library imports
from typing import Tuple
# Third-party imports
import numpy as np
import pandas as pd
import pytest
import torch
from pts import transform
# First-party imports
from pts.dataset import (
ProcessStartField,
FieldName,
ListDataset,
DataEntry,
calculate_dataset_statistics,
ScaleHistogram,
)
from pts.feature import time_feature
FREQ = "1D"
TEST_VALUES = {
"is_train": [True, False],
"target": [np.zeros(0), np.random.rand(13), np.random.rand(100)],
"start": [
ProcessStartField.process("2012-01-02", freq="1D"),
ProcessStartField.process("1994-02-19 20:01:02", freq="3D"),
],
"use_prediction_features": [True, False],
"allow_target_padding": [True, False],
}
def test_align_timestamp():
def aligned_with(date_str, freq):
return str(ProcessStartField.process(date_str, freq=freq))
for _ in range(2):
assert aligned_with("2012-03-05 09:13:12", "min") == "2012-03-05 09:13:00"
assert aligned_with("2012-03-05 09:13:12", "2min") == "2012-03-05 09:12:00"
assert aligned_with("2012-03-05 09:13:12", "H") == "2012-03-05 09:00:00"
assert aligned_with("2012-03-05 09:13:12", "D") == "2012-03-05 00:00:00"
assert aligned_with("2012-03-05 09:13:12", "W") == "2012-03-11 00:00:00"
assert aligned_with("2012-03-05 09:13:12", "4W") == "2012-03-11 00:00:00"
assert aligned_with("2012-03-05 09:13:12", "M") == "2012-03-31 00:00:00"
assert aligned_with("2012-03-05 09:13:12", "3M") == "2012-03-31 00:00:00"
assert aligned_with("2012-03-05 09:13:12", "Y") == "2012-12-31 00:00:00"
assert aligned_with("2012-03-05 09:14:11", "min") == "2012-03-05 09:14:00"
assert aligned_with("2012-03-05 09:14:11", "2min") == "2012-03-05 09:14:00"
assert aligned_with("2012-03-05 09:14:11", "H") == "2012-03-05 09:00:00"
assert aligned_with("2012-03-05 09:14:11", "D") == "2012-03-05 00:00:00"
assert aligned_with("2012-03-05 09:14:11", "W") == "2012-03-11 00:00:00"
assert aligned_with("2012-03-05 09:14:11", "4W") == "2012-03-11 00:00:00"
assert aligned_with("2012-03-05 09:14:11", "M") == "2012-03-31 00:00:00"
assert aligned_with("2012-03-05 09:14:11", "3M") == "2012-03-31 00:00:00"
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
@pytest.mark.parametrize("target", TEST_VALUES["target"])
@pytest.mark.parametrize("start", TEST_VALUES["start"])
def test_AddTimeFeatures(start, target, is_train: bool):
pred_length = 13
t = transform.AddTimeFeatures(
start_field=FieldName.START,
target_field=FieldName.TARGET,
output_field="myout",
pred_length=pred_length,
time_features=[time_feature.DayOfWeek(), time_feature.DayOfMonth()],
)
data = {"start": start, "target": target}
res = t.map_transform(data, is_train=is_train)
mat = res["myout"]
expected_length = len(target) + (0 if is_train else pred_length)
assert mat.shape == (2, expected_length)
tmp_idx = pd.date_range(start=start, freq=start.freq, periods=expected_length)
assert np.alltrue(mat[0] == time_feature.DayOfWeek()(tmp_idx))
assert np.alltrue(mat[1] == time_feature.DayOfMonth()(tmp_idx))
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
@pytest.mark.parametrize("target", TEST_VALUES["target"])
@pytest.mark.parametrize("start", TEST_VALUES["start"])
def test_AddTimeFeatures_empty_time_features(start, target, is_train: bool):
pred_length = 13
t = transform.AddTimeFeatures(
start_field=FieldName.START,
target_field=FieldName.TARGET,
output_field="myout",
pred_length=pred_length,
time_features=[],
)
data = {"start": start, "target": target}
res = t.map_transform(data, is_train=is_train)
assert res["myout"] is None
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
@pytest.mark.parametrize("target", TEST_VALUES["target"])
@pytest.mark.parametrize("start", TEST_VALUES["start"])
def test_AddAgeFeatures(start, target, is_train: bool):
pred_length = 13
t = transform.AddAgeFeature(
pred_length=pred_length,
target_field=FieldName.TARGET,
output_field="age",
log_scale=True,
)
data = {"start": start, "target": target}
out = t.map_transform(data, is_train=is_train)
expected_length = len(target) + (0 if is_train else pred_length)
assert out["age"].shape[-1] == expected_length
assert np.allclose(
out["age"],
np.log10(2.0 + np.arange(expected_length)).reshape((1, expected_length)),
)
@pytest.mark.parametrize("pick_incomplete", TEST_VALUES["allow_target_padding"])
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
@pytest.mark.parametrize("target", TEST_VALUES["target"])
@pytest.mark.parametrize("start", TEST_VALUES["start"])
def test_InstanceSplitter(start, target, is_train: bool, pick_incomplete: bool):
train_length = 100
pred_length = 13
t = transform.InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
train_sampler=transform.UniformSplitSampler(p=1.0),
past_length=train_length,
future_length=pred_length,
time_series_fields=["some_time_feature"],
pick_incomplete=pick_incomplete,
)
other_feat = np.arange(len(target) + 100)
data = {
"start": start,
"target": target,
"some_time_feature": other_feat,
"some_other_col": "ABC",
}
if not is_train and not pick_incomplete and len(target) < train_length:
with pytest.raises(AssertionError):
out = list(t.flatmap_transform(data, is_train=is_train))
return
else:
out = list(t.flatmap_transform(data, is_train=is_train))
if is_train:
assert len(out) == max(
0, len(target) - pred_length + 1 - (0 if pick_incomplete else train_length),
)
else:
assert len(out) == 1
for o in out:
assert "target" not in o
assert "some_time_feature" not in o
assert "some_other_col" in o
assert len(o["past_some_time_feature"]) == train_length
assert len(o["past_target"]) == train_length
if is_train:
assert len(o["future_target"]) == pred_length
assert len(o["future_some_time_feature"]) == pred_length
else:
assert len(o["future_target"]) == 0
assert len(o["future_some_time_feature"]) == pred_length
# expected_length = len(target) + (0 if is_train else pred_length)
# assert len(out['age']) == expected_length
# assert np.alltrue(out['age'] == np.log10(2.0 + np.arange(expected_length)))
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
@pytest.mark.parametrize("target", TEST_VALUES["target"])
@pytest.mark.parametrize("start", TEST_VALUES["start"])
@pytest.mark.parametrize(
"use_prediction_features", TEST_VALUES["use_prediction_features"]
)
@pytest.mark.parametrize("allow_target_padding", TEST_VALUES["allow_target_padding"])
def test_CanonicalInstanceSplitter(
start,
target,
is_train: bool,
use_prediction_features: bool,
allow_target_padding: bool,
):
train_length = 100
pred_length = 13
t = transform.CanonicalInstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
instance_sampler=transform.UniformSplitSampler(p=1.0),
instance_length=train_length,
prediction_length=pred_length,
time_series_fields=["some_time_feature"],
allow_target_padding=allow_target_padding,
use_prediction_features=use_prediction_features,
)
other_feat = np.arange(len(target) + 100)
data = {
"start": start,
"target": target,
"some_time_feature": other_feat,
"some_other_col": "ABC",
}
out = list(t.flatmap_transform(data, is_train=is_train))
min_num_instances = 1 if allow_target_padding else 0
if is_train:
assert len(out) == max(min_num_instances, len(target) - train_length + 1)
else:
assert len(out) == 1
for o in out:
assert "target" not in o
assert "future_target" not in o
assert "some_time_feature" not in o
assert "some_other_col" in o
assert len(o["past_some_time_feature"]) == train_length
assert len(o["past_target"]) == train_length
if use_prediction_features and not is_train:
assert len(o["future_some_time_feature"]) == pred_length
def test_Transformation():
train_length = 100
ds = ListDataset(
[{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D"
)
pred_length = 10
t = transform.Chain(
trans=[
transform.AddTimeFeatures(
start_field=FieldName.START,
target_field=FieldName.TARGET,
output_field="time_feat",
time_features=[
time_feature.DayOfWeek(),
time_feature.DayOfMonth(),
time_feature.MonthOfYear(),
],
pred_length=pred_length,
),
transform.AddAgeFeature(
target_field=FieldName.TARGET,
output_field="age",
pred_length=pred_length,
log_scale=True,
),
transform.AddObservedValuesIndicator(
target_field=FieldName.TARGET, output_field="observed_values"
),
transform.VstackFeatures(
output_field="dynamic_feat",
input_fields=["age", "time_feat"],
drop_inputs=True,
),
transform.InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
past_length=train_length,
future_length=pred_length,
time_series_fields=["dynamic_feat", "observed_values"],
),
]
)
for u in t(iter(ds), is_train=True):
print(u)
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
def test_multi_dim_transformation(is_train):
train_length = 10
first_dim: list = list(np.arange(1, 11, 1))
first_dim[-1] = "NaN"
second_dim: list = list(np.arange(11, 21, 1))
second_dim[0] = "NaN"
ds = ListDataset(
data_iter=[{"start": "2012-01-01", "target": [first_dim, second_dim]}],
freq="1D",
one_dim_target=False,
)
pred_length = 2
# Looks weird - but this is necessary to assert the nan entries correctly.
first_dim[-1] = np.nan
second_dim[0] = np.nan
t = transform.Chain(
trans=[
transform.AddTimeFeatures(
start_field=FieldName.START,
target_field=FieldName.TARGET,
output_field="time_feat",
time_features=[
time_feature.DayOfWeek(),
time_feature.DayOfMonth(),
time_feature.MonthOfYear(),
],
pred_length=pred_length,
),
transform.AddAgeFeature(
target_field=FieldName.TARGET,
output_field="age",
pred_length=pred_length,
log_scale=True,
),
transform.AddObservedValuesIndicator(
target_field=FieldName.TARGET,
output_field="observed_values",
convert_nans=False,
),
transform.VstackFeatures(
output_field="dynamic_feat",
input_fields=["age", "time_feat"],
drop_inputs=True,
),
transform.InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
past_length=train_length,
future_length=pred_length,
time_series_fields=["dynamic_feat", "observed_values"],
time_first=False,
),
]
)
if is_train:
for u in t(iter(ds), is_train=True):
assert_shape(u["past_target"], (2, 10))
assert_shape(u["past_dynamic_feat"], (4, 10))
assert_shape(u["past_observed_values"], (2, 10))
assert_shape(u["future_target"], (2, 2))
assert_padded_array(
u["past_observed_values"],
np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
u["past_is_pad"],
)
assert_padded_array(
u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"],
)
else:
for u in t(iter(ds), is_train=False):
assert_shape(u["past_target"], (2, 10))
assert_shape(u["past_dynamic_feat"], (4, 10))
assert_shape(u["past_observed_values"], (2, 10))
assert_shape(u["future_target"], (2, 0))
assert_padded_array(
u["past_observed_values"],
np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
u["past_is_pad"],
)
assert_padded_array(
u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"],
)
def test_ExpectedNumInstanceSampler():
N = 6
train_length = 2
pred_length = 1
ds = make_dataset(N, train_length)
t = transform.Chain(
trans=[
transform.InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
past_length=train_length,
future_length=pred_length,
pick_incomplete=True,
)
]
)
scale_hist = ScaleHistogram()
repetition = 2
for i in range(repetition):
for data in t(iter(ds), is_train=True):
target_values = data["past_target"]
# for simplicity, discard values that are zeros to avoid confusion with padding
target_values = target_values[target_values > 0]
scale_hist.add(target_values)
expected_values = {i: 2 ** i * repetition for i in range(1, N)}
assert expected_values == scale_hist.bin_counts
def test_BucketInstanceSampler():
N = 6
train_length = 2
pred_length = 1
ds = make_dataset(N, train_length)
dataset_stats = calculate_dataset_statistics(ds)
t = transform.Chain(
trans=[
transform.InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
train_sampler=transform.BucketInstanceSampler(
dataset_stats.scale_histogram
),
past_length=train_length,
future_length=pred_length,
pick_incomplete=True,
)
]
)
scale_hist = ScaleHistogram()
repetition = 200
for i in range(repetition):
for data in t(iter(ds), is_train=True):
target_values = data["past_target"]
# for simplicity, discard values that are zeros to avoid confusion with padding
target_values = target_values[target_values > 0]
scale_hist.add(target_values)
expected_values = {i: repetition for i in range(1, N)}
found_values = scale_hist.bin_counts
for i in range(1, N):
assert abs(expected_values[i] - found_values[i] < expected_values[i] * 0.3)
def test_cdf_to_gaussian_transformation():
def make_test_data():
target = np.array(
[0, 0, 0, 0, 10, 10, 20, 20, 30, 30, 40, 50, 59, 60, 60, 70, 80, 90, 100,]
).tolist()
np.random.shuffle(target)
multi_dim_target = np.array([target, target]).transpose()
past_is_pad = np.array([[0] * len(target)]).transpose()
past_observed_target = np.array(
[[1] * len(target), [1] * len(target)]
).transpose()
ds = ListDataset(
# Mimic output from InstanceSplitter
data_iter=[
{
"start": "2012-01-01",
"target": multi_dim_target,
"past_target": multi_dim_target,
"future_target": multi_dim_target,
"past_is_pad": past_is_pad,
f"past_{FieldName.OBSERVED_VALUES}": past_observed_target,
}
],
freq="1D",
one_dim_target=False,
)
return ds
def make_fake_output(u: DataEntry):
fake_output = np.expand_dims(
np.expand_dims(u["past_target_cdf"], axis=0), axis=0
)
return fake_output
ds = make_test_data()
t = transform.Chain(
trans=[
transform.CDFtoGaussianTransform(
target_field=FieldName.TARGET,
observed_values_field=FieldName.OBSERVED_VALUES,
max_context_length=20,
target_dim=2,
)
]
)
for u in t(iter(ds), is_train=False):
fake_output = make_fake_output(u)
# Fake transformation chain output
u["past_target_sorted"] = torch.tensor(
np.expand_dims(u["past_target_sorted"], axis=0)
)
u["slopes"] = torch.tensor(np.expand_dims(u["slopes"], axis=0))
u["intercepts"] = torch.tensor(np.expand_dims(u["intercepts"], axis=0))
back_transformed = transform.cdf_to_gaussian_forward_transform(u, fake_output)
# Get any sample/batch (slopes[i][:, d]they are all the same)
back_transformed = back_transformed[0][0]
original_target = u["target"]
# Original target and back-transformed target should be the same
assert np.allclose(original_target, back_transformed)
def test_gaussian_cdf():
try:
from scipy.stats import norm
except:
pytest.skip("scipy not installed skipping test for erf")
x = np.array(
[-1000, -100, -10] + np.linspace(-2, 2, 1001).tolist() + [10, 100, 1000]
)
y_gluonts = transform.CDFtoGaussianTransform.standard_gaussian_cdf(x)
y_scipy = norm.cdf(x)
assert np.allclose(y_gluonts, y_scipy, atol=1e-7)
def test_gaussian_ppf():
try:
from scipy.stats import norm
except:
pytest.skip("scipy not installed skipping test for erf")
x = np.linspace(0.0001, 0.9999, 1001)
y_gluonts = transform.CDFtoGaussianTransform.standard_gaussian_ppf(x)
y_scipy = norm.ppf(x)
assert np.allclose(y_gluonts, y_scipy, atol=1e-7)
def test_target_dim_indicator():
target = np.array([0, 2, 3, 10]).tolist()
multi_dim_target = np.array([target, target, target, target])
dataset = ListDataset(
data_iter=[{"start": "2012-01-01", "target": multi_dim_target}],
freq="1D",
one_dim_target=False,
)
t = transform.Chain(
trans=[
transform.TargetDimIndicator(
target_field=FieldName.TARGET, field_name="target_dimensions"
)
]
)
for data_entry in t(dataset, is_train=True):
assert (data_entry["target_dimensions"] == np.array([0, 1, 2, 3])).all()
@pytest.fixture
def point_process_dataset():
ia_times = np.array([0.2, 0.7, 0.2, 0.5, 0.3, 0.3, 0.2, 0.1])
marks = np.array([0, 1, 2, 0, 1, 2, 2, 2])
lds = ListDataset(
[
{
"target": np.c_[ia_times, marks].T,
"start": pd.Timestamp("2011-01-01 00:00:00", freq="H"),
"end": pd.Timestamp("2011-01-01 03:00:00", freq="H"),
}
],
freq="H",
one_dim_target=False,
)
return lds
class MockContinuousTimeSampler(transform.ContinuousTimePointSampler):
# noinspection PyMissingConstructor,PyUnusedLocal
def __init__(self, ret_values, *args, **kwargs):
self._ret_values = ret_values
def __call__(self, *args, **kwargs):
return np.array(self._ret_values)
def test_ctsplitter_mask_sorted(point_process_dataset):
d = next(iter(point_process_dataset))
ia_times = d["target"][0, :]
ts = np.cumsum(ia_times)
splitter = transform.ContinuousTimeInstanceSplitter(
2, 1, train_sampler=transform.ContinuousTimeUniformSampler(num_instances=10),
)
# no boundary conditions
res = splitter._mask_sorted(ts, 1, 2)
assert all([a == b for a, b in zip([2, 3, 4], res)])
# lower bound equal, exclusive of upper bound
res = splitter._mask_sorted(np.array([1, 2, 3, 4, 5, 6]), 1, 2)
assert all([a == b for a, b in zip([0], res)])
def test_ctsplitter_no_train_last_point(point_process_dataset):
splitter = transform.ContinuousTimeInstanceSplitter(
2, 1, train_sampler=transform.ContinuousTimeUniformSampler(num_instances=10),
)
iter_de = splitter(point_process_dataset, is_train=False)
d_out = next(iter(iter_de))
assert "future_target" not in d_out
assert "future_valid_length" not in d_out
assert "past_target" in d_out
assert "past_valid_length" in d_out
assert d_out["past_valid_length"] == 6
assert np.allclose(
[0.1, 0.5, 0.3, 0.3, 0.2, 0.1], d_out["past_target"][..., 0], atol=0.01
)
def test_ctsplitter_train_correct(point_process_dataset):
splitter = transform.ContinuousTimeInstanceSplitter(
1,
1,
train_sampler=MockContinuousTimeSampler(
ret_values=[1.01, 1.5, 1.99], num_instances=3
),
)
iter_de = splitter(point_process_dataset, is_train=True)
outputs = list(iter_de)
assert outputs[0]["past_valid_length"] == 2
assert outputs[0]["future_valid_length"] == 3
assert np.allclose(outputs[0]["past_target"], np.array([[0.19, 0.7], [0, 1]]).T)
assert np.allclose(
outputs[0]["future_target"], np.array([[0.09, 0.5, 0.3], [2, 0, 1]]).T
)
assert outputs[1]["past_valid_length"] == 2
assert outputs[1]["future_valid_length"] == 4
assert outputs[2]["past_valid_length"] == 3
assert outputs[2]["future_valid_length"] == 3
def test_ctsplitter_train_correct_out_count(point_process_dataset):
# produce new TPP data by shuffling existing TS instance
def shuffle_iterator(num_duplications=5):
for entry in point_process_dataset:
for i in range(num_duplications):
d = dict.copy(entry)
d["target"] = np.random.permutation(d["target"].T).T
yield d
splitter = transform.ContinuousTimeInstanceSplitter(
1,
1,
train_sampler=MockContinuousTimeSampler(
ret_values=[1.01, 1.5, 1.99], num_instances=3
),
)
iter_de = splitter(shuffle_iterator(), is_train=True)
outputs = list(iter_de)
assert len(outputs) == 5 * 3
def test_ctsplitter_train_samples_correct_times(point_process_dataset):
splitter = transform.ContinuousTimeInstanceSplitter(
1.25, 1.25, train_sampler=transform.ContinuousTimeUniformSampler(20)
)
iter_de = splitter(point_process_dataset, is_train=True)
assert all(
[
(
pd.Timestamp("2011-01-01 01:15:00")
<= d["forecast_start"]
<= pd.Timestamp("2011-01-01 01:45:00")
)
for d in iter_de
]
)
def test_ctsplitter_train_short_intervals(point_process_dataset):
splitter = transform.ContinuousTimeInstanceSplitter(
0.01,
0.01,
train_sampler=MockContinuousTimeSampler(
ret_values=[1.01, 1.5, 1.99], num_instances=3
),
)
iter_de = splitter(point_process_dataset, is_train=True)
for d in iter_de:
assert d["future_valid_length"] == d["past_valid_length"] == 0
assert np.prod(np.shape(d["past_target"])) == 0
assert np.prod(np.shape(d["future_target"])) == 0
def make_dataset(N, train_length):
# generates 2 ** N - 1 timeseries with constant increasing values
n = 2 ** N - 1
targets = np.ones((n, train_length))
for i in range(0, n):
targets[i, :] = targets[i, :] * i
ds = ListDataset(
data_iter=[{"start": "2012-01-01", "target": targets[i, :]} for i in range(n)],
freq="1D",
)
return ds
def assert_shape(array: np.array, reference_shape: Tuple[int, int]):
assert (
array.shape == reference_shape
), f"Shape should be {reference_shape} but found {array.shape}."
def assert_padded_array(
sampled_array: np.array, reference_array: np.array, padding_array: np.array
):
num_padded = int(np.sum(padding_array))
sampled_no_padding = sampled_array[:, num_padded:]
reference_array = np.roll(reference_array, num_padded, axis=1)
reference_no_padding = reference_array[:, num_padded:]
# Convert nans to dummy value for assertion because
# np.nan == np.nan -> False.
reference_no_padding[np.isnan(reference_no_padding)] = 9999.0
sampled_no_padding[np.isnan(sampled_no_padding)] = 9999.0
reference_no_padding = np.array(reference_no_padding, dtype=np.float32)
assert (sampled_no_padding == reference_no_padding).all(), (
f"Sampled and reference arrays do not match. '"
f"Got {sampled_no_padding} but should be {reference_no_padding}."
)