mirror of
https://github.com/wassname/pytorch-ts.git
synced 2026-06-27 17:49:41 +08:00
initial gluonts dependency
This commit is contained in:
+46
-152
File diff suppressed because one or more lines are too long
+1
-2
@@ -2,7 +2,6 @@ from pkgutil import extend_path
|
||||
|
||||
from pkg_resources import get_distribution, DistributionNotFound
|
||||
|
||||
from .exception import assert_pts
|
||||
from .trainer import Trainer
|
||||
|
||||
__path__ = extend_path(__path__, __name__) # type: ignore
|
||||
@@ -10,4 +9,4 @@ __path__ = extend_path(__path__, __name__) # type: ignore
|
||||
try:
|
||||
__version__ = get_distribution(__name__).version
|
||||
except DistributionNotFound:
|
||||
__version__ = "0.0.0-unknown"
|
||||
__version__ = "0.0.0-unknown"
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
# Relative imports
|
||||
from ._base import fqname_for
|
||||
|
||||
__all__ = ["fqname_for"]
|
||||
|
||||
# fix Sphinx issues, see https://bit.ly/2K2eptM
|
||||
for item in __all__:
|
||||
if hasattr(item, "__module__"):
|
||||
setattr(item, "__module__", __name__)
|
||||
@@ -1,29 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
def fqname_for(cls: type) -> str:
|
||||
"""
|
||||
Returns the fully qualified name of ``cls``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cls
|
||||
The class we are interested in.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The fully qualified name of ``cls``.
|
||||
"""
|
||||
return f"{cls.__module__}.{cls.__qualname__}"
|
||||
@@ -1,171 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import functools
|
||||
import inspect
|
||||
from collections import OrderedDict
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from pydantic import BaseConfig, BaseModel, create_model
|
||||
|
||||
from pts.core.serde import dump_code
|
||||
|
||||
|
||||
class BaseValidatedInitializerModel(BaseModel):
|
||||
"""
|
||||
Base Pydantic model for components with :func:`validated` initializers.
|
||||
|
||||
See Also
|
||||
--------
|
||||
validated
|
||||
Decorates an initializer methods with argument validation logic.
|
||||
"""
|
||||
|
||||
class Config(BaseConfig):
|
||||
"""
|
||||
`Config <https://pydantic-docs.helpmanual.io/#model-config>`_ for the
|
||||
Pydantic model inherited by all :func:`validated` initializers.
|
||||
|
||||
Allows the use of arbitrary type annotations in initializer parameters.
|
||||
"""
|
||||
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
def validated(base_model=None):
|
||||
"""
|
||||
Decorates an ``__init__`` method with typed parameters with validation
|
||||
and auto-conversion logic.
|
||||
|
||||
>>> class ComplexNumber:
|
||||
... @validated()
|
||||
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
|
||||
... self.x = x
|
||||
... self.y = y
|
||||
|
||||
Classes with decorated initializers can be instantiated using arguments of
|
||||
another type (e.g. an ``y`` argument of type ``str`` ). The decorator
|
||||
handles the type conversion logic.
|
||||
|
||||
>>> c = ComplexNumber(y='42')
|
||||
>>> (c.x, c.y)
|
||||
(0.0, 42.0)
|
||||
|
||||
If the bound argument cannot be converted, the decorator throws an error.
|
||||
|
||||
>>> c = ComplexNumber(y=None)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
pydantic.error_wrappers.ValidationError: 1 validation error for ComplexNumberModel
|
||||
y
|
||||
none is not an allowed value (type=type_error.none.not_allowed)
|
||||
|
||||
Internally, the decorator delegates all validation and conversion logic to
|
||||
`a Pydantic model <https://pydantic-docs.helpmanual.io/>`_, which can be
|
||||
accessed through the ``Model`` attribute of the decorated initiazlier.
|
||||
|
||||
>>> ComplexNumber.__init__.Model
|
||||
<class 'ComplexNumberModel'>
|
||||
|
||||
The Pydantic model is synthesized automatically from on the parameter
|
||||
names and types of the decorated initializer. In the ``ComplexNumber``
|
||||
example, the synthesized Pydantic model corresponds to the following
|
||||
definition.
|
||||
|
||||
>>> class ComplexNumberModel(BaseValidatedInitializerModel):
|
||||
... x: float = 0.0
|
||||
... y: float = 0.0
|
||||
|
||||
|
||||
Clients can optionally customize the base class of the synthesized
|
||||
Pydantic model using the ``base_model`` decorator parameter. The default
|
||||
behavior uses :class:`BaseValidatedInitializerModel` and its
|
||||
`model config <https://pydantic-docs.helpmanual.io/#config>`_.
|
||||
|
||||
See Also
|
||||
--------
|
||||
BaseValidatedInitializerModel
|
||||
Default base class for all synthesized Pydantic models.
|
||||
"""
|
||||
|
||||
def validator(init):
|
||||
init_qualname = dict(inspect.getmembers(init))["__qualname__"]
|
||||
init_clsnme = init_qualname.split(".")[0]
|
||||
init_params = inspect.signature(init).parameters
|
||||
init_fields = {
|
||||
param.name: (
|
||||
param.annotation
|
||||
if param.annotation != inspect.Parameter.empty
|
||||
else Any,
|
||||
param.default if param.default != inspect.Parameter.empty else ...,
|
||||
)
|
||||
for param in init_params.values()
|
||||
if param.name != "self"
|
||||
and param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
|
||||
}
|
||||
|
||||
if base_model is None:
|
||||
PydanticModel = create_model(
|
||||
f"{init_clsnme}Model",
|
||||
__config__=BaseValidatedInitializerModel.Config,
|
||||
**init_fields,
|
||||
)
|
||||
else:
|
||||
PydanticModel = create_model(
|
||||
f"{init_clsnme}Model", __base__=base_model, **init_fields,
|
||||
)
|
||||
|
||||
def validated_repr(self) -> str:
|
||||
return dump_code(self)
|
||||
|
||||
def validated_getnewargs_ex(self):
|
||||
return (), self.__init_args__
|
||||
|
||||
@functools.wraps(init)
|
||||
def init_wrapper(*args, **kwargs):
|
||||
self, *args = args
|
||||
|
||||
nmargs = {
|
||||
name: arg
|
||||
for (name, param), arg in zip(list(init_params.items()), [self] + args)
|
||||
if name != "self"
|
||||
}
|
||||
model = PydanticModel(**{**nmargs, **kwargs})
|
||||
|
||||
# merge nmargs, kwargs, and the model fields into a single dict
|
||||
all_args = {**nmargs, **kwargs, **model.__dict__}
|
||||
|
||||
# save the merged dictionary for Representable use, but only of the
|
||||
# __init_args__ is not already set in order to avoid overriding a
|
||||
# value set by a subclass initializer in super().__init__ calls
|
||||
if not getattr(self, "__init_args__", {}):
|
||||
self.__init_args__ = OrderedDict(
|
||||
{
|
||||
name: arg
|
||||
for name, arg in sorted(all_args.items())
|
||||
if type(arg) != torch.nn.ParameterDict
|
||||
}
|
||||
)
|
||||
self.__class__.__getnewargs_ex__ = validated_getnewargs_ex
|
||||
self.__class__.__repr__ = validated_repr
|
||||
|
||||
return init(self, **all_args)
|
||||
|
||||
# attach the Pydantic model as the attribute of the initializer wrapper
|
||||
setattr(init_wrapper, "Model", PydanticModel)
|
||||
|
||||
return init_wrapper
|
||||
|
||||
return validator
|
||||
@@ -1,374 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import math
|
||||
import textwrap
|
||||
from functools import singledispatch
|
||||
from pydoc import locate
|
||||
from typing import Any, Optional, cast, NamedTuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pts.core import fqname_for
|
||||
|
||||
bad_type_msg = textwrap.dedent(
|
||||
"""
|
||||
Cannot serialize type {}. See the documentation of the `encode` and
|
||||
`validate` functions at
|
||||
|
||||
http://gluon-ts.mxnet.io/api/gluonts/gluonts.html
|
||||
|
||||
and the Python documentation of the `__getnewargs_ex__` magic method at
|
||||
|
||||
https://docs.python.org/3/library/pickle.html#object.__getnewargs_ex__
|
||||
|
||||
for more information how to make this type serializable.
|
||||
"""
|
||||
).lstrip()
|
||||
|
||||
|
||||
def dump_code(o: Any) -> str:
|
||||
"""
|
||||
Serializes an object to a Python code string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
o
|
||||
The object to serialize.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A string representing the object as Python code.
|
||||
|
||||
See Also
|
||||
--------
|
||||
load_code
|
||||
Inverse function.
|
||||
"""
|
||||
|
||||
def _dump_code(x: Any) -> str:
|
||||
# r = { 'class': ..., 'args': ... }
|
||||
# r = { 'class': ..., 'kwargs': ... }
|
||||
if type(x) == dict and x.get("__kind__") == kind_inst:
|
||||
args = x.get("args", [])
|
||||
kwargs = x.get("kwargs", {})
|
||||
|
||||
fqname = x["class"]
|
||||
bindings = ", ".join(
|
||||
itertools.chain(
|
||||
map(_dump_code, args),
|
||||
[f"{k}={_dump_code(v)}" for k, v in kwargs.items()],
|
||||
)
|
||||
)
|
||||
return f"{fqname}({bindings})"
|
||||
|
||||
if type(x) == dict and x.get("__kind__") == kind_type:
|
||||
return x["class"]
|
||||
|
||||
if isinstance(x, dict):
|
||||
inner = ", ".join(
|
||||
f"{_dump_code(k)}: {_dump_code(v)}" for k, v in x.items()
|
||||
)
|
||||
return f"{{{inner}}}"
|
||||
|
||||
if isinstance(x, list):
|
||||
inner = ", ".join(list(map(dump_code, x)))
|
||||
return f"[{inner}]"
|
||||
|
||||
if isinstance(x, tuple):
|
||||
inner = ", ".join(list(map(dump_code, x)))
|
||||
# account for the extra `,` in `(x,)`
|
||||
if len(x) == 1:
|
||||
inner += ","
|
||||
return f"({inner})"
|
||||
|
||||
if isinstance(x, str):
|
||||
# json.dumps escapes the string
|
||||
return json.dumps(x)
|
||||
|
||||
if isinstance(x, float) or np.issubdtype(type(x), np.inexact):
|
||||
if math.isfinite(x):
|
||||
return str(x)
|
||||
else:
|
||||
# e.g. `nan` needs to be encoded as `float("nan")`
|
||||
return 'float("{x}")'
|
||||
|
||||
if isinstance(x, int) or np.issubdtype(type(x), np.integer):
|
||||
return str(x)
|
||||
|
||||
if x is None:
|
||||
return str(x)
|
||||
|
||||
raise RuntimeError(
|
||||
f"Unexpected element type {fqname_for(x.__class__)}"
|
||||
)
|
||||
|
||||
return _dump_code(encode(o))
|
||||
|
||||
# JSON Serialization/Deserialization
|
||||
# ----------------------------------
|
||||
|
||||
# The canonical way to do this is to define and `default` and `object_hook`
|
||||
# parameters to the json.dumps and json.loads methods. Unfortunately, due
|
||||
# to https://bugs.python.org/issue12657 this is not possible at the moment,
|
||||
# as support for custom NamedTuple serialization is broken.
|
||||
#
|
||||
# To circumvent the issue, we pass the input value through custom encode
|
||||
# and decode functions that map nested object terms to JSON-serializable
|
||||
# data structures with explicit recursion.
|
||||
|
||||
|
||||
|
||||
def dump_json(o: Any, indent: Optional[int] = None) -> str:
|
||||
"""
|
||||
Serializes an object to a JSON string.
|
||||
Parameters
|
||||
----------
|
||||
o
|
||||
The object to serialize.
|
||||
indent
|
||||
An optional number of spaced to use as an indent.
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A string representing the object in JSON format.
|
||||
See Also
|
||||
--------
|
||||
load_json
|
||||
Inverse function.
|
||||
"""
|
||||
return json.dumps(encode(o), indent=indent, sort_keys=True)
|
||||
|
||||
|
||||
def load_json(s: str) -> Any:
|
||||
"""
|
||||
Deserializes an object from a JSON string.
|
||||
Parameters
|
||||
----------
|
||||
s
|
||||
A string representing the object in JSON format.
|
||||
Returns
|
||||
-------
|
||||
Any
|
||||
The deserialized object.
|
||||
See Also
|
||||
--------
|
||||
dump_json
|
||||
Inverse function.
|
||||
"""
|
||||
return decode(json.loads(s))
|
||||
|
||||
|
||||
# Structural encoding/decoding
|
||||
# ----------------------------
|
||||
|
||||
kind_type = "type"
|
||||
kind_inst = "instance"
|
||||
|
||||
|
||||
@singledispatch
|
||||
def encode(v: Any) -> Any:
|
||||
"""
|
||||
Transforms a value `v` as a serializable intermediate representation (for
|
||||
example, named tuples are encoded as dictionaries). The intermediate
|
||||
representation is then recursively traversed and serialized either as
|
||||
Python code or as JSON string.
|
||||
|
||||
This function is decorated with :func:`~functools.singledispatch` and can
|
||||
be specialized by clients for families of types that are not supported by
|
||||
the basic implementation (explained below).
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
The conversion logic implemented by the basic implementation is used
|
||||
as a fallback and is best explained by a series of examples.
|
||||
|
||||
Lists (as lists).
|
||||
|
||||
>>> encode([1, 2.0, '3'])
|
||||
[1, 2.0, '3']
|
||||
|
||||
Tuples (as lists).
|
||||
|
||||
>>> encode((1, 2.0, '3'))
|
||||
[1, 2.0, '3']
|
||||
|
||||
Dictionaries (as dictionaries).
|
||||
|
||||
>>> encode({'a': 1, 'b': 2.0, 'c': '3'})
|
||||
{'a': 1, 'b': 2.0, 'c': '3'}
|
||||
|
||||
Named tuples (as dictionaries with a ``'__kind__': 'instance'`` member).
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> from typing import NamedTuple
|
||||
>>> class ComplexNumber(NamedTuple):
|
||||
... x: float = 0.0
|
||||
... y: float = 0.0
|
||||
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
|
||||
{'__kind__': 'instance',
|
||||
'class': 'gluonts.core.serde.ComplexNumber',
|
||||
'kwargs': {'x': 4.0, 'y': 2.0}}
|
||||
|
||||
Classes with a :func:`~gluonts.core.component.validated` initializer (as
|
||||
dictionaries with a ``'__kind__': 'instance'`` member).
|
||||
|
||||
>>> from gluonts.core.component import validated
|
||||
>>> class ComplexNumber:
|
||||
... @validated()
|
||||
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
|
||||
... self.x = x
|
||||
... self.y = y
|
||||
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
|
||||
{'__kind__': 'instance',
|
||||
'args': [],
|
||||
'class': 'gluonts.core.serde.ComplexNumber',
|
||||
'kwargs': {'x': 4.0, 'y': 2.0}}
|
||||
|
||||
Classes with a ``__getnewargs_ex__`` magic method (as dictionaries with a
|
||||
``'__kind__': 'instance'`` member).
|
||||
|
||||
>>> from gluonts.core.component import validated
|
||||
>>> class ComplexNumber:
|
||||
... def __init__(self, x: float = 0.0, y: float = 0.0) -> None:
|
||||
... self.x = x
|
||||
... self.y = y
|
||||
... def __getnewargs_ex__(self):
|
||||
... return [], {'x': self.x, 'y': self.y}
|
||||
>>> pprint(encode(ComplexNumber(4.0, 2.0)))
|
||||
{'__kind__': 'instance',
|
||||
'args': [],
|
||||
'class': 'gluonts.core.serde.ComplexNumber',
|
||||
'kwargs': {'x': 4.0, 'y': 2.0}}
|
||||
|
||||
|
||||
Types (as dictionaries with a ``'__kind__': 'type' member``).
|
||||
|
||||
>>> encode(ComplexNumber)
|
||||
{'__kind__': 'type', 'class': 'gluonts.core.serde.ComplexNumber'}
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v
|
||||
The value to be encoded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Any
|
||||
An encoding of ``v`` that can be serialized to Python code or
|
||||
JSON string.
|
||||
|
||||
See Also
|
||||
--------
|
||||
decode
|
||||
Inverse function.
|
||||
dump_json
|
||||
Serializes an object to a JSON string.
|
||||
dump_code
|
||||
Serializes an object to a Python code string.
|
||||
"""
|
||||
if isinstance(v, type(None)):
|
||||
return None
|
||||
|
||||
if isinstance(v, (float, int, str)):
|
||||
return v
|
||||
|
||||
if np.issubdtype(type(v), np.inexact):
|
||||
return float(v)
|
||||
|
||||
if np.issubdtype(type(v), np.integer):
|
||||
return int(v)
|
||||
|
||||
# we have to check for namedtuples first, to encode them not as plain
|
||||
# tuples (which would become lists)
|
||||
if isinstance(v, tuple) and hasattr(v, "_asdict"):
|
||||
v = cast(NamedTuple, v)
|
||||
return {
|
||||
"__kind__": kind_inst,
|
||||
"class": fqname_for(v.__class__),
|
||||
"kwargs": encode(v._asdict()),
|
||||
}
|
||||
|
||||
if isinstance(v, (list, set, tuple)):
|
||||
return list(map(encode, v))
|
||||
|
||||
if isinstance(v, dict):
|
||||
return {k: encode(v) for k, v in v.items()}
|
||||
|
||||
if isinstance(v, type):
|
||||
return {"__kind__": kind_type, "class": fqname_for(v)}
|
||||
|
||||
if hasattr(v, "__getnewargs_ex__"):
|
||||
args, kwargs = v.__getnewargs_ex__() # mypy: ignore
|
||||
return {
|
||||
"__kind__": kind_inst,
|
||||
"class": fqname_for(v.__class__),
|
||||
"args": encode(args),
|
||||
"kwargs": encode(kwargs),
|
||||
}
|
||||
|
||||
raise RuntimeError(bad_type_msg.format(fqname_for(v.__class__)))
|
||||
|
||||
|
||||
def decode(r: Any) -> Any:
|
||||
"""
|
||||
Decodes a value from an intermediate representation `r`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
r
|
||||
An intermediate representation to be decoded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Any
|
||||
A Python data structure corresponding to the decoded version of ``r``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
encode
|
||||
Inverse function.
|
||||
"""
|
||||
|
||||
# structural recursion over the possible shapes of r
|
||||
# r = { 'class': ..., 'args': ... }
|
||||
# r = { 'class': ..., 'kwargs': ... }
|
||||
if type(r) == dict and r.get("__kind__") == kind_inst:
|
||||
cls = cast(Any, locate(r["class"]))
|
||||
args = decode(r["args"]) if "args" in r else []
|
||||
kwargs = decode(r["kwargs"]) if "kwargs" in r else {}
|
||||
return cls(*args, **kwargs)
|
||||
# r = { 'class': ..., 'args': ... }
|
||||
# r = { 'class': ..., 'kwargs': ... }
|
||||
if type(r) == dict and r.get("__kind__") == kind_type:
|
||||
return locate(r["class"])
|
||||
# r = { k1: v1, ..., kn: vn }
|
||||
elif type(r) == dict:
|
||||
return {k: decode(v) for k, v in r.items()}
|
||||
# r = ( y1, ..., yn )
|
||||
elif type(r) == tuple:
|
||||
return tuple([decode(y) for y in r])
|
||||
# r = [ y1, ..., yn ]
|
||||
elif type(r) == list:
|
||||
return [decode(y) for y in r]
|
||||
# r = { y1, ..., yn }
|
||||
elif type(r) == set:
|
||||
return {decode(y) for y in r}
|
||||
# r = a
|
||||
else:
|
||||
return r
|
||||
@@ -1,32 +0,0 @@
|
||||
from .artificial import (
|
||||
ArtificialDataset,
|
||||
ConstantDataset,
|
||||
ComplexSeasonalTimeSeries,
|
||||
RecipeDataset,
|
||||
constant_dataset,
|
||||
default_synthetic,
|
||||
generate_sf2,
|
||||
)
|
||||
from .common import (
|
||||
DataEntry,
|
||||
FieldName,
|
||||
Dataset,
|
||||
MetaData,
|
||||
TrainDatasets,
|
||||
DateConstants,
|
||||
)
|
||||
from .file_dataset import FileDataset
|
||||
from .list_dataset import ListDataset
|
||||
from .loader import TrainDataLoader, InferenceDataLoader
|
||||
from .multivariate_grouper import MultivariateGrouper
|
||||
from .process import ProcessStartField, ProcessDataEntry
|
||||
from .stat import DatasetStatistics, ScaleHistogram, calculate_dataset_statistics
|
||||
from .transformed_iterable_dataset import TransformedIterableDataset
|
||||
from .utils import (
|
||||
to_pandas,
|
||||
load_datasets,
|
||||
save_datasets,
|
||||
serialize_data_entry,
|
||||
frequency_add,
|
||||
forecast_start,
|
||||
)
|
||||
@@ -1,834 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from typing import Callable, List, NamedTuple, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import rapidjson as json
|
||||
|
||||
from .common import (
|
||||
MetaData,
|
||||
CategoricalFeatureInfo,
|
||||
BasicFeatureInfo,
|
||||
FieldName,
|
||||
Dataset,
|
||||
TrainDatasets,
|
||||
DataEntry,
|
||||
)
|
||||
from .list_dataset import ListDataset
|
||||
from .recipe import (
|
||||
BinaryHolidays,
|
||||
BinaryMarkovChain,
|
||||
Constant,
|
||||
ForEachCat,
|
||||
Lag,
|
||||
LinearTrend,
|
||||
RandomCat,
|
||||
RandomGaussian,
|
||||
Stack,
|
||||
generate,
|
||||
take_as_list,
|
||||
)
|
||||
from .stat import DatasetStatistics, calculate_dataset_statistics
|
||||
|
||||
|
||||
class DatasetInfo(NamedTuple):
|
||||
"""
|
||||
Information stored on a dataset. When downloading from the repository, the
|
||||
dataset repository checks that the obtained version matches the one
|
||||
declared in dataset_info/dataset_name.json.
|
||||
"""
|
||||
|
||||
name: str
|
||||
metadata: MetaData
|
||||
prediction_length: int
|
||||
train_statistics: DatasetStatistics
|
||||
test_statistics: DatasetStatistics
|
||||
|
||||
|
||||
class ArtificialDataset:
|
||||
"""
|
||||
Parent class of a dataset that can be generated from code.
|
||||
"""
|
||||
|
||||
def __init__(self, freq) -> None:
|
||||
self.freq = freq
|
||||
|
||||
@property
|
||||
def metadata(self) -> MetaData:
|
||||
pass
|
||||
|
||||
@property
|
||||
def train(self) -> List[DataEntry]:
|
||||
pass
|
||||
|
||||
@property
|
||||
def test(self) -> List[DataEntry]:
|
||||
pass
|
||||
|
||||
# todo return the same type as dataset repo for better usability
|
||||
def generate(self) -> TrainDatasets:
|
||||
return TrainDatasets(
|
||||
metadata=self.metadata,
|
||||
train=ListDataset(self.train, self.freq),
|
||||
test=ListDataset(self.test, self.freq),
|
||||
)
|
||||
|
||||
|
||||
class ConstantDataset(ArtificialDataset):
|
||||
def __init__(
|
||||
self,
|
||||
num_timeseries: int = 10,
|
||||
num_steps: int = 30,
|
||||
freq: str = "1H",
|
||||
start: str = "2000-01-01 00:00:00",
|
||||
is_nan: bool = False, # Generates constant dataset of 0s with explicit NaN missing values
|
||||
is_random_constant: bool = False, # Inserts random constant value for each time series
|
||||
is_different_scales: bool = False, # Generates constants on various scales
|
||||
is_piecewise: bool = False, # Determines whether the time series in the test
|
||||
# and train set should have different constant values
|
||||
is_noise: bool = False, # Determines whether to add Gaussian noise to the constant dataset
|
||||
is_long: bool = False, # Determines whether some time series will have very long lengths
|
||||
is_short: bool = False, # Determines whether some time series will have very short lengths
|
||||
is_trend: bool = False, # Determines whether to add linear trends
|
||||
num_missing_middle: int = 0, # Number of missing values in the middle of the time series
|
||||
is_promotions: bool = False, # Determines whether to add promotions to the target time series
|
||||
# and to store in metadata
|
||||
holidays: Optional[
|
||||
List[pd.Timestamp]
|
||||
] = None, # Determines whether to add holidays to the target time series
|
||||
# and to store in metadata
|
||||
) -> None:
|
||||
super(ConstantDataset, self).__init__(freq)
|
||||
self.num_timeseries = num_timeseries
|
||||
self.num_steps = num_steps
|
||||
self.num_training_steps = self.num_steps // 10 * 8
|
||||
self.prediction_length = self.num_steps - self.num_training_steps
|
||||
self.start = start
|
||||
self.is_nan = is_nan
|
||||
self.is_random_constant = is_random_constant
|
||||
self.is_different_scales = is_different_scales
|
||||
self.is_piecewise = is_piecewise
|
||||
self.is_noise = is_noise
|
||||
self.is_long = is_long
|
||||
self.is_short = is_short
|
||||
self.is_trend = is_trend
|
||||
self.num_missing_middle = num_missing_middle
|
||||
self.is_promotions = is_promotions
|
||||
self.holidays = holidays
|
||||
|
||||
@property
|
||||
def metadata(self) -> MetaData:
|
||||
metadata = MetaData(
|
||||
freq=self.freq,
|
||||
feat_static_cat=[
|
||||
{
|
||||
"name": "feat_static_cat_000",
|
||||
"cardinality": str(self.num_timeseries),
|
||||
}
|
||||
],
|
||||
feat_static_real=[{"name": "feat_static_real_000"}],
|
||||
prediction_length=self.prediction_length,
|
||||
)
|
||||
if self.is_promotions or self.holidays:
|
||||
metadata = MetaData(
|
||||
freq=self.freq,
|
||||
feat_static_cat=[
|
||||
{
|
||||
"name": "feat_static_cat_000",
|
||||
"cardinality": str(self.num_timeseries),
|
||||
}
|
||||
],
|
||||
feat_static_real=[{"name": "feat_static_real_000"}],
|
||||
feat_dynamic_real=[BasicFeatureInfo(name=FieldName.FEAT_DYNAMIC_REAL)],
|
||||
prediction_length=self.prediction_length,
|
||||
)
|
||||
return metadata
|
||||
|
||||
def determine_constant(
|
||||
self, index: int, constant: Optional[float] = None, seed: int = 1
|
||||
) -> Optional[float]:
|
||||
if self.is_random_constant:
|
||||
my_random = random.Random(seed)
|
||||
constant = (index + 1) * my_random.random()
|
||||
elif self.is_different_scales:
|
||||
if index == 0:
|
||||
constant = 1e-8
|
||||
elif constant is not None:
|
||||
constant *= 100
|
||||
else:
|
||||
constant = float(index)
|
||||
return constant
|
||||
|
||||
def compute_data_from_recipe(
|
||||
self,
|
||||
num_steps: int,
|
||||
constant: Optional[float] = None,
|
||||
one_to_zero: float = 0.1,
|
||||
zero_to_one: float = 0.1,
|
||||
scale_features: float = 200,
|
||||
) -> TrainDatasets:
|
||||
recipe = []
|
||||
recipe_type = Constant(constant)
|
||||
if self.is_noise:
|
||||
recipe_type += RandomGaussian() # Use default stddev = 1.0
|
||||
if self.is_trend:
|
||||
recipe_type += LinearTrend()
|
||||
if self.is_promotions:
|
||||
recipe.append(
|
||||
("binary_causal", BinaryMarkovChain(one_to_zero, zero_to_one))
|
||||
)
|
||||
recipe.append((FieldName.FEAT_DYNAMIC_REAL, Stack(["binary_causal"])))
|
||||
recipe_type += scale_features * Lag("binary_causal", lag=0)
|
||||
if self.holidays:
|
||||
timestamp = self.init_date()
|
||||
# Compute dates array
|
||||
dates = []
|
||||
for i in range(num_steps):
|
||||
dates.append(timestamp)
|
||||
timestamp += 1
|
||||
recipe.append(("binary_holidays", BinaryHolidays(dates, self.holidays)))
|
||||
recipe.append((FieldName.FEAT_DYNAMIC_REAL, Stack(["binary_holidays"])))
|
||||
recipe_type += scale_features * Lag("binary_holidays", lag=0)
|
||||
recipe.append((FieldName.TARGET, recipe_type))
|
||||
max_train_length = num_steps - self.prediction_length
|
||||
data = RecipeDataset(
|
||||
recipe=recipe,
|
||||
metadata=self.metadata,
|
||||
max_train_length=max_train_length,
|
||||
prediction_length=self.prediction_length,
|
||||
num_timeseries=1, # Add 1 time series at a time in the loop for different constant valus per time series
|
||||
)
|
||||
generated = data.generate()
|
||||
return generated
|
||||
|
||||
def piecewise_constant(self, index: int, num_steps: int) -> List:
|
||||
target = []
|
||||
for j in range(num_steps):
|
||||
if j < self.num_training_steps:
|
||||
constant = self.determine_constant(index=index)
|
||||
else:
|
||||
constant = self.determine_constant(index=index, seed=2)
|
||||
target.append(constant)
|
||||
return target
|
||||
|
||||
def get_num_steps(
|
||||
self,
|
||||
index: int,
|
||||
num_steps_max: int = 10000,
|
||||
long_freq: int = 4,
|
||||
num_steps_min: int = 2,
|
||||
short_freq: int = 4,
|
||||
) -> int:
|
||||
num_steps = self.num_steps
|
||||
if self.is_long and index % long_freq == 0:
|
||||
num_steps = num_steps_max
|
||||
elif self.is_short and index % short_freq == 0:
|
||||
num_steps = num_steps_min
|
||||
return num_steps
|
||||
|
||||
def init_date(self) -> pd.Timestamp:
|
||||
week_dict = {
|
||||
0: "MON",
|
||||
1: "TUE",
|
||||
2: "WED",
|
||||
3: "THU",
|
||||
4: "FRI",
|
||||
5: "SAT",
|
||||
6: "SUN",
|
||||
}
|
||||
timestamp = pd.Timestamp(self.start)
|
||||
freq_week_start = self.freq
|
||||
if freq_week_start == "W":
|
||||
freq_week_start = f"W-{week_dict[timestamp.weekday()]}"
|
||||
return pd.Timestamp(self.start, freq=freq_week_start)
|
||||
|
||||
@staticmethod
|
||||
def insert_nans_and_zeros(ts_len: int) -> List:
|
||||
target = []
|
||||
for j in range(ts_len):
|
||||
# Place NaNs at even indices. Use convention no NaNs before start date.
|
||||
if j != 0 and j % 2 == 0:
|
||||
target.append(np.nan)
|
||||
# Place zeros at odd indices
|
||||
else:
|
||||
target.append(0.0)
|
||||
return target
|
||||
|
||||
def insert_missing_vals_middle(
|
||||
self, ts_len: int, constant: Optional[float]
|
||||
) -> List:
|
||||
target = []
|
||||
lower_bound = (self.num_training_steps - self.num_missing_middle) // 2
|
||||
upper_bound = (self.num_training_steps + self.num_missing_middle) // 2
|
||||
num_missing_endpts = math.floor(0.1 * self.num_missing_middle)
|
||||
for j in range(ts_len):
|
||||
if (
|
||||
(0 < j < lower_bound and j % (2 * num_missing_endpts) == 0)
|
||||
or (lower_bound <= j < upper_bound)
|
||||
or (j >= upper_bound and j % (2 * num_missing_endpts) == 0)
|
||||
):
|
||||
val = np.nan
|
||||
else:
|
||||
val = constant
|
||||
target.append(val)
|
||||
return target
|
||||
|
||||
def generate_ts(self, num_ts_steps: int, is_train: bool = False) -> List[DataEntry]:
|
||||
res = []
|
||||
constant = None
|
||||
for i in range(self.num_timeseries):
|
||||
if self.is_nan:
|
||||
target = self.insert_nans_and_zeros(num_ts_steps)
|
||||
elif self.is_piecewise:
|
||||
target = self.piecewise_constant(i, num_ts_steps)
|
||||
else:
|
||||
constant = self.determine_constant(i, constant)
|
||||
if self.num_missing_middle > 0:
|
||||
target = self.insert_missing_vals_middle(num_ts_steps, constant)
|
||||
elif (
|
||||
self.is_noise
|
||||
or self.is_trend
|
||||
or self.is_promotions
|
||||
or self.holidays
|
||||
):
|
||||
|
||||
num_steps = self.get_num_steps(i)
|
||||
generated = self.compute_data_from_recipe(num_steps, constant)
|
||||
if is_train:
|
||||
time_series = generated.train
|
||||
else:
|
||||
assert generated.test is not None
|
||||
time_series = generated.test
|
||||
# returns np array convert to list for consistency
|
||||
target = list(time_series)[0][FieldName.TARGET].tolist()
|
||||
else:
|
||||
target = [constant] * num_ts_steps
|
||||
ts_data = dict(
|
||||
start=self.start,
|
||||
target=target,
|
||||
item_id=str(i),
|
||||
feat_static_cat=[i],
|
||||
feat_static_real=[i],
|
||||
)
|
||||
if self.is_promotions or self.holidays:
|
||||
ts_data[FieldName.FEAT_DYNAMIC_REAL] = list(time_series)[0][
|
||||
FieldName.FEAT_DYNAMIC_REAL
|
||||
].tolist()
|
||||
res.append(ts_data)
|
||||
return res
|
||||
|
||||
@property
|
||||
def train(self) -> List[DataEntry]:
|
||||
return self.generate_ts(num_ts_steps=self.num_training_steps, is_train=True)
|
||||
|
||||
@property
|
||||
def test(self) -> List[DataEntry]:
|
||||
return self.generate_ts(num_ts_steps=self.num_steps)
|
||||
|
||||
|
||||
class ComplexSeasonalTimeSeries(ArtificialDataset):
|
||||
"""
|
||||
Generate sinus time series that ramp up and reach a certain amplitude, and
|
||||
level and have additional spikes on each sunday.
|
||||
|
||||
|
||||
TODO: This could be converted to a RecipeDataset to avoid code duplication.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_series: int = 100,
|
||||
prediction_length: int = 20,
|
||||
freq_str: str = "D",
|
||||
length_low: int = 30,
|
||||
length_high: int = 200,
|
||||
min_val: float = -10000,
|
||||
max_val: float = 10000,
|
||||
is_integer: bool = False,
|
||||
proportion_missing_values: float = 0,
|
||||
is_noise: bool = True,
|
||||
is_scale: bool = True,
|
||||
percentage_unique_timestamps: float = 0.07,
|
||||
is_out_of_bounds_date: bool = False,
|
||||
seasonality: Optional[int] = None,
|
||||
clip_values: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
:param num_series: number of time series generated in the train and
|
||||
test set
|
||||
:param prediction_length:
|
||||
:param freq_str:
|
||||
:param length_low: minimum length of a time-series, must be larger than
|
||||
prediction_length
|
||||
:param length_high: maximum length of a time-series
|
||||
:param min_val: min value of a time-series
|
||||
:param max_val: max value of a time-series
|
||||
:param is_integer: whether the dataset has integers or not
|
||||
:param proportion_missing_values:
|
||||
:param is_noise: whether to add noise
|
||||
:param is_scale: whether to add scale
|
||||
:param percentage_unique_timestamps: percentage of random start dates bounded between 0 and 1
|
||||
:param is_out_of_bounds_date: determines whether to use very old start dates and start dates far in the future
|
||||
:param seasonality: Seasonality of the generated data. If not given uses default seasonality for frequency
|
||||
:param clip_values: if True the values will be clipped to [min_val, max_val], otherwise linearly scales them
|
||||
"""
|
||||
assert length_low > prediction_length
|
||||
super(ComplexSeasonalTimeSeries, self).__init__(freq_str)
|
||||
self.num_series = num_series
|
||||
self.prediction_length = prediction_length
|
||||
self.length_low = length_low
|
||||
self.length_high = length_high
|
||||
self.freq_str = freq_str
|
||||
self.min_val = min_val
|
||||
self.max_val = max_val
|
||||
self.is_integer = is_integer
|
||||
self.proportion_missing_values = proportion_missing_values
|
||||
self.is_noise = is_noise
|
||||
self.is_scale = is_scale
|
||||
self.percentage_unique_timestamps = percentage_unique_timestamps
|
||||
self.is_out_of_bounds_date = is_out_of_bounds_date
|
||||
self.seasonality = seasonality
|
||||
self.clip_values = clip_values
|
||||
|
||||
@property
|
||||
def metadata(self) -> MetaData:
|
||||
return MetaData(freq=self.freq, prediction_length=self.prediction_length)
|
||||
|
||||
def _get_period(self) -> int:
|
||||
if self.seasonality is not None:
|
||||
return self.seasonality
|
||||
if self.freq_str == "M":
|
||||
return 24
|
||||
elif self.freq_str == "W":
|
||||
return 52
|
||||
elif self.freq_str == "D":
|
||||
return 14
|
||||
elif self.freq_str == "H":
|
||||
return 24
|
||||
elif self.freq_str == "min":
|
||||
return 60
|
||||
else:
|
||||
raise RuntimeError()
|
||||
|
||||
def _get_start(self, index: int, my_random: random.Random) -> str:
|
||||
if (
|
||||
self.is_out_of_bounds_date and index == 0
|
||||
): # Add edge case of dates out of normal bounds past date
|
||||
start_y, start_m, start_d = (
|
||||
1690,
|
||||
2,
|
||||
7,
|
||||
) # Pandas doesn't allot before 1650
|
||||
start_h, start_min = 18, 36
|
||||
elif (
|
||||
self.is_out_of_bounds_date and index == self.num_series - 1
|
||||
): # Add edge case of dates out of normal bounds future date
|
||||
start_y, start_m, start_d = (
|
||||
2030,
|
||||
6,
|
||||
3,
|
||||
) # Pandas doesn't allot before 1650
|
||||
start_h, start_min = 18, 36
|
||||
# assume that only 100 * percentage_unique_timestamps of timestamps are unique
|
||||
elif my_random.random() < self.percentage_unique_timestamps:
|
||||
start_y = my_random.randint(2000, 2018)
|
||||
start_m = my_random.randint(1, 12)
|
||||
start_d = my_random.randint(1, 28)
|
||||
start_h = my_random.randint(0, 23)
|
||||
start_min = my_random.randint(0, 59)
|
||||
else:
|
||||
start_y, start_m, start_d = 2013, 11, 28
|
||||
start_h, start_min = 18, 36
|
||||
|
||||
if self.freq_str == "M":
|
||||
return "%04.d-%02.d" % (start_y, start_m)
|
||||
elif self.freq_str in ["W", "D"]:
|
||||
return "%04.d-%02.d-%02.d" % (start_y, start_m, start_d)
|
||||
elif self.freq_str == "H":
|
||||
return "%04.d-%02.d-%02.d %02.d:00:00" % (
|
||||
start_y,
|
||||
start_m,
|
||||
start_d,
|
||||
start_h,
|
||||
)
|
||||
else:
|
||||
return "%04.d-%02.d-%02.d %02.d:%02.d:00" % (
|
||||
start_y,
|
||||
start_m,
|
||||
start_d,
|
||||
start_h,
|
||||
start_min,
|
||||
)
|
||||
|
||||
def _special_time_point_indicator(self, index) -> bool:
|
||||
if self.freq_str == "M":
|
||||
return index.month == 1
|
||||
elif self.freq_str == "W":
|
||||
return index.month % 2 == 0
|
||||
elif self.freq_str == "D":
|
||||
return index.dayofweek == 0
|
||||
elif self.freq_str == "H":
|
||||
return index.hour == 0
|
||||
elif self.freq_str == "min":
|
||||
return index.minute % 30 == 0
|
||||
else:
|
||||
raise RuntimeError(f'Bad freq_str value "{index}"')
|
||||
|
||||
@property
|
||||
def train(self) -> List[DataEntry]:
|
||||
return [
|
||||
dict(
|
||||
start=ts[FieldName.START],
|
||||
target=ts[FieldName.TARGET][: -self.prediction_length],
|
||||
item_id=ts[FieldName.ITEM_ID],
|
||||
)
|
||||
for ts in self.make_timeseries()
|
||||
]
|
||||
|
||||
@property
|
||||
def test(self) -> List[DataEntry]:
|
||||
return self.make_timeseries()
|
||||
|
||||
def make_timeseries(self, seed: int = 1) -> List[DataEntry]:
|
||||
res = []
|
||||
# Fix seed so that the training set is the same
|
||||
# as the test set from 0:self.prediction_length for the two independent calls
|
||||
|
||||
def sigmoid(x: np.ndarray) -> np.ndarray:
|
||||
return 1.0 / (1.0 + np.exp(-x))
|
||||
|
||||
# Ensure same start dates in test and training set
|
||||
my_random = random.Random(seed)
|
||||
state = np.random.RandomState(seed)
|
||||
for i in range(self.num_series):
|
||||
val_range = self.max_val - self.min_val
|
||||
length = state.randint(low=self.length_low, high=self.length_high)
|
||||
start = self._get_start(i, my_random)
|
||||
envelope = sigmoid((np.arange(length) - 20.0) / 10.0)
|
||||
level = 0.3 * val_range * (state.random_sample() - 0.5)
|
||||
phi = 2 * np.pi * state.random_sample()
|
||||
period = self._get_period()
|
||||
w = 2 * np.pi / period
|
||||
t = np.arange(length)
|
||||
idx = pd.date_range(start=start, freq=self.freq_str, periods=length)
|
||||
special_tp_indicator = self._special_time_point_indicator(idx)
|
||||
sunday_effect = state.random_sample() * special_tp_indicator
|
||||
v = np.sin(w * t + phi) + sunday_effect
|
||||
|
||||
if self.is_scale:
|
||||
scale = 0.1 * val_range * state.random_sample()
|
||||
v *= scale
|
||||
v += level
|
||||
if self.is_noise:
|
||||
noise_range = 0.02 * val_range * state.random_sample()
|
||||
noise = noise_range * state.normal(size=length)
|
||||
v += noise
|
||||
v = envelope * v
|
||||
if self.clip_values:
|
||||
np.clip(v, a_min=self.min_val, a_max=self.max_val, out=v)
|
||||
else:
|
||||
"""
|
||||
Rather than mapping [v_min, v_max] to [self.min_val, self.max_val] which would lead to
|
||||
all the time series having the same min and max, we want to keep the same interval length
|
||||
(v_max - v_min). We thus shift the interval [v_min, v_max] in [self.min_val, self.max_val]
|
||||
and clip it if needed.
|
||||
"""
|
||||
v_min, v_max = v.min(), v.max()
|
||||
p_min, p_max = (
|
||||
max(self.min_val, v_min),
|
||||
min(self.max_val, v_max),
|
||||
)
|
||||
shifted_min = np.clip(
|
||||
p_min + (p_max - v_max), a_min=self.min_val, a_max=self.max_val,
|
||||
)
|
||||
shifted_max = np.clip(
|
||||
p_max + (p_min - v_min), a_min=self.min_val, a_max=self.max_val,
|
||||
)
|
||||
v = shifted_min + (shifted_max - shifted_min) * (v - v_min) / (
|
||||
v_max - v_min
|
||||
)
|
||||
|
||||
if self.is_integer:
|
||||
np.clip(
|
||||
v, a_min=np.ceil(self.min_val), a_max=np.floor(self.max_val), out=v,
|
||||
)
|
||||
v = np.round(v).astype(int)
|
||||
v = list(v.tolist())
|
||||
if self.proportion_missing_values > 0:
|
||||
assert (
|
||||
self.proportion_missing_values < 1.0
|
||||
), "Please chose a number 0 < x < 1.0"
|
||||
idx = np.arange(len(v))
|
||||
state.shuffle(idx)
|
||||
num_missing_values = (
|
||||
int(len(v) * self.proportion_missing_values) + 1
|
||||
) # Add one in case this gets zero
|
||||
missing_idx = idx[:num_missing_values]
|
||||
for j in missing_idx:
|
||||
# Using convention that there are no missing values before the start date.
|
||||
if j != 0:
|
||||
v[j] = None if state.rand() < 0.5 else "NaN"
|
||||
res.append(
|
||||
dict(
|
||||
start=pd.Timestamp(start, freq=self.freq_str),
|
||||
target=np.array(v),
|
||||
item_id=i,
|
||||
)
|
||||
)
|
||||
return res
|
||||
|
||||
|
||||
class RecipeDataset(ArtificialDataset):
|
||||
"""Synthetic data set generated by providing a recipe.
|
||||
|
||||
A recipe is either a (non-deterministic) function
|
||||
|
||||
f(length: int, global_state: dict) -> dict
|
||||
|
||||
or list of (field, function) tuples of the form
|
||||
|
||||
(field: str, f(data: dict, length: int, global_state: dict) -> dict)
|
||||
|
||||
which is processed sequentially, with data initially set to {},
|
||||
and each entry updating data[field] to the output of the function
|
||||
call.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
recipe: Union[Callable, List[Tuple[str, Callable]]],
|
||||
metadata: MetaData,
|
||||
max_train_length: int,
|
||||
prediction_length: int,
|
||||
num_timeseries: int,
|
||||
trim_length_fun=lambda x, **kwargs: 0,
|
||||
data_start=pd.Timestamp("2014-01-01"),
|
||||
) -> None:
|
||||
"""
|
||||
|
||||
:param recipe: The recipe to generate from (see class docstring)
|
||||
:param metadata: The metadata to be included in the dataset
|
||||
:param max_train_length: The maximum length of a training time series.
|
||||
:param prediction_length: The length of the prediction range
|
||||
:param num_timeseries: Number of time series to generate
|
||||
:param trim_length_fun: Callable f(x: int) -> int returning the
|
||||
(shortened) training length
|
||||
:param data_start: Start date for the data set
|
||||
"""
|
||||
super().__init__(freq=metadata.freq)
|
||||
|
||||
self.recipe = recipe
|
||||
self._metadata = metadata
|
||||
self.max_train_length = max_train_length
|
||||
self.prediction_length = prediction_length
|
||||
self.trim_length_fun = trim_length_fun
|
||||
self.num_timeseries = num_timeseries
|
||||
self.data_start = pd.Timestamp(data_start, freq=self._metadata.freq)
|
||||
|
||||
@property
|
||||
def metadata(self) -> MetaData:
|
||||
return self._metadata
|
||||
|
||||
def dataset_info(self, train_ds: Dataset, test_ds: Dataset) -> DatasetInfo:
|
||||
return DatasetInfo(
|
||||
name=f"RecipeDataset({repr(self.recipe)})",
|
||||
metadata=self.metadata,
|
||||
prediction_length=self.prediction_length,
|
||||
train_statistics=calculate_dataset_statistics(train_ds),
|
||||
test_statistics=calculate_dataset_statistics(test_ds),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def trim_ts_item_end(x: DataEntry, length: int) -> DataEntry:
|
||||
"""Trim a TimeSeriesItem into a training range, by removing
|
||||
the last prediction_length time points from the target and dynamic
|
||||
features."""
|
||||
y = dict(
|
||||
item_id=x[FieldName.ITEM_ID],
|
||||
start=x[FieldName.START],
|
||||
target=x[FieldName.TARGET][:-length],
|
||||
)
|
||||
|
||||
if FieldName.FEAT_DYNAMIC_CAT in x:
|
||||
y[FieldName.FEAT_DYNAMIC_CAT] = x[FieldName.FEAT_DYNAMIC_CAT][:, :-length]
|
||||
if FieldName.FEAT_DYNAMIC_REAL in x:
|
||||
y[FieldName.FEAT_DYNAMIC_REAL] = x[FieldName.FEAT_DYNAMIC_REAL][:, :-length]
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def trim_ts_item_front(x: DataEntry, length: int) -> DataEntry:
|
||||
"""Trim a TimeSeriesItem into a training range, by removing
|
||||
the first offset_front time points from the target and dynamic
|
||||
features."""
|
||||
assert length <= len(x[FieldName.TARGET])
|
||||
|
||||
y = dict(
|
||||
item_id=x[FieldName.ITEM_ID],
|
||||
start=x[FieldName.START] + length * x[FieldName.START].freq,
|
||||
target=x[FieldName.TARGET][length:],
|
||||
)
|
||||
|
||||
if FieldName.FEAT_DYNAMIC_CAT in x:
|
||||
y[FieldName.FEAT_DYNAMIC_CAT] = x[FieldName.FEAT_DYNAMIC_CAT][:, length:]
|
||||
if FieldName.FEAT_DYNAMIC_REAL in x:
|
||||
y[FieldName.FEAT_DYNAMIC_REAL] = x[FieldName.FEAT_DYNAMIC_REAL][:, length:]
|
||||
return y
|
||||
|
||||
def generate(self) -> TrainDatasets:
|
||||
metadata = self.metadata
|
||||
data_it = generate(
|
||||
length=self.max_train_length + self.prediction_length,
|
||||
recipe=self.recipe,
|
||||
start=self.data_start,
|
||||
)
|
||||
full_length_data = take_as_list(data_it, self.num_timeseries)
|
||||
|
||||
test_data = [
|
||||
RecipeDataset.trim_ts_item_front(
|
||||
x, self.trim_length_fun(x, train_length=self.max_train_length)
|
||||
)
|
||||
for x in full_length_data
|
||||
]
|
||||
train_data = [
|
||||
RecipeDataset.trim_ts_item_end(x, self.prediction_length) for x in test_data
|
||||
]
|
||||
return TrainDatasets(
|
||||
metadata=metadata,
|
||||
train=ListDataset(train_data, metadata.freq),
|
||||
test=ListDataset(test_data, metadata.freq),
|
||||
)
|
||||
|
||||
|
||||
def default_synthetic() -> Tuple[DatasetInfo, Dataset, Dataset]:
|
||||
|
||||
recipe = [
|
||||
(FieldName.TARGET, LinearTrend() + RandomGaussian()),
|
||||
(FieldName.FEAT_STATIC_CAT, RandomCat([10])),
|
||||
(
|
||||
FieldName.FEAT_STATIC_REAL,
|
||||
ForEachCat(RandomGaussian(1, (10,)), FieldName.FEAT_STATIC_CAT)
|
||||
+ RandomGaussian(0.1, (10,)),
|
||||
),
|
||||
]
|
||||
|
||||
data = RecipeDataset(
|
||||
recipe=recipe,
|
||||
metadata=MetaData(
|
||||
freq="D",
|
||||
feat_static_real=[BasicFeatureInfo(name=FieldName.FEAT_STATIC_REAL)],
|
||||
feat_static_cat=[
|
||||
CategoricalFeatureInfo(name=FieldName.FEAT_STATIC_CAT, cardinality=10)
|
||||
],
|
||||
feat_dynamic_real=[BasicFeatureInfo(name=FieldName.FEAT_DYNAMIC_REAL)],
|
||||
),
|
||||
max_train_length=20,
|
||||
prediction_length=10,
|
||||
num_timeseries=10,
|
||||
trim_length_fun=lambda x, **kwargs: np.minimum(
|
||||
int(np.random.geometric(1 / (kwargs["train_length"] / 2))),
|
||||
kwargs["train_length"],
|
||||
),
|
||||
)
|
||||
|
||||
generated = data.generate()
|
||||
assert generated.test is not None
|
||||
info = data.dataset_info(generated.train, generated.test)
|
||||
|
||||
return info, generated.train, generated.test
|
||||
|
||||
|
||||
def constant_dataset() -> Tuple[DatasetInfo, Dataset, Dataset]:
|
||||
metadata = MetaData(
|
||||
freq="1H",
|
||||
feat_static_cat=[
|
||||
CategoricalFeatureInfo(name="feat_static_cat_000", cardinality="10")
|
||||
],
|
||||
feat_static_real=[BasicFeatureInfo(name="feat_static_real_000")],
|
||||
)
|
||||
|
||||
start_date = "2000-01-01 00:00:00"
|
||||
|
||||
train_ds = ListDataset(
|
||||
data_iter=[
|
||||
{
|
||||
FieldName.ITEM_ID: str(i),
|
||||
FieldName.START: start_date,
|
||||
FieldName.TARGET: [float(i)] * 24,
|
||||
FieldName.FEAT_STATIC_CAT: [i],
|
||||
FieldName.FEAT_STATIC_REAL: [float(i)],
|
||||
}
|
||||
for i in range(10)
|
||||
],
|
||||
freq=metadata.freq,
|
||||
)
|
||||
|
||||
test_ds = ListDataset(
|
||||
data_iter=[
|
||||
{
|
||||
FieldName.ITEM_ID: str(i),
|
||||
FieldName.START: start_date,
|
||||
FieldName.TARGET: [float(i)] * 30,
|
||||
FieldName.FEAT_STATIC_CAT: [i],
|
||||
FieldName.FEAT_STATIC_REAL: [float(i)],
|
||||
}
|
||||
for i in range(10)
|
||||
],
|
||||
freq=metadata.freq,
|
||||
)
|
||||
|
||||
info = DatasetInfo(
|
||||
name="constant_dataset",
|
||||
metadata=metadata,
|
||||
prediction_length=6,
|
||||
train_statistics=calculate_dataset_statistics(train_ds),
|
||||
test_statistics=calculate_dataset_statistics(test_ds),
|
||||
)
|
||||
|
||||
return info, train_ds, test_ds
|
||||
|
||||
|
||||
def generate_sf2(
|
||||
filename: str, time_series: List, is_missing: bool, num_missing: int
|
||||
) -> None:
|
||||
# This function generates the test and train json files which will be converted to csv format
|
||||
if not os.path.exists(os.path.dirname(filename)):
|
||||
os.makedirs(os.path.dirname(filename))
|
||||
with open(filename, "w") as json_file:
|
||||
for ts in time_series:
|
||||
if is_missing:
|
||||
target = [] # type: List
|
||||
# For Forecast don't output feat_static_cat and feat_static_real
|
||||
for j, val in enumerate(ts[FieldName.TARGET]):
|
||||
# only add ones that are not missing
|
||||
if j != 0 and j % num_missing == 0:
|
||||
target.append(None)
|
||||
else:
|
||||
target.append(val)
|
||||
ts[FieldName.TARGET] = target
|
||||
ts.pop(FieldName.FEAT_STATIC_CAT, None)
|
||||
ts.pop(FieldName.FEAT_STATIC_REAL, None)
|
||||
# Chop features in training set
|
||||
if FieldName.FEAT_DYNAMIC_REAL in ts.keys() and "train" in filename:
|
||||
# TODO: Fix for missing values
|
||||
for i, feat_dynamic_real in enumerate(ts[FieldName.FEAT_DYNAMIC_REAL]):
|
||||
ts[FieldName.FEAT_DYNAMIC_REAL][i] = feat_dynamic_real[
|
||||
: len(ts[FieldName.TARGET])
|
||||
]
|
||||
json.dump(ts, json_file)
|
||||
json_file.write("\n")
|
||||
@@ -1,95 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from typing import Any, Dict, Iterable, NamedTuple, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Dictionary used for data flowing through the transformations.
|
||||
DataEntry = Dict[str, Any]
|
||||
|
||||
# A Dataset is an iterable of DataEntry.
|
||||
Dataset = Iterable[DataEntry]
|
||||
|
||||
|
||||
class SourceContext(NamedTuple):
|
||||
source: str
|
||||
row: int
|
||||
|
||||
|
||||
class FieldName:
|
||||
"""
|
||||
A bundle of default field names to be used by clients when instantiating
|
||||
transformer instances.
|
||||
"""
|
||||
|
||||
ITEM_ID = "item_id"
|
||||
|
||||
START = "start"
|
||||
TARGET = "target"
|
||||
|
||||
FEAT_STATIC_CAT = "feat_static_cat"
|
||||
FEAT_STATIC_REAL = "feat_static_real"
|
||||
FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
|
||||
FEAT_DYNAMIC_REAL = "feat_dynamic_real"
|
||||
|
||||
FEAT_TIME = "time_feat"
|
||||
FEAT_CONST = "feat_dynamic_const"
|
||||
FEAT_AGE = "feat_dynamic_age"
|
||||
|
||||
OBSERVED_VALUES = "observed_values"
|
||||
IS_PAD = "is_pad"
|
||||
FORECAST_START = "forecast_start"
|
||||
|
||||
|
||||
class CategoricalFeatureInfo(BaseModel):
|
||||
name: str
|
||||
cardinality: str
|
||||
|
||||
|
||||
class BasicFeatureInfo(BaseModel):
|
||||
name: str
|
||||
|
||||
|
||||
class MetaData(BaseModel):
|
||||
freq: str = None
|
||||
target: Optional[BasicFeatureInfo] = None
|
||||
|
||||
feat_static_cat: List[CategoricalFeatureInfo] = []
|
||||
feat_static_real: List[BasicFeatureInfo] = []
|
||||
feat_dynamic_real: List[BasicFeatureInfo] = []
|
||||
feat_dynamic_cat: List[CategoricalFeatureInfo] = []
|
||||
|
||||
prediction_length: Optional[int] = None
|
||||
|
||||
|
||||
class TrainDatasets(NamedTuple):
|
||||
"""
|
||||
A dataset containing two subsets, one to be used for training purposes,
|
||||
and the other for testing purposes, as well as metadata.
|
||||
"""
|
||||
|
||||
metadata: MetaData
|
||||
train: Dataset
|
||||
test: Optional[Dataset] = None
|
||||
|
||||
|
||||
class DateConstants:
|
||||
"""
|
||||
Default constants for specific dates.
|
||||
"""
|
||||
|
||||
OLDEST_SUPPORTED_TIMESTAMP = pd.Timestamp(1800, 1, 1, 12)
|
||||
LATEST_SUPPORTED_TIMESTAMP = pd.Timestamp(2200, 1, 1, 12)
|
||||
@@ -1,133 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import functools
|
||||
import glob
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List
|
||||
from typing import NamedTuple
|
||||
|
||||
import rapidjson as json
|
||||
|
||||
from .common import Dataset, DataEntry, SourceContext
|
||||
from .process import ProcessDataEntry
|
||||
|
||||
|
||||
def load(file_obj):
|
||||
for line in file_obj:
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
class Span(NamedTuple):
|
||||
path: Path
|
||||
line: int
|
||||
|
||||
|
||||
class Line(NamedTuple):
|
||||
content: object
|
||||
span: Span
|
||||
|
||||
|
||||
class JsonLinesFile:
|
||||
"""
|
||||
An iterable type that draws from a JSON Lines file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path
|
||||
Path of the file to load data from. This should be a valid
|
||||
JSON Lines file.
|
||||
"""
|
||||
|
||||
def __init__(self, path: Path, shuffle: bool = True) -> None:
|
||||
self.path = path
|
||||
self.shuffle = shuffle
|
||||
|
||||
def __iter__(self):
|
||||
with open(self.path) as jsonl_file:
|
||||
lines = jsonl_file.read().splitlines()
|
||||
if self.shuffle:
|
||||
random.shuffle(lines)
|
||||
|
||||
for line_number, raw in enumerate(lines, start=1):
|
||||
span = Span(path=self.path, line=line_number)
|
||||
try:
|
||||
yield Line(json.loads(raw), span=span)
|
||||
except ValueError:
|
||||
raise Exception(f"Could not read json line {line_number}, {raw}")
|
||||
|
||||
def __len__(self):
|
||||
# 1MB
|
||||
BUF_SIZE = 1024 ** 2
|
||||
|
||||
with open(self.path) as file_obj:
|
||||
read_chunk = functools.partial(file_obj.read, BUF_SIZE)
|
||||
return sum(chunk.count("\n") for chunk in iter(read_chunk, ""))
|
||||
|
||||
|
||||
class FileDataset(Dataset):
|
||||
"""
|
||||
Dataset that loads JSON Lines files contained in a path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path
|
||||
Return list of path names that match path. Each file is considered
|
||||
and should be valid. A valid line in a file can be for
|
||||
instance: {"start": "2014-09-07", "target": [0.1, 0.2]}.
|
||||
freq
|
||||
Frequency of the observation in the time series.
|
||||
Must be a valid Pandas frequency.
|
||||
one_dim_target
|
||||
Whether to accept only univariate target time series.
|
||||
shuffle
|
||||
Whether to shuffle the time series when making the batches
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, path: Path, freq: str, one_dim_target: bool = True, shuffle: bool = False
|
||||
) -> None:
|
||||
self.shuffle = shuffle
|
||||
self.path = path
|
||||
self.process = ProcessDataEntry(freq, one_dim_target=one_dim_target)
|
||||
if not self.files():
|
||||
raise OSError(f"no valid file found via {path}")
|
||||
|
||||
def __iter__(self) -> Iterator[DataEntry]:
|
||||
for path in self.files():
|
||||
for line in JsonLinesFile(path, self.shuffle):
|
||||
data = self.process(line.content)
|
||||
data["source"] = SourceContext(
|
||||
source=line.span.path, row=line.span.line
|
||||
)
|
||||
yield data
|
||||
|
||||
def __len__(self):
|
||||
return sum([len(JsonLinesFile(path)) for path in self.files()])
|
||||
|
||||
def files(self) -> List[Path]:
|
||||
"""
|
||||
List the files that compose the dataset.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Path]
|
||||
List of the paths of all files composing the dataset.
|
||||
"""
|
||||
files = glob.glob(str(self.path))
|
||||
if self.shuffle:
|
||||
random.shuffle(files)
|
||||
return files
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import random
|
||||
from typing import Iterable
|
||||
|
||||
from .common import DataEntry, Dataset, SourceContext
|
||||
from .process import ProcessDataEntry
|
||||
|
||||
|
||||
class ListDataset(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
data_iter: Iterable[DataEntry],
|
||||
freq: str,
|
||||
one_dim_target: bool = True,
|
||||
shuffle: bool = False,
|
||||
) -> None:
|
||||
process = ProcessDataEntry(freq, one_dim_target)
|
||||
self.list_data = [process(data) for data in data_iter]
|
||||
if shuffle:
|
||||
random.shuffle(self.list_data)
|
||||
|
||||
def __iter__(self):
|
||||
source_name = "list_data"
|
||||
for row_number, data in enumerate(self.list_data, start=1):
|
||||
data["source"] = SourceContext(source=source_name, row=row_number)
|
||||
yield data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.list_data)
|
||||
@@ -1,224 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Iterable, Iterator, List, Optional # noqa: F401
|
||||
|
||||
import numpy as np
|
||||
# Third-party imports
|
||||
import torch
|
||||
|
||||
from pts.transform.transform import Transformation
|
||||
# First-party imports
|
||||
from .common import DataEntry, Dataset
|
||||
|
||||
DataBatch = Dict[str, Any]
|
||||
|
||||
|
||||
class BatchBuffer:
|
||||
def __init__(
|
||||
self, batch_size: int, device: torch.device, dtype: np.dtype = np.float32
|
||||
) -> None:
|
||||
self._buffers: Dict[Any, List[Any]] = defaultdict(list)
|
||||
self.batch_size = batch_size
|
||||
self._size = 0
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
|
||||
def add(self, d: Dict[str, List[np.ndarray]]):
|
||||
if self._buffers:
|
||||
assert self._buffers.keys() == d.keys()
|
||||
for k, v in d.items():
|
||||
self._buffers[k].append(v)
|
||||
self._size += 1
|
||||
|
||||
def __len__(self):
|
||||
return self._size
|
||||
|
||||
def next_batch(self) -> DataBatch:
|
||||
assert self._size > 0
|
||||
n = min(self._size, self.batch_size)
|
||||
batch = {k: self.stack(v[:n]) for k, v in self._buffers.items()}
|
||||
for key in self._buffers.keys():
|
||||
self._buffers[key] = self._buffers[key][n:]
|
||||
self._size -= n
|
||||
return batch
|
||||
|
||||
def stack(self, xs):
|
||||
if isinstance(xs[0], np.ndarray):
|
||||
data = np.asarray(xs)
|
||||
if data.dtype.kind == "f":
|
||||
data = data.astype(self.dtype)
|
||||
return torch.from_numpy(data).to(device=self.device, non_blocking=True)
|
||||
elif isinstance(xs[0], torch.Tensor):
|
||||
return torch.stack(*xs)
|
||||
else:
|
||||
return xs # stack all other types as list
|
||||
|
||||
def shuffle(self):
|
||||
perm = np.random.permutation(self._size)
|
||||
for key in self._buffers.keys():
|
||||
li = self._buffers[key]
|
||||
self._buffers[key] = [li[i] for i in perm]
|
||||
|
||||
|
||||
class DataLoader(Iterable[DataEntry]):
|
||||
"""
|
||||
An abstract Iterable type for iterating and transforming a dataset,
|
||||
in batches of a prescribed size.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset
|
||||
The dataset from which to load data.
|
||||
transform
|
||||
A transformation to apply to each entry in the dataset.
|
||||
batch_size
|
||||
The size of the batches to emit.
|
||||
device
|
||||
device to use to store data on.
|
||||
dtype
|
||||
Floating point type to use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset: Dataset,
|
||||
transform: Transformation,
|
||||
batch_size: int,
|
||||
device: torch.device,
|
||||
dtype: np.dtype = np.float32,
|
||||
) -> None:
|
||||
self.dataset = dataset
|
||||
self.transform = transform
|
||||
self.batch_size = batch_size
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class TrainDataLoader(DataLoader):
|
||||
"""
|
||||
An Iterable type for iterating and transforming a dataset, in batches of a
|
||||
prescribed size, until a given number of batches is reached.
|
||||
|
||||
The transformation are applied with in training mode, i.e. with the flag
|
||||
`is_train = True`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset
|
||||
The dataset from which to load data.
|
||||
transform
|
||||
A transformation to apply to each entry in the dataset.
|
||||
batch_size
|
||||
The size of the batches to emit.
|
||||
device
|
||||
device to use to store data on.
|
||||
num_batches_per_epoch
|
||||
Number of batches to return in one complete iteration over this object.
|
||||
dtype
|
||||
Floating point type to use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset: Dataset,
|
||||
transform: Transformation,
|
||||
batch_size: int,
|
||||
device: torch.device,
|
||||
num_batches_per_epoch: int,
|
||||
dtype: np.dtype = np.float32,
|
||||
shuffle_for_training: bool = True,
|
||||
num_batches_for_shuffling: int = 10,
|
||||
) -> None:
|
||||
super().__init__(dataset, transform, batch_size, device, dtype)
|
||||
self.num_batches_per_epoch = num_batches_per_epoch
|
||||
self.shuffle_for_training = shuffle_for_training
|
||||
self._num_buffered_batches = (
|
||||
num_batches_for_shuffling if shuffle_for_training else 1
|
||||
)
|
||||
self._cur_iter: Optional[Iterator] = None
|
||||
self._buffer = BatchBuffer(self.batch_size, device, dtype)
|
||||
|
||||
def _emit_batches_while_buffer_larger_than(self, thresh) -> Iterator[DataBatch]:
|
||||
if self.shuffle_for_training:
|
||||
self._buffer.shuffle()
|
||||
while len(self._buffer) > thresh:
|
||||
yield self._buffer.next_batch()
|
||||
|
||||
def _iterate_forever(self, collection: Iterable[DataEntry]) -> Iterator[DataEntry]:
|
||||
# iterate forever over the collection, the collection must be non empty
|
||||
while True:
|
||||
try:
|
||||
first = next(iter(collection))
|
||||
except StopIteration:
|
||||
raise Exception("empty dataset")
|
||||
else:
|
||||
for x in itertools.chain([first], collection):
|
||||
yield x
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.num_batches_per_epoch
|
||||
|
||||
def __iter__(self) -> Iterator[DataBatch]:
|
||||
batch_count = 0
|
||||
if self._cur_iter is None:
|
||||
self._cur_iter = self.transform(
|
||||
self._iterate_forever(self.dataset), is_train=True
|
||||
)
|
||||
assert self._cur_iter is not None
|
||||
while True:
|
||||
data_entry = next(self._cur_iter)
|
||||
self._buffer.add(data_entry)
|
||||
if len(self._buffer) >= self._num_buffered_batches * self.batch_size:
|
||||
for batch in self._emit_batches_while_buffer_larger_than(
|
||||
self.batch_size - 1
|
||||
):
|
||||
yield batch
|
||||
batch_count += 1
|
||||
if batch_count >= self.num_batches_per_epoch:
|
||||
return
|
||||
|
||||
|
||||
class InferenceDataLoader(DataLoader):
|
||||
"""
|
||||
An Iterable type for iterating and transforming a dataset just once, in
|
||||
batches of a prescribed size.
|
||||
|
||||
The transformation are applied with in inference mode, i.e. with the flag
|
||||
`is_train = False`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset
|
||||
The dataset from which to load data.
|
||||
transform
|
||||
A transformation to apply to each entry in the dataset.
|
||||
batch_size
|
||||
The size of the batches to emit.
|
||||
device
|
||||
device to use to store data on.
|
||||
dtype
|
||||
Floating point type to use.
|
||||
"""
|
||||
|
||||
def __iter__(self) -> Iterator[DataBatch]:
|
||||
buffer = BatchBuffer(self.batch_size, self.device, self.dtype)
|
||||
for data_entry in self.transform(iter(self.dataset), is_train=False):
|
||||
buffer.add(data_entry)
|
||||
if len(buffer) >= self.batch_size:
|
||||
yield buffer.next_batch()
|
||||
if len(buffer) > 0:
|
||||
yield buffer.next_batch()
|
||||
@@ -1,211 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
import logging
|
||||
from typing import Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# First-party imports
|
||||
from .common import DataEntry, Dataset, FieldName, DateConstants
|
||||
from .list_dataset import ListDataset
|
||||
|
||||
|
||||
class MultivariateGrouper:
|
||||
"""
|
||||
The MultivariateGrouper takes a univariate dataset and groups it into a
|
||||
single multivariate time series. Therefore, this class allows the user
|
||||
to convert a univariate dataset into a multivariate dataset without making
|
||||
a separate copy of the dataset.
|
||||
|
||||
The Multivariate Grouper has two different modes:
|
||||
|
||||
Training: For training data, the univariate time series get aligned to the
|
||||
earliest time stamp in the dataset. Time series will be left and right
|
||||
padded to produce an array of shape (dim, num_time_steps)
|
||||
|
||||
Test: The test dataset might have multiple start dates (usually because
|
||||
the test dataset mimics a rolling evaluation scenario). In this case,
|
||||
the univariate dataset will be split into n multivariate time series,
|
||||
where n is the number of evaluation dates. Again, the
|
||||
time series will be grouped but only left padded. Note that the
|
||||
padded value will influence the prediction if the context length is
|
||||
longer than the length of the time series.
|
||||
|
||||
Rules for padding for training and test datasets can be specified by the
|
||||
user.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_target_dim
|
||||
Set maximum dimensionality (for faster testing or when hitting
|
||||
constraints of multivariate model). Takes the last max_target_dim
|
||||
time series and groups them to multivariate time series.
|
||||
num_test_dates
|
||||
Number of test dates in the test set. This can be more than one if
|
||||
the test set contains more than one forecast start date (often the
|
||||
case in a rolling evaluation scenario). Must be set to convert test
|
||||
data.
|
||||
train_fill_rule
|
||||
Implements the rule that fills missing data after alignment of the
|
||||
time series for the training dataset.
|
||||
test_fill_rule
|
||||
Implements the rule that fills missing data after alignment of the
|
||||
time series for the test dataset.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_target_dim: Optional[int] = None,
|
||||
num_test_dates: Optional[int] = None,
|
||||
train_fill_rule: Callable = np.mean,
|
||||
test_fill_rule: Callable = lambda x: 0.0,
|
||||
) -> None:
|
||||
self.num_test_dates = num_test_dates
|
||||
self.max_target_dimension = max_target_dim
|
||||
self.train_fill_function = train_fill_rule
|
||||
self.test_fill_rule = test_fill_rule
|
||||
|
||||
self.first_timestamp = DateConstants.LATEST_SUPPORTED_TIMESTAMP
|
||||
self.last_timestamp = DateConstants.OLDEST_SUPPORTED_TIMESTAMP
|
||||
self.frequency = ""
|
||||
|
||||
def __call__(self, dataset: Dataset) -> Dataset:
|
||||
self._preprocess(dataset)
|
||||
return self._group_all(dataset)
|
||||
|
||||
def _preprocess(self, dataset: Dataset) -> None:
|
||||
"""
|
||||
The preprocess function iterates over the dataset to gather data that
|
||||
is necessary for alignment.
|
||||
This includes
|
||||
1) Storing first/last timestamp in the dataset
|
||||
2) Storing the frequency of the dataset
|
||||
"""
|
||||
for data in dataset:
|
||||
timestamp = data[FieldName.START]
|
||||
self.first_timestamp = min(self.first_timestamp, timestamp)
|
||||
self.last_timestamp = max(
|
||||
self.last_timestamp,
|
||||
timestamp + (len(data[FieldName.TARGET]) - 1) * timestamp.freq,
|
||||
)
|
||||
self.frequency = timestamp.freq
|
||||
logging.info(
|
||||
f"first/last timestamp found: "
|
||||
f"{self.first_timestamp}/{self.last_timestamp}"
|
||||
)
|
||||
|
||||
def _group_all(self, dataset: Dataset) -> Dataset:
|
||||
if self.num_test_dates is None:
|
||||
grouped_dataset = self._prepare_train_data(dataset)
|
||||
else:
|
||||
grouped_dataset = self._prepare_test_data(dataset)
|
||||
return grouped_dataset
|
||||
|
||||
def _prepare_train_data(self, dataset: Dataset) -> ListDataset:
|
||||
logging.info("group training time-series to datasets")
|
||||
|
||||
grouped_data = self._transform_target(self._align_data_entry, dataset)
|
||||
grouped_data = self._restrict_max_dimensionality(grouped_data)
|
||||
grouped_data[FieldName.START] = self.first_timestamp
|
||||
grouped_data[FieldName.FEAT_STATIC_CAT] = [0]
|
||||
|
||||
return ListDataset([grouped_data], freq=self.frequency, one_dim_target=False)
|
||||
|
||||
def _prepare_test_data(self, dataset: Dataset) -> ListDataset:
|
||||
logging.info("group test time-series to datasets")
|
||||
|
||||
grouped_data = self._transform_target(self._left_pad_data, dataset)
|
||||
# splits test dataset with rolling date into N R^d time series where
|
||||
# N is the number of rolling evaluation dates
|
||||
split_dataset = np.split(grouped_data[FieldName.TARGET], self.num_test_dates)
|
||||
|
||||
all_entries = list()
|
||||
for dataset_at_test_date in split_dataset:
|
||||
grouped_data = dict()
|
||||
grouped_data[FieldName.TARGET] = np.array(
|
||||
list(dataset_at_test_date), dtype=np.float32
|
||||
)
|
||||
grouped_data = self._restrict_max_dimensionality(grouped_data)
|
||||
grouped_data[FieldName.START] = self.first_timestamp
|
||||
grouped_data[FieldName.FEAT_STATIC_CAT] = [0]
|
||||
all_entries.append(grouped_data)
|
||||
|
||||
return ListDataset(
|
||||
all_entries, freq=self.frequency, one_dim_target=False
|
||||
)
|
||||
|
||||
def _align_data_entry(self, data: DataEntry) -> np.array:
|
||||
ts = self.to_ts(data)
|
||||
return ts.reindex(
|
||||
pd.date_range(
|
||||
start=self.first_timestamp,
|
||||
end=self.last_timestamp,
|
||||
freq=data[FieldName.START].freq,
|
||||
),
|
||||
fill_value=self.train_fill_function(ts),
|
||||
).values
|
||||
|
||||
def _left_pad_data(self, data: DataEntry) -> np.array:
|
||||
ts = self.to_ts(data)
|
||||
return ts.reindex(
|
||||
pd.date_range(
|
||||
start=self.first_timestamp,
|
||||
end=ts.index[-1],
|
||||
freq=data[FieldName.START].freq,
|
||||
),
|
||||
fill_value=self.test_fill_rule(ts),
|
||||
).values
|
||||
|
||||
@staticmethod
|
||||
def _transform_target(funcs, dataset: Dataset) -> DataEntry:
|
||||
return {FieldName.TARGET: np.array([funcs(data) for data in dataset])}
|
||||
|
||||
def _restrict_max_dimensionality(self, data: DataEntry) -> DataEntry:
|
||||
"""
|
||||
Takes the last max_target_dimension dimensions from a multivariate
|
||||
dataentry.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data
|
||||
multivariate data entry with (dim, num_timesteps) target field
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataEntry
|
||||
data multivariate data entry with
|
||||
(max_target_dimension, num_timesteps) target field
|
||||
"""
|
||||
|
||||
if self.max_target_dimension is not None:
|
||||
# restrict maximum dimensionality (for faster testing)
|
||||
data[FieldName.TARGET] = data[FieldName.TARGET][
|
||||
-self.max_target_dimension :, :
|
||||
]
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def to_ts(data: DataEntry) -> pd.Series:
|
||||
return pd.Series(
|
||||
data[FieldName.TARGET],
|
||||
index=pd.date_range(
|
||||
start=data[FieldName.START],
|
||||
periods=len(data[FieldName.TARGET]),
|
||||
freq=data[FieldName.START].freq,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Callable, List, cast
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.tseries.offsets import Tick
|
||||
|
||||
from .common import DataEntry
|
||||
|
||||
|
||||
class ProcessStartField:
|
||||
def __init__(self, name: str, freq: str) -> None:
|
||||
self.name = name
|
||||
self.freq = freq
|
||||
|
||||
def __call__(self, data: DataEntry) -> DataEntry:
|
||||
try:
|
||||
value = ProcessStartField.process(data[self.name], self.freq)
|
||||
except (TypeError, ValueError) as e:
|
||||
raise Exception(f'Error "{e}" occurred when reading field "{self.name}"')
|
||||
|
||||
data[self.name] = value
|
||||
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=10000)
|
||||
def process(string: str, freq: str) -> pd.Timestamp:
|
||||
timestamp = pd.Timestamp(string, freq=freq)
|
||||
|
||||
# operate on time information (days, hours, minute, second)
|
||||
if isinstance(timestamp.freq, Tick):
|
||||
return pd.Timestamp(timestamp.floor(timestamp.freq), timestamp.freq)
|
||||
|
||||
# since we are only interested in the data piece, we normalize the
|
||||
# time information
|
||||
timestamp = timestamp.replace(
|
||||
hour=0, minute=0, second=0, microsecond=0, nanosecond=0
|
||||
)
|
||||
|
||||
return timestamp.freq.rollforward(timestamp)
|
||||
|
||||
|
||||
class ProcessTimeSeriesField:
|
||||
def __init__(self, name, is_required: bool, is_static: bool, is_cat: bool) -> None:
|
||||
self.name = name
|
||||
self.is_required = is_required
|
||||
self.req_ndim = 1 if is_static else 2
|
||||
self.dtype = np.int64 if is_cat else np.float32
|
||||
|
||||
def __call__(self, data: DataEntry) -> DataEntry:
|
||||
value = data.get(self.name, None)
|
||||
|
||||
if value is not None:
|
||||
value = np.asarray(value, dtype=self.dtype)
|
||||
dim_diff = self.req_ndim - value.ndim
|
||||
if dim_diff == 1:
|
||||
value = np.expand_dims(a=value, axis=0)
|
||||
elif dim_diff != 0:
|
||||
raise Exception(
|
||||
f"JSON array has bad shape - expected {self.req_ndim} dimensions got {dim_diff}"
|
||||
)
|
||||
|
||||
data[self.name] = value
|
||||
return data
|
||||
elif not self.is_required:
|
||||
return data
|
||||
else:
|
||||
raise Exception(f"JSON object is missing a required field `{self.name}`")
|
||||
|
||||
|
||||
class ProcessDataEntry:
|
||||
def __init__(self, freq: str, one_dim_target: bool = True) -> None:
|
||||
self.trans = cast(
|
||||
List[Callable[[DataEntry], DataEntry]],
|
||||
[
|
||||
ProcessStartField("start", freq=freq),
|
||||
ProcessTimeSeriesField(
|
||||
"target", is_required=True, is_cat=False, is_static=one_dim_target
|
||||
),
|
||||
ProcessTimeSeriesField(
|
||||
"feat_dynamic_cat", is_required=False, is_cat=True, is_static=False
|
||||
),
|
||||
ProcessTimeSeriesField(
|
||||
"feat_dynamic_real",
|
||||
is_required=False,
|
||||
is_cat=False,
|
||||
is_static=False,
|
||||
),
|
||||
ProcessTimeSeriesField(
|
||||
"feat_static_cat", is_required=False, is_cat=True, is_static=True
|
||||
),
|
||||
ProcessTimeSeriesField(
|
||||
"feat_static_real", is_required=False, is_cat=False, is_static=True
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
def __call__(self, data: DataEntry) -> DataEntry:
|
||||
for t in self.trans:
|
||||
data = t(data)
|
||||
return data
|
||||
@@ -1,604 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
import functools
|
||||
import itertools
|
||||
import operator
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# First-party imports
|
||||
from .common import DataEntry
|
||||
|
||||
ValueOrCallable = Union[Any, Callable]
|
||||
Recipe = List[Tuple[str, Callable]]
|
||||
Env = Dict[str, Any]
|
||||
|
||||
|
||||
def resolve(val_or_callable: ValueOrCallable, context: Env, *args, **kwargs):
|
||||
if callable(val_or_callable):
|
||||
return val_or_callable(context, *args, **kwargs)
|
||||
elif isinstance(val_or_callable, str):
|
||||
return context[val_or_callable]
|
||||
else:
|
||||
return val_or_callable
|
||||
|
||||
|
||||
def generate(
|
||||
length: int,
|
||||
recipe: Union[Callable, Recipe],
|
||||
start: pd.Timestamp,
|
||||
global_state: Optional[dict] = None,
|
||||
seed: int = 0,
|
||||
item_id_prefix: str = "",
|
||||
) -> Iterator[DataEntry]:
|
||||
np.random.seed(seed)
|
||||
|
||||
if global_state is None:
|
||||
global_state = {}
|
||||
|
||||
if isinstance(recipe, list):
|
||||
for x in itertools.count():
|
||||
data: DataEntry = {}
|
||||
for k, f in recipe:
|
||||
data[k] = resolve(
|
||||
f, data, length=length, field_name=k, global_state=global_state,
|
||||
)
|
||||
yield dict(**data, item_id=item_id_prefix + str(x), start=start)
|
||||
else:
|
||||
assert callable(recipe)
|
||||
for x in itertools.count():
|
||||
data = recipe(length=length, global_state=global_state)
|
||||
yield dict(**data, item_id=item_id_prefix + str(x), start=start)
|
||||
|
||||
|
||||
def evaluate(
|
||||
funcs: Recipe, length: int, *args, global_state: dict = None, **kwargs
|
||||
) -> Env:
|
||||
if global_state is None:
|
||||
global_state = {}
|
||||
|
||||
if "length" in kwargs:
|
||||
del kwargs["length"]
|
||||
if "field_name" in kwargs:
|
||||
del kwargs["field_name"]
|
||||
if "global_state" in kwargs:
|
||||
del kwargs["global_state"]
|
||||
|
||||
data: DataEntry = {}
|
||||
for k, f in funcs:
|
||||
try:
|
||||
data[k] = resolve(
|
||||
f,
|
||||
data,
|
||||
length=length,
|
||||
field_name=k,
|
||||
global_state=global_state,
|
||||
*args,
|
||||
**kwargs
|
||||
)
|
||||
except ValueError as e:
|
||||
raise ValueError('Error while evaluating key "{}"'.format(k), e)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def make_func(
|
||||
length: int, funcs: Recipe, global_state=None
|
||||
) -> Callable[[int, Env], DataEntry]:
|
||||
if global_state is None:
|
||||
global_state = {}
|
||||
|
||||
def f(length=length, global_state=global_state, *args, **kwargs):
|
||||
data = {}
|
||||
for k, f in funcs:
|
||||
data[k] = resolve(
|
||||
f,
|
||||
data,
|
||||
length=length,
|
||||
field_name=k,
|
||||
global_state=global_state,
|
||||
*args,
|
||||
**kwargs
|
||||
)
|
||||
return data
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def take_as_list(iterator, num):
|
||||
return list(itertools.islice(iterator, num))
|
||||
|
||||
|
||||
class Debug:
|
||||
def __init__(self, print_global=False) -> None:
|
||||
self.print_global = print_global
|
||||
|
||||
def __call__(self, x: Env, global_state, **kwargs):
|
||||
print(x)
|
||||
if self.print_global:
|
||||
print(global_state)
|
||||
return 0
|
||||
|
||||
|
||||
class Lifted:
|
||||
def __add__(self, other):
|
||||
return LiftedAdd(self, other)
|
||||
|
||||
def __radd__(self, other):
|
||||
return LiftedAdd(other, self)
|
||||
|
||||
def __sub__(self, other):
|
||||
return LiftedSub(self, other)
|
||||
|
||||
def __rsub__(self, other):
|
||||
return LiftedSub(other, self)
|
||||
|
||||
def __mul__(self, other):
|
||||
return LiftedMul(self, other, operator.mul)
|
||||
|
||||
def __rmul__(self, other):
|
||||
return LiftedMul(other, self, operator.mul)
|
||||
|
||||
def __truediv__(self, other):
|
||||
return LiftedTruediv(self, other, operator.truediv)
|
||||
|
||||
def __rtruediv__(self, other):
|
||||
return LiftedTruediv(other, self, operator.truediv)
|
||||
|
||||
def __call__(
|
||||
self, x: Env, length: int, field_name: str, global_state: Dict, *args, **kwargs
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class LiftedBinaryOp(Lifted):
|
||||
def __init__(self, left, right, op) -> None:
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.op = op
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
left = resolve(self.left, *args, **kwargs)
|
||||
right = resolve(self.right, *args, **kwargs)
|
||||
return self.op(left, right)
|
||||
|
||||
|
||||
class LiftedAdd(LiftedBinaryOp):
|
||||
def __init__(self, left, right) -> None:
|
||||
super().__init__(left, right, operator.add)
|
||||
|
||||
|
||||
class LiftedSub(LiftedBinaryOp):
|
||||
def __init__(self, left, right) -> None:
|
||||
super().__init__(left, right, operator.sub)
|
||||
|
||||
|
||||
class LiftedMul(LiftedBinaryOp):
|
||||
def __init__(self, left, right) -> None:
|
||||
super().__init__(left, right, operator.mul)
|
||||
|
||||
|
||||
class LiftedTruediv(LiftedBinaryOp):
|
||||
def __init__(self, left, right) -> None:
|
||||
super().__init__(left, right, operator.truediv)
|
||||
|
||||
|
||||
class RandomGaussian(Lifted):
|
||||
def __init__(
|
||||
self, stddev: ValueOrCallable = 1.0, shape: Sequence[int] = (0,)
|
||||
) -> None:
|
||||
self.stddev = stddev
|
||||
self.shape = shape
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
stddev = resolve(self.stddev, x, length, *args, **kwargs)
|
||||
s = np.array(self.shape)
|
||||
s[s == 0] = length
|
||||
return stddev * np.random.randn(*s)
|
||||
|
||||
|
||||
# Binary recipe that returns 1 if date is in holidays list and 0 otherwise
|
||||
class BinaryHolidays(Lifted):
|
||||
# TODO: holidays is type List[datetime.date]
|
||||
def __init__(self, dates: List[pd.Timestamp], holidays: List[Any]) -> None:
|
||||
self.dates = dates
|
||||
self.holidays = holidays
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
length = len(self.dates)
|
||||
out = np.ones(length)
|
||||
for i, date in enumerate(self.dates):
|
||||
# Convert to string to check if inside of holidays datatime.date
|
||||
if date.date() in self.holidays:
|
||||
out[i] = 1.0
|
||||
else:
|
||||
out[i] = 0.0
|
||||
return out
|
||||
|
||||
|
||||
class RandomBinary(Lifted):
|
||||
def __init__(self, prob: ValueOrCallable = 0.1) -> None:
|
||||
self.prob = prob
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
prob = resolve(self.prob, x, length, *args, **kwargs)
|
||||
return 1.0 * (np.random.rand(length) < prob)
|
||||
|
||||
|
||||
class RandomSymmetricDirichlet(Lifted):
|
||||
def __init__(
|
||||
self, alpha: ValueOrCallable = 1.0, shape: Sequence[int] = (0,)
|
||||
) -> None:
|
||||
self.alpha = alpha
|
||||
self.shape = shape
|
||||
|
||||
def __call__(self, x, length, *args, **kwargs):
|
||||
alpha = resolve(self.alpha, x, length, *args, **kwargs)
|
||||
s = np.array(self.shape)
|
||||
s[s == 0] = length
|
||||
return np.random.dirichlet(alpha * np.ones(s))
|
||||
|
||||
|
||||
class BinaryMarkovChain(Lifted):
|
||||
def __init__(
|
||||
self, one_to_zero: ValueOrCallable, zero_to_one: ValueOrCallable
|
||||
) -> None:
|
||||
self.one_to_zero = one_to_zero
|
||||
self.zero_to_one = zero_to_one
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
probs = np.zeros(2)
|
||||
probs[0] = resolve(self.zero_to_one, x, length, *args, **kwargs)
|
||||
probs[1] = resolve(self.one_to_zero, x, length, *args, **kwargs)
|
||||
out = np.ones(length, dtype=np.int) # initial state is 1
|
||||
uu = np.random.rand(length)
|
||||
for i in range(1, length):
|
||||
if uu[i] < probs[out[i - 1]]:
|
||||
out[i] = 1 - out[i - 1]
|
||||
else:
|
||||
out[i] = out[i - 1]
|
||||
return out
|
||||
|
||||
|
||||
class Constant(Lifted):
|
||||
def __init__(self, constant) -> None:
|
||||
self.constant = constant
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.constant
|
||||
|
||||
|
||||
class ConstantVec(Lifted):
|
||||
def __init__(self, constant: ValueOrCallable) -> None:
|
||||
self.constant = constant
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
constant = resolve(self.constant, x, length, *args, **kwargs)
|
||||
return constant * np.ones(length)
|
||||
|
||||
|
||||
class NormalizeMax(Lifted):
|
||||
def __init__(self, input) -> None:
|
||||
self.input = input
|
||||
|
||||
def __call__(self, x: Env, *args, **kwargs):
|
||||
inp = resolve(self.input, x, *args, kwargs)
|
||||
return inp / np.max(inp)
|
||||
|
||||
|
||||
class OnesLike(Lifted):
|
||||
def __init__(self, other) -> None:
|
||||
self.other = other
|
||||
|
||||
def __call__(self, x, length, *args, **kwargs):
|
||||
other = resolve(self.other, x, length, **kwargs)
|
||||
return np.ones_like(other)
|
||||
|
||||
|
||||
class LinearTrend(Lifted):
|
||||
def __init__(self, slope: ValueOrCallable = 1.0) -> None:
|
||||
self.slope = slope
|
||||
|
||||
def __call__(self, x, length, *args, **kwargs):
|
||||
slope = resolve(self.slope, x, length, *args, **kwargs)
|
||||
return slope * np.arange(length) / length
|
||||
|
||||
|
||||
class RandomCat:
|
||||
def __init__(
|
||||
self,
|
||||
cardinalities: List[int],
|
||||
prob_fun: Callable = RandomSymmetricDirichlet(alpha=1.0, shape=(0,)),
|
||||
) -> None:
|
||||
self.cardinalities = cardinalities
|
||||
self.prob_fun = prob_fun
|
||||
|
||||
def __call__(self, x, field_name, global_state, **kwargs):
|
||||
if field_name not in global_state:
|
||||
probs = [self.prob_fun(x, length=c) for c in self.cardinalities]
|
||||
global_state[field_name] = probs
|
||||
probs = global_state[field_name]
|
||||
cats = np.array(
|
||||
[
|
||||
np.random.choice(np.arange(len(probs[i])), p=probs[i])
|
||||
for i in range(len(probs))
|
||||
]
|
||||
)
|
||||
return cats
|
||||
|
||||
|
||||
class Lag(Lifted):
|
||||
def __init__(
|
||||
self, input: ValueOrCallable, lag: ValueOrCallable = 0, pad_const: int = 0,
|
||||
) -> None:
|
||||
self.input = input
|
||||
self.lag = lag
|
||||
self.pad_const = pad_const
|
||||
|
||||
def __call__(self, x, *args, **kwargs):
|
||||
feat = resolve(self.input, x, *args, **kwargs)
|
||||
lag = resolve(self.lag, x, *args, **kwargs)
|
||||
|
||||
if lag > 0:
|
||||
lagged_feat = np.concatenate((self.pad_const * np.ones(lag), feat[:-lag]))
|
||||
elif lag < 0:
|
||||
lagged_feat = np.concatenate((feat[-lag:], self.pad_const * np.ones(-lag)))
|
||||
|
||||
else:
|
||||
lagged_feat = feat
|
||||
return lagged_feat
|
||||
|
||||
|
||||
class ForEachCat(Lifted):
|
||||
def __init__(self, fun, cat_field="cat", cat_idx=0) -> None:
|
||||
self.fun = fun
|
||||
self.cat_field = cat_field
|
||||
self.cat_idx = cat_idx
|
||||
|
||||
def __call__(
|
||||
self, x: Env, length: int, field_name: str, global_state: Dict, *args, **kwargs
|
||||
):
|
||||
c = x[self.cat_field][self.cat_idx]
|
||||
if field_name not in global_state:
|
||||
global_state[field_name] = np.empty(
|
||||
len(global_state[self.cat_field][self.cat_idx]), dtype=np.object,
|
||||
)
|
||||
if global_state[field_name][c] is None:
|
||||
global_state[field_name][c] = self.fun(
|
||||
x, length=length, field_name=field_name, *args, **kwargs
|
||||
)
|
||||
return global_state[field_name][c]
|
||||
|
||||
|
||||
class Eval(Lifted):
|
||||
def __init__(self, expr: str) -> None:
|
||||
self.expr = expr
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
return eval(self.expr, globals(), dict(x=x, length=length, **kwargs))
|
||||
|
||||
|
||||
class SmoothSeasonality(Lifted):
|
||||
def __init__(self, period: ValueOrCallable, phase: ValueOrCallable) -> None:
|
||||
self.period = period
|
||||
self.phase = phase
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
period = resolve(self.period, x, length, *args, **kwargs)
|
||||
phase = resolve(self.phase, x, length, *args, **kwargs)
|
||||
return (np.sin(2.0 / period * np.pi * (np.arange(length) + phase)) + 1) / 2.0
|
||||
|
||||
|
||||
class Add(Lifted):
|
||||
def __init__(self, inputs: List[ValueOrCallable]) -> None:
|
||||
self.inputs = inputs
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
return sum([resolve(k, x, length, *args, **kwargs) for k in self.inputs])
|
||||
|
||||
|
||||
class Mul(Lifted):
|
||||
def __init__(self, inputs) -> None:
|
||||
self.inputs = inputs
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
return functools.reduce(
|
||||
operator.mul, [resolve(k, x, length, *args, **kwargs) for k in self.inputs],
|
||||
)
|
||||
|
||||
|
||||
class NanWhere(Lifted):
|
||||
def __init__(self, source: ValueOrCallable, nan_indicator: ValueOrCallable) -> None:
|
||||
self.source = source
|
||||
self.nan_indicator = nan_indicator
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
source = resolve(self.source, x, length, *args, **kwargs)
|
||||
nan_indicator = resolve(self.nan_indicator, x, length, *args, **kwargs)
|
||||
out = source.copy()
|
||||
out[nan_indicator == 1] = np.nan
|
||||
return out
|
||||
|
||||
|
||||
class OneMinus(Lifted):
|
||||
def __init__(self, source: ValueOrCallable) -> None:
|
||||
self.source = source
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
value = resolve(self.source, x, length, *args, **kwargs)
|
||||
return 1 - value
|
||||
|
||||
|
||||
class Concatenate(Lifted):
|
||||
def __init__(self, inputs: List[ValueOrCallable], axis: int = 0) -> None:
|
||||
self.inputs = inputs
|
||||
self.axis = axis
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
inputs = [resolve(z, x, length, **kwargs) for z in self.inputs]
|
||||
return np.concatenate(inputs, self.axis)
|
||||
|
||||
|
||||
class Stack(Lifted):
|
||||
def __init__(self, inputs: List[ValueOrCallable]) -> None:
|
||||
self.inputs = inputs
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
inputs = [resolve(z, x, length, **kwargs) for z in self.inputs]
|
||||
return np.stack(inputs, axis=0)
|
||||
|
||||
|
||||
class StackPrefix(Lifted):
|
||||
def __init__(self, prefix: str) -> None:
|
||||
self.prefix = prefix
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
inputs = [v for k, v in x.items() if k.startswith(self.prefix)]
|
||||
return np.stack(inputs, axis=0)
|
||||
|
||||
|
||||
class Ref(Lifted):
|
||||
def __init__(self, field_name: str) -> None:
|
||||
self.field_name = field_name
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
return x[self.field_name]
|
||||
|
||||
|
||||
class RandomUniform(Lifted):
|
||||
def __init__(
|
||||
self, low: ValueOrCallable = 0.0, high: ValueOrCallable = 1.0, shape=(0,),
|
||||
) -> None:
|
||||
self.low = low
|
||||
self.high = high
|
||||
self.shape = shape
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
low = resolve(self.low, x, length, *args, **kwargs)
|
||||
high = resolve(self.high, x, length, *args, **kwargs)
|
||||
s = np.array(self.shape)
|
||||
s[s == 0] = length
|
||||
return np.random.uniform(low, high, s)
|
||||
|
||||
|
||||
class RandomInteger(Lifted):
|
||||
def __init__(
|
||||
self,
|
||||
low: ValueOrCallable,
|
||||
high: ValueOrCallable,
|
||||
shape: Optional[Sequence[int]] = (0,),
|
||||
) -> None:
|
||||
self.low = low
|
||||
self.high = high
|
||||
self.shape = shape
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
low = resolve(self.low, x, length, *args, **kwargs)
|
||||
high = resolve(self.high, x, length, *args, **kwargs)
|
||||
if self.shape is not None:
|
||||
s = np.array(self.shape)
|
||||
s[s == 0] = length
|
||||
return np.random.randint(low, high, s)
|
||||
else:
|
||||
return np.random.randint(low, high)
|
||||
|
||||
|
||||
class RandomChangepoints(Lifted):
|
||||
def __init__(self, max_num_changepoints: ValueOrCallable) -> None:
|
||||
self.max_num_changepoints = max_num_changepoints
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
max_num_changepoints = resolve(
|
||||
self.max_num_changepoints, x, length, *args, **kwargs
|
||||
)
|
||||
num_changepoints = np.random.randint(0, max_num_changepoints + 1)
|
||||
change_idx = np.sort(
|
||||
np.random.randint(low=1, high=length - 1, size=(num_changepoints,))
|
||||
)
|
||||
change_ranges = np.concatenate([change_idx, [length]])
|
||||
out = np.zeros(length, dtype=np.int)
|
||||
for i in range(0, num_changepoints):
|
||||
out[change_ranges[i] : change_ranges[i + 1]] = i + 1
|
||||
return out
|
||||
|
||||
|
||||
class Repeated(Lifted):
|
||||
def __init__(self, pattern: ValueOrCallable) -> None:
|
||||
self.pattern = pattern
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
pattern = resolve(self.pattern, x, length, **kwargs)
|
||||
repeats = length // len(pattern) + 1
|
||||
out = np.tile(pattern, (repeats,))
|
||||
return out[:length]
|
||||
|
||||
|
||||
class Convolve(Lifted):
|
||||
def __init__(self, input: ValueOrCallable, filter: ValueOrCallable) -> None:
|
||||
self.filter = filter
|
||||
self.input = input
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
fil = resolve(self.filter, x, length, **kwargs)
|
||||
inp = resolve(self.input, x, length, **kwargs)
|
||||
out = np.convolve(inp, fil, mode="same")
|
||||
return out
|
||||
|
||||
|
||||
class Dilated(Lifted):
|
||||
def __init__(self, source: Callable, dilation: int) -> None:
|
||||
self.source = source
|
||||
self.dilation = dilation
|
||||
|
||||
def __call__(self, x: Env, length: int, *args, **kwargs):
|
||||
inner = self.source(x, length // self.dilation + 1, **kwargs)
|
||||
out = np.repeat(inner, self.dilation)
|
||||
return out[:length]
|
||||
|
||||
|
||||
class Choose(Lifted):
|
||||
def __init__(self, options: ValueOrCallable, selector: ValueOrCallable) -> None:
|
||||
self.options = options
|
||||
self.selector = selector
|
||||
|
||||
def __call__(self, x, length, **kwargs):
|
||||
options = resolve(self.options, x, length, **kwargs)
|
||||
selector = resolve(self.selector, x, length, **kwargs)
|
||||
e = np.eye(options.shape[0])
|
||||
out = np.sum(e[selector] * options.T, axis=1)
|
||||
return out
|
||||
|
||||
|
||||
class EvalRecipe(Lifted):
|
||||
def __init__(self, recipe: Recipe, op: ValueOrCallable) -> None:
|
||||
self.recipe = recipe
|
||||
self.op = op
|
||||
|
||||
def __call__(self, x: Env, *args, **kwargs):
|
||||
xx = evaluate(self.recipe, *args, **kwargs)
|
||||
return resolve(self.op, xx, *args, **kwargs)
|
||||
@@ -1,14 +1 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from .datasets import get_dataset, dataset_recipes
|
||||
from .datasets import dataset_recipes
|
||||
@@ -1,48 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# First-party imports
|
||||
from pts.dataset import ArtificialDataset, generate_sf2, serialize_data_entry
|
||||
|
||||
|
||||
def generate_artificial_dataset(dataset_path: Path, dataset: ArtificialDataset) -> None:
|
||||
dataset_path_train = dataset_path / "train"
|
||||
dataset_path_test = dataset_path / "test"
|
||||
|
||||
dataset_path.mkdir(exist_ok=True)
|
||||
dataset_path_train.mkdir(exist_ok=False)
|
||||
dataset_path_test.mkdir(exist_ok=False)
|
||||
|
||||
ds = dataset.generate()
|
||||
assert ds.test is not None
|
||||
|
||||
with (dataset_path / "metadata.json").open("w") as fp:
|
||||
json.dump(ds.metadata.dict(), fp, indent=2, sort_keys=True)
|
||||
|
||||
generate_sf2(
|
||||
filename=str(dataset_path_train / "train.json"),
|
||||
time_series=list(map(serialize_data_entry, ds.train)),
|
||||
is_missing=False,
|
||||
num_missing=0,
|
||||
)
|
||||
|
||||
generate_sf2(
|
||||
filename=str(dataset_path_test / "test.json"),
|
||||
time_series=list(map(serialize_data_entry, ds.test)),
|
||||
is_missing=False,
|
||||
num_missing=0,
|
||||
)
|
||||
@@ -1,160 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
"""
|
||||
Loads the datasets used in Salinas et al. 2019 (https://tinyurl.com/woyhhqy).
|
||||
This wrapper downloads and unpacks them so they don'thave to be attached as
|
||||
large files in GluonTS master.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple, Optional
|
||||
from urllib import request
|
||||
|
||||
from pts.dataset import FileDataset, FieldName
|
||||
from ._util import save_to_file, to_dict, metadata
|
||||
|
||||
|
||||
class GPCopulaDataset(NamedTuple):
|
||||
name: str
|
||||
url: str
|
||||
num_series: int
|
||||
prediction_length: int
|
||||
freq: str
|
||||
rolling_evaluations: int
|
||||
max_target_dim: Optional[int] = None
|
||||
|
||||
|
||||
root = (
|
||||
"https://raw.githubusercontent.com/mbohlkeschneider/gluon-ts/mv_release/datasets/"
|
||||
)
|
||||
|
||||
datasets_info = {
|
||||
"exchange_rate_nips": GPCopulaDataset(
|
||||
name="exchange_rate_nips",
|
||||
url=root + "exchange_rate_nips.tar.gz",
|
||||
num_series=8,
|
||||
prediction_length=30,
|
||||
freq="B",
|
||||
rolling_evaluations=5,
|
||||
max_target_dim=None,
|
||||
),
|
||||
"electricity_nips": GPCopulaDataset(
|
||||
name="electricity_nips",
|
||||
url=root + "electricity_nips.tar.gz",
|
||||
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
|
||||
num_series=370,
|
||||
prediction_length=24,
|
||||
freq="H",
|
||||
rolling_evaluations=7,
|
||||
max_target_dim=None,
|
||||
),
|
||||
"traffic_nips": GPCopulaDataset(
|
||||
name="traffic_nips",
|
||||
url=root + "traffic_nips.tar.gz",
|
||||
# note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
|
||||
num_series=963,
|
||||
prediction_length=24,
|
||||
freq="H",
|
||||
rolling_evaluations=7,
|
||||
max_target_dim=None,
|
||||
),
|
||||
"solar_nips": GPCopulaDataset(
|
||||
name="solar-energy",
|
||||
url=root + "solar_nips.tar.gz",
|
||||
num_series=137,
|
||||
prediction_length=24,
|
||||
freq="H",
|
||||
rolling_evaluations=7,
|
||||
max_target_dim=None,
|
||||
),
|
||||
"wiki-rolling_nips": GPCopulaDataset(
|
||||
name="wiki-rolling_nips",
|
||||
# That file lives on GitHub Large file storage (lfs). We need to use
|
||||
# the exact link, otherwise it will only open the lfs pointer file.
|
||||
url="https://github.com/mbohlkeschneider/gluon-ts/raw/650ad5ffe92d20e89d491966b6d8b4459e219be8/datasets/wiki-rolling_nips.tar.gz",
|
||||
num_series=9535,
|
||||
prediction_length=30,
|
||||
freq="D",
|
||||
rolling_evaluations=5,
|
||||
max_target_dim=2000,
|
||||
),
|
||||
"taxi_30min": GPCopulaDataset(
|
||||
name="taxi_30min",
|
||||
url=root + "taxi_30min.tar.gz",
|
||||
num_series=1214,
|
||||
prediction_length=24,
|
||||
freq="30min",
|
||||
rolling_evaluations=56,
|
||||
max_target_dim=None,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def generate_gp_copula_dataset(dataset_path: Path, dataset_name: str):
|
||||
ds_info = datasets_info[dataset_name]
|
||||
os.makedirs(dataset_path, exist_ok=True)
|
||||
|
||||
download_dataset(dataset_path.parent, ds_info)
|
||||
save_metadata(dataset_path, ds_info)
|
||||
save_dataset(dataset_path / "train", ds_info)
|
||||
save_dataset(dataset_path / "test", ds_info)
|
||||
clean_up_dataset(dataset_path, ds_info)
|
||||
|
||||
|
||||
def download_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
|
||||
request.urlretrieve(ds_info.url, dataset_path / f"{ds_info.name}.tar.gz")
|
||||
|
||||
with tarfile.open(dataset_path / f"{ds_info.name}.tar.gz") as tar:
|
||||
tar.extractall(path=dataset_path)
|
||||
|
||||
|
||||
def save_metadata(dataset_path: Path, ds_info: GPCopulaDataset):
|
||||
with open(dataset_path / "metadata.json", "w") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
metadata(
|
||||
cardinality=ds_info.num_series,
|
||||
freq=ds_info.freq,
|
||||
prediction_length=ds_info.prediction_length,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def save_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
|
||||
dataset = list(FileDataset(dataset_path / "*.json", freq=ds_info.freq))
|
||||
shutil.rmtree(dataset_path)
|
||||
train_file = dataset_path / "data.json"
|
||||
save_to_file(
|
||||
train_file,
|
||||
[
|
||||
to_dict(
|
||||
target_values=data_entry[FieldName.TARGET],
|
||||
start=data_entry[FieldName.START],
|
||||
# Handles adding categorical features of rolling
|
||||
# evaluation dates
|
||||
cat=[cat - ds_info.num_series * (cat // ds_info.num_series)],
|
||||
item_id=cat,
|
||||
)
|
||||
for cat, data_entry in enumerate(dataset)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def clean_up_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
|
||||
os.remove(dataset_path.parent / f"{ds_info.name}.tar.gz")
|
||||
shutil.rmtree(dataset_path / "metadata")
|
||||
@@ -1,197 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
"""
|
||||
Here we reuse the datasets used by LSTNet as the processed url of the datasets
|
||||
are available on GitHub.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, NamedTuple, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pts.dataset import frequency_add
|
||||
from ._util import save_to_file, to_dict, metadata
|
||||
|
||||
|
||||
def load_from_pandas(
|
||||
df: pd.DataFrame, time_index: pd.DatetimeIndex, agg_freq: Optional[str] = None,
|
||||
) -> List[pd.Series]:
|
||||
df = df.set_index(time_index)
|
||||
|
||||
pivot_df = df.transpose()
|
||||
pivot_df.head()
|
||||
|
||||
timeseries = []
|
||||
for row in pivot_df.iterrows():
|
||||
ts = pd.Series(row[1].values, index=time_index)
|
||||
if agg_freq is not None:
|
||||
ts = ts.resample(agg_freq).sum()
|
||||
first_valid = ts[ts.notnull()].index[0]
|
||||
last_valid = ts[ts.notnull()].index[-1]
|
||||
ts = ts[first_valid:last_valid]
|
||||
|
||||
timeseries.append(ts)
|
||||
|
||||
return timeseries
|
||||
|
||||
|
||||
class LstnetDataset(NamedTuple):
|
||||
name: str
|
||||
url: str
|
||||
num_series: int
|
||||
num_time_steps: int
|
||||
prediction_length: int
|
||||
rolling_evaluations: int
|
||||
freq: str
|
||||
start_date: str
|
||||
agg_freq: Optional[str] = None
|
||||
|
||||
|
||||
root = (
|
||||
"https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/"
|
||||
)
|
||||
|
||||
datasets_info = {
|
||||
"exchange_rate": LstnetDataset(
|
||||
name="exchange_rate",
|
||||
url=root + "exchange_rate/exchange_rate.txt.gz",
|
||||
num_series=8,
|
||||
num_time_steps=7588,
|
||||
prediction_length=30,
|
||||
rolling_evaluations=5,
|
||||
start_date="1990-01-01",
|
||||
freq="1B",
|
||||
agg_freq=None,
|
||||
),
|
||||
"electricity": LstnetDataset(
|
||||
name="electricity",
|
||||
url=root + "electricity/electricity.txt.gz",
|
||||
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
|
||||
# the aggregated ones that is used from LSTNet filters out from the initial 370 series the one with no data
|
||||
# in 2011
|
||||
num_series=321,
|
||||
num_time_steps=26304,
|
||||
prediction_length=24,
|
||||
rolling_evaluations=7,
|
||||
start_date="2012-01-01",
|
||||
freq="1H",
|
||||
agg_freq=None,
|
||||
),
|
||||
"traffic": LstnetDataset(
|
||||
name="traffic",
|
||||
url=root + "traffic/traffic.txt.gz",
|
||||
# note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
|
||||
# but only 862 in LSTNet
|
||||
num_series=862,
|
||||
num_time_steps=17544,
|
||||
prediction_length=24,
|
||||
rolling_evaluations=7,
|
||||
start_date="2015-01-01",
|
||||
freq="H",
|
||||
agg_freq=None,
|
||||
),
|
||||
"solar-energy": LstnetDataset(
|
||||
name="solar-energy",
|
||||
url=root + "solar-energy/solar_AL.txt.gz",
|
||||
num_series=137,
|
||||
num_time_steps=52560,
|
||||
prediction_length=24,
|
||||
rolling_evaluations=7,
|
||||
start_date="2006-01-01",
|
||||
freq="10min",
|
||||
agg_freq="1H",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
|
||||
ds_info = datasets_info[dataset_name]
|
||||
|
||||
os.makedirs(dataset_path, exist_ok=True)
|
||||
|
||||
with open(dataset_path / "metadata.json", "w") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
metadata(
|
||||
cardinality=ds_info.num_series,
|
||||
freq=ds_info.freq,
|
||||
prediction_length=ds_info.prediction_length,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
train_file = dataset_path / "train" / "data.json"
|
||||
test_file = dataset_path / "test" / "data.json"
|
||||
|
||||
time_index = pd.date_range(
|
||||
start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps,
|
||||
)
|
||||
|
||||
df = pd.read_csv(ds_info.url, header=None)
|
||||
|
||||
assert df.shape == (
|
||||
ds_info.num_time_steps,
|
||||
ds_info.num_series,
|
||||
), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"
|
||||
|
||||
timeseries = load_from_pandas(
|
||||
df=df, time_index=time_index, agg_freq=ds_info.agg_freq
|
||||
)
|
||||
|
||||
# the last date seen during training
|
||||
ts_index = timeseries[0].index
|
||||
training_end = ts_index[int(len(ts_index) * (8 / 10))]
|
||||
|
||||
train_ts = []
|
||||
for cat, ts in enumerate(timeseries):
|
||||
sliced_ts = ts[:training_end]
|
||||
if len(sliced_ts) > 0:
|
||||
train_ts.append(
|
||||
to_dict(
|
||||
target_values=sliced_ts.values,
|
||||
start=sliced_ts.index[0],
|
||||
cat=[cat],
|
||||
item_id=cat,
|
||||
)
|
||||
)
|
||||
|
||||
assert len(train_ts) == ds_info.num_series
|
||||
|
||||
save_to_file(train_file, train_ts)
|
||||
|
||||
# time of the first prediction
|
||||
prediction_dates = [
|
||||
frequency_add(training_end, i * ds_info.prediction_length)
|
||||
for i in range(ds_info.rolling_evaluations)
|
||||
]
|
||||
|
||||
test_ts = []
|
||||
for prediction_start_date in prediction_dates:
|
||||
for cat, ts in enumerate(timeseries):
|
||||
# print(prediction_start_date)
|
||||
prediction_end_date = frequency_add(
|
||||
prediction_start_date, ds_info.prediction_length
|
||||
)
|
||||
sliced_ts = ts[:prediction_end_date]
|
||||
test_ts.append(
|
||||
to_dict(
|
||||
target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat],
|
||||
)
|
||||
)
|
||||
|
||||
assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations
|
||||
|
||||
save_to_file(test_file, test_ts)
|
||||
@@ -1,85 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from ._util import save_to_file, to_dict, metadata
|
||||
|
||||
|
||||
def generate_m4_dataset(
|
||||
dataset_path: Path, m4_freq: str, pandas_freq: str, prediction_length: int
|
||||
):
|
||||
m4_dataset_url = "https://github.com/M4Competition/M4-methods/raw/master/Dataset"
|
||||
train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv", index_col=0)
|
||||
test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv", index_col=0)
|
||||
|
||||
os.makedirs(dataset_path, exist_ok=True)
|
||||
|
||||
with open(dataset_path / "metadata.json", "w") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
metadata(
|
||||
cardinality=len(train_df),
|
||||
freq=pandas_freq,
|
||||
prediction_length=prediction_length,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
train_file = dataset_path / "train" / "data.json"
|
||||
test_file = dataset_path / "test" / "data.json"
|
||||
|
||||
train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values]
|
||||
|
||||
test_target_values = [
|
||||
np.hstack([train_ts, test_ts])
|
||||
for train_ts, test_ts in zip(train_target_values, test_df.values)
|
||||
]
|
||||
|
||||
if m4_freq == "Yearly":
|
||||
# some time series have more than 300 years which can not be represented in pandas,
|
||||
# this is probably due to a misclassification of those time series as Yearly
|
||||
# we simply use only the last 300 years for training
|
||||
# note this does not affect test time as prediction length is less than 300 years
|
||||
train_target_values = [ts[-300:] for ts in train_target_values]
|
||||
test_target_values = [ts[-300:] for ts in test_target_values]
|
||||
|
||||
# the original dataset did not include time stamps, so we use a mock start date for each time series
|
||||
# we use the earliest point available in pandas
|
||||
mock_start_dataset = "1750-01-01 00:00:00"
|
||||
|
||||
save_to_file(
|
||||
train_file,
|
||||
[
|
||||
to_dict(
|
||||
target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat
|
||||
)
|
||||
for cat, target in enumerate(train_target_values)
|
||||
],
|
||||
)
|
||||
|
||||
save_to_file(
|
||||
test_file,
|
||||
[
|
||||
to_dict(
|
||||
target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat
|
||||
)
|
||||
for cat, target in enumerate(test_target_values)
|
||||
],
|
||||
)
|
||||
|
||||
@@ -6,12 +6,13 @@ from functools import lru_cache
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pts.dataset import FieldName
|
||||
from pts.feature import CustomDateFeatureSet, squared_exponential_kernel
|
||||
from ._util import metadata, save_to_file
|
||||
from gluonts.dataset.field_names import FieldName
|
||||
from gluonts.dataset.repository._util import metadata, save_to_file
|
||||
from gluonts.time_feature.holiday import squared_exponential_kernel
|
||||
from pts.feature import CustomDateFeatureSet
|
||||
|
||||
|
||||
def generate_m5_dataset(
|
||||
def generate_pts_m5_dataset(
|
||||
dataset_path: Path,
|
||||
pandas_freq: str,
|
||||
prediction_length: int = 28,
|
||||
@@ -46,7 +47,7 @@ def generate_m5_dataset(
|
||||
)
|
||||
sales_train_evaluation.sort_index(inplace=True)
|
||||
|
||||
sell_prices = pd.read_csv(sell_prices_path, index_col=['item_id', 'store_id'])
|
||||
sell_prices = pd.read_csv(sell_prices_path, index_col=["item_id", "store_id"])
|
||||
sell_prices.sort_index(inplace=True)
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@@ -161,16 +162,22 @@ def generate_m5_dataset(
|
||||
"WI": snap_WI_feature,
|
||||
}[state_id]
|
||||
|
||||
time_series["target"] = item.iloc[start_index:1913].values.astype(np.float32).tolist()
|
||||
time_series["feat_dynamic_real"] = np.concatenate(
|
||||
(
|
||||
np.expand_dims(sell_price.iloc[start_index:1913].values, 0),
|
||||
event_1_feature[:, start_index:1913],
|
||||
event_2_feature[:, start_index:1913],
|
||||
snap_feature[:, start_index:1913],
|
||||
),
|
||||
0,
|
||||
).astype(np.float32).tolist()
|
||||
time_series["target"] = (
|
||||
item.iloc[start_index:1913].values.astype(np.float32).tolist()
|
||||
)
|
||||
time_series["feat_dynamic_real"] = (
|
||||
np.concatenate(
|
||||
(
|
||||
np.expand_dims(sell_price.iloc[start_index:1913].values, 0),
|
||||
event_1_feature[:, start_index:1913],
|
||||
event_2_feature[:, start_index:1913],
|
||||
snap_feature[:, start_index:1913],
|
||||
),
|
||||
0,
|
||||
)
|
||||
.astype(np.float32)
|
||||
.tolist()
|
||||
)
|
||||
|
||||
train_ds.append(time_series.copy())
|
||||
|
||||
@@ -222,16 +229,22 @@ def generate_m5_dataset(
|
||||
"WI": snap_WI_feature,
|
||||
}[state_id]
|
||||
|
||||
time_series["target"] = item.iloc[start_index:1941].values.astype(np.float32).tolist()
|
||||
time_series["feat_dynamic_real"] = np.concatenate(
|
||||
(
|
||||
np.expand_dims(sell_price.iloc[start_index:1941].values, 0),
|
||||
event_1_feature[:, start_index:1941],
|
||||
event_2_feature[:, start_index:1941],
|
||||
snap_feature[:, start_index:1941],
|
||||
),
|
||||
0,
|
||||
).astype(np.float32).tolist()
|
||||
time_series["target"] = (
|
||||
item.iloc[start_index:1941].values.astype(np.float32).tolist()
|
||||
)
|
||||
time_series["feat_dynamic_real"] = (
|
||||
np.concatenate(
|
||||
(
|
||||
np.expand_dims(sell_price.iloc[start_index:1941].values, 0),
|
||||
event_1_feature[:, start_index:1941],
|
||||
event_2_feature[:, start_index:1941],
|
||||
snap_feature[:, start_index:1941],
|
||||
),
|
||||
0,
|
||||
)
|
||||
.astype(np.float32)
|
||||
.tolist()
|
||||
)
|
||||
|
||||
test_ds.append(time_series.copy())
|
||||
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def to_dict(
|
||||
target_values: np.ndarray,
|
||||
start: str,
|
||||
cat: Optional[List[int]] = None,
|
||||
item_id: Optional[Any] = None,
|
||||
):
|
||||
def serialize(x):
|
||||
if np.isnan(x):
|
||||
return "NaN"
|
||||
else:
|
||||
# return x
|
||||
return float("{0:.6f}".format(float(x)))
|
||||
|
||||
res = {
|
||||
"start": str(start),
|
||||
"target": [serialize(x) for x in target_values],
|
||||
}
|
||||
|
||||
if cat is not None:
|
||||
res["feat_static_cat"] = cat
|
||||
|
||||
if item_id is not None:
|
||||
res["item_id"] = item_id
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def save_to_file(path: Path, data: List[Dict]):
|
||||
print(f"saving time-series into {path}")
|
||||
path_dir = os.path.dirname(path)
|
||||
os.makedirs(path_dir, exist_ok=True)
|
||||
with open(path, "wb") as fp:
|
||||
for d in data:
|
||||
fp.write(json.dumps(d).encode("utf-8"))
|
||||
fp.write("\n".encode("utf-8"))
|
||||
|
||||
|
||||
def get_download_path() -> Path:
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
default path to download datasets
|
||||
/home/username/.pytorch/pytorch-ts/
|
||||
"""
|
||||
return Path(str(Path.home() / ".pytorch" / "pytorch-ts"))
|
||||
|
||||
|
||||
def metadata(cardinality: int, freq: str, prediction_length: int):
|
||||
return {
|
||||
"freq": freq,
|
||||
"prediction_length": prediction_length,
|
||||
"feat_static_cat": [
|
||||
{"name": "feat_static_cat", "cardinality": str(cardinality)}
|
||||
],
|
||||
}
|
||||
@@ -1,183 +1,9 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
import logging
|
||||
from collections import OrderedDict
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
from pts.dataset import ConstantDataset, TrainDatasets, load_datasets
|
||||
from ._artificial import generate_artificial_dataset
|
||||
from ._gp_copula_2019 import generate_gp_copula_dataset
|
||||
from ._lstnet import generate_lstnet_dataset
|
||||
from ._m4 import generate_m4_dataset
|
||||
from ._m5 import generate_m5_dataset
|
||||
from ._util import get_download_path
|
||||
from gluonts.dataset.repository.datasets import dataset_recipes
|
||||
|
||||
m4_freq = "Hourly"
|
||||
pandas_freq = "H"
|
||||
dataset_path = Path(f"m4-{m4_freq}")
|
||||
prediction_length = 48
|
||||
from ._m5 import generate_pts_m5_dataset
|
||||
|
||||
dataset_recipes = OrderedDict(
|
||||
{
|
||||
# each recipe generates a dataset given a path
|
||||
"constant": partial(generate_artificial_dataset, dataset=ConstantDataset()),
|
||||
"exchange_rate": partial(generate_lstnet_dataset, dataset_name="exchange_rate"),
|
||||
"solar-energy": partial(generate_lstnet_dataset, dataset_name="solar-energy"),
|
||||
"electricity": partial(generate_lstnet_dataset, dataset_name="electricity"),
|
||||
"traffic": partial(generate_lstnet_dataset, dataset_name="traffic"),
|
||||
"exchange_rate_nips": partial(
|
||||
generate_gp_copula_dataset, dataset_name="exchange_rate_nips"
|
||||
),
|
||||
"electricity_nips": partial(
|
||||
generate_gp_copula_dataset, dataset_name="electricity_nips"
|
||||
),
|
||||
"traffic_nips": partial(
|
||||
generate_gp_copula_dataset, dataset_name="traffic_nips"
|
||||
),
|
||||
"solar_nips": partial(generate_gp_copula_dataset, dataset_name="solar_nips"),
|
||||
"wiki-rolling_nips": partial(
|
||||
generate_gp_copula_dataset, dataset_name="wiki-rolling_nips"
|
||||
),
|
||||
"taxi_30min": partial(generate_gp_copula_dataset, dataset_name="taxi_30min"),
|
||||
"m4_hourly": partial(
|
||||
generate_m4_dataset,
|
||||
m4_freq="Hourly",
|
||||
pandas_freq="H",
|
||||
prediction_length=48,
|
||||
),
|
||||
"m4_daily": partial(
|
||||
generate_m4_dataset, m4_freq="Daily", pandas_freq="D", prediction_length=14,
|
||||
),
|
||||
"m4_weekly": partial(
|
||||
generate_m4_dataset,
|
||||
m4_freq="Weekly",
|
||||
pandas_freq="W",
|
||||
prediction_length=13,
|
||||
),
|
||||
"m4_monthly": partial(
|
||||
generate_m4_dataset,
|
||||
m4_freq="Monthly",
|
||||
pandas_freq="M",
|
||||
prediction_length=18,
|
||||
),
|
||||
"m4_quarterly": partial(
|
||||
generate_m4_dataset,
|
||||
m4_freq="Quarterly",
|
||||
pandas_freq="3M",
|
||||
prediction_length=8,
|
||||
),
|
||||
"m4_yearly": partial(
|
||||
generate_m4_dataset,
|
||||
m4_freq="Yearly",
|
||||
pandas_freq="12M",
|
||||
prediction_length=6,
|
||||
),
|
||||
"m5": partial(
|
||||
generate_m5_dataset, pandas_freq="D", prediction_length=28, alpha=0.5
|
||||
),
|
||||
}
|
||||
dataset_recipes["pts_m5"] = partial(
|
||||
generate_pts_m5_dataset, pandas_freq="D", prediction_length=28
|
||||
)
|
||||
|
||||
dataset_names = list(dataset_recipes.keys())
|
||||
|
||||
default_dataset_path = get_download_path() / "datasets"
|
||||
|
||||
|
||||
def materialize_dataset(
|
||||
dataset_name: str, path: Path = default_dataset_path, regenerate: bool = False,
|
||||
) -> Path:
|
||||
"""
|
||||
Ensures that the dataset is materialized under the `path / dataset_name`
|
||||
path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_name
|
||||
name of the dataset, for instance "m4_hourly"
|
||||
regenerate
|
||||
whether to regenerate the dataset even if a local file is present.
|
||||
If this flag is False and the file is present, the dataset will not
|
||||
be downloaded again.
|
||||
path
|
||||
where the dataset should be saved
|
||||
Returns
|
||||
-------
|
||||
the path where the dataset is materialized
|
||||
"""
|
||||
assert dataset_name in dataset_recipes.keys(), (
|
||||
f"{dataset_name} is not present, please choose one from "
|
||||
f"{dataset_recipes.keys()}."
|
||||
)
|
||||
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
dataset_path = path / dataset_name
|
||||
|
||||
dataset_recipe = dataset_recipes[dataset_name]
|
||||
|
||||
if not dataset_path.exists() or regenerate:
|
||||
logging.info(f"downloading and processing {dataset_name}")
|
||||
dataset_recipe(dataset_path=dataset_path)
|
||||
else:
|
||||
logging.info(f"using dataset already processed in path {dataset_path}.")
|
||||
|
||||
return dataset_path
|
||||
|
||||
|
||||
def get_dataset(
|
||||
dataset_name: str,
|
||||
path: Path = default_dataset_path,
|
||||
regenerate: bool = False,
|
||||
shuffle: bool = True,
|
||||
) -> TrainDatasets:
|
||||
"""
|
||||
Get a repository dataset.
|
||||
|
||||
The datasets that can be obtained through this function have been used
|
||||
with different processing over time by several papers (e.g., [SFG17]_,
|
||||
[LCY+18]_, and [YRD15]_).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_name
|
||||
name of the dataset, for instance "m4_hourly"
|
||||
regenerate
|
||||
whether to regenerate the dataset even if a local file is present.
|
||||
If this flag is False and the file is present, the dataset will not
|
||||
be downloaded again.
|
||||
path
|
||||
where the dataset should be saved
|
||||
shuffle
|
||||
wheather to shuffle the training time series
|
||||
Returns
|
||||
-------
|
||||
dataset obtained by either downloading or reloading from local file.
|
||||
"""
|
||||
dataset_path = materialize_dataset(dataset_name, path, regenerate)
|
||||
|
||||
return load_datasets(
|
||||
metadata=dataset_path / "metadata.json",
|
||||
train=dataset_path / "train" / "*.json",
|
||||
test=dataset_path / "test" / "*.json",
|
||||
shuffle=shuffle,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
for dataset in dataset_names:
|
||||
print(f"generate {dataset}")
|
||||
ds = get_dataset(dataset, regenerate=True)
|
||||
print(ds.metadata)
|
||||
print(sum(1 for _ in list(iter(ds.train))))
|
||||
|
||||
@@ -1,357 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from typing import Any, List, NamedTuple, Optional, Set
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from pts.exception import assert_pts
|
||||
from .common import FieldName
|
||||
|
||||
|
||||
class ScaleHistogram:
|
||||
"""
|
||||
Scale histogram of a timeseries dataset
|
||||
This counts the number of timeseries whose mean of absolute values is in
|
||||
the `[base ** i, base ** (i+1)]` range for all possible `i`.
|
||||
The number of entries with empty target is counted separately.
|
||||
Parameters
|
||||
----------
|
||||
base
|
||||
Log-width of the histogram's buckets.
|
||||
bin_counts
|
||||
empty_target_count
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base: float = 2.0,
|
||||
bin_counts: Optional[dict] = None,
|
||||
empty_target_count: int = 0,
|
||||
) -> None:
|
||||
self._base = base
|
||||
self.bin_counts = defaultdict(int, {} if bin_counts is None else bin_counts)
|
||||
self.empty_target_count = empty_target_count
|
||||
self.__init_args__ = dict(
|
||||
base=self._base,
|
||||
bin_counts=self.bin_counts,
|
||||
empty_target_count=empty_target_count,
|
||||
)
|
||||
|
||||
def bucket_index(self, target_values):
|
||||
assert len(target_values) > 0
|
||||
scale = np.mean(np.abs(target_values))
|
||||
scale_bin = int(math.log(scale + 1.0, self._base))
|
||||
return scale_bin
|
||||
|
||||
def add(self, target_values):
|
||||
if len(target_values) > 0:
|
||||
bucket = self.bucket_index(target_values)
|
||||
self.bin_counts[bucket] = self.bin_counts[bucket] + 1
|
||||
else:
|
||||
self.empty_target_count = self.empty_target_count + 1
|
||||
|
||||
def count(self, target):
|
||||
if len(target) > 0:
|
||||
return self.bin_counts[self.bucket_index(target)]
|
||||
else:
|
||||
return self.empty_target_count
|
||||
|
||||
def __len__(self):
|
||||
return self.empty_target_count + sum(self.bin_counts.values())
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
isinstance(other, ScaleHistogram)
|
||||
and self.bin_counts == other.bin_counts
|
||||
and self.empty_target_count == other.empty_target_count
|
||||
and self._base == other._base
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
string_repr = [
|
||||
"count of scales in {min}-{max}:{count}".format(
|
||||
min=self._base ** base_index - 1,
|
||||
max=self._base ** (base_index + 1) - 1,
|
||||
count=count,
|
||||
)
|
||||
for base_index, count in sorted(self.bin_counts.items(), key=lambda x: x[0])
|
||||
]
|
||||
return "\n".join(string_repr)
|
||||
|
||||
|
||||
class DatasetStatistics(NamedTuple):
|
||||
"""
|
||||
A NamedTuple to store the statistics of a Dataset.
|
||||
"""
|
||||
|
||||
integer_dataset: bool
|
||||
max_target: float
|
||||
mean_abs_target: float
|
||||
mean_target: float
|
||||
mean_target_length: float
|
||||
min_target: float
|
||||
feat_static_real: List[Set[float]]
|
||||
feat_static_cat: List[Set[int]]
|
||||
num_feat_dynamic_real: Optional[int]
|
||||
num_feat_dynamic_cat: Optional[int]
|
||||
num_missing_values: int
|
||||
num_time_observations: int
|
||||
num_time_series: int
|
||||
scale_histogram: ScaleHistogram
|
||||
|
||||
# DO NOT override the __str__ method, since we rely that we can load
|
||||
# DatasetStatistics again; i.e. stats == eval(str(stats))
|
||||
|
||||
def __eq__(self, other):
|
||||
for x, y in zip(self._asdict().values(), other._asdict().values()):
|
||||
if isinstance(x, float):
|
||||
if abs(x - y) > abs(0.0001 * x):
|
||||
return False
|
||||
elif x != y:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# TODO: reorganize modules to avoid circular dependency
|
||||
# TODO: and substitute Any with Dataset
|
||||
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
|
||||
"""
|
||||
Computes the statistics of a given Dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ts_dataset
|
||||
Dataset of which to compute the statistics.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DatasetStatistics
|
||||
NamedTuple containing the statistics.
|
||||
"""
|
||||
num_time_observations = 0
|
||||
num_time_series = 0
|
||||
min_target = 1e20
|
||||
max_target = -1e20
|
||||
sum_target = 0.0
|
||||
sum_abs_target = 0.0
|
||||
integer_dataset = True
|
||||
observed_feat_static_cat: Optional[List[Set[int]]] = None
|
||||
observed_feat_static_real: Optional[List[Set[float]]] = None
|
||||
num_feat_static_real: Optional[int] = None
|
||||
num_feat_static_cat: Optional[int] = None
|
||||
num_feat_dynamic_real: Optional[int] = None
|
||||
num_feat_dynamic_cat: Optional[int] = None
|
||||
num_missing_values = 0
|
||||
|
||||
scale_histogram = ScaleHistogram()
|
||||
|
||||
with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it:
|
||||
for num_time_series, ts in it:
|
||||
|
||||
# TARGET
|
||||
target = ts[FieldName.TARGET]
|
||||
observed_target = target[~np.isnan(target)]
|
||||
num_observations = len(observed_target)
|
||||
|
||||
if num_observations > 0:
|
||||
# 'nan' is handled in observed_target definition
|
||||
assert_pts(
|
||||
np.all(np.isfinite(observed_target)),
|
||||
"Target values have to be finite (e.g., not inf, -inf, "
|
||||
"or None) and cannot exceed single precision floating "
|
||||
"point range.",
|
||||
)
|
||||
|
||||
num_time_observations += num_observations
|
||||
min_target = float(min(min_target, observed_target.min()))
|
||||
max_target = float(max(max_target, observed_target.max()))
|
||||
num_missing_values += int(np.isnan(target).sum())
|
||||
sum_target += float(observed_target.sum())
|
||||
sum_abs_target += float(np.abs(observed_target).sum())
|
||||
integer_dataset = integer_dataset and bool(
|
||||
np.all(np.mod(observed_target, 1) == 0)
|
||||
)
|
||||
|
||||
scale_histogram.add(observed_target) # after checks for inf and None
|
||||
|
||||
# FEAT_STATIC_CAT
|
||||
feat_static_cat = (
|
||||
ts[FieldName.FEAT_STATIC_CAT] if FieldName.FEAT_STATIC_CAT in ts else []
|
||||
)
|
||||
|
||||
if num_feat_static_cat is None:
|
||||
num_feat_static_cat = len(feat_static_cat)
|
||||
observed_feat_static_cat = [set() for _ in range(num_feat_static_cat)]
|
||||
|
||||
# needed to type check
|
||||
assert num_feat_static_cat is not None
|
||||
assert observed_feat_static_cat is not None
|
||||
|
||||
assert_pts(
|
||||
num_feat_static_cat == len(feat_static_cat),
|
||||
"Not all feat_static_cat vectors have the same length {} != {}.",
|
||||
num_feat_static_cat,
|
||||
len(feat_static_cat),
|
||||
)
|
||||
for i, c in enumerate(feat_static_cat):
|
||||
observed_feat_static_cat[i].add(c)
|
||||
|
||||
# FEAT_STATIC_REAL
|
||||
feat_static_real = (
|
||||
ts[FieldName.FEAT_STATIC_REAL]
|
||||
if FieldName.FEAT_STATIC_REAL in ts
|
||||
else []
|
||||
)
|
||||
|
||||
if num_feat_static_real is None:
|
||||
num_feat_static_real = len(feat_static_real)
|
||||
observed_feat_static_real = [set() for _ in range(num_feat_static_real)]
|
||||
|
||||
# needed to type check
|
||||
assert num_feat_static_real is not None
|
||||
assert observed_feat_static_real is not None
|
||||
|
||||
assert_pts(
|
||||
num_feat_static_real == len(feat_static_real),
|
||||
"Not all feat_static_real vectors have the same length {} != {}.",
|
||||
num_feat_static_real,
|
||||
len(feat_static_real),
|
||||
)
|
||||
for i, c in enumerate(feat_static_real):
|
||||
observed_feat_static_real[i].add(c)
|
||||
|
||||
# FEAT_DYNAMIC_CAT
|
||||
feat_dynamic_cat = (
|
||||
ts[FieldName.FEAT_DYNAMIC_CAT]
|
||||
if FieldName.FEAT_DYNAMIC_CAT in ts
|
||||
else None
|
||||
)
|
||||
|
||||
if feat_dynamic_cat is None:
|
||||
# feat_dynamic_cat not found, check it was the first ts we encounter or
|
||||
# that feat_dynamic_cat were seen before
|
||||
assert_pts(
|
||||
num_feat_dynamic_cat is None or num_feat_dynamic_cat == 0,
|
||||
"feat_dynamic_cat was found for some instances but not others.",
|
||||
)
|
||||
num_feat_dynamic_cat = 0
|
||||
else:
|
||||
if num_feat_dynamic_cat is None:
|
||||
# first num_feat_dynamic_cat found
|
||||
num_feat_dynamic_cat = feat_dynamic_cat.shape[0]
|
||||
else:
|
||||
assert_pts(
|
||||
num_feat_dynamic_cat == feat_dynamic_cat.shape[0],
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_cat, found one with {} and another with {}.",
|
||||
num_feat_dynamic_cat,
|
||||
feat_dynamic_cat.shape[0],
|
||||
)
|
||||
|
||||
assert_pts(
|
||||
np.all(np.isfinite(feat_dynamic_cat)),
|
||||
"Features values have to be finite and cannot exceed single "
|
||||
"precision floating point range.",
|
||||
)
|
||||
num_feat_dynamic_cat_time_steps = feat_dynamic_cat.shape[1]
|
||||
assert_pts(
|
||||
num_feat_dynamic_cat_time_steps == len(target),
|
||||
"Each feature in feat_dynamic_cat has to have the same length as "
|
||||
"the target. Found an instance with feat_dynamic_cat of length {} "
|
||||
"and a target of length {}.",
|
||||
num_feat_dynamic_cat_time_steps,
|
||||
len(target),
|
||||
)
|
||||
|
||||
# FEAT_DYNAMIC_REAL
|
||||
feat_dynamic_real = (
|
||||
ts[FieldName.FEAT_DYNAMIC_REAL]
|
||||
if FieldName.FEAT_DYNAMIC_REAL in ts
|
||||
else None
|
||||
)
|
||||
|
||||
if feat_dynamic_real is None:
|
||||
# feat_dynamic_real not found, check it was the first ts we encounter or
|
||||
# that feat_dynamic_real were seen before
|
||||
assert_pts(
|
||||
num_feat_dynamic_real is None or num_feat_dynamic_real == 0,
|
||||
"feat_dynamic_real was found for some instances but not others.",
|
||||
)
|
||||
num_feat_dynamic_real = 0
|
||||
else:
|
||||
if num_feat_dynamic_real is None:
|
||||
# first num_feat_dynamic_real found
|
||||
num_feat_dynamic_real = feat_dynamic_real.shape[0]
|
||||
else:
|
||||
assert_pts(
|
||||
num_feat_dynamic_real == feat_dynamic_real.shape[0],
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_real, found one with {} and another with {}.",
|
||||
num_feat_dynamic_real,
|
||||
feat_dynamic_real.shape[0],
|
||||
)
|
||||
|
||||
assert_pts(
|
||||
np.all(np.isfinite(feat_dynamic_real)),
|
||||
"Features values have to be finite and cannot exceed single "
|
||||
"precision floating point range.",
|
||||
)
|
||||
num_feat_dynamic_real_time_steps = feat_dynamic_real.shape[1]
|
||||
assert_pts(
|
||||
num_feat_dynamic_real_time_steps == len(target),
|
||||
"Each feature in feat_dynamic_real has to have the same length as "
|
||||
"the target. Found an instance with feat_dynamic_real of length {} "
|
||||
"and a target of length {}.",
|
||||
num_feat_dynamic_real_time_steps,
|
||||
len(target),
|
||||
)
|
||||
|
||||
assert_pts(num_time_series > 0, "Time series dataset is empty!")
|
||||
assert_pts(
|
||||
num_time_observations > 0, "Only empty time series found in the dataset!",
|
||||
)
|
||||
|
||||
# note this require the above assumption to avoid a division by zero
|
||||
# runtime error
|
||||
mean_target_length = num_time_observations / num_time_series
|
||||
|
||||
# note this require the above assumption to avoid a division by zero
|
||||
# runtime error
|
||||
mean_target = sum_target / num_time_observations
|
||||
mean_abs_target = sum_abs_target / num_time_observations
|
||||
|
||||
integer_dataset = integer_dataset and min_target >= 0.0
|
||||
|
||||
assert len(scale_histogram) == num_time_series
|
||||
|
||||
return DatasetStatistics(
|
||||
integer_dataset=integer_dataset,
|
||||
max_target=max_target,
|
||||
mean_abs_target=mean_abs_target,
|
||||
mean_target=mean_target,
|
||||
mean_target_length=mean_target_length,
|
||||
min_target=min_target,
|
||||
num_missing_values=num_missing_values,
|
||||
feat_static_real=observed_feat_static_real if observed_feat_static_real else [],
|
||||
feat_static_cat=observed_feat_static_cat if observed_feat_static_cat else [],
|
||||
num_feat_dynamic_real=num_feat_dynamic_real,
|
||||
num_feat_dynamic_cat=num_feat_dynamic_cat,
|
||||
num_time_observations=num_time_observations,
|
||||
num_time_series=num_time_series,
|
||||
scale_histogram=scale_histogram,
|
||||
)
|
||||
@@ -1,47 +0,0 @@
|
||||
import itertools
|
||||
from typing import Dict, Iterable, Iterator, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pts.transform.transform import Transformation
|
||||
from .common import DataEntry, Dataset
|
||||
|
||||
|
||||
class TransformedIterableDataset(torch.utils.data.IterableDataset):
|
||||
def __init__(
|
||||
self, dataset: Dataset, is_train: bool, transform: Transformation
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.dataset = dataset
|
||||
self.transform = transform
|
||||
self.is_train = is_train
|
||||
self._cur_iter: Optional[Iterator] = None
|
||||
|
||||
def _iterate_forever(self, collection: Iterable[DataEntry]) -> Iterator[DataEntry]:
|
||||
# iterate forever over the collection, the collection must be non empty
|
||||
while True:
|
||||
try:
|
||||
first = next(iter(collection))
|
||||
except StopIteration:
|
||||
raise Exception("empty dataset")
|
||||
else:
|
||||
for x in itertools.chain([first], collection):
|
||||
yield x
|
||||
|
||||
def __iter__(self) -> Iterator[Dict[str, np.ndarray]]:
|
||||
if self._cur_iter is None:
|
||||
self._cur_iter = self.transform(
|
||||
self._iterate_forever(self.dataset), is_train=self.is_train
|
||||
)
|
||||
assert self._cur_iter is not None
|
||||
while True:
|
||||
data_entry = next(self._cur_iter)
|
||||
yield {
|
||||
k: (v.astype(np.float32) if v.dtype.kind == "f" else v)
|
||||
for k, v in data_entry.items()
|
||||
if isinstance(v, np.ndarray) == True
|
||||
}
|
||||
|
||||
# def __len__(self) -> int:
|
||||
# return len(self.dataset)
|
||||
@@ -1,148 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import rapidjson as json
|
||||
|
||||
from .common import TrainDatasets, MetaData
|
||||
from .file_dataset import FileDataset
|
||||
|
||||
|
||||
def frequency_add(ts: pd.Timestamp, amount: int) -> pd.Timestamp:
|
||||
return ts + ts.freq * amount
|
||||
|
||||
|
||||
def forecast_start(entry):
|
||||
return frequency_add(entry["start"], len(entry["target"]))
|
||||
|
||||
|
||||
def to_pandas(instance: dict, freq: str = None) -> pd.Series:
|
||||
"""
|
||||
Transform a dictionary into a pandas.Series object, using its
|
||||
"start" and "target" fields.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance
|
||||
Dictionary containing the time series data.
|
||||
freq
|
||||
Frequency to use in the pandas.Series index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
Pandas time series object.
|
||||
"""
|
||||
target = instance["target"]
|
||||
start = instance["start"]
|
||||
if not freq:
|
||||
freq = start.freqstr
|
||||
index = pd.date_range(start=start, periods=len(target), freq=freq)
|
||||
return pd.Series(target, index=index)
|
||||
|
||||
|
||||
def load_datasets(metadata, train, test, shuffle: bool = False) -> TrainDatasets:
|
||||
"""
|
||||
Loads a dataset given metadata, train and test path.
|
||||
Parameters
|
||||
----------
|
||||
metadata
|
||||
Path to the metadata file
|
||||
train
|
||||
Path to the training dataset files.
|
||||
test
|
||||
Path to the test dataset files.
|
||||
shuffle
|
||||
Return shuffled train data.
|
||||
Returns
|
||||
-------
|
||||
TrainDatasets
|
||||
An object collecting metadata, training data, test data.
|
||||
"""
|
||||
meta = MetaData.parse_file(metadata)
|
||||
train_ds = FileDataset(train, meta.freq, shuffle=shuffle)
|
||||
test_ds = FileDataset(test, meta.freq) if test else None
|
||||
|
||||
return TrainDatasets(metadata=meta, train=train_ds, test=test_ds)
|
||||
|
||||
|
||||
def save_datasets(dataset: TrainDatasets, path_str: str, overwrite=True) -> None:
|
||||
"""
|
||||
Saves an TrainDatasets object to a JSON Lines file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset
|
||||
The training datasets.
|
||||
path_str
|
||||
Where to save the dataset.
|
||||
overwrite
|
||||
Whether to delete previous version in this folder.
|
||||
"""
|
||||
path = Path(path_str)
|
||||
|
||||
if overwrite:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
|
||||
def dump_line(f, line):
|
||||
f.write(json.dumps(line).encode("utf-8"))
|
||||
f.write("\n".encode("utf-8"))
|
||||
|
||||
(path / "metadata").mkdir(parents=True)
|
||||
with open(path / "metadata/metadata.json", "wb") as f:
|
||||
dump_line(f, dataset.metadata.dict())
|
||||
|
||||
(path / "train").mkdir(parents=True)
|
||||
with open(path / "train/data.json", "wb") as f:
|
||||
for entry in dataset.train:
|
||||
dump_line(f, serialize_data_entry(entry))
|
||||
|
||||
if dataset.test is not None:
|
||||
(path / "test").mkdir(parents=True)
|
||||
with open(path / "test/data.json", "wb") as f:
|
||||
for entry in dataset.test:
|
||||
dump_line(f, serialize_data_entry(entry))
|
||||
|
||||
|
||||
def serialize_data_entry(data):
|
||||
"""
|
||||
Encode the numpy values in the a DataEntry dictionary into lists so the
|
||||
dictionary can be JSON serialized.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data
|
||||
The dictionary to be transformed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict
|
||||
The transformed dictionary, where all fields where transformed into
|
||||
strings.
|
||||
"""
|
||||
|
||||
def serialize_field(field):
|
||||
if isinstance(field, np.ndarray):
|
||||
# circumvent https://github.com/micropython/micropython/issues/3511
|
||||
nan_ix = np.isnan(field)
|
||||
field = field.astype(np.object_)
|
||||
field[nan_ix] = "NaN"
|
||||
return field.tolist()
|
||||
return str(field)
|
||||
|
||||
return {k: serialize_field(v) for k, v in data.items() if v is not None}
|
||||
@@ -2,10 +2,15 @@ import torch
|
||||
from torch.distributions import Distribution, TransformedDistribution, AffineTransform
|
||||
|
||||
|
||||
|
||||
class ImplicitQuantile(Distribution):
|
||||
|
||||
def __init__(self, implicit_quantile_function, taus, nn_output, predicted_quantiles, validate_args=None):
|
||||
def __init__(
|
||||
self,
|
||||
implicit_quantile_function,
|
||||
taus,
|
||||
nn_output,
|
||||
predicted_quantiles,
|
||||
validate_args=None,
|
||||
):
|
||||
self.predicted_quantiles = predicted_quantiles[0]
|
||||
self.taus = taus
|
||||
self.quantile_function = implicit_quantile_function
|
||||
@@ -46,9 +51,8 @@ class ImplicitQuantile(Distribution):
|
||||
@staticmethod
|
||||
def quantile_loss(quantile_forecast, target, tau):
|
||||
return torch.abs(
|
||||
(quantile_forecast - target)
|
||||
* ((target <= quantile_forecast).float() - tau)
|
||||
)
|
||||
(quantile_forecast - target) * ((target <= quantile_forecast).float() - tau)
|
||||
)
|
||||
|
||||
|
||||
class TransformedImplicitQuantile(TransformedDistribution):
|
||||
@@ -63,4 +67,3 @@ class TransformedImplicitQuantile(TransformedDistribution):
|
||||
scale *= transform.scale
|
||||
p = self.base_dist.log_prob(x)
|
||||
return p * scale
|
||||
|
||||
|
||||
@@ -118,7 +118,10 @@ class ZeroInflatedNegativeBinomial(ZeroInflatedDistribution):
|
||||
|
||||
def __init__(self, gate, total_count, probs=None, logits=None, validate_args=None):
|
||||
base_dist = NegativeBinomial(
|
||||
total_count=total_count, probs=probs, logits=logits, validate_args=False,
|
||||
total_count=total_count,
|
||||
probs=probs,
|
||||
logits=logits,
|
||||
validate_args=False,
|
||||
)
|
||||
base_dist._validate_args = validate_args
|
||||
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
from .backtest import make_evaluation_predictions, backtest_metrics
|
||||
from .evaluator import Evaluator, MultivariateEvaluator
|
||||
@@ -1,221 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
# Standard library imports
|
||||
import logging
|
||||
from typing import Dict, Iterator, NamedTuple, Optional, Tuple, Union
|
||||
|
||||
# Third-party imports
|
||||
import pandas as pd
|
||||
|
||||
from pts.dataset import (
|
||||
DataEntry,
|
||||
Dataset,
|
||||
DatasetStatistics,
|
||||
calculate_dataset_statistics,
|
||||
)
|
||||
from pts.model import Estimator, Predictor, Forecast
|
||||
# First-party imports
|
||||
from pts.transform import AdhocTransform, TransformedDataset
|
||||
from .evaluator import Evaluator
|
||||
|
||||
|
||||
def make_evaluation_predictions(
|
||||
dataset: Dataset, predictor: Predictor, num_samples: int
|
||||
) -> Tuple[Iterator[Forecast], Iterator[pd.Series]]:
|
||||
"""
|
||||
Return predictions on the last portion of predict_length time units of the
|
||||
target. Such portion is cut before making predictions, such a function can
|
||||
be used in evaluations where accuracy is evaluated on the last portion of
|
||||
the target.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset
|
||||
Dataset where the evaluation will happen. Only the portion excluding
|
||||
the prediction_length portion is used when making prediction.
|
||||
predictor
|
||||
Model used to draw predictions.
|
||||
num_samples
|
||||
Number of samples to draw on the model when evaluating.
|
||||
|
||||
Returns
|
||||
-------
|
||||
"""
|
||||
|
||||
prediction_length = predictor.prediction_length
|
||||
freq = predictor.freq
|
||||
|
||||
def add_ts_dataframe(data_iterator: Iterator[DataEntry]) -> Iterator[DataEntry]:
|
||||
for data_entry in data_iterator:
|
||||
data = data_entry.copy()
|
||||
index = pd.date_range(
|
||||
start=data["start"], freq=freq, periods=data["target"].shape[-1],
|
||||
)
|
||||
data["ts"] = pd.DataFrame(index=index, data=data["target"].transpose())
|
||||
yield data
|
||||
|
||||
def ts_iter(dataset: Dataset) -> pd.DataFrame:
|
||||
for data_entry in add_ts_dataframe(iter(dataset)):
|
||||
yield data_entry["ts"]
|
||||
|
||||
def truncate_target(data):
|
||||
data = data.copy()
|
||||
target = data["target"]
|
||||
assert (
|
||||
target.shape[-1] >= prediction_length
|
||||
) # handles multivariate case (target_dim, history_length)
|
||||
data["target"] = target[..., :-prediction_length]
|
||||
return data
|
||||
|
||||
# TODO filter out time series with target shorter than prediction length
|
||||
# TODO or fix the evaluator so it supports missing values instead (all
|
||||
# TODO the test set may be gone otherwise with such a filtering)
|
||||
|
||||
dataset_trunc = TransformedDataset(
|
||||
dataset, transformations=[AdhocTransform(truncate_target)]
|
||||
)
|
||||
|
||||
return (
|
||||
predictor.predict(dataset_trunc, num_samples=num_samples),
|
||||
ts_iter(dataset),
|
||||
)
|
||||
|
||||
|
||||
train_dataset_stats_key = "train_dataset_stats"
|
||||
test_dataset_stats_key = "test_dataset_stats"
|
||||
estimator_key = "estimator"
|
||||
agg_metrics_key = "agg_metrics"
|
||||
|
||||
|
||||
def serialize_message(logger, message: str, variable):
|
||||
logger.info(f"pts[{message}]: {variable}")
|
||||
|
||||
|
||||
def backtest_metrics(
|
||||
train_dataset: Optional[Dataset],
|
||||
test_dataset: Dataset,
|
||||
forecaster: Union[Estimator, Predictor],
|
||||
evaluator=Evaluator(quantiles=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)),
|
||||
num_samples: int = 100,
|
||||
logging_file: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
train_dataset
|
||||
Dataset to use for training.
|
||||
test_dataset
|
||||
Dataset to use for testing.
|
||||
forecaster
|
||||
An estimator or a predictor to use for generating predictions.
|
||||
evaluator
|
||||
Evaluator to use.
|
||||
num_samples
|
||||
Number of samples to use when generating sample-based forecasts.
|
||||
logging_file
|
||||
If specified, information of the backtest is redirected to this file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
A tuple of aggregate metrics and per-time-series metrics obtained by
|
||||
training `forecaster` on `train_dataset` and evaluating the resulting
|
||||
`evaluator` provided on the `test_dataset`.
|
||||
"""
|
||||
|
||||
if logging_file is not None:
|
||||
log_formatter = logging.Formatter(
|
||||
"[%(asctime)s %(levelname)s %(thread)d] %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
handler = logging.FileHandler(logging_file)
|
||||
handler.setFormatter(log_formatter)
|
||||
logger.addHandler(handler)
|
||||
else:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if train_dataset is not None:
|
||||
train_statistics = calculate_dataset_statistics(train_dataset)
|
||||
serialize_message(logger, train_dataset_stats_key, train_statistics)
|
||||
|
||||
test_statistics = calculate_dataset_statistics(test_dataset)
|
||||
serialize_message(logger, test_dataset_stats_key, test_statistics)
|
||||
|
||||
if isinstance(forecaster, Estimator):
|
||||
serialize_message(logger, estimator_key, forecaster)
|
||||
assert train_dataset is not None
|
||||
predictor = forecaster.train(train_dataset)
|
||||
else:
|
||||
predictor = forecaster
|
||||
|
||||
forecast_it, ts_it = make_evaluation_predictions(
|
||||
test_dataset, predictor=predictor, num_samples=num_samples
|
||||
)
|
||||
|
||||
agg_metrics, item_metrics = evaluator(
|
||||
ts_it, forecast_it, num_series=len(test_dataset)
|
||||
)
|
||||
|
||||
# we only log aggregate metrics for now as item metrics may be very large
|
||||
for name, value in agg_metrics.items():
|
||||
serialize_message(logger, f"metric-{name}", value)
|
||||
|
||||
if logging_file is not None:
|
||||
# Close the file handler to avoid letting the file open.
|
||||
# https://stackoverflow.com/questions/24816456/python-logging-wont-shutdown
|
||||
logger.removeHandler(handler)
|
||||
del logger, handler
|
||||
|
||||
return agg_metrics, item_metrics
|
||||
|
||||
|
||||
class BacktestInformation(NamedTuple):
|
||||
train_dataset_stats: DatasetStatistics
|
||||
test_dataset_stats: DatasetStatistics
|
||||
estimator: Estimator
|
||||
agg_metrics: Dict[str, float]
|
||||
|
||||
# @staticmethod
|
||||
# def make_from_log(log_file):
|
||||
# with open(log_file, "r") as f:
|
||||
# return BacktestInformation.make_from_log_contents(
|
||||
# "\n".join(f.readlines())
|
||||
# )
|
||||
|
||||
# @staticmethod
|
||||
# def make_from_log_contents(log_contents):
|
||||
# messages = dict(re.findall(r"pts\[(.*)\]: (.*)", log_contents))
|
||||
|
||||
# # avoid to fail if a key is missing for instance in the case a run did
|
||||
# # not finish so that we can still get partial information
|
||||
# try:
|
||||
# return BacktestInformation(
|
||||
# train_dataset_stats=eval(
|
||||
# messages[train_dataset_stats_key]
|
||||
# ), # TODO: use load
|
||||
# test_dataset_stats=eval(
|
||||
# messages[test_dataset_stats_key]
|
||||
# ), # TODO: use load
|
||||
# estimator=load_code(messages[estimator_key]),
|
||||
# agg_metrics={
|
||||
# k: load_code(v)
|
||||
# for k, v in messages.items()
|
||||
# if k.startswith("metric-") and v != "nan"
|
||||
# },
|
||||
# )
|
||||
# except Exception as error:
|
||||
# logging.error(error)
|
||||
# return None
|
||||
@@ -1,730 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
import multiprocessing
|
||||
import sys
|
||||
|
||||
from itertools import chain, tee
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
Callable,
|
||||
)
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
# First-party imports
|
||||
from pts.feature import get_seasonality
|
||||
from pts.model import Quantile, Forecast
|
||||
|
||||
|
||||
class Evaluator:
|
||||
"""
|
||||
Evaluator class, to compute accuracy metrics by comparing observations
|
||||
to forecasts.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
quantiles
|
||||
list of strings of the form 'p10' or floats in [0, 1] with
|
||||
the quantile levels
|
||||
seasonality
|
||||
seasonality to use for seasonal_error, if nothing is passed
|
||||
uses the default seasonality
|
||||
for the given series frequency as returned by `get_seasonality`
|
||||
alpha
|
||||
Parameter of the MSIS metric from the M4 competition that
|
||||
defines the confidence interval.
|
||||
For alpha=0.05 (default) the 95% considered is considered in the metric,
|
||||
see https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
for more detail on MSIS
|
||||
calculate_owa
|
||||
Determines whether the OWA metric should also be calculated,
|
||||
which is computationally expensive to evaluate and thus slows
|
||||
down the evaluation process considerably.
|
||||
By default False.
|
||||
num_workers
|
||||
The number of multiprocessing workers that will be used to process
|
||||
the data in parallel.
|
||||
Default is multiprocessing.cpu_count().
|
||||
Setting it to 0 means no multiprocessing.
|
||||
chunk_size
|
||||
Controls the approximate chunk size each workers handles at a time.
|
||||
Default is 32.
|
||||
"""
|
||||
|
||||
default_quantiles = 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quantiles: Iterable[Union[float, str]] = default_quantiles,
|
||||
seasonality: Optional[int] = None,
|
||||
alpha: float = 0.05,
|
||||
calculate_owa: bool = False,
|
||||
num_workers: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
) -> None:
|
||||
self.quantiles = tuple(map(Quantile.parse, quantiles))
|
||||
self.seasonality = seasonality
|
||||
self.alpha = alpha
|
||||
self.calculate_owa = calculate_owa
|
||||
|
||||
self.num_workers = (
|
||||
num_workers if num_workers is not None else multiprocessing.cpu_count()
|
||||
)
|
||||
self.chunk_size = chunk_size if chunk_size is not None else 32
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
ts_iterator: Iterable[Union[pd.DataFrame, pd.Series]],
|
||||
fcst_iterator: Iterable[Forecast],
|
||||
num_series: Optional[int] = None,
|
||||
) -> Tuple[Dict[str, float], pd.DataFrame]:
|
||||
"""
|
||||
Compute accuracy metrics by comparing actual data to the forecasts.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ts_iterator
|
||||
iterator containing true target on the predicted range
|
||||
fcst_iterator
|
||||
iterator of forecasts on the predicted range
|
||||
num_series
|
||||
number of series of the iterator
|
||||
(optional, only used for displaying progress)
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
Dictionary of aggregated metrics
|
||||
pd.DataFrame
|
||||
DataFrame containing per-time-series metrics
|
||||
"""
|
||||
ts_iterator = iter(ts_iterator)
|
||||
fcst_iterator = iter(fcst_iterator)
|
||||
|
||||
rows = []
|
||||
|
||||
with tqdm(
|
||||
zip(ts_iterator, fcst_iterator),
|
||||
total=num_series,
|
||||
desc="Running evaluation",
|
||||
) as it, np.errstate(invalid="ignore"):
|
||||
if self.num_workers > 0 and not sys.platform == "win32":
|
||||
mp_pool = multiprocessing.Pool(
|
||||
initializer=_worker_init(self), processes=self.num_workers
|
||||
)
|
||||
rows = mp_pool.map(
|
||||
func=_worker_fun, iterable=iter(it), chunksize=self.chunk_size,
|
||||
)
|
||||
mp_pool.close()
|
||||
mp_pool.join()
|
||||
else:
|
||||
for ts, forecast in it:
|
||||
rows.append(self.get_metrics_per_ts(ts, forecast))
|
||||
|
||||
assert not any(
|
||||
True for _ in ts_iterator
|
||||
), "ts_iterator has more elements than fcst_iterator"
|
||||
|
||||
assert not any(
|
||||
True for _ in fcst_iterator
|
||||
), "fcst_iterator has more elements than ts_iterator"
|
||||
|
||||
if num_series is not None:
|
||||
assert (
|
||||
len(rows) == num_series
|
||||
), f"num_series={num_series} did not match number of elements={len(rows)}"
|
||||
|
||||
# If all entries of a target array are NaNs, the resulting metric will have value "masked". Pandas does not
|
||||
# handle masked values correctly. Thus we set dtype=np.float64 to convert masked values back to NaNs which
|
||||
# are handled correctly by pandas Dataframes during aggregation.
|
||||
metrics_per_ts = pd.DataFrame(rows, dtype=np.float64)
|
||||
return self.get_aggregate_metrics(metrics_per_ts)
|
||||
|
||||
@staticmethod
|
||||
def extract_pred_target(
|
||||
time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time_series
|
||||
forecast
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
time series cut in the Forecast object dates
|
||||
"""
|
||||
assert forecast.index.intersection(time_series.index).equals(forecast.index), (
|
||||
"Cannot extract prediction target since the index of forecast is outside the index of target\n"
|
||||
f"Index of forecast: {forecast.index}\n Index of target: {time_series.index}"
|
||||
)
|
||||
|
||||
# cut the time series using the dates of the forecast object
|
||||
return np.atleast_1d(np.squeeze(time_series.loc[forecast.index].transpose()))
|
||||
|
||||
# This method is needed for the owa calculation
|
||||
# It extracts the training sequence from the Series or DataFrame to a numpy array
|
||||
@staticmethod
|
||||
def extract_past_data(
|
||||
time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time_series
|
||||
forecast
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
time series without the forecast dates
|
||||
"""
|
||||
|
||||
assert forecast.index.intersection(time_series.index).equals(forecast.index), (
|
||||
"Index of forecast is outside the index of target\n"
|
||||
f"Index of forecast: {forecast.index}\n Index of target: {time_series.index}"
|
||||
)
|
||||
|
||||
# Remove the prediction range
|
||||
# If the prediction range is not in the end of the time series,
|
||||
# everything after the prediction range is truncated
|
||||
date_before_forecast = forecast.index[0] - forecast.index[0].freq
|
||||
return np.atleast_1d(
|
||||
np.squeeze(time_series.loc[:date_before_forecast].transpose())
|
||||
)
|
||||
|
||||
def seasonal_error(self, past_data: np.ndarray, forecast: Forecast) -> float:
|
||||
r"""
|
||||
.. math::
|
||||
|
||||
seasonal_error = mean(|Y[t] - Y[t-m]|)
|
||||
|
||||
where m is the seasonal frequency
|
||||
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
"""
|
||||
# Check if the length of the time series is larger than the seasonal frequency
|
||||
seasonality = (
|
||||
self.seasonality if self.seasonality else get_seasonality(forecast.freq)
|
||||
)
|
||||
if seasonality < len(past_data):
|
||||
forecast_freq = seasonality
|
||||
else:
|
||||
# edge case: the seasonal freq is larger than the length of ts
|
||||
# revert to freq=1
|
||||
# logging.info('The seasonal frequency is larger than the length of the time series. Reverting to freq=1.')
|
||||
forecast_freq = 1
|
||||
y_t = past_data[:-forecast_freq]
|
||||
y_tm = past_data[forecast_freq:]
|
||||
|
||||
seasonal_mae = np.mean(abs(y_t - y_tm))
|
||||
|
||||
return seasonal_mae if seasonal_mae is not np.ma.masked else np.nan
|
||||
|
||||
def get_metrics_per_ts(
|
||||
self, time_series: Union[pd.Series, pd.DataFrame], forecast: Forecast
|
||||
) -> Dict[str, Union[float, str, None]]:
|
||||
pred_target = np.array(self.extract_pred_target(time_series, forecast))
|
||||
pred_target = np.ma.masked_invalid(pred_target)
|
||||
|
||||
# required for seasonal_error and owa calculation
|
||||
past_data = np.array(self.extract_past_data(time_series, forecast))
|
||||
past_data = np.ma.masked_invalid(past_data)
|
||||
|
||||
try:
|
||||
mean_fcst = forecast.mean
|
||||
except:
|
||||
mean_fcst = None
|
||||
median_fcst = forecast.quantile(0.5)
|
||||
seasonal_error = self.seasonal_error(past_data, forecast)
|
||||
metrics = {
|
||||
"item_id": forecast.item_id,
|
||||
"MSE": self.mse(pred_target, mean_fcst) if mean_fcst is not None else None,
|
||||
"abs_error": self.abs_error(pred_target, median_fcst),
|
||||
"abs_target_sum": self.abs_target_sum(pred_target),
|
||||
"abs_target_mean": self.abs_target_mean(pred_target),
|
||||
"seasonal_error": seasonal_error,
|
||||
"MASE": self.mase(pred_target, median_fcst, seasonal_error),
|
||||
"MAPE": self.mape(pred_target, median_fcst),
|
||||
"sMAPE": self.smape(pred_target, median_fcst),
|
||||
"OWA": np.nan, # by default not calculated
|
||||
"MSIS": self.msis(
|
||||
pred_target,
|
||||
forecast.quantile(self.alpha / 2),
|
||||
forecast.quantile(1.0 - self.alpha / 2),
|
||||
seasonal_error,
|
||||
self.alpha,
|
||||
),
|
||||
}
|
||||
|
||||
if self.calculate_owa:
|
||||
metrics["OWA"] = self.owa(
|
||||
pred_target,
|
||||
median_fcst,
|
||||
past_data,
|
||||
seasonal_error,
|
||||
forecast.start_date,
|
||||
)
|
||||
|
||||
for quantile in self.quantiles:
|
||||
forecast_quantile = forecast.quantile(quantile.value)
|
||||
|
||||
metrics[quantile.loss_name] = self.quantile_loss(
|
||||
pred_target, forecast_quantile, quantile.value
|
||||
)
|
||||
metrics[quantile.coverage_name] = self.coverage(
|
||||
pred_target, forecast_quantile
|
||||
)
|
||||
|
||||
return metrics
|
||||
|
||||
def get_aggregate_metrics(
|
||||
self, metric_per_ts: pd.DataFrame
|
||||
) -> Tuple[Dict[str, float], pd.DataFrame]:
|
||||
agg_funs = {
|
||||
"MSE": "mean",
|
||||
"abs_error": "sum",
|
||||
"abs_target_sum": "sum",
|
||||
"abs_target_mean": "mean",
|
||||
"seasonal_error": "mean",
|
||||
"MASE": "mean",
|
||||
"MAPE": "mean",
|
||||
"sMAPE": "mean",
|
||||
"OWA": "mean",
|
||||
"MSIS": "mean",
|
||||
}
|
||||
for quantile in self.quantiles:
|
||||
agg_funs[quantile.loss_name] = "sum"
|
||||
agg_funs[quantile.coverage_name] = "mean"
|
||||
|
||||
assert (
|
||||
set(metric_per_ts.columns) >= agg_funs.keys()
|
||||
), "The some of the requested item metrics are missing."
|
||||
|
||||
totals = {key: metric_per_ts[key].agg(agg) for key, agg in agg_funs.items()}
|
||||
|
||||
# derived metrics based on previous aggregate metrics
|
||||
totals["RMSE"] = np.sqrt(totals["MSE"])
|
||||
|
||||
flag = totals["abs_target_mean"] == 0
|
||||
totals["NRMSE"] = np.divide(
|
||||
totals["RMSE"] * (1 - flag), totals["abs_target_mean"] + flag
|
||||
)
|
||||
|
||||
flag = totals["abs_target_sum"] == 0
|
||||
totals["ND"] = np.divide(
|
||||
totals["abs_error"] * (1 - flag), totals["abs_target_sum"] + flag
|
||||
)
|
||||
|
||||
all_qLoss_names = [quantile.weighted_loss_name for quantile in self.quantiles]
|
||||
for quantile in self.quantiles:
|
||||
totals[quantile.weighted_loss_name] = np.divide(
|
||||
totals[quantile.loss_name], totals["abs_target_sum"]
|
||||
)
|
||||
|
||||
totals["mean_wQuantileLoss"] = np.array(
|
||||
[totals[ql] for ql in all_qLoss_names]
|
||||
).mean()
|
||||
|
||||
totals["MAE_Coverage"] = np.mean(
|
||||
[
|
||||
np.abs(totals[q.coverage_name] - np.array([q.value]))
|
||||
for q in self.quantiles
|
||||
]
|
||||
)
|
||||
return totals, metric_per_ts
|
||||
|
||||
@staticmethod
|
||||
def mse(target, forecast):
|
||||
return np.mean(np.square(target - forecast))
|
||||
|
||||
@staticmethod
|
||||
def abs_error(target, forecast):
|
||||
return np.sum(np.abs(target - forecast))
|
||||
|
||||
@staticmethod
|
||||
def quantile_loss(target, quantile_forecast, q):
|
||||
return 2.0 * np.sum(
|
||||
np.abs((quantile_forecast - target) * ((target <= quantile_forecast) - q))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def coverage(target, quantile_forecast):
|
||||
return np.mean((target < quantile_forecast))
|
||||
|
||||
@staticmethod
|
||||
def mase(target, forecast, seasonal_error):
|
||||
r"""
|
||||
.. math::
|
||||
|
||||
mase = mean(|Y - Y_hat|) / seasonal_error
|
||||
|
||||
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
"""
|
||||
flag = seasonal_error == 0
|
||||
return (np.mean(np.abs(target - forecast)) * (1 - flag)) / (
|
||||
seasonal_error + flag
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def mape(target, forecast):
|
||||
r"""
|
||||
.. math::
|
||||
|
||||
mape = mean(|Y - Y_hat| / |Y|))
|
||||
"""
|
||||
|
||||
denominator = np.abs(target)
|
||||
flag = denominator == 0
|
||||
|
||||
mape = np.mean((np.abs(target - forecast) * (1 - flag)) / (denominator + flag))
|
||||
return mape
|
||||
|
||||
@staticmethod
|
||||
def smape(target, forecast):
|
||||
r"""
|
||||
.. math::
|
||||
|
||||
smape = mean(2 * |Y - Y_hat| / (|Y| + |Y_hat|))
|
||||
|
||||
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
"""
|
||||
|
||||
denominator = np.abs(target) + np.abs(forecast)
|
||||
flag = denominator == 0
|
||||
|
||||
smape = 2 * np.mean(
|
||||
(np.abs(target - forecast) * (1 - flag)) / (denominator + flag)
|
||||
)
|
||||
return smape
|
||||
|
||||
@staticmethod
|
||||
def owa(
|
||||
target: np.ndarray,
|
||||
forecast: np.ndarray,
|
||||
past_data: np.ndarray,
|
||||
seasonal_error: float,
|
||||
start_date: pd.Timestamp,
|
||||
) -> float:
|
||||
r"""
|
||||
.. math::
|
||||
|
||||
owa = 0.5*(smape/smape_naive + mase/mase_naive)
|
||||
|
||||
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
"""
|
||||
# avoid import error due to circular dependency
|
||||
from gluonts.model.naive_2 import naive_2
|
||||
|
||||
# calculate the forecast of the seasonal naive predictor
|
||||
naive_median_fcst = naive_2(past_data, len(target), freq=start_date.freqstr)
|
||||
|
||||
owa = 0.5 * (
|
||||
(
|
||||
Evaluator.smape(target, forecast)
|
||||
/ Evaluator.smape(target, naive_median_fcst)
|
||||
)
|
||||
+ (
|
||||
Evaluator.mase(target, forecast, seasonal_error)
|
||||
/ Evaluator.mase(target, naive_median_fcst, seasonal_error)
|
||||
)
|
||||
)
|
||||
|
||||
return owa
|
||||
|
||||
@staticmethod
|
||||
def msis(target, lower_quantile, upper_quantile, seasonal_error, alpha):
|
||||
r"""
|
||||
:math:
|
||||
|
||||
msis = mean(U - L + 2/alpha * (L-Y) * I[Y<L] + 2/alpha * (Y-U) * I[Y>U]) /seasonal_error
|
||||
|
||||
https://www.m4.unic.ac.cy/wp-content/uploads/2018/03/M4-Competitors-Guide.pdf
|
||||
"""
|
||||
numerator = np.mean(
|
||||
upper_quantile
|
||||
- lower_quantile
|
||||
+ 2.0 / alpha * (lower_quantile - target) * (target < lower_quantile)
|
||||
+ 2.0 / alpha * (target - upper_quantile) * (target > upper_quantile)
|
||||
)
|
||||
|
||||
flag = seasonal_error == 0
|
||||
return (numerator * (1 - flag)) / (seasonal_error + flag)
|
||||
|
||||
@staticmethod
|
||||
def abs_target_sum(target):
|
||||
return np.sum(np.abs(target))
|
||||
|
||||
@staticmethod
|
||||
def abs_target_mean(target):
|
||||
return np.mean(np.abs(target))
|
||||
|
||||
|
||||
class MultivariateEvaluator(Evaluator):
|
||||
"""
|
||||
|
||||
The MultivariateEvaluator class owns functionality for evaluating
|
||||
multidimensional target arrays of shape
|
||||
(target_dimensionality, prediction_length).
|
||||
|
||||
Evaluations of individual dimensions will be stored with the corresponding
|
||||
dimension prefix and contain the metrics calculated by only this dimension.
|
||||
Metrics with the plain metric name correspond to metrics calculated over
|
||||
all dimensions.
|
||||
Additionally, the user can provide additional aggregation functions that
|
||||
first aggregate the target and forecast over dimensions and then calculate
|
||||
the metric. These metrics will be prefixed with m_<aggregation_fun_name>_
|
||||
|
||||
The evaluation dimensions can be set by the user.
|
||||
|
||||
Example:
|
||||
{'0_MSE': 0.004307240342677687, # MSE of dimension 0
|
||||
'0_abs_error': 1.6246897801756859,
|
||||
'1_MSE': 0.003949341769475723, # MSE of dimension 1
|
||||
'1_abs_error': 1.5052175521850586,
|
||||
'MSE': 0.004128291056076705, # MSE of all dimensions
|
||||
'abs_error': 3.1299073323607445,
|
||||
'm_sum_MSE': 0.02 # MSE of aggregated target and aggregated forecast
|
||||
(if target_agg_funcs is set).
|
||||
'm_sum_abs_error': 4.2}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quantiles: Iterable[Union[float, str]] = np.linspace(0.1, 0.9, 9),
|
||||
seasonality: Optional[int] = None,
|
||||
alpha: float = 0.05,
|
||||
eval_dims: List[int] = None,
|
||||
target_agg_funcs: Dict[str, Callable] = {},
|
||||
) -> None:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
quantiles
|
||||
list of strings of the form 'p10' or floats in [0, 1] with the
|
||||
quantile levels
|
||||
seasonality
|
||||
seasonality to use for seasonal_error, if nothing is passed uses
|
||||
the default seasonality for the given series frequency as
|
||||
returned by `get_seasonality`
|
||||
alpha
|
||||
parameter of the MSIS metric that defines the CI,
|
||||
e.g., for alpha=0.05 the 95% CI is considered in the metric.
|
||||
eval_dims
|
||||
dimensions of the target that will be evaluated.
|
||||
target_agg_funcs
|
||||
pass key-value pairs that define aggregation functions over the
|
||||
dimension axis. Useful to compute metrics over aggregated target
|
||||
and forecast (typically sum or mean).
|
||||
"""
|
||||
super().__init__(quantiles=quantiles, seasonality=seasonality, alpha=alpha)
|
||||
self._eval_dims = eval_dims
|
||||
self.target_agg_funcs = target_agg_funcs
|
||||
|
||||
@staticmethod
|
||||
def extract_target_by_dim(
|
||||
it_iterator: Iterator[pd.DataFrame], dim: int
|
||||
) -> Iterator[pd.DataFrame]:
|
||||
for i in it_iterator:
|
||||
yield (i[dim])
|
||||
|
||||
@staticmethod
|
||||
def extract_forecast_by_dim(
|
||||
forecast_iterator: Iterator[Forecast], dim: int
|
||||
) -> Iterator[Forecast]:
|
||||
for forecast in forecast_iterator:
|
||||
yield forecast.copy_dim(dim)
|
||||
|
||||
@staticmethod
|
||||
def extract_aggregate_target(
|
||||
it_iterator: Iterator[pd.DataFrame], agg_fun: Callable
|
||||
) -> Iterator[pd.DataFrame]:
|
||||
for i in it_iterator:
|
||||
yield i.agg(agg_fun, axis=1)
|
||||
|
||||
@staticmethod
|
||||
def extract_aggregate_forecast(
|
||||
forecast_iterator: Iterator[Forecast], agg_fun: Callable
|
||||
) -> Iterator[Forecast]:
|
||||
for forecast in forecast_iterator:
|
||||
yield forecast.copy_aggregate(agg_fun)
|
||||
|
||||
@staticmethod
|
||||
def peek(iterator: Iterator[Any]) -> Tuple[Any, Iterator[Any]]:
|
||||
peeked_object = iterator.__next__()
|
||||
iterator = chain([peeked_object], iterator)
|
||||
return peeked_object, iterator
|
||||
|
||||
@staticmethod
|
||||
def get_target_dimensionality(forecast: Forecast) -> int:
|
||||
target_dim = forecast.dim()
|
||||
assert target_dim > 1, (
|
||||
f"the dimensionality of the forecast should be larger than 1, "
|
||||
f"but got {target_dim}. "
|
||||
f"Please use the Evaluator to evaluate 1D forecasts."
|
||||
)
|
||||
return target_dim
|
||||
|
||||
def get_eval_dims(self, target_dimensionality: int) -> List[int]:
|
||||
eval_dims = (
|
||||
self._eval_dims
|
||||
if self._eval_dims is not None
|
||||
else list(range(0, target_dimensionality))
|
||||
)
|
||||
assert max(eval_dims) < target_dimensionality, (
|
||||
f"eval dims should range from 0 to target_dimensionality - 1, "
|
||||
f"but got max eval_dim {max(eval_dims)}"
|
||||
)
|
||||
return eval_dims
|
||||
|
||||
def calculate_aggregate_multivariate_metrics(
|
||||
self,
|
||||
ts_iterator: Iterator[pd.DataFrame],
|
||||
forecast_iterator: Iterator[Forecast],
|
||||
agg_fun: Callable,
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ts_iterator
|
||||
Iterator over time series
|
||||
forecast_iterator
|
||||
Iterator over forecasts
|
||||
agg_fun
|
||||
aggregation function
|
||||
Returns
|
||||
-------
|
||||
Dict[str, float]
|
||||
dictionary with aggregate datasets metrics
|
||||
"""
|
||||
agg_metrics, _ = super(MultivariateEvaluator, self).__call__(
|
||||
self.extract_aggregate_target(ts_iterator, agg_fun),
|
||||
self.extract_aggregate_forecast(forecast_iterator, agg_fun),
|
||||
)
|
||||
return agg_metrics
|
||||
|
||||
def calculate_aggregate_vector_metrics(
|
||||
self, all_agg_metrics: Dict[str, float], all_metrics_per_ts: pd.DataFrame,
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
all_agg_metrics
|
||||
dictionary with aggregate metrics of individual dimensions
|
||||
all_metrics_per_ts
|
||||
DataFrame containing metrics for all time series of all evaluated
|
||||
dimensions
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict[str, float]
|
||||
dictionary with aggregate metrics (of individual (evaluated)
|
||||
dimensions and the entire vector)
|
||||
"""
|
||||
vector_aggregate_metrics, _ = self.get_aggregate_metrics(all_metrics_per_ts)
|
||||
for key, value in vector_aggregate_metrics.items():
|
||||
all_agg_metrics[key] = value
|
||||
return all_agg_metrics
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
ts_iterator: Iterable[pd.DataFrame],
|
||||
fcst_iterator: Iterable[Forecast],
|
||||
num_series=None,
|
||||
) -> Tuple[Dict[str, float], pd.DataFrame]:
|
||||
ts_iterator = iter(ts_iterator)
|
||||
fcst_iterator = iter(fcst_iterator)
|
||||
|
||||
all_agg_metrics = dict()
|
||||
all_metrics_per_ts = list()
|
||||
|
||||
peeked_forecast, fcst_iterator = self.peek(fcst_iterator)
|
||||
target_dimensionality = self.get_target_dimensionality(peeked_forecast)
|
||||
eval_dims = self.get_eval_dims(target_dimensionality)
|
||||
|
||||
ts_iterator_set = tee(
|
||||
ts_iterator, target_dimensionality + len(self.target_agg_funcs)
|
||||
)
|
||||
fcst_iterator_set = tee(
|
||||
fcst_iterator, target_dimensionality + len(self.target_agg_funcs)
|
||||
)
|
||||
|
||||
for dim in eval_dims:
|
||||
agg_metrics, metrics_per_ts = super(MultivariateEvaluator, self).__call__(
|
||||
self.extract_target_by_dim(ts_iterator_set[dim], dim),
|
||||
self.extract_forecast_by_dim(fcst_iterator_set[dim], dim),
|
||||
)
|
||||
|
||||
all_metrics_per_ts.append(metrics_per_ts)
|
||||
|
||||
for metric, value in agg_metrics.items():
|
||||
all_agg_metrics[f"{dim}_{metric}"] = value
|
||||
|
||||
all_metrics_per_ts = pd.concat(all_metrics_per_ts)
|
||||
all_agg_metrics = self.calculate_aggregate_vector_metrics(
|
||||
all_agg_metrics, all_metrics_per_ts
|
||||
)
|
||||
|
||||
if self.target_agg_funcs:
|
||||
multivariate_metrics = {
|
||||
agg_fun_name: self.calculate_aggregate_multivariate_metrics(
|
||||
ts_iterator_set[-(index + 1)],
|
||||
fcst_iterator_set[-(index + 1)],
|
||||
agg_fun,
|
||||
)
|
||||
for index, (agg_fun_name, agg_fun) in enumerate(
|
||||
self.target_agg_funcs.items()
|
||||
)
|
||||
}
|
||||
|
||||
for key, metric_dict in multivariate_metrics.items():
|
||||
prefix = f"m_{key}_"
|
||||
for metric, value in metric_dict.items():
|
||||
all_agg_metrics[prefix + metric] = value
|
||||
|
||||
return all_agg_metrics, all_metrics_per_ts
|
||||
|
||||
|
||||
# This is required for the multiprocessing to work.
|
||||
_worker_evaluator: Optional[Evaluator] = None
|
||||
|
||||
|
||||
def _worker_init(evaluator: Evaluator):
|
||||
global _worker_evaluator
|
||||
_worker_evaluator = evaluator
|
||||
|
||||
|
||||
def _worker_fun(inp: tuple):
|
||||
ts, forecast = inp
|
||||
global _worker_evaluator
|
||||
assert isinstance(
|
||||
_worker_evaluator, Evaluator
|
||||
), "Something went wrong with the worker initialization."
|
||||
return _worker_evaluator.get_metrics_per_ts(ts, forecast)
|
||||
@@ -1,3 +0,0 @@
|
||||
def assert_pts(condition: bool, message: str, *args, **kwargs) -> None:
|
||||
if not condition:
|
||||
raise Exception(message.format(*args, **kwargs))
|
||||
@@ -1,23 +1,4 @@
|
||||
from .holiday import (
|
||||
SPECIAL_DATE_FEATURES,
|
||||
SpecialDateFeatureSet,
|
||||
CustomDateFeatureSet,
|
||||
CustomHolidayFeatureSet,
|
||||
squared_exponential_kernel,
|
||||
exponential_kernel,
|
||||
)
|
||||
from .lag import get_lags_for_frequency, get_fourier_lags_for_frequency
|
||||
from .time_feature import (
|
||||
DayOfMonth,
|
||||
DayOfWeek,
|
||||
DayOfYear,
|
||||
HourOfDay,
|
||||
MinuteOfHour,
|
||||
MonthOfYear,
|
||||
TimeFeature,
|
||||
WeekOfYear,
|
||||
FourierDateFeatures,
|
||||
time_features_from_frequency_str,
|
||||
fourier_time_features_from_frequency_str,
|
||||
)
|
||||
from .utils import get_granularity, get_seasonality
|
||||
|
||||
+13
-232
@@ -1,221 +1,9 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from typing import List, Callable
|
||||
|
||||
from typing import Callable, List
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.tseries.holiday import (
|
||||
TH,
|
||||
SU,
|
||||
EasterMonday,
|
||||
GoodFriday,
|
||||
Holiday,
|
||||
USColumbusDay,
|
||||
USLaborDay,
|
||||
USMartinLutherKingJr,
|
||||
USMemorialDay,
|
||||
USPresidentsDay,
|
||||
USThanksgivingDay,
|
||||
)
|
||||
from pandas.tseries.offsets import DateOffset, Day, Easter
|
||||
from pandas.tseries.holiday import Holiday
|
||||
|
||||
# This is 183 to cover half a year (in both directions), also for leap years
|
||||
# plus a week and a half to cover holidays offset by a week e.g. easter etc
|
||||
MAX_WINDOW = 192
|
||||
|
||||
|
||||
def distance_to_holiday(holiday):
|
||||
def distance_to_day(index):
|
||||
holiday_date = holiday.dates(
|
||||
index - pd.Timedelta(days=MAX_WINDOW),
|
||||
index + pd.Timedelta(days=MAX_WINDOW),
|
||||
)
|
||||
assert (
|
||||
len(holiday_date) != 0
|
||||
), f"No closest holiday for the date index {index} found."
|
||||
# It sometimes returns two dates if it is exactly half a year after the
|
||||
# holiday. In this case, the smaller distance (182 days) is returned.
|
||||
return (index - holiday_date[0]).days
|
||||
|
||||
return distance_to_day
|
||||
|
||||
|
||||
EasterSunday = Holiday("Easter Sunday", month=1, day=1, offset=[Easter(), Day(0)])
|
||||
NewYearsDay = Holiday("New Years Day", month=1, day=1)
|
||||
SuperBowl = Holiday("Superbowl", month=2, day=1, offset=DateOffset(weekday=SU(1)))
|
||||
MothersDay = Holiday("Mothers Day", month=5, day=1, offset=DateOffset(weekday=SU(2)))
|
||||
IndependenceDay = Holiday("Independence Day", month=7, day=4)
|
||||
ChristmasEve = Holiday("Christmas", month=12, day=24)
|
||||
ChristmasDay = Holiday("Christmas", month=12, day=25)
|
||||
NewYearsEve = Holiday("New Years Eve", month=12, day=31)
|
||||
BlackFriday = Holiday(
|
||||
"Black Friday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), Day(1)]
|
||||
)
|
||||
CyberMonday = Holiday(
|
||||
"Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), Day(4)],
|
||||
)
|
||||
|
||||
|
||||
NEW_YEARS_DAY = "new_years_day"
|
||||
MARTIN_LUTHER_KING_DAY = "martin_luther_king_day"
|
||||
SUPERBOWL = "superbowl"
|
||||
PRESIDENTS_DAY = "presidents_day"
|
||||
GOOD_FRIDAY = "good_friday"
|
||||
EASTER_SUNDAY = "easter_sunday"
|
||||
EASTER_MONDAY = "easter_monday"
|
||||
MOTHERS_DAY = "mothers_day"
|
||||
INDEPENDENCE_DAY = "independence_day"
|
||||
LABOR_DAY = "labor_day"
|
||||
MEMORIAL_DAY = "memorial_day"
|
||||
COLUMBUS_DAY = "columbus_day"
|
||||
THANKSGIVING = "thanksgiving"
|
||||
CHRISTMAS_EVE = "christmas_eve"
|
||||
CHRISTMAS_DAY = "christmas_day"
|
||||
NEW_YEARS_EVE = "new_years_eve"
|
||||
BLACK_FRIDAY = "black_friday"
|
||||
CYBER_MONDAY = "cyber_monday"
|
||||
|
||||
|
||||
SPECIAL_DATE_FEATURES = {
|
||||
NEW_YEARS_DAY: distance_to_holiday(NewYearsDay),
|
||||
MARTIN_LUTHER_KING_DAY: distance_to_holiday(USMartinLutherKingJr),
|
||||
SUPERBOWL: distance_to_holiday(SuperBowl),
|
||||
PRESIDENTS_DAY: distance_to_holiday(USPresidentsDay),
|
||||
GOOD_FRIDAY: distance_to_holiday(GoodFriday),
|
||||
EASTER_SUNDAY: distance_to_holiday(EasterSunday),
|
||||
EASTER_MONDAY: distance_to_holiday(EasterMonday),
|
||||
MOTHERS_DAY: distance_to_holiday(MothersDay),
|
||||
INDEPENDENCE_DAY: distance_to_holiday(IndependenceDay),
|
||||
LABOR_DAY: distance_to_holiday(USLaborDay),
|
||||
MEMORIAL_DAY: distance_to_holiday(USMemorialDay),
|
||||
COLUMBUS_DAY: distance_to_holiday(USColumbusDay),
|
||||
THANKSGIVING: distance_to_holiday(USThanksgivingDay),
|
||||
CHRISTMAS_EVE: distance_to_holiday(ChristmasEve),
|
||||
CHRISTMAS_DAY: distance_to_holiday(ChristmasDay),
|
||||
NEW_YEARS_EVE: distance_to_holiday(NewYearsEve),
|
||||
BLACK_FRIDAY: distance_to_holiday(BlackFriday),
|
||||
CYBER_MONDAY: distance_to_holiday(CyberMonday),
|
||||
}
|
||||
|
||||
|
||||
# Kernel functions
|
||||
def indicator(distance):
|
||||
return float(distance == 0)
|
||||
|
||||
|
||||
def exponential_kernel(alpha=1.0, tol=1e-9):
|
||||
def kernel(distance):
|
||||
kernel_value = np.exp(-alpha * np.abs(distance))
|
||||
if kernel_value > tol:
|
||||
return kernel_value
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def squared_exponential_kernel(alpha=1.0, tol=1e-9):
|
||||
def kernel(distance):
|
||||
kernel_value = np.exp(-alpha * np.abs(distance) ** 2)
|
||||
if kernel_value > tol:
|
||||
return kernel_value
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
class SpecialDateFeatureSet:
|
||||
"""
|
||||
Implements calculation of holiday features. The SpecialDateFeatureSet is
|
||||
applied on a pandas Series with Datetimeindex and returns a 2D array of
|
||||
the shape (len(dates), num_features), where num_features are the number
|
||||
of holidays.
|
||||
|
||||
Note that for lower than daily granularity the distance to the holiday is
|
||||
still computed on a per-day basis.
|
||||
|
||||
Example use:
|
||||
|
||||
>>> from pts.features import (
|
||||
... squared_exponential_kernel,
|
||||
... SpecialDateFeatureSet,
|
||||
... CHRISTMAS_DAY,
|
||||
... CHRISTMAS_EVE
|
||||
... )
|
||||
>>> import pandas as pd
|
||||
>>> sfs = SpecialDateFeatureSet([CHRISTMAS_EVE, CHRISTMAS_DAY])
|
||||
>>> date_indices = pd.date_range(
|
||||
... start="2016-12-24",
|
||||
... end="2016-12-31",
|
||||
... freq='D'
|
||||
... )
|
||||
>>> sfs(date_indices)
|
||||
array([[1., 0., 0., 0., 0., 0., 0., 0.],
|
||||
[0., 1., 0., 0., 0., 0., 0., 0.]])
|
||||
|
||||
Example use for using a squared exponential kernel:
|
||||
|
||||
>>> kernel = squared_exponential_kernel(alpha=1.0)
|
||||
>>> sfs = SpecialDateFeatureSet([CHRISTMAS_EVE, CHRISTMAS_DAY], kernel)
|
||||
>>> sfs(date_indices)
|
||||
array([[1.00000000e+00, 3.67879441e-01, 1.83156389e-02, 1.23409804e-04,
|
||||
1.12535175e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
|
||||
[3.67879441e-01, 1.00000000e+00, 3.67879441e-01, 1.83156389e-02,
|
||||
1.23409804e-04, 1.12535175e-07, 0.00000000e+00, 0.00000000e+00]])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_names: List[str],
|
||||
kernel_function: Callable[[int], int] = indicator,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
feature_names
|
||||
list of strings with holiday names for which features should be created.
|
||||
kernel_function
|
||||
kernel function to pass the feature value based
|
||||
on distance in days. Can be indicator function (default),
|
||||
exponential_kernel, squared_exponential_kernel or user defined.
|
||||
"""
|
||||
self.feature_names = feature_names
|
||||
self.num_features = len(feature_names)
|
||||
self.kernel_function = kernel_function
|
||||
|
||||
def __call__(self, dates):
|
||||
"""
|
||||
Transform a pandas series with timestamps to holiday features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dates
|
||||
Pandas series with Datetimeindex timestamps.
|
||||
"""
|
||||
return np.vstack(
|
||||
[
|
||||
np.hstack(
|
||||
[
|
||||
self.kernel_function(SPECIAL_DATE_FEATURES[feat_name](index))
|
||||
for index in dates
|
||||
]
|
||||
)
|
||||
for feat_name in self.feature_names
|
||||
]
|
||||
)
|
||||
from gluonts.time_feature.holiday import indicator, distance_to_holiday
|
||||
|
||||
|
||||
class CustomDateFeatureSet:
|
||||
@@ -230,7 +18,7 @@ class CustomDateFeatureSet:
|
||||
Example use:
|
||||
|
||||
>>> import pandas as pd
|
||||
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
|
||||
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
|
||||
... pd.to_datetime('20200101', format='%Y%m%d')])
|
||||
>>> date_indices = pd.date_range(
|
||||
... start="2019-11-24",
|
||||
@@ -245,7 +33,7 @@ class CustomDateFeatureSet:
|
||||
Example use for using a squared exponential kernel:
|
||||
|
||||
>>> kernel = squared_exponential_kernel(alpha=0.5)
|
||||
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
|
||||
>>> cfs = CustomDateFeatureSet([pd.to_datetime('20191129', format='%Y%m%d'),
|
||||
... pd.to_datetime('20200101', format='%Y%m%d')], kernel)
|
||||
>>> cfs(date_indices)
|
||||
array([[3.72665317e-06, 3.35462628e-04, 1.11089965e-02, 1.35335283e-01,
|
||||
@@ -287,20 +75,14 @@ class CustomDateFeatureSet:
|
||||
dates
|
||||
Pandas series with Datetimeindex timestamps.
|
||||
"""
|
||||
return (
|
||||
np.vstack(
|
||||
[
|
||||
np.hstack(
|
||||
[
|
||||
self.kernel_function((index - ref_date).days)
|
||||
for index in dates
|
||||
]
|
||||
)
|
||||
for ref_date in self.reference_dates
|
||||
]
|
||||
)
|
||||
.sum(0, keepdims=True)
|
||||
)
|
||||
return np.vstack(
|
||||
[
|
||||
np.hstack(
|
||||
[self.kernel_function((index - ref_date).days) for index in dates]
|
||||
)
|
||||
for ref_date in self.reference_dates
|
||||
]
|
||||
).sum(0, keepdims=True)
|
||||
|
||||
|
||||
class CustomHolidayFeatureSet:
|
||||
@@ -383,4 +165,3 @@ class CustomHolidayFeatureSet:
|
||||
for custom_holiday in self.custom_holidays
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
from typing import List, Optional
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
from .utils import get_granularity
|
||||
|
||||
|
||||
def _make_lags(middle: int, delta: int) -> np.ndarray:
|
||||
"""
|
||||
Create a set of lags around a middle point including +/- delta
|
||||
"""
|
||||
return np.arange(middle - delta, middle + delta + 1).tolist()
|
||||
|
||||
|
||||
def get_lags_for_frequency(
|
||||
freq_str: str, lag_ub: int = 1200, num_lags: Optional[int] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Generates a list of lags that that are appropriate for the given frequency string.
|
||||
|
||||
By default all frequencies have the following lags: [1, 2, 3, 4, 5, 6, 7].
|
||||
Remaining lags correspond to the same `season` (+/- `delta`) in previous `k` cycles.
|
||||
Here `delta` and `k` are chosen according to the existing code.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
freq_str
|
||||
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
|
||||
|
||||
lag_ub
|
||||
The maximum value for a lag.
|
||||
|
||||
num_lags
|
||||
Maximum number of lags; by default all generated lags are returned
|
||||
"""
|
||||
|
||||
multiple, granularity = get_granularity(freq_str)
|
||||
|
||||
# Lags are target values at the same `season` (+/- delta) but in the previous cycle.
|
||||
def _make_lags_for_minute(multiple, num_cycles=3):
|
||||
# We use previous ``num_cycles`` hours to generate lags
|
||||
return [_make_lags(k * 60 // multiple, 2) for k in range(1, num_cycles + 1)]
|
||||
|
||||
def _make_lags_for_hour(multiple, num_cycles=7):
|
||||
# We use previous ``num_cycles`` days to generate lags
|
||||
return [_make_lags(k * 24 // multiple, 1) for k in range(1, num_cycles + 1)]
|
||||
|
||||
def _make_lags_for_day(multiple, num_cycles=4):
|
||||
# We use previous ``num_cycles`` weeks to generate lags
|
||||
# We use the last month (in addition to 4 weeks) to generate lag.
|
||||
return [_make_lags(k * 7 // multiple, 1) for k in range(1, num_cycles + 1)] + [
|
||||
_make_lags(30 // multiple, 1)
|
||||
]
|
||||
|
||||
def _make_lags_for_week(multiple, num_cycles=3):
|
||||
# We use previous ``num_cycles`` years to generate lags
|
||||
# Additionally, we use previous 4, 8, 12 weeks
|
||||
return [_make_lags(k * 52 // multiple, 1) for k in range(1, num_cycles + 1)] + [
|
||||
[4 // multiple, 8 // multiple, 12 // multiple]
|
||||
]
|
||||
|
||||
def _make_lags_for_month(multiple, num_cycles=3):
|
||||
# We use previous ``num_cycles`` years to generate lags
|
||||
return [_make_lags(k * 12 // multiple, 1) for k in range(1, num_cycles + 1)]
|
||||
|
||||
# multiple, granularity = get_granularity(freq_str)
|
||||
offset = to_offset(freq_str)
|
||||
|
||||
if offset.name == "M":
|
||||
lags = _make_lags_for_month(offset.n)
|
||||
elif offset.name == "W-SUN" or offset.name == "W-MON":
|
||||
lags = _make_lags_for_week(offset.n)
|
||||
elif offset.name == "D":
|
||||
lags = _make_lags_for_day(offset.n) + _make_lags_for_week(offset.n / 7.0)
|
||||
elif offset.name == "B":
|
||||
# todo find good lags for business day
|
||||
lags = []
|
||||
elif offset.name == "H":
|
||||
lags = (
|
||||
_make_lags_for_hour(offset.n)
|
||||
+ _make_lags_for_day(offset.n / 24.0)
|
||||
+ _make_lags_for_week(offset.n / (24.0 * 7))
|
||||
)
|
||||
# minutes
|
||||
elif offset.name == "T":
|
||||
lags = (
|
||||
_make_lags_for_minute(offset.n)
|
||||
+ _make_lags_for_hour(offset.n / 60.0)
|
||||
+ _make_lags_for_day(offset.n / (60.0 * 24))
|
||||
+ _make_lags_for_week(offset.n / (60.0 * 24 * 7))
|
||||
)
|
||||
else:
|
||||
raise Exception("invalid frequency")
|
||||
|
||||
# flatten lags list and filter
|
||||
lags = [int(lag) for sub_list in lags for lag in sub_list if 7 < lag <= lag_ub]
|
||||
lags = [1, 2, 3, 4, 5, 6, 7] + sorted(list(set(lags)))
|
||||
|
||||
return lags[:num_lags]
|
||||
|
||||
|
||||
def get_fourier_lags_for_frequency(freq_str: str, num_lags: Optional[int] = None) -> List[int]:
|
||||
offset = to_offset(freq_str)
|
||||
granularity = offset.name
|
||||
|
||||
if granularity == "M":
|
||||
lags = [[1, 12]]
|
||||
elif granularity == "D":
|
||||
lags = [[1, 7, 14]]
|
||||
elif granularity == "B":
|
||||
lags = [[1, 2]]
|
||||
elif granularity == "H":
|
||||
lags = [[1, 24, 168]]
|
||||
elif granularity == "min":
|
||||
lags = [[1, 4, 12, 24, 48]]
|
||||
else:
|
||||
lags = [[1]]
|
||||
|
||||
# use less lags
|
||||
output_lags = list([int(lag) for sub_list in lags for lag in sub_list])
|
||||
output_lags = sorted(list(set(output_lags)))
|
||||
return output_lags[:num_lags]
|
||||
@@ -1,206 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
from pts.core.component import validated
|
||||
from .utils import get_granularity
|
||||
|
||||
|
||||
class TimeFeature(ABC):
|
||||
@validated()
|
||||
def __init__(self, normalized: bool = True):
|
||||
self.normalized = normalized
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
pass
|
||||
|
||||
|
||||
class MinuteOfHour(TimeFeature):
|
||||
"""
|
||||
Minute of hour encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.minute / 59.0 - 0.5
|
||||
else:
|
||||
return index.minute.map(float)
|
||||
|
||||
|
||||
class HourOfDay(TimeFeature):
|
||||
"""
|
||||
Hour of day encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.hour / 23.0 - 0.5
|
||||
else:
|
||||
return index.hour.map(float)
|
||||
|
||||
|
||||
class DayOfWeek(TimeFeature):
|
||||
"""
|
||||
Hour of day encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.dayofweek / 6.0 - 0.5
|
||||
else:
|
||||
return index.dayofweek.map(float)
|
||||
|
||||
|
||||
class DayOfMonth(TimeFeature):
|
||||
"""
|
||||
Day of month encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.day / 30.0 - 0.5
|
||||
else:
|
||||
return index.day.map(float)
|
||||
|
||||
|
||||
class DayOfYear(TimeFeature):
|
||||
"""
|
||||
Day of year encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.dayofyear / 364.0 - 0.5
|
||||
else:
|
||||
return index.dayofyear.map(float)
|
||||
|
||||
|
||||
class MonthOfYear(TimeFeature):
|
||||
"""
|
||||
Month of year encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return index.month / 11.0 - 0.5
|
||||
else:
|
||||
return index.month.map(float)
|
||||
|
||||
|
||||
class WeekOfYear(TimeFeature):
|
||||
"""
|
||||
Week of year encoded as value between [-0.5, 0.5]
|
||||
"""
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
if self.normalized:
|
||||
return pd.Int64Index(index.isocalendar().week) / 51.0 - 0.5
|
||||
else:
|
||||
return pd.Int64Index(index.isocalendar().week).map(float)
|
||||
|
||||
|
||||
class FourierDateFeatures(TimeFeature):
|
||||
@validated()
|
||||
def __init__(self, freq: str) -> None:
|
||||
super().__init__()
|
||||
# reoccurring freq
|
||||
freqs = [
|
||||
"month",
|
||||
"day",
|
||||
"hour",
|
||||
"minute",
|
||||
"weekofyear",
|
||||
"weekday",
|
||||
"dayofweek",
|
||||
"dayofyear",
|
||||
"daysinmonth",
|
||||
]
|
||||
|
||||
assert freq in freqs
|
||||
self.freq = freq
|
||||
|
||||
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
|
||||
values = getattr(index, self.freq)
|
||||
num_values = max(values) + 1
|
||||
steps = [x * 2.0 * np.pi / num_values for x in values]
|
||||
return np.vstack([np.cos(steps), np.sin(steps)])
|
||||
|
||||
|
||||
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
|
||||
"""
|
||||
Returns a list of time features that will be appropriate for the given frequency string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
freq_str
|
||||
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
|
||||
|
||||
"""
|
||||
_, granularity = get_granularity(freq_str)
|
||||
if granularity == "M":
|
||||
feature_classes = [MonthOfYear]
|
||||
elif granularity == "W":
|
||||
feature_classes = [DayOfMonth, WeekOfYear]
|
||||
elif granularity in ["D", "B"]:
|
||||
feature_classes = [DayOfWeek, DayOfMonth, DayOfYear]
|
||||
elif granularity == "H":
|
||||
feature_classes = [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]
|
||||
elif granularity in ["min", "T"]:
|
||||
feature_classes = [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]
|
||||
else:
|
||||
supported_freq_msg = f"""
|
||||
Unsupported frequency {freq_str}
|
||||
|
||||
The following frequencies are supported:
|
||||
|
||||
M - monthly
|
||||
W - week
|
||||
D - daily
|
||||
H - hourly
|
||||
min - minutely
|
||||
"""
|
||||
raise RuntimeError(supported_freq_msg)
|
||||
|
||||
return [cls() for cls in feature_classes]
|
||||
|
||||
|
||||
def fourier_time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
|
||||
offset = to_offset(freq_str)
|
||||
granularity = offset.name
|
||||
|
||||
features = {
|
||||
"M": ["weekofyear"],
|
||||
"W-SUN": ["daysinmonth", "weekofyear"],
|
||||
"W-MON": ["daysinmonth", "weekofyear"],
|
||||
"D": ["dayofweek"],
|
||||
"B": ["dayofweek", "dayofyear"],
|
||||
"H": ["hour", "dayofweek"],
|
||||
"min": ["minute", "hour", "dayofweek"],
|
||||
"T": ["minute", "hour", "dayofweek"],
|
||||
}
|
||||
|
||||
assert granularity in features, f"freq {granularity} not supported"
|
||||
|
||||
feature_classes: List[TimeFeature] = [
|
||||
FourierDateFeatures(freq=freq) for freq in features[granularity]
|
||||
]
|
||||
return feature_classes
|
||||
@@ -1,65 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def get_granularity(freq_str: str) -> Tuple[int, str]:
|
||||
"""
|
||||
Splits a frequency string such as "7D" into the multiple 7 and the base
|
||||
granularity "D".
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
freq_str
|
||||
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
|
||||
"""
|
||||
freq_regex = r"\s*((\d+)?)\s*([^\d]\w*)"
|
||||
m = re.match(freq_regex, freq_str)
|
||||
assert m is not None, "Cannot parse frequency string: %s" % freq_str
|
||||
groups = m.groups()
|
||||
multiple = int(groups[1]) if groups[1] is not None else 1
|
||||
granularity = groups[2]
|
||||
return multiple, granularity
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_seasonality(freq: str) -> int:
|
||||
"""
|
||||
Returns the default seasonality for a given freq str. E.g. for
|
||||
|
||||
2H -> 12
|
||||
|
||||
"""
|
||||
match = re.match(r"(\d*)(\w+)", freq)
|
||||
assert match, "Cannot match freq regex"
|
||||
mult, base_freq = match.groups()
|
||||
multiple = int(mult) if mult else 1
|
||||
|
||||
seasonalities = {"H": 24, "D": 1, "W": 1, "M": 12, "B": 5}
|
||||
if base_freq in seasonalities:
|
||||
seasonality = seasonalities[base_freq]
|
||||
else:
|
||||
seasonality = 1
|
||||
if seasonality % multiple != 0:
|
||||
# logging.warning(
|
||||
# f"multiple {multiple} does not divide base "
|
||||
# f"seasonality {seasonality}."
|
||||
# f"Falling back to seasonality 1"
|
||||
# )
|
||||
return 1
|
||||
return seasonality // multiple
|
||||
@@ -1,5 +1,2 @@
|
||||
from .estimator import Estimator, PTSEstimator
|
||||
from .forecast import Forecast, SampleForecast, QuantileForecast, DistributionForecast
|
||||
from .predictor import Predictor, PTSPredictor
|
||||
from .quantile import Quantile
|
||||
from .utils import get_module_forward_input_names, copy_parameters, weighted_average
|
||||
from .utils import get_module_forward_input_names, weighted_average
|
||||
from .estimator import PyTorchEstimator
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
from pts.model.utils import get_module_forward_input_names
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts import Trainer
|
||||
from pts.dataset import FieldName
|
||||
from pts.feature import (
|
||||
from gluonts.dataset.field_names import FieldName
|
||||
from gluonts.time_feature import (
|
||||
TimeFeature,
|
||||
get_lags_for_frequency,
|
||||
time_features_from_frequency_str,
|
||||
)
|
||||
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
|
||||
from pts.modules import DistributionOutput, StudentTOutput
|
||||
from pts.transform import (
|
||||
from gluonts.transform import (
|
||||
Transformation,
|
||||
Chain,
|
||||
RemoveFields,
|
||||
@@ -26,10 +24,19 @@ from pts.transform import (
|
||||
InstanceSplitter,
|
||||
ExpectedNumInstanceSampler,
|
||||
)
|
||||
from gluonts.torch.support.util import copy_parameters
|
||||
from gluonts.torch.model.predictor import PyTorchPredictor
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from gluonts.model.predictor import Predictor
|
||||
|
||||
from pts import Trainer
|
||||
from pts.model import PyTorchEstimator
|
||||
from pts.modules import StudentTOutput
|
||||
|
||||
from .deepar_network import DeepARTrainingNetwork, DeepARPredictionNetwork
|
||||
|
||||
|
||||
class DeepAREstimator(PTSEstimator):
|
||||
class DeepAREstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
freq: str,
|
||||
@@ -115,10 +122,14 @@ class DeepAREstimator(PTSEstimator):
|
||||
)
|
||||
+ [
|
||||
AsNumpyArray(
|
||||
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long,
|
||||
field=FieldName.FEAT_STATIC_CAT,
|
||||
expected_ndim=1,
|
||||
dtype=np.long,
|
||||
),
|
||||
AsNumpyArray(
|
||||
field=FieldName.FEAT_STATIC_REAL, expected_ndim=1, dtype=self.dtype,
|
||||
field=FieldName.FEAT_STATIC_REAL,
|
||||
expected_ndim=1,
|
||||
dtype=self.dtype,
|
||||
),
|
||||
AsNumpyArray(
|
||||
field=FieldName.TARGET,
|
||||
@@ -218,13 +229,14 @@ class DeepAREstimator(PTSEstimator):
|
||||
).to(device)
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
input_names = get_module_forward_input_names(prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
input_names=input_names,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
freq=self.freq,
|
||||
prediction_length=self.prediction_length,
|
||||
device=device,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
|
||||
@@ -5,9 +5,10 @@ import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Distribution
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from pts.model import weighted_average
|
||||
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
|
||||
from pts.modules import MeanScaler, NOPScaler, FeatureEmbedder
|
||||
|
||||
|
||||
def prod(xs):
|
||||
@@ -18,7 +19,6 @@ def prod(xs):
|
||||
|
||||
|
||||
class DeepARNetwork(nn.Module):
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
@@ -144,7 +144,7 @@ class DeepARNetwork(nn.Module):
|
||||
past_time_feat[:, self.history_length - self.context_length :, ...],
|
||||
future_time_feat,
|
||||
),
|
||||
dim=1
|
||||
dim=1,
|
||||
)
|
||||
sequence = torch.cat((past_target, future_target), dim=1)
|
||||
sequence_length = self.history_length + self.prediction_length
|
||||
@@ -154,7 +154,7 @@ class DeepARNetwork(nn.Module):
|
||||
sequence=sequence,
|
||||
sequence_length=sequence_length,
|
||||
indices=self.lags_seq,
|
||||
subsequences_length=subsequences_length
|
||||
subsequences_length=subsequences_length,
|
||||
)
|
||||
|
||||
# scale is computed on the context length last units of the past target
|
||||
|
||||
@@ -10,7 +10,7 @@ from pts.feature import (
|
||||
fourier_time_features_from_frequency_str,
|
||||
get_fourier_lags_for_frequency,
|
||||
)
|
||||
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
|
||||
from pts.modules import DistributionOutput, LowRankMultivariateNormalOutput
|
||||
from pts.transform import (
|
||||
Transformation,
|
||||
@@ -34,7 +34,7 @@ from pts.transform import (
|
||||
from .deepvar_network import DeepVARTrainingNetwork, DeepVARPredictionNetwork
|
||||
|
||||
|
||||
class DeepVAREstimator(PTSEstimator):
|
||||
class DeepVAREstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
@@ -199,7 +199,9 @@ class DeepVAREstimator(PTSEstimator):
|
||||
field_name="target_dimension_indicator",
|
||||
target_field=FieldName.TARGET,
|
||||
),
|
||||
AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long),
|
||||
AsNumpyArray(
|
||||
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long
|
||||
),
|
||||
AsNumpyArray(field=FieldName.FEAT_STATIC_REAL, expected_ndim=1),
|
||||
InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
@@ -242,7 +244,7 @@ class DeepVAREstimator(PTSEstimator):
|
||||
transformation: Transformation,
|
||||
trained_network: DeepVARTrainingNetwork,
|
||||
device: torch.device,
|
||||
) -> PTSPredictor:
|
||||
) -> PyTorchPredictor:
|
||||
prediction_network = DeepVARPredictionNetwork(
|
||||
input_size=self.input_size,
|
||||
target_dim=self.target_dim,
|
||||
@@ -263,7 +265,7 @@ class DeepVAREstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from pts.model import weighted_average
|
||||
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
|
||||
|
||||
@@ -250,7 +250,10 @@ class DeepVARTrainingNetwork(nn.Module):
|
||||
subsequences_length = self.context_length
|
||||
else:
|
||||
time_feat = torch.cat(
|
||||
(past_time_feat[:, -self.context_length :, ...], future_time_feat,),
|
||||
(
|
||||
past_time_feat[:, -self.context_length :, ...],
|
||||
future_time_feat,
|
||||
),
|
||||
dim=1,
|
||||
)
|
||||
sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
|
||||
@@ -285,7 +288,9 @@ class DeepVARTrainingNetwork(nn.Module):
|
||||
return outputs, states, scale, lags_scaled, inputs
|
||||
|
||||
def distr(
|
||||
self, rnn_outputs: torch.Tensor, scale: torch.Tensor,
|
||||
self,
|
||||
rnn_outputs: torch.Tensor,
|
||||
scale: torch.Tensor,
|
||||
):
|
||||
"""
|
||||
Returns the distribution of DeepVAR with respect to the RNN outputs.
|
||||
@@ -382,7 +387,8 @@ class DeepVARTrainingNetwork(nn.Module):
|
||||
# put together target sequence
|
||||
# (batch_size, seq_len, target_dim)
|
||||
target = torch.cat(
|
||||
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf), dim=1,
|
||||
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf),
|
||||
dim=1,
|
||||
)
|
||||
|
||||
# assert_shape(target, (-1, seq_len, self.target_dim))
|
||||
@@ -507,7 +513,8 @@ class DeepVARPredictionNetwork(DeepVARTrainingNetwork):
|
||||
)
|
||||
|
||||
distr, distr_args = self.distr(
|
||||
rnn_outputs=rnn_outputs, scale=repeated_scale,
|
||||
rnn_outputs=rnn_outputs,
|
||||
scale=repeated_scale,
|
||||
)
|
||||
|
||||
# (batch_size, 1, target_dim)
|
||||
@@ -524,7 +531,12 @@ class DeepVARPredictionNetwork(DeepVARTrainingNetwork):
|
||||
|
||||
# (batch_size, num_samples, prediction_length, target_dim)
|
||||
return samples.reshape(
|
||||
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
|
||||
(
|
||||
-1,
|
||||
self.num_parallel_samples,
|
||||
self.prediction_length,
|
||||
self.target_dim,
|
||||
)
|
||||
)
|
||||
|
||||
def forward(
|
||||
|
||||
+82
-80
@@ -1,73 +1,38 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import NamedTuple
|
||||
from typing import NamedTuple, Optional
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from gluonts.core.component import validated
|
||||
from gluonts.dataset.common import Dataset
|
||||
from gluonts.dataset.loader import TrainDataLoader, ValidationDataLoader
|
||||
from gluonts.model.estimator import Estimator
|
||||
from gluonts.torch.model.predictor import PyTorchPredictor
|
||||
from gluonts.torch.batchify import batchify
|
||||
from gluonts.transform import SelectFields, Transformation
|
||||
|
||||
from pts import Trainer
|
||||
from pts.dataset import Dataset, TransformedIterableDataset
|
||||
from pts.transform import Transformation
|
||||
from .predictor import Predictor
|
||||
from .utils import get_module_forward_input_names
|
||||
|
||||
|
||||
class Estimator(ABC):
|
||||
prediction_length: int
|
||||
freq: str
|
||||
|
||||
@abstractmethod
|
||||
def train(self, training_data: Dataset) -> Predictor:
|
||||
pass
|
||||
|
||||
|
||||
class DummyEstimator(Estimator):
|
||||
"""
|
||||
An `Estimator` that, upon training, simply returns a pre-constructed
|
||||
`Predictor`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
predictor_cls
|
||||
`Predictor` class to instantiate.
|
||||
**kwargs
|
||||
Keyword arguments to pass to the predictor constructor.
|
||||
"""
|
||||
|
||||
def __init__(self, predictor_cls: type, **kwargs) -> None:
|
||||
self.predictor = predictor_cls(**kwargs)
|
||||
|
||||
def train(self, training_data: Dataset) -> Predictor:
|
||||
return self.predictor
|
||||
from pts.model import get_module_forward_input_names
|
||||
|
||||
|
||||
class TrainOutput(NamedTuple):
|
||||
transformation: Transformation
|
||||
trained_net: nn.Module
|
||||
predictor: Predictor
|
||||
predictor: PyTorchPredictor
|
||||
|
||||
|
||||
class PTSEstimator(Estimator):
|
||||
def __init__(self, trainer: Trainer, dtype: np.dtype = np.float32) -> None:
|
||||
class PyTorchEstimator(Estimator):
|
||||
@validated()
|
||||
def __init__(
|
||||
self, trainer: Trainer, lead_time: int = 0, dtype: np.dtype = np.float32
|
||||
) -> None:
|
||||
super().__init__(lead_time=lead_time)
|
||||
self.trainer = trainer
|
||||
self.dtype = dtype
|
||||
|
||||
@abstractmethod
|
||||
def create_transformation(self) -> Transformation:
|
||||
"""
|
||||
Create and return the transformation needed for training and inference.
|
||||
@@ -78,9 +43,8 @@ class PTSEstimator(Estimator):
|
||||
The transformation that will be applied entry-wise to datasets,
|
||||
at training and inference time.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def create_training_network(self, device: torch.device) -> nn.Module:
|
||||
"""
|
||||
Create and return the network used for training (i.e., computing the
|
||||
@@ -91,15 +55,14 @@ class PTSEstimator(Estimator):
|
||||
nn.Module
|
||||
The network that computes the loss given input data.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def create_predictor(
|
||||
self,
|
||||
transformation: Transformation,
|
||||
trained_network: nn.Module,
|
||||
device: torch.device,
|
||||
) -> Predictor:
|
||||
) -> PyTorchPredictor:
|
||||
"""
|
||||
Create and return a predictor object.
|
||||
|
||||
@@ -108,32 +71,56 @@ class PTSEstimator(Estimator):
|
||||
Predictor
|
||||
A predictor wrapping a `nn.Module` used for inference.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def train_model(self, training_data: Dataset) -> TrainOutput:
|
||||
def train_model(
|
||||
self,
|
||||
training_data: Dataset,
|
||||
validation_data: Optional[Dataset] = None,
|
||||
num_workers: Optional[int] = None,
|
||||
num_prefetch: Optional[int] = None,
|
||||
shuffle_buffer_length: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> TrainOutput:
|
||||
transformation = self.create_transformation()
|
||||
transformation.estimate(iter(training_data))
|
||||
|
||||
training_iter_dataset = TransformedIterableDataset(
|
||||
dataset=training_data,
|
||||
is_train=True,
|
||||
transform=transformation
|
||||
)
|
||||
|
||||
training_data_loader = DataLoader(
|
||||
training_iter_dataset,
|
||||
batch_size=self.trainer.batch_size,
|
||||
num_workers=self.trainer.num_workers,
|
||||
pin_memory=self.trainer.pin_memory
|
||||
)
|
||||
|
||||
# ensure that the training network is created on the same device
|
||||
trained_net = self.create_training_network(self.trainer.device)
|
||||
|
||||
input_names = get_module_forward_input_names(trained_net)
|
||||
|
||||
training_data_loader = TrainDataLoader(
|
||||
dataset=training_data,
|
||||
transform=transformation + SelectFields(input_names),
|
||||
batch_size=self.trainer.batch_size,
|
||||
stack_fn=partial(
|
||||
batchify,
|
||||
device=self.trainer.device,
|
||||
),
|
||||
num_workers=num_workers,
|
||||
num_prefetch=num_prefetch,
|
||||
shuffle_buffer_length=shuffle_buffer_length,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
validation_data_loader = None
|
||||
if validation_data is not None:
|
||||
validation_data_loader = ValidationDataLoader(
|
||||
dataset=validation_data,
|
||||
transform=transformation + SelectFields(input_names),
|
||||
batch_size=self.trainer.batch_size,
|
||||
stack_fn=partial(
|
||||
batchify,
|
||||
device=self.trainer.device,
|
||||
),
|
||||
num_workers=num_workers,
|
||||
num_prefetch=num_prefetch,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.trainer(
|
||||
net=trained_net,
|
||||
input_names=get_module_forward_input_names(trained_net),
|
||||
data_loader=training_data_loader,
|
||||
train_iter=training_data_loader,
|
||||
validation_iter=validation_data_loader,
|
||||
)
|
||||
|
||||
return TrainOutput(
|
||||
@@ -144,5 +131,20 @@ class PTSEstimator(Estimator):
|
||||
),
|
||||
)
|
||||
|
||||
def train(self, training_data: Dataset) -> Predictor:
|
||||
return self.train_model(training_data).predictor
|
||||
def train(
|
||||
self,
|
||||
training_data: Dataset,
|
||||
validation_data: Optional[Dataset] = None,
|
||||
num_workers: Optional[int] = None,
|
||||
num_prefetch: Optional[int] = None,
|
||||
shuffle_buffer_length: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> PyTorchPredictor:
|
||||
return self.train_model(
|
||||
training_data,
|
||||
validation_data,
|
||||
num_workers,
|
||||
num_prefetch,
|
||||
shuffle_buffer_length,
|
||||
**kwargs,
|
||||
).predictor
|
||||
|
||||
@@ -1,552 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional, Set, Union, Callable
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from torch.distributions import Distribution
|
||||
|
||||
from .quantile import Quantile
|
||||
|
||||
|
||||
class OutputType(str, Enum):
|
||||
mean = "mean"
|
||||
samples = "samples"
|
||||
quantiles = "quantiles"
|
||||
|
||||
|
||||
class Config(BaseModel):
|
||||
num_samples: int = Field(100, alias="num_eval_samples")
|
||||
output_types: Set[OutputType] = {"quantiles", "mean"}
|
||||
# FIXME: validate list elements
|
||||
quantiles: List[str] = ["0.1", "0.5", "0.9"]
|
||||
|
||||
class Config:
|
||||
allow_population_by_field_name = True
|
||||
# store additional fields
|
||||
extra = "allow"
|
||||
|
||||
|
||||
class Forecast(ABC):
|
||||
start_date: pd.Timestamp
|
||||
freq: str
|
||||
item_id: Optional[str]
|
||||
info: Optional[Dict]
|
||||
prediction_length: int
|
||||
mean: np.ndarray
|
||||
_index = None
|
||||
|
||||
@abstractmethod
|
||||
def quantile(self, q: Union[float, str]) -> np.ndarray:
|
||||
"""
|
||||
Computes a quantile from the predicted distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
q
|
||||
Quantile to compute.
|
||||
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Value of the quantile across the prediction range.
|
||||
"""
|
||||
pass
|
||||
|
||||
def quantile_ts(self, q: Union[float, str]) -> pd.Series:
|
||||
return pd.Series(data=self.quantile(q), index=self.index)
|
||||
|
||||
@property
|
||||
def median(self) -> np.ndarray:
|
||||
return self.quantile(0.5)
|
||||
|
||||
def plot(
|
||||
self,
|
||||
prediction_intervals=(50.0, 90.0),
|
||||
show_mean=False,
|
||||
color="b",
|
||||
label=None,
|
||||
output_file=None,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Plots the median of the forecast as well as confidence bounds.
|
||||
(requires matplotlib and pandas).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prediction_intervals : float or list of floats in [0, 100]
|
||||
Confidence interval size(s). If a list, it will stack the error
|
||||
plots for each confidence interval. Only relevant for error styles
|
||||
with "ci" in the name.
|
||||
show_mean : boolean
|
||||
Whether to also show the mean of the forecast.
|
||||
color : matplotlib color name or dictionary
|
||||
The color used for plotting the forecast.
|
||||
label : string
|
||||
A label (prefix) that is used for the forecast
|
||||
output_file : str or None, default None
|
||||
Output path for the plot file. If None, plot is not saved to file.
|
||||
args :
|
||||
Other arguments are passed to main plot() call
|
||||
kwargs :
|
||||
Other keyword arguments are passed to main plot() call
|
||||
"""
|
||||
|
||||
# matplotlib==2.0.* gives errors in Brazil builds and has to be
|
||||
# imported locally
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
label_prefix = "" if label is None else label + "-"
|
||||
|
||||
for c in prediction_intervals:
|
||||
assert 0.0 <= c <= 100.0
|
||||
|
||||
ps = [50.0] + [
|
||||
50.0 + f * c / 2.0 for c in prediction_intervals for f in [-1.0, +1.0]
|
||||
]
|
||||
percentiles_sorted = sorted(set(ps))
|
||||
|
||||
def alpha_for_percentile(p):
|
||||
return (p / 100.0) ** 0.3
|
||||
|
||||
ps_data = [self.quantile(p / 100.0) for p in percentiles_sorted]
|
||||
i_p50 = len(percentiles_sorted) // 2
|
||||
|
||||
p50_data = ps_data[i_p50]
|
||||
p50_series = pd.Series(data=p50_data, index=self.index)
|
||||
p50_series.plot(color=color, ls="-", label=f"{label_prefix}median")
|
||||
|
||||
if show_mean:
|
||||
mean_data = np.mean(self._sorted_samples, axis=0)
|
||||
pd.Series(data=mean_data, index=self.index).plot(
|
||||
color=color, ls=":", label=f"{label_prefix}mean", *args, **kwargs,
|
||||
)
|
||||
|
||||
for i in range(len(percentiles_sorted) // 2):
|
||||
ptile = percentiles_sorted[i]
|
||||
alpha = alpha_for_percentile(ptile)
|
||||
plt.fill_between(
|
||||
self.index,
|
||||
ps_data[i],
|
||||
ps_data[-i - 1],
|
||||
facecolor=color,
|
||||
alpha=alpha,
|
||||
interpolate=True,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
# Hack to create labels for the error intervals.
|
||||
# Doesn't actually plot anything, because we only pass a single data point
|
||||
pd.Series(data=p50_data[:1], index=self.index[:1]).plot(
|
||||
color=color,
|
||||
alpha=alpha,
|
||||
linewidth=10,
|
||||
label=f"{label_prefix}{100 - ptile * 2}%",
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
if output_file:
|
||||
plt.savefig(output_file)
|
||||
|
||||
@property
|
||||
def index(self) -> pd.DatetimeIndex:
|
||||
if self._index is None:
|
||||
self._index = pd.date_range(
|
||||
self.start_date, periods=self.prediction_length, freq=self.freq
|
||||
)
|
||||
return self._index
|
||||
|
||||
def as_json_dict(self, config: "Config") -> dict:
|
||||
result = {}
|
||||
|
||||
if OutputType.mean in config.output_types:
|
||||
result["mean"] = self.mean.tolist()
|
||||
|
||||
if OutputType.quantiles in config.output_types:
|
||||
quantiles = map(Quantile.parse, config.quantiles)
|
||||
|
||||
result["quantiles"] = {
|
||||
quantile.name: self.quantile(quantile.value).tolist()
|
||||
for quantile in quantiles
|
||||
}
|
||||
|
||||
if OutputType.samples in config.output_types:
|
||||
result["samples"] = []
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class SampleForecast(Forecast):
|
||||
"""
|
||||
A `Forecast` object, where the predicted distribution is represented
|
||||
internally as samples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
samples
|
||||
Array of size (num_samples, prediction_length)
|
||||
start_date
|
||||
start of the forecast
|
||||
freq
|
||||
forecast frequency
|
||||
info
|
||||
additional information that the forecaster may provide e.g. estimated
|
||||
parameters, number of iterations ran etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
samples: Union[torch.Tensor, np.ndarray],
|
||||
start_date: pd.Timestamp,
|
||||
freq: str,
|
||||
item_id: Optional[str] = None,
|
||||
info: Optional[Dict] = None,
|
||||
) -> None:
|
||||
assert isinstance(
|
||||
samples, (np.ndarray, torch.Tensor)
|
||||
), "samples should be either a numpy array or an torch tensor"
|
||||
assert (
|
||||
len(np.shape(samples)) == 2 or len(np.shape(samples)) == 3
|
||||
), "samples should be a 2-dimensional or 3-dimensional array. Dimensions found: {}".format(
|
||||
len(np.shape(samples))
|
||||
)
|
||||
self.samples = (
|
||||
samples if (isinstance(samples, np.ndarray)) else samples.cpu().numpy()
|
||||
)
|
||||
self._sorted_samples_value = None
|
||||
self._mean = None
|
||||
self._dim = None
|
||||
self.item_id = item_id
|
||||
self.info = info
|
||||
|
||||
assert isinstance(
|
||||
start_date, pd.Timestamp
|
||||
), "start_date should be a pandas Timestamp object"
|
||||
self.start_date = start_date
|
||||
|
||||
assert isinstance(freq, str), "freq should be a string"
|
||||
self.freq = freq
|
||||
|
||||
@property
|
||||
def _sorted_samples(self):
|
||||
if self._sorted_samples_value is None:
|
||||
self._sorted_samples_value = np.sort(self.samples, axis=0)
|
||||
return self._sorted_samples_value
|
||||
|
||||
@property
|
||||
def num_samples(self):
|
||||
"""
|
||||
The number of samples representing the forecast.
|
||||
"""
|
||||
return self.samples.shape[0]
|
||||
|
||||
@property
|
||||
def prediction_length(self):
|
||||
"""
|
||||
Time length of the forecast.
|
||||
"""
|
||||
return self.samples.shape[1]
|
||||
|
||||
@property
|
||||
def mean(self) -> np.ndarray:
|
||||
"""
|
||||
Forecast mean.
|
||||
"""
|
||||
if self._mean is not None:
|
||||
return self._mean
|
||||
else:
|
||||
return np.mean(self.samples, axis=0)
|
||||
|
||||
@property
|
||||
def mean_ts(self) -> pd.Series:
|
||||
"""
|
||||
Forecast mean, as a pandas.Series object.
|
||||
"""
|
||||
return pd.Series(data=self.mean, index=self.index)
|
||||
|
||||
def quantile(self, q: Union[float, str]) -> np.ndarray:
|
||||
q = Quantile.parse(q).value
|
||||
sample_idx = int(np.round((self.num_samples - 1) * q))
|
||||
return self._sorted_samples[sample_idx, :]
|
||||
|
||||
def copy_dim(self, dim: int) -> "SampleForecast":
|
||||
"""
|
||||
Returns a new Forecast object with only the selected sub-dimension.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dim
|
||||
The returned forecast object will only represent this dimension.
|
||||
"""
|
||||
if len(self.samples.shape) == 2:
|
||||
samples = self.samples
|
||||
else:
|
||||
target_dim = self.samples.shape[2]
|
||||
assert dim < target_dim, (
|
||||
f"must set 0 <= dim < target_dim, but got dim={dim},"
|
||||
f" target_dim={target_dim}"
|
||||
)
|
||||
samples = self.samples[:, :, dim]
|
||||
|
||||
return SampleForecast(
|
||||
samples=samples,
|
||||
start_date=self.start_date,
|
||||
freq=self.freq,
|
||||
item_id=self.item_id,
|
||||
info=self.info,
|
||||
)
|
||||
|
||||
def copy_aggregate(self, agg_fun: Callable) -> "SampleForecast":
|
||||
"""
|
||||
Returns a new Forecast object with a time series aggregated over the
|
||||
dimension axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
agg_fun
|
||||
Aggregation function that defines the aggregation operation
|
||||
(typically mean or sum).
|
||||
"""
|
||||
if len(self.samples.shape) == 2:
|
||||
samples = self.samples
|
||||
else:
|
||||
# Aggregate over target dimension axis
|
||||
samples = agg_fun(self.samples, axis=2)
|
||||
return SampleForecast(
|
||||
samples=samples,
|
||||
start_date=self.start_date,
|
||||
freq=self.freq,
|
||||
item_id=self.item_id,
|
||||
info=self.info,
|
||||
)
|
||||
|
||||
def dim(self) -> int:
|
||||
"""
|
||||
Returns the dimensionality of the forecast object.
|
||||
"""
|
||||
if self._dim is not None:
|
||||
return self._dim
|
||||
else:
|
||||
if len(self.samples.shape) == 2:
|
||||
# univariate target
|
||||
# shape: (num_samples, prediction_length)
|
||||
return 1
|
||||
else:
|
||||
# multivariate target
|
||||
# shape: (num_samples, prediction_length, target_dim)
|
||||
return self.samples.shape[2]
|
||||
|
||||
def as_json_dict(self, config: "Config") -> dict:
|
||||
result = super().as_json_dict(config)
|
||||
|
||||
if OutputType.samples in config.output_types:
|
||||
result["samples"] = self.samples.tolist()
|
||||
|
||||
return result
|
||||
|
||||
def __repr__(self):
|
||||
return ", ".join(
|
||||
[
|
||||
f"SampleForecast({self.samples!r})",
|
||||
f"{self.start_date!r}",
|
||||
f"{self.freq!r}",
|
||||
f"item_id={self.item_id!r}",
|
||||
f"info={self.info!r})",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class QuantileForecast(Forecast):
|
||||
"""
|
||||
A Forecast that contains arrays (i.e. time series) for quantiles and mean
|
||||
|
||||
Parameters
|
||||
----------
|
||||
forecast_arrays
|
||||
An array of forecasts
|
||||
start_date
|
||||
start of the forecast
|
||||
freq
|
||||
forecast frequency
|
||||
forecast_keys
|
||||
A list of quantiles of the form '0.1', '0.9', etc.,
|
||||
and potentially 'mean'. Each entry corresponds to one array in
|
||||
forecast_arrays.
|
||||
info
|
||||
additional information that the forecaster may provide e.g. estimated
|
||||
parameters, number of iterations ran etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
forecast_arrays: np.ndarray,
|
||||
start_date: pd.Timestamp,
|
||||
freq: str,
|
||||
forecast_keys: List[str],
|
||||
item_id: Optional[str] = None,
|
||||
info: Optional[Dict] = None,
|
||||
) -> None:
|
||||
self.forecast_array = forecast_arrays
|
||||
self.start_date = pd.Timestamp(start_date, freq=freq)
|
||||
self.freq = freq
|
||||
|
||||
# normalize keys
|
||||
self.forecast_keys = [
|
||||
Quantile.from_str(key).name if key != "mean" else key
|
||||
for key in forecast_keys
|
||||
]
|
||||
self.item_id = item_id
|
||||
self.info = info
|
||||
self._dim = None
|
||||
|
||||
shape = self.forecast_array.shape
|
||||
assert shape[0] == len(self.forecast_keys), (
|
||||
f"The forecast_array (shape={shape} should have the same "
|
||||
f"length as the forecast_keys (len={len(self.forecast_keys)})."
|
||||
)
|
||||
self.prediction_length = shape[-1]
|
||||
self._forecast_dict = {
|
||||
k: self.forecast_array[i] for i, k in enumerate(self.forecast_keys)
|
||||
}
|
||||
|
||||
self._nan_out = np.array([np.nan] * self.prediction_length)
|
||||
|
||||
def quantile(self, q: Union[float, str]) -> np.ndarray:
|
||||
q_str = Quantile.parse(q).name
|
||||
# We return nan here such that evaluation runs through
|
||||
return self._forecast_dict.get(q_str, self._nan_out)
|
||||
|
||||
@property
|
||||
def mean(self) -> np.ndarray:
|
||||
"""
|
||||
Forecast mean.
|
||||
"""
|
||||
return self._forecast_dict.get("mean", self._nan_out)
|
||||
|
||||
def dim(self) -> int:
|
||||
"""
|
||||
Returns the dimensionality of the forecast object.
|
||||
"""
|
||||
if self._dim is not None:
|
||||
return self._dim
|
||||
else:
|
||||
if (
|
||||
len(self.forecast_array.shape) == 2
|
||||
): # 1D target. shape: (num_samples, prediction_length)
|
||||
return 1
|
||||
else:
|
||||
return self.forecast_array.shape[
|
||||
1
|
||||
] # 2D target. shape: (num_samples, target_dim, prediction_length)
|
||||
|
||||
def __repr__(self):
|
||||
return ", ".join(
|
||||
[
|
||||
f"QuantileForecast({self.forecast_array!r})",
|
||||
f"start_date={self.start_date!r}",
|
||||
f"freq={self.freq!r}",
|
||||
f"forecast_keys={self.forecast_keys!r}",
|
||||
f"item_id={self.item_id!r}",
|
||||
f"info={self.info!r})",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class DistributionForecast(Forecast):
|
||||
"""
|
||||
A `Forecast` object that uses a distribution directly.
|
||||
This can for instance be used to represent marginal probability
|
||||
distributions for each time point -- although joint distributions are
|
||||
also possible, e.g. when using MultiVariateGaussian).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distribution
|
||||
Distribution object. This should represent the entire prediction
|
||||
length, i.e., if we draw `num_samples` samples from the distribution,
|
||||
the sample shape should be
|
||||
|
||||
samples = trans_dist.sample(num_samples)
|
||||
samples.shape -> (num_samples, prediction_length)
|
||||
|
||||
start_date
|
||||
start of the forecast
|
||||
freq
|
||||
forecast frequency
|
||||
info
|
||||
additional information that the forecaster may provide e.g. estimated
|
||||
parameters, number of iterations ran etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
distribution: Distribution,
|
||||
start_date: pd.Timestamp,
|
||||
freq: str,
|
||||
item_id: Optional[str] = None,
|
||||
info: Optional[Dict] = None,
|
||||
) -> None:
|
||||
self.distribution = distribution
|
||||
self.shape = self.distribution.batch_shape + self.distribution.event_shape
|
||||
self.prediction_length = self.shape[0]
|
||||
self.item_id = item_id
|
||||
self.info = info
|
||||
|
||||
assert isinstance(
|
||||
start_date, pd.Timestamp
|
||||
), "start_date should be a pandas Timestamp object"
|
||||
self.start_date = start_date
|
||||
|
||||
assert isinstance(freq, str), "freq should be a string"
|
||||
self.freq = freq
|
||||
self._mean = None
|
||||
|
||||
@property
|
||||
def mean(self) -> np.ndarray:
|
||||
"""
|
||||
Forecast mean.
|
||||
"""
|
||||
if self._mean is not None:
|
||||
return self._mean
|
||||
else:
|
||||
self._mean = self.distribution.mean.cpu().numpy()
|
||||
return self._mean
|
||||
|
||||
@property
|
||||
def mean_ts(self) -> pd.Series:
|
||||
"""
|
||||
Forecast mean, as a pandas.Series object.
|
||||
"""
|
||||
return pd.Series(data=self.mean, index=self.index)
|
||||
|
||||
def quantile(self, level: Union[float, str]) -> np.ndarray:
|
||||
level = Quantile.parse(level).value
|
||||
q = self.distribution.icdf(torch.tensor([level])).cpu().numpy()
|
||||
return q
|
||||
|
||||
def to_sample_forecast(self, num_samples: int = 200) -> SampleForecast:
|
||||
return SampleForecast(
|
||||
samples=self.distribution.sample((num_samples,)),
|
||||
start_date=self.start_date,
|
||||
freq=self.freq,
|
||||
item_id=self.item_id,
|
||||
info=self.info,
|
||||
)
|
||||
@@ -1,195 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterator, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset import InferenceDataLoader, DataEntry, FieldName
|
||||
from pts.modules import DistributionOutput
|
||||
from .forecast import Forecast, DistributionForecast, QuantileForecast, SampleForecast
|
||||
|
||||
OutputTransform = Callable[[DataEntry, np.ndarray], np.ndarray]
|
||||
|
||||
|
||||
def _extract_instances(x: Any) -> Any:
|
||||
"""
|
||||
Helper function to extract individual instances from batched
|
||||
mxnet results.
|
||||
|
||||
For a tensor `a`
|
||||
_extract_instances(a) -> [a[0], a[1], ...]
|
||||
|
||||
For (nested) tuples of tensors `(a, (b, c))`
|
||||
_extract_instances((a, (b, c)) -> [(a[0], (b[0], c[0])), (a[1], (b[1], c[1])), ...]
|
||||
"""
|
||||
if isinstance(x, (np.ndarray, torch.Tensor)):
|
||||
for i in range(x.shape[0]):
|
||||
# yield x[i: i + 1]
|
||||
yield x[i]
|
||||
elif isinstance(x, tuple):
|
||||
for m in zip(*[_extract_instances(y) for y in x]):
|
||||
yield tuple([r for r in m])
|
||||
elif isinstance(x, list):
|
||||
for m in zip(*[_extract_instances(y) for y in x]):
|
||||
yield [r for r in m]
|
||||
elif x is None:
|
||||
while True:
|
||||
yield None
|
||||
else:
|
||||
assert False
|
||||
|
||||
|
||||
class ForecastGenerator(ABC):
|
||||
"""
|
||||
Classes used to bring the output of a network into a class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self,
|
||||
inference_data_loader: InferenceDataLoader,
|
||||
prediction_net: nn.Module,
|
||||
input_names: List[str],
|
||||
freq: str,
|
||||
output_transform: Optional[OutputTransform],
|
||||
num_samples: Optional[int],
|
||||
**kwargs
|
||||
) -> Iterator[Forecast]:
|
||||
pass
|
||||
|
||||
|
||||
class DistributionForecastGenerator(ForecastGenerator):
|
||||
def __init__(self, distr_output: DistributionOutput) -> None:
|
||||
self.distr_output = distr_output
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
inference_data_loader: InferenceDataLoader,
|
||||
prediction_net: nn.Module,
|
||||
input_names: List[str],
|
||||
freq: str,
|
||||
output_transform: Optional[OutputTransform],
|
||||
num_samples: Optional[int],
|
||||
**kwargs
|
||||
) -> Iterator[DistributionForecast]:
|
||||
for batch in inference_data_loader:
|
||||
inputs = [batch[k] for k in input_names]
|
||||
outputs = prediction_net(*inputs)
|
||||
if output_transform is not None:
|
||||
outputs = output_transform(batch, outputs)
|
||||
|
||||
distributions = [
|
||||
self.distr_output.distribution(*u) for u in _extract_instances(outputs)
|
||||
]
|
||||
|
||||
i = -1
|
||||
for i, distr in enumerate(distributions):
|
||||
yield DistributionForecast(
|
||||
distr,
|
||||
start_date=batch["forecast_start"][i],
|
||||
freq=freq,
|
||||
item_id=batch[FieldName.ITEM_ID][i]
|
||||
if FieldName.ITEM_ID in batch
|
||||
else None,
|
||||
info=batch["info"][i] if "info" in batch else None,
|
||||
)
|
||||
assert i + 1 == len(batch["forecast_start"])
|
||||
|
||||
|
||||
class QuantileForecastGenerator(ForecastGenerator):
|
||||
def __init__(self, quantiles: List[str]) -> None:
|
||||
self.quantiles = quantiles
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
inference_data_loader: InferenceDataLoader,
|
||||
prediction_net: nn.Module,
|
||||
input_names: List[str],
|
||||
freq: str,
|
||||
output_transform: Optional[OutputTransform],
|
||||
num_samples: Optional[int],
|
||||
**kwargs
|
||||
) -> Iterator[Forecast]:
|
||||
for batch in inference_data_loader:
|
||||
inputs = [batch[k] for k in input_names]
|
||||
outputs = prediction_net(*inputs).cpu().numpy()
|
||||
if output_transform is not None:
|
||||
outputs = output_transform(batch, outputs)
|
||||
|
||||
i = -1
|
||||
for i, output in enumerate(outputs):
|
||||
yield QuantileForecast(
|
||||
output,
|
||||
start_date=batch["forecast_start"][i],
|
||||
freq=freq,
|
||||
item_id=batch[FieldName.ITEM_ID][i]
|
||||
if FieldName.ITEM_ID in batch
|
||||
else None,
|
||||
info=batch["info"][i] if "info" in batch else None,
|
||||
forecast_keys=self.quantiles,
|
||||
)
|
||||
assert i + 1 == len(batch["forecast_start"])
|
||||
|
||||
|
||||
class SampleForecastGenerator(ForecastGenerator):
|
||||
|
||||
@validated()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
inference_data_loader: InferenceDataLoader,
|
||||
prediction_net: nn.Module,
|
||||
input_names: List[str],
|
||||
freq: str,
|
||||
output_transform: Optional[OutputTransform],
|
||||
num_samples: Optional[int],
|
||||
**kwargs
|
||||
) -> Iterator[Forecast]:
|
||||
for batch in inference_data_loader:
|
||||
inputs = [batch[k] for k in input_names]
|
||||
outputs = prediction_net(*inputs).cpu().numpy()
|
||||
if output_transform is not None:
|
||||
outputs = output_transform(batch, outputs)
|
||||
if num_samples:
|
||||
num_collected_samples = outputs[0].shape[0]
|
||||
collected_samples = [outputs]
|
||||
while num_collected_samples < num_samples:
|
||||
outputs = prediction_net(*inputs).cpu().numpy()
|
||||
if output_transform is not None:
|
||||
outputs = output_transform(batch, outputs)
|
||||
collected_samples.append(outputs)
|
||||
num_collected_samples += outputs[0].shape[0]
|
||||
outputs = [
|
||||
np.concatenate(s)[:num_samples] for s in zip(*collected_samples)
|
||||
]
|
||||
assert len(outputs[0]) == num_samples
|
||||
i = -1
|
||||
for i, output in enumerate(outputs):
|
||||
yield SampleForecast(
|
||||
output,
|
||||
start_date=batch["forecast_start"][i],
|
||||
freq=freq,
|
||||
item_id=batch[FieldName.ITEM_ID][i]
|
||||
if FieldName.ITEM_ID in batch
|
||||
else None,
|
||||
info=batch["info"][i] if "info" in batch else None,
|
||||
)
|
||||
assert i + 1 == len(batch["forecast_start"])
|
||||
@@ -6,7 +6,7 @@ import torch.nn as nn
|
||||
|
||||
from pts import Trainer
|
||||
from pts.dataset import FieldName
|
||||
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
|
||||
from pts.transform import (
|
||||
InstanceSplitter,
|
||||
Transformation,
|
||||
@@ -19,7 +19,7 @@ from pts.transform import (
|
||||
from .lstnet_network import LSTNetTrain, LSTNetPredict
|
||||
|
||||
|
||||
class LSTNetEstimator(PTSEstimator):
|
||||
class LSTNetEstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
freq: str,
|
||||
@@ -110,7 +110,7 @@ class LSTNetEstimator(PTSEstimator):
|
||||
transformation: Transformation,
|
||||
trained_network: LSTNetTrain,
|
||||
device: torch.device,
|
||||
) -> PTSPredictor:
|
||||
) -> PyTorchPredictor:
|
||||
prediction_network = LSTNetPredict(
|
||||
num_series=self.num_series,
|
||||
channels=self.channels,
|
||||
@@ -131,7 +131,7 @@ class LSTNetEstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -110,7 +110,7 @@ class LSTNetBase(nn.Module):
|
||||
) -> torch.Tensor:
|
||||
scaled_past_target, scale = self.scaler(
|
||||
past_target[..., -self.context_length :], # [B, C, T]
|
||||
past_observed_values[..., -self.context_length :] # [B, C, T]
|
||||
past_observed_values[..., -self.context_length :], # [B, C, T]
|
||||
)
|
||||
|
||||
# CNN
|
||||
@@ -121,7 +121,7 @@ class LSTNetBase(nn.Module):
|
||||
# RNN
|
||||
r = c.permute(2, 0, 1) # [F (T), B, C]
|
||||
_, r = self.rnn(r) # [1, B, H]
|
||||
r = self.dropout(r.squeeze(0)) # [B, H]
|
||||
r = self.dropout(r.squeeze(0)) # [B, H]
|
||||
|
||||
# Skip-RNN
|
||||
skip_c = c[..., -self.conv_skip * self.skip_size :]
|
||||
@@ -174,7 +174,7 @@ class LSTNetTrain(LSTNetBase):
|
||||
if self.horizon:
|
||||
future_target = future_target[..., -1:]
|
||||
|
||||
loss = self.loss_fn(ret*scale, future_target)
|
||||
loss = self.loss_fn(ret * scale, future_target)
|
||||
return loss
|
||||
|
||||
|
||||
@@ -183,6 +183,6 @@ class LSTNetPredict(LSTNetBase):
|
||||
self, past_target: torch.Tensor, past_observed_values: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
ret, scale = super().forward(past_target, past_observed_values)
|
||||
ret = (ret*scale).permute(0, 2, 1)
|
||||
ret = (ret * scale).permute(0, 2, 1)
|
||||
|
||||
return ret.unsqueeze(1)
|
||||
|
||||
@@ -164,6 +164,7 @@ class NBEATSEnsembleEstimator(Estimator):
|
||||
**kwargs
|
||||
Arguments passed down to the individual estimators.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
freq: str,
|
||||
|
||||
@@ -5,7 +5,7 @@ import torch.nn as nn
|
||||
|
||||
from pts import Trainer
|
||||
from pts.dataset import FieldName
|
||||
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
|
||||
from pts.transform import (
|
||||
InstanceSplitter,
|
||||
Transformation,
|
||||
@@ -20,7 +20,7 @@ from .n_beats_network import (
|
||||
)
|
||||
|
||||
|
||||
class NBEATSEstimator(PTSEstimator):
|
||||
class NBEATSEstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
freq: str,
|
||||
@@ -124,10 +124,14 @@ class NBEATSEstimator(PTSEstimator):
|
||||
# conditioning part and a to-predict part, for each training example.
|
||||
def create_transformation(self) -> Transformation:
|
||||
return Chain(
|
||||
[ RemoveFields(
|
||||
field_names=[FieldName.FEAT_STATIC_REAL,
|
||||
FieldName.FEAT_DYNAMIC_REAL,
|
||||
FieldName.FEAT_DYNAMIC_CAT]),
|
||||
[
|
||||
RemoveFields(
|
||||
field_names=[
|
||||
FieldName.FEAT_STATIC_REAL,
|
||||
FieldName.FEAT_DYNAMIC_REAL,
|
||||
FieldName.FEAT_DYNAMIC_CAT,
|
||||
]
|
||||
),
|
||||
InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
@@ -137,11 +141,11 @@ class NBEATSEstimator(PTSEstimator):
|
||||
past_length=self.context_length,
|
||||
future_length=self.prediction_length,
|
||||
time_series_fields=[],
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def create_training_network(self, device: torch.device) -> NBEATSTrainingNetwork:
|
||||
|
||||
def create_training_network(self, device: torch.device) -> NBEATSTrainingNetwork:
|
||||
return NBEATSTrainingNetwork(
|
||||
prediction_length=self.prediction_length,
|
||||
context_length=self.context_length,
|
||||
@@ -156,10 +160,9 @@ class NBEATSEstimator(PTSEstimator):
|
||||
freq=self.freq,
|
||||
).to(device)
|
||||
|
||||
|
||||
def create_predictor(
|
||||
self,
|
||||
transformation: Transformation,
|
||||
self,
|
||||
transformation: Transformation,
|
||||
trained_network: nn.Module,
|
||||
device: torch.device,
|
||||
) -> Predictor:
|
||||
@@ -172,12 +175,12 @@ class NBEATSEstimator(PTSEstimator):
|
||||
num_block_layers=self.num_block_layers,
|
||||
expansion_coefficient_lengths=self.expansion_coefficient_lengths,
|
||||
sharing=self.sharing,
|
||||
stack_types=self.stack_types
|
||||
stack_types=self.stack_types,
|
||||
).to(device)
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -258,7 +258,8 @@ class NBEATSNetwork(nn.Module):
|
||||
flag = denominator == 0
|
||||
|
||||
return (200 / self.prediction_length) * torch.mean(
|
||||
(torch.abs(future_target - forecast) * torch.logical_not(flag)) / (denominator + flag),
|
||||
(torch.abs(future_target - forecast) * torch.logical_not(flag))
|
||||
/ (denominator + flag),
|
||||
dim=1,
|
||||
)
|
||||
|
||||
@@ -269,7 +270,8 @@ class NBEATSNetwork(nn.Module):
|
||||
flag = denominator == 0
|
||||
|
||||
return (100 / self.prediction_length) * torch.mean(
|
||||
(torch.abs(future_target - forecast) * torch.logical_not(flag)) / (denominator + flag),
|
||||
(torch.abs(future_target - forecast) * torch.logical_not(flag))
|
||||
/ (denominator + flag),
|
||||
dim=1,
|
||||
)
|
||||
|
||||
@@ -292,9 +294,10 @@ class NBEATSNetwork(nn.Module):
|
||||
)
|
||||
flag = seasonal_error == 0
|
||||
|
||||
return (torch.mean(torch.abs(future_target - forecast), dim=1) * torch.logical_not(flag)) / (
|
||||
seasonal_error + flag
|
||||
)
|
||||
return (
|
||||
torch.mean(torch.abs(future_target - forecast), dim=1)
|
||||
* torch.logical_not(flag)
|
||||
) / (seasonal_error + flag)
|
||||
|
||||
|
||||
class NBEATSTrainingNetwork(NBEATSNetwork):
|
||||
@@ -342,4 +345,3 @@ class NBEATSPredictionNetwork(NBEATSNetwork):
|
||||
forecasts = super().forward(past_target=past_target)
|
||||
|
||||
return forecasts.unsqueeze(1)
|
||||
|
||||
|
||||
@@ -1,190 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from pydoc import locate
|
||||
from typing import Iterator, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import pts
|
||||
from pts.core.serde import dump_json, fqname_for, load_json
|
||||
from pts.dataset import Dataset, DataEntry, InferenceDataLoader
|
||||
from pts.transform import Transformation
|
||||
from .forecast import Forecast
|
||||
from .forecast_generator import ForecastGenerator, SampleForecastGenerator
|
||||
from .utils import get_module_forward_input_names
|
||||
|
||||
OutputTransform = Callable[[DataEntry, np.ndarray], np.ndarray]
|
||||
|
||||
|
||||
class Predictor(ABC):
|
||||
|
||||
__version__: str = pts.__version__
|
||||
|
||||
def __init__(self, prediction_length: int, freq: str) -> None:
|
||||
self.prediction_length = prediction_length
|
||||
self.freq = freq
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, dataset: Dataset, **kwargs) -> Iterator[Forecast]:
|
||||
pass
|
||||
|
||||
def serialize(self, path: Path) -> None:
|
||||
# serialize Predictor type
|
||||
with (path / "type.txt").open("w") as fp:
|
||||
fp.write(fqname_for(self.__class__))
|
||||
with (path / "version.json").open("w") as fp:
|
||||
json.dump(
|
||||
{"model": self.__version__, "pts": pts.__version__}, fp
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def deserialize(
|
||||
cls, path: Path, device: Optional[torch.device] = None
|
||||
) -> "Predictor":
|
||||
"""
|
||||
Load a serialized predictor from the given path
|
||||
Parameters
|
||||
----------
|
||||
path
|
||||
Path to the serialized files predictor.
|
||||
device
|
||||
Optional pytorch to be used with the predictor.
|
||||
If nothing is passed will use the GPU if available and CPU otherwise.
|
||||
"""
|
||||
# deserialize Predictor type
|
||||
with (path / "type.txt").open("r") as fp:
|
||||
tpe = locate(fp.readline())
|
||||
|
||||
# ensure that predictor_cls is a subtype of Predictor
|
||||
if not issubclass(tpe, Predictor):
|
||||
raise IOError(
|
||||
f"Class {fqname_for(tpe)} is not "
|
||||
f"a subclass of {fqname_for(Predictor)}"
|
||||
)
|
||||
# call deserialize() for the concrete Predictor type
|
||||
return tpe.deserialize(path, device)
|
||||
|
||||
|
||||
class PTSPredictor(Predictor):
|
||||
def __init__(
|
||||
self,
|
||||
prediction_net: nn.Module,
|
||||
batch_size: int,
|
||||
prediction_length: int,
|
||||
freq: str,
|
||||
device: torch.device,
|
||||
input_transform: Transformation,
|
||||
forecast_generator: ForecastGenerator = SampleForecastGenerator(),
|
||||
output_transform: Optional[OutputTransform] = None,
|
||||
dtype: np.dtype = np.float32,
|
||||
) -> None:
|
||||
super().__init__(prediction_length, freq)
|
||||
self.input_names = get_module_forward_input_names(prediction_net)
|
||||
self.prediction_net = prediction_net
|
||||
self.batch_size = batch_size
|
||||
self.input_transform = input_transform
|
||||
self.forecast_generator = forecast_generator
|
||||
self.output_transform = output_transform
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
|
||||
def predict(
|
||||
self, dataset: Dataset, num_samples: Optional[int] = None
|
||||
) -> Iterator[Forecast]:
|
||||
inference_data_loader = InferenceDataLoader(
|
||||
dataset,
|
||||
self.input_transform,
|
||||
self.batch_size,
|
||||
device=self.device,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
|
||||
self.prediction_net.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
yield from self.forecast_generator(
|
||||
inference_data_loader=inference_data_loader,
|
||||
prediction_net=self.prediction_net,
|
||||
input_names=self.input_names,
|
||||
freq=self.freq,
|
||||
output_transform=self.output_transform,
|
||||
num_samples=num_samples,
|
||||
)
|
||||
|
||||
def serialize(self, path: Path) -> None:
|
||||
|
||||
super().serialize(path)
|
||||
|
||||
# serialize network
|
||||
model_name = 'prediction_net'
|
||||
with (path / f"{model_name}-network.json").open("w") as fp:
|
||||
print(dump_json(self.prediction_net), file=fp)
|
||||
torch.save(self.prediction_net.state_dict(), path / "prediction_net")
|
||||
|
||||
# serialize input transformation chain
|
||||
with (path / "input_transform.json").open("w") as fp:
|
||||
print(dump_json(self.input_transform), file=fp)
|
||||
|
||||
# serialize output transformation chain
|
||||
with (path / "output_transform.json").open("w") as fp:
|
||||
print(dump_json(self.output_transform), file=fp)
|
||||
|
||||
# serialize all remaining constructor parameters
|
||||
with (path / "parameters.json").open("w") as fp:
|
||||
parameters = dict(
|
||||
batch_size=self.batch_size,
|
||||
prediction_length=self.prediction_length,
|
||||
freq=self.freq,
|
||||
dtype=self.dtype,
|
||||
forecast_generator=self.forecast_generator,
|
||||
input_names=self.input_names,
|
||||
)
|
||||
print(dump_json(parameters), file=fp)
|
||||
|
||||
@classmethod
|
||||
def deserialize(
|
||||
cls, path: Path, device: Optional[torch.device] = None
|
||||
) -> "PTSPredictor":
|
||||
|
||||
# deserialize constructor parameters
|
||||
with (path / "parameters.json").open("r") as fp:
|
||||
parameters = load_json(fp.read())
|
||||
|
||||
# deserialize transformation chain
|
||||
with (path / "input_transform.json").open("r") as fp:
|
||||
transformation = load_json(fp.read())
|
||||
|
||||
# deserialize prediction network
|
||||
model_name = 'prediction_net'
|
||||
with (path / f"{model_name}-network.json").open("r") as fp:
|
||||
prediction_net = load_json(fp.read())
|
||||
prediction_net.load_state_dict(torch.load(path / "prediction_net"))
|
||||
|
||||
# input_names is derived from the prediction_net
|
||||
if "input_names" in parameters:
|
||||
del parameters["input_names"]
|
||||
|
||||
parameters["device"] = device
|
||||
|
||||
return PTSPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_net,
|
||||
**parameters
|
||||
)
|
||||
@@ -1,98 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
import re
|
||||
from typing import NamedTuple, Union
|
||||
|
||||
|
||||
class Quantile(NamedTuple):
|
||||
value: float
|
||||
name: str
|
||||
|
||||
@property
|
||||
def loss_name(self):
|
||||
return f"QuantileLoss[{self.name}]"
|
||||
|
||||
@property
|
||||
def weighted_loss_name(self):
|
||||
return f"wQuantileLoss[{self.name}]"
|
||||
|
||||
@property
|
||||
def coverage_name(self):
|
||||
return f"Coverage[{self.name}]"
|
||||
|
||||
@classmethod
|
||||
def checked(cls, value: float, name: str) -> "Quantile":
|
||||
if not 0 <= value <= 1:
|
||||
raise Exception(f"quantile value should be in [0, 1] but found {value}")
|
||||
|
||||
return Quantile(value, name)
|
||||
|
||||
@classmethod
|
||||
def from_float(cls, quantile: float) -> "Quantile":
|
||||
assert isinstance(quantile, float)
|
||||
return cls.checked(value=quantile, name=str(quantile))
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, quantile: str) -> "Quantile":
|
||||
assert isinstance(quantile, str)
|
||||
try:
|
||||
return cls.checked(value=float(quantile), name=quantile)
|
||||
except ValueError:
|
||||
m = re.match(r"^p(\d{2})$", quantile)
|
||||
|
||||
if m is None:
|
||||
raise Exception(
|
||||
"Quantile string should be of the form "
|
||||
f'"p10", "p50", ... or "0.1", "0.5", ... but found {quantile}'
|
||||
)
|
||||
else:
|
||||
quantile_float: float = int(m.group(1)) / 100
|
||||
return cls(value=quantile_float, name=str(quantile_float))
|
||||
|
||||
@classmethod
|
||||
def parse(cls, quantile: Union["Quantile", float, str]) -> "Quantile":
|
||||
"""Produces equivalent float and string representation of a given
|
||||
quantile level.
|
||||
|
||||
>>> Quantile.parse(0.1)
|
||||
Quantile(value=0.1, name='0.1')
|
||||
|
||||
>>> Quantile.parse('0.2')
|
||||
Quantile(value=0.2, name='0.2')
|
||||
|
||||
>>> Quantile.parse('0.20')
|
||||
Quantile(value=0.2, name='0.20')
|
||||
|
||||
>>> Quantile.parse('p99')
|
||||
Quantile(value=0.99, name='0.99')
|
||||
|
||||
Parameters
|
||||
----------
|
||||
quantile
|
||||
Quantile, can be a float a str representing a float e.g. '0.1' or a
|
||||
quantile string of the form 'p0.1'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Quantile
|
||||
A tuple containing both a float and a string representation of the
|
||||
input quantile level.
|
||||
"""
|
||||
if isinstance(quantile, Quantile):
|
||||
return quantile
|
||||
elif isinstance(quantile, float):
|
||||
return cls.from_float(quantile)
|
||||
else:
|
||||
return cls.from_str(quantile)
|
||||
@@ -3,4 +3,3 @@ from .simple_feedforward_network import (
|
||||
SimpleFeedForwardTrainingNetwork,
|
||||
SimpleFeedForwardPredictionNetwork,
|
||||
)
|
||||
|
||||
|
||||
@@ -3,29 +3,39 @@ from typing import List, Optional
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts import Trainer
|
||||
from pts.dataset import FieldName
|
||||
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
|
||||
from pts.modules import DistributionOutput, StudentTOutput
|
||||
from pts.transform import (
|
||||
from gluonts.torch.support.util import copy_parameters
|
||||
from gluonts.torch.model.predictor import PyTorchPredictor
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from gluonts.model.predictor import Predictor
|
||||
from gluonts.dataset.field_names import FieldName
|
||||
from gluonts.time_feature import (
|
||||
TimeFeature,
|
||||
get_lags_for_frequency,
|
||||
time_features_from_frequency_str,
|
||||
)
|
||||
from gluonts.transform import (
|
||||
Transformation,
|
||||
Chain,
|
||||
InstanceSplitter,
|
||||
ExpectedNumInstanceSampler,
|
||||
)
|
||||
from pts import Trainer
|
||||
from pts.model import PyTorchEstimator
|
||||
from pts.modules import StudentTOutput
|
||||
|
||||
from .simple_feedforward_network import (
|
||||
SimpleFeedForwardTrainingNetwork,
|
||||
SimpleFeedForwardPredictionNetwork,
|
||||
)
|
||||
|
||||
|
||||
class SimpleFeedForwardEstimator(PTSEstimator):
|
||||
class SimpleFeedForwardEstimator(PyTorchEstimator):
|
||||
"""
|
||||
SimpleFeedForwardEstimator shows how to build a simple MLP model predicting
|
||||
the next target time-steps given the previous ones.
|
||||
|
||||
Given that we want to define a pytorch model trainable by SGD, we inherit the
|
||||
parent class `PTSEstimator` that handles most of the logic for fitting a
|
||||
parent class `PyTorchEstimator` that handles most of the logic for fitting a
|
||||
neural-network.
|
||||
|
||||
We thus only have to define:
|
||||
@@ -148,7 +158,7 @@ class SimpleFeedForwardEstimator(PTSEstimator):
|
||||
transformation: Transformation,
|
||||
trained_network: nn.Module,
|
||||
device: torch.device,
|
||||
) -> PTSPredictor:
|
||||
) -> PyTorchPredictor:
|
||||
prediction_network = SimpleFeedForwardPredictionNetwork(
|
||||
num_hidden_dimensions=self.num_hidden_dimensions,
|
||||
prediction_length=self.prediction_length,
|
||||
@@ -161,7 +171,7 @@ class SimpleFeedForwardEstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -4,8 +4,10 @@ import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Distribution
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.modules import MeanScaler, NOPScaler, DistributionOutput, LambdaLayer
|
||||
from gluonts.core.component import validated
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from gluonts.torch.modules.lambda_layer import LambdaLayer
|
||||
from pts.modules import MeanScaler, NOPScaler
|
||||
|
||||
|
||||
class SimpleFeedForwardNetworkBase(nn.Module):
|
||||
@@ -35,6 +37,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
|
||||
Distribution to fit.
|
||||
kwargs
|
||||
"""
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
@@ -60,7 +63,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
|
||||
if i == 0:
|
||||
input_size = context_length
|
||||
else:
|
||||
input_size = dims[i-1]
|
||||
input_size = dims[i - 1]
|
||||
modules += [nn.Linear(input_size, units), nn.ReLU()]
|
||||
if self.batch_normalization:
|
||||
modules.append(nn.BatchNorm1d(units))
|
||||
@@ -83,7 +86,7 @@ class SimpleFeedForwardNetworkBase(nn.Module):
|
||||
past_target,
|
||||
torch.ones_like(past_target), # TODO: pass the actual observed here
|
||||
)
|
||||
|
||||
|
||||
mlp_outputs = self.mlp(scaled_target)
|
||||
distr_args = self.distr_args_proj(mlp_outputs)
|
||||
return self.distr_output.distribution(
|
||||
|
||||
@@ -9,7 +9,7 @@ from pts.feature import (
|
||||
fourier_time_features_from_frequency_str,
|
||||
get_fourier_lags_for_frequency,
|
||||
)
|
||||
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
|
||||
from pts.transform import (
|
||||
Transformation,
|
||||
Chain,
|
||||
@@ -27,7 +27,7 @@ from pts.transform import (
|
||||
from .tempflow_network import TempFlowTrainingNetwork, TempFlowPredictionNetwork
|
||||
|
||||
|
||||
class TempFlowEstimator(PTSEstimator):
|
||||
class TempFlowEstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
@@ -49,7 +49,6 @@ class TempFlowEstimator(PTSEstimator):
|
||||
n_hidden=2,
|
||||
conditioning_length: int = 200,
|
||||
dequantize: bool = False,
|
||||
|
||||
scaling: bool = True,
|
||||
pick_incomplete: bool = False,
|
||||
lags_seq: Optional[List[int]] = None,
|
||||
@@ -100,10 +99,16 @@ class TempFlowEstimator(PTSEstimator):
|
||||
def create_transformation(self) -> Transformation:
|
||||
return Chain(
|
||||
[
|
||||
AsNumpyArray(field=FieldName.TARGET, expected_ndim=2,),
|
||||
AsNumpyArray(
|
||||
field=FieldName.TARGET,
|
||||
expected_ndim=2,
|
||||
),
|
||||
# maps the target to (1, T)
|
||||
# if the target data is uni dimensional
|
||||
ExpandDimArray(field=FieldName.TARGET, axis=None,),
|
||||
ExpandDimArray(
|
||||
field=FieldName.TARGET,
|
||||
axis=None,
|
||||
),
|
||||
AddObservedValuesIndicator(
|
||||
target_field=FieldName.TARGET,
|
||||
output_field=FieldName.OBSERVED_VALUES,
|
||||
@@ -176,7 +181,7 @@ class TempFlowEstimator(PTSEstimator):
|
||||
transformation: Transformation,
|
||||
trained_network: TempFlowTrainingNetwork,
|
||||
device: torch.device,
|
||||
) -> PTSPredictor:
|
||||
) -> PyTorchPredictor:
|
||||
prediction_network = TempFlowPredictionNetwork(
|
||||
input_size=self.input_size,
|
||||
target_dim=self.target_dim,
|
||||
@@ -202,7 +207,7 @@ class TempFlowEstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -3,13 +3,12 @@ from typing import List, Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from pts.model import weighted_average
|
||||
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
|
||||
|
||||
|
||||
class TempFlowTrainingNetwork(nn.Module):
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
@@ -55,7 +54,10 @@ class TempFlowTrainingNetwork(nn.Module):
|
||||
batch_first=True,
|
||||
)
|
||||
|
||||
flow_cls = {"RealNVP": RealNVP, "MAF": MAF,}[flow_type]
|
||||
flow_cls = {
|
||||
"RealNVP": RealNVP,
|
||||
"MAF": MAF,
|
||||
}[flow_type]
|
||||
self.flow = flow_cls(
|
||||
input_size=target_dim,
|
||||
n_blocks=n_blocks,
|
||||
@@ -377,7 +379,8 @@ class TempFlowTrainingNetwork(nn.Module):
|
||||
# put together target sequence
|
||||
# (batch_size, seq_len, target_dim)
|
||||
target = torch.cat(
|
||||
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf), dim=1,
|
||||
(past_target_cdf[:, -self.context_length :, ...], future_target_cdf),
|
||||
dim=1,
|
||||
)
|
||||
|
||||
# assert_shape(target, (-1, seq_len, self.target_dim))
|
||||
@@ -519,7 +522,12 @@ class TempFlowPredictionNetwork(TempFlowTrainingNetwork):
|
||||
|
||||
# (batch_size, num_samples, prediction_length, target_dim)
|
||||
return samples.reshape(
|
||||
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
|
||||
(
|
||||
-1,
|
||||
self.num_parallel_samples,
|
||||
self.prediction_length,
|
||||
self.target_dim,
|
||||
)
|
||||
)
|
||||
|
||||
def forward(
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .transformer_estimator import TransformerEstimator
|
||||
from .transformer_estimator import TransformerEstimator
|
||||
|
||||
@@ -11,7 +11,7 @@ from pts.feature import (
|
||||
fourier_time_features_from_frequency_str,
|
||||
get_fourier_lags_for_frequency,
|
||||
)
|
||||
from pts.model import PTSEstimator, Predictor, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, Predictor, PyTorchPredictor, copy_parameters
|
||||
from pts.modules import DistributionOutput, StudentTOutput
|
||||
from pts.transform import (
|
||||
Transformation,
|
||||
@@ -32,7 +32,7 @@ from .transformer_network import (
|
||||
)
|
||||
|
||||
|
||||
class TransformerEstimator(PTSEstimator):
|
||||
class TransformerEstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
@@ -75,7 +75,9 @@ class TransformerEstimator(PTSEstimator):
|
||||
self.embedding_dimension = embedding_dimension
|
||||
self.num_parallel_samples = num_parallel_samples
|
||||
self.lags_seq = (
|
||||
lags_seq if lags_seq is not None else get_fourier_lags_for_frequency(freq_str=freq)
|
||||
lags_seq
|
||||
if lags_seq is not None
|
||||
else get_fourier_lags_for_frequency(freq_str=freq)
|
||||
)
|
||||
self.time_features = (
|
||||
time_features
|
||||
@@ -117,7 +119,9 @@ class TransformerEstimator(PTSEstimator):
|
||||
field=FieldName.FEAT_STATIC_CAT, expected_ndim=1, dtype=np.long
|
||||
),
|
||||
AsNumpyArray(
|
||||
field=FieldName.FEAT_STATIC_REAL, expected_ndim=1, dtype=self.dtype,
|
||||
field=FieldName.FEAT_STATIC_REAL,
|
||||
expected_ndim=1,
|
||||
dtype=self.dtype,
|
||||
),
|
||||
AsNumpyArray(
|
||||
field=FieldName.TARGET,
|
||||
@@ -220,7 +224,7 @@ class TransformerEstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from pts.modules import DistributionOutput, MeanScaler, NOPScaler, FeatureEmbedder
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ def prod(xs):
|
||||
|
||||
|
||||
class TransformerNetwork(nn.Module):
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
@@ -72,7 +71,8 @@ class TransformerNetwork(nn.Module):
|
||||
self.proj_dist_args = distr_output.get_args_proj(d_model)
|
||||
|
||||
self.embedder = FeatureEmbedder(
|
||||
cardinalities=cardinality, embedding_dims=embedding_dimension,
|
||||
cardinalities=cardinality,
|
||||
embedding_dims=embedding_dimension,
|
||||
)
|
||||
|
||||
if scaling:
|
||||
@@ -82,7 +82,8 @@ class TransformerNetwork(nn.Module):
|
||||
|
||||
# mask
|
||||
self.register_buffer(
|
||||
"tgt_mask", self.transformer.generate_square_subsequent_mask(prediction_length)
|
||||
"tgt_mask",
|
||||
self.transformer.generate_square_subsequent_mask(prediction_length),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -154,9 +155,7 @@ class TransformerNetwork(nn.Module):
|
||||
else:
|
||||
time_feat = torch.cat(
|
||||
(
|
||||
past_time_feat[
|
||||
:, self.history_length - self.context_length :, ...
|
||||
],
|
||||
past_time_feat[:, self.history_length - self.context_length :, ...],
|
||||
future_time_feat,
|
||||
),
|
||||
dim=1,
|
||||
@@ -177,7 +176,7 @@ class TransformerNetwork(nn.Module):
|
||||
# scale shape is (batch_size, 1, *target_shape)
|
||||
_, scale = self.scaler(
|
||||
past_target[:, -self.context_length :, ...],
|
||||
past_observed_values[:, -self.context_length :, ...]
|
||||
past_observed_values[:, -self.context_length :, ...],
|
||||
)
|
||||
embedded_cat = self.embedder(feat_static_cat)
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from pts.feature import (
|
||||
fourier_time_features_from_frequency_str,
|
||||
get_fourier_lags_for_frequency,
|
||||
)
|
||||
from pts.model import PTSEstimator, PTSPredictor, copy_parameters
|
||||
from pts.model import PyTorchEstimator, PyTorchPredictor, copy_parameters
|
||||
from pts.transform import (
|
||||
Transformation,
|
||||
Chain,
|
||||
@@ -24,10 +24,13 @@ from pts.transform import (
|
||||
SetFieldIfNotPresent,
|
||||
TargetDimIndicator,
|
||||
)
|
||||
from .transformer_tempflow_network import TransformerTempFlowTrainingNetwork, TransformerTempFlowPredictionNetwork
|
||||
from .transformer_tempflow_network import (
|
||||
TransformerTempFlowTrainingNetwork,
|
||||
TransformerTempFlowPredictionNetwork,
|
||||
)
|
||||
|
||||
|
||||
class TransformerTempFlowEstimator(PTSEstimator):
|
||||
class TransformerTempFlowEstimator(PyTorchEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
@@ -52,7 +55,6 @@ class TransformerTempFlowEstimator(PTSEstimator):
|
||||
n_hidden=2,
|
||||
conditioning_length: int = 200,
|
||||
dequantize: bool = False,
|
||||
|
||||
scaling: bool = True,
|
||||
pick_incomplete: bool = False,
|
||||
lags_seq: Optional[List[int]] = None,
|
||||
@@ -108,10 +110,16 @@ class TransformerTempFlowEstimator(PTSEstimator):
|
||||
def create_transformation(self) -> Transformation:
|
||||
return Chain(
|
||||
[
|
||||
AsNumpyArray(field=FieldName.TARGET, expected_ndim=2,),
|
||||
AsNumpyArray(
|
||||
field=FieldName.TARGET,
|
||||
expected_ndim=2,
|
||||
),
|
||||
# maps the target to (1, T)
|
||||
# if the target data is uni dimensional
|
||||
ExpandDimArray(field=FieldName.TARGET, axis=None,),
|
||||
ExpandDimArray(
|
||||
field=FieldName.TARGET,
|
||||
axis=None,
|
||||
),
|
||||
AddObservedValuesIndicator(
|
||||
target_field=FieldName.TARGET,
|
||||
output_field=FieldName.OBSERVED_VALUES,
|
||||
@@ -156,7 +164,9 @@ class TransformerTempFlowEstimator(PTSEstimator):
|
||||
]
|
||||
)
|
||||
|
||||
def create_training_network(self, device: torch.device) -> TransformerTempFlowTrainingNetwork:
|
||||
def create_training_network(
|
||||
self, device: torch.device
|
||||
) -> TransformerTempFlowTrainingNetwork:
|
||||
return TransformerTempFlowTrainingNetwork(
|
||||
input_size=self.input_size,
|
||||
target_dim=self.target_dim,
|
||||
@@ -187,7 +197,7 @@ class TransformerTempFlowEstimator(PTSEstimator):
|
||||
transformation: Transformation,
|
||||
trained_network: TransformerTempFlowTrainingNetwork,
|
||||
device: torch.device,
|
||||
) -> PTSPredictor:
|
||||
) -> PyTorchPredictor:
|
||||
prediction_network = TransformerTempFlowPredictionNetwork(
|
||||
input_size=self.input_size,
|
||||
target_dim=self.target_dim,
|
||||
@@ -216,7 +226,7 @@ class TransformerTempFlowEstimator(PTSEstimator):
|
||||
|
||||
copy_parameters(trained_network, prediction_network)
|
||||
|
||||
return PTSPredictor(
|
||||
return PyTorchPredictor(
|
||||
input_transform=transformation,
|
||||
prediction_net=prediction_network,
|
||||
batch_size=self.trainer.batch_size,
|
||||
|
||||
@@ -3,12 +3,11 @@ from typing import List, Optional, Tuple
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
|
||||
|
||||
|
||||
class TransformerTempFlowTrainingNetwork(nn.Module):
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
@@ -61,7 +60,10 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
|
||||
activation=act_type,
|
||||
)
|
||||
|
||||
flow_cls = {"RealNVP": RealNVP, "MAF": MAF,}[flow_type]
|
||||
flow_cls = {
|
||||
"RealNVP": RealNVP,
|
||||
"MAF": MAF,
|
||||
}[flow_type]
|
||||
self.flow = flow_cls(
|
||||
input_size=target_dim,
|
||||
n_blocks=n_blocks,
|
||||
@@ -146,9 +148,7 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
|
||||
future_time_feat: Optional[torch.Tensor],
|
||||
future_target_cdf: Optional[torch.Tensor],
|
||||
target_dimension_indicator: torch.Tensor,
|
||||
) -> Tuple[
|
||||
torch.Tensor, torch.Tensor, torch.Tensor,
|
||||
]:
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
|
||||
"""
|
||||
Unrolls the RNN encoder over past and, if present, future data.
|
||||
Returns outputs and state of the encoder, plus the scale of
|
||||
@@ -204,7 +204,10 @@ class TransformerTempFlowTrainingNetwork(nn.Module):
|
||||
subsequences_length = self.context_length
|
||||
else:
|
||||
time_feat = torch.cat(
|
||||
(past_time_feat[:, -self.context_length :, ...], future_time_feat,),
|
||||
(
|
||||
past_time_feat[:, -self.context_length :, ...],
|
||||
future_time_feat,
|
||||
),
|
||||
dim=1,
|
||||
)
|
||||
sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
|
||||
@@ -516,7 +519,12 @@ class TransformerTempFlowPredictionNetwork(TransformerTempFlowTrainingNetwork):
|
||||
|
||||
# (batch_size, num_samples, prediction_length, target_dim)
|
||||
return samples.reshape(
|
||||
(-1, self.num_parallel_samples, self.prediction_length, self.target_dim,)
|
||||
(
|
||||
-1,
|
||||
self.num_parallel_samples,
|
||||
self.prediction_length,
|
||||
self.target_dim,
|
||||
)
|
||||
)
|
||||
|
||||
def forward(
|
||||
|
||||
+27
-22
@@ -7,30 +7,35 @@ import torch.nn as nn
|
||||
|
||||
def get_module_forward_input_names(module: nn.Module):
|
||||
params = inspect.signature(module.forward).parameters
|
||||
return list(params)
|
||||
|
||||
|
||||
def copy_parameters(net_source: nn.Module, net_dest: nn.Module) -> None:
|
||||
net_dest.load_state_dict(net_source.state_dict())
|
||||
param_names = [k for k, v in params.items() if not str(v).startswith("*")]
|
||||
return param_names
|
||||
|
||||
|
||||
def weighted_average(
|
||||
tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None
|
||||
):
|
||||
x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Computes the weighted average of a given tensor across a given dim, masking
|
||||
values associated with weight zero,
|
||||
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x
|
||||
Input tensor, of which the average must be computed.
|
||||
weights
|
||||
Weights tensor, of the same shape as `x`.
|
||||
dim
|
||||
The dim along which to average `x`
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tensor:
|
||||
The tensor with values averaged along the specified `dim`.
|
||||
"""
|
||||
if weights is not None:
|
||||
weighted_tensor = tensor * weights
|
||||
if dim is not None:
|
||||
sum_weights = torch.sum(weights, dim)
|
||||
sum_weighted_tensor = torch.sum(weighted_tensor, dim)
|
||||
else:
|
||||
sum_weights = weights.sum()
|
||||
sum_weighted_tensor = weighted_tensor.sum()
|
||||
|
||||
sum_weights = torch.max(torch.ones_like(sum_weights), sum_weights)
|
||||
|
||||
return sum_weighted_tensor / sum_weights
|
||||
weighted_tensor = torch.where(weights != 0, x * weights, torch.zeros_like(x))
|
||||
sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
|
||||
return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum())/ sum_weights
|
||||
else:
|
||||
if dim is not None:
|
||||
return torch.mean(tensor, dim=dim)
|
||||
else:
|
||||
return tensor.mean()
|
||||
return x.mean(dim=dim)
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
from .distribution_output import (
|
||||
ArgProj,
|
||||
Output,
|
||||
DistributionOutput,
|
||||
NormalOutput,
|
||||
StudentTOutput,
|
||||
BetaOutput,
|
||||
@@ -20,5 +17,4 @@ from .distribution_output import (
|
||||
)
|
||||
from .feature import FeatureEmbedder, FeatureAssembler
|
||||
from .flows import RealNVP, MAF
|
||||
from .lambda_layer import LambdaLayer
|
||||
from .scaler import MeanScaler, NOPScaler
|
||||
|
||||
@@ -19,7 +19,8 @@ from torch.distributions import (
|
||||
MultivariateNormal,
|
||||
TransformedDistribution,
|
||||
AffineTransform,
|
||||
Poisson)
|
||||
Poisson,
|
||||
)
|
||||
|
||||
from pts.distributions import (
|
||||
ZeroInflatedPoisson,
|
||||
@@ -29,79 +30,13 @@ from pts.distributions import (
|
||||
ImplicitQuantile,
|
||||
TransformedImplicitQuantile,
|
||||
)
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from gluonts.torch.modules.distribution_output import (
|
||||
DistributionOutput,
|
||||
LambdaLayer,
|
||||
PtArgProj,
|
||||
)
|
||||
from pts.modules.iqn_modules import ImplicitQuantileModule
|
||||
from .lambda_layer import LambdaLayer
|
||||
|
||||
|
||||
class ArgProj(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_features: int,
|
||||
args_dim: Dict[str, int],
|
||||
domain_map: Callable[..., Tuple[torch.Tensor]],
|
||||
dtype: np.dtype = np.float32,
|
||||
prefix: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.args_dim = args_dim
|
||||
self.dtype = dtype
|
||||
self.proj = nn.ModuleList(
|
||||
[nn.Linear(in_features, dim) for dim in args_dim.values()]
|
||||
)
|
||||
self.domain_map = domain_map
|
||||
|
||||
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
|
||||
params_unbounded = [proj(x) for proj in self.proj]
|
||||
|
||||
return self.domain_map(*params_unbounded)
|
||||
|
||||
|
||||
class Output(ABC):
|
||||
in_features: int
|
||||
args_dim: Dict[str, int]
|
||||
_dtype: np.dtype = np.float32
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
@dtype.setter
|
||||
def dtype(self, dtype: np.dtype):
|
||||
self._dtype = dtype
|
||||
|
||||
def get_args_proj(self, in_features: int, prefix: Optional[str] = None) -> ArgProj:
|
||||
return ArgProj(
|
||||
in_features=in_features,
|
||||
args_dim=self.args_dim,
|
||||
domain_map=LambdaLayer(self.domain_map),
|
||||
prefix=prefix,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
|
||||
@abstractclassmethod
|
||||
def domain_map(cls, *args: torch.Tensor):
|
||||
pass
|
||||
|
||||
|
||||
class DistributionOutput(Output, ABC):
|
||||
|
||||
distr_cls: type
|
||||
|
||||
@validated()
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def distribution(
|
||||
self, distr_args, scale: Optional[torch.Tensor] = None
|
||||
) -> Distribution:
|
||||
|
||||
distr = self.distr_cls(*distr_args)
|
||||
if scale is None:
|
||||
return distr
|
||||
else:
|
||||
return TransformedDistribution(distr, [AffineTransform(loc=0, scale=scale)])
|
||||
|
||||
|
||||
class IndependentDistributionOutput(DistributionOutput):
|
||||
@@ -364,7 +299,9 @@ class PiecewiseLinearOutput(DistributionOutput):
|
||||
return gamma.squeeze(axis=-1), slopes_proj, knot_spacings_proj
|
||||
|
||||
def distribution(
|
||||
self, distr_args, scale: Optional[torch.Tensor] = None,
|
||||
self,
|
||||
distr_args,
|
||||
scale: Optional[torch.Tensor] = None,
|
||||
) -> PiecewiseLinear:
|
||||
if scale is None:
|
||||
return self.distr_cls(*distr_args)
|
||||
@@ -415,7 +352,11 @@ class NormalMixtureOutput(DistributionOutput):
|
||||
class LowRankMultivariateNormalOutput(DistributionOutput):
|
||||
@validated()
|
||||
def __init__(
|
||||
self, dim: int, rank: int, sigma_init: float = 1.0, sigma_minimum: float = 1e-3,
|
||||
self,
|
||||
dim: int,
|
||||
rank: int,
|
||||
sigma_init: float = 1.0,
|
||||
sigma_minimum: float = 1e-3,
|
||||
) -> None:
|
||||
self.distr_cls = LowRankMultivariateNormal
|
||||
self.dim = dim
|
||||
@@ -508,25 +449,16 @@ class FlowOutput(DistributionOutput):
|
||||
return (self.dim,)
|
||||
|
||||
|
||||
class QuantileArgProj(ArgProj):
|
||||
class QuantilePtArgProj(PtArgProj):
|
||||
def __init__(
|
||||
self,
|
||||
in_features: int,
|
||||
output_domain_cls: nn.Module,
|
||||
args_dim: Dict[str, int],
|
||||
domain_map: Callable[..., Tuple[torch.Tensor]],
|
||||
dtype: np.dtype = np.float32,
|
||||
prefix: Optional[str] = None,
|
||||
**kwargs,
|
||||
self,
|
||||
in_features: int,
|
||||
output_domain_cls: nn.Module,
|
||||
args_dim: Dict[str, int],
|
||||
domain_map: Callable[..., Tuple[torch.Tensor]],
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
in_features,
|
||||
args_dim,
|
||||
domain_map,
|
||||
dtype,
|
||||
prefix,
|
||||
**kwargs
|
||||
)
|
||||
super().__init__(in_features, args_dim, domain_map, **kwargs)
|
||||
self.output_domain_cls = output_domain_cls
|
||||
self.proj = ImplicitQuantileModule(in_features, output_domain_cls)
|
||||
|
||||
@@ -535,8 +467,8 @@ class QuantileArgProj(ArgProj):
|
||||
forecast_length = x.shape[1]
|
||||
device = x.device
|
||||
taus = torch.rand(size=(batch_size, forecast_length), device=device)
|
||||
self.register_buffer('taus', taus)
|
||||
self.register_buffer('nn_ouput', x.clone().detach())
|
||||
self.register_buffer("taus", taus)
|
||||
self.register_buffer("nn_ouput", x.clone().detach())
|
||||
predicted_quantiles = self.proj(x, taus)
|
||||
return self.domain_map(predicted_quantiles)
|
||||
|
||||
@@ -548,6 +480,7 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
|
||||
output_domain_cls: type = nn.Module
|
||||
quantile_arg_proj: type = nn.Module
|
||||
|
||||
@validated()
|
||||
def __init__(self, output_domain: str) -> None:
|
||||
super().__init__()
|
||||
self.set_output_domain_map(output_domain)
|
||||
@@ -559,14 +492,17 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
|
||||
"Positive": nn.Softplus,
|
||||
"Real": nn.Identity,
|
||||
}
|
||||
assert output_domain in available_domain_map_cls.keys(), \
|
||||
"Only the following output domains are allowed: {}".format(available_domain_map_cls.keys())
|
||||
assert (
|
||||
output_domain in available_domain_map_cls.keys()
|
||||
), "Only the following output domains are allowed: {}".format(
|
||||
available_domain_map_cls.keys()
|
||||
)
|
||||
output_domain_cls = available_domain_map_cls[output_domain]
|
||||
cls.output_domain_cls = output_domain_cls
|
||||
|
||||
@classmethod
|
||||
def set_args_proj(cls):
|
||||
cls.quantile_arg_proj = QuantileArgProj(
|
||||
cls.quantile_arg_proj = QuantilePtArgProj(
|
||||
in_features=cls.in_features,
|
||||
output_domain_cls=cls.output_domain_cls,
|
||||
args_dim=cls.args_dim,
|
||||
@@ -584,11 +520,13 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
|
||||
cls.set_args_proj()
|
||||
return cls.quantile_arg_proj
|
||||
|
||||
def get_args_proj(self, in_features: int, prefix: Optional[str] = None) :
|
||||
def get_args_proj(self, in_features: int, prefix: Optional[str] = None):
|
||||
return self.args_proj(in_features)
|
||||
|
||||
def distribution(
|
||||
self, distr_args, scale: Optional[torch.Tensor] = None,
|
||||
self,
|
||||
distr_args,
|
||||
scale: Optional[torch.Tensor] = None,
|
||||
) -> ImplicitQuantile:
|
||||
|
||||
args_proj = self.get_args_proj(self.in_features)
|
||||
@@ -597,7 +535,8 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
|
||||
implicit_quantile_function=implicit_quantile_function,
|
||||
taus=list(args_proj.buffers())[0],
|
||||
nn_output=list(args_proj.buffers())[1],
|
||||
predicted_quantiles=distr_args)
|
||||
predicted_quantiles=distr_args,
|
||||
)
|
||||
if scale is None:
|
||||
return distr
|
||||
else:
|
||||
@@ -608,6 +547,3 @@ class ImplicitQuantileOutput(IndependentDistributionOutput):
|
||||
@property
|
||||
def event_shape(self) -> Tuple:
|
||||
return ()
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,11 @@ import torch.nn as nn
|
||||
|
||||
|
||||
class FeatureEmbedder(nn.Module):
|
||||
def __init__(self, cardinalities: List[int], embedding_dims: List[int],) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
cardinalities: List[int],
|
||||
embedding_dims: List[int],
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.__num_features = len(cardinalities)
|
||||
|
||||
@@ -52,6 +52,6 @@ class QuantileLayer(nn.Module):
|
||||
integers = torch.repeat_interleave(
|
||||
torch.arange(0, self.n_cos_embedding).unsqueeze(dim=0),
|
||||
repeats=tau.shape[-1],
|
||||
dim=0
|
||||
dim=0,
|
||||
).to(tau.device)
|
||||
return torch.cos(pi * tau.unsqueeze(dim=-1) * integers)
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class LambdaLayer(nn.Module):
|
||||
def __init__(self, function):
|
||||
super().__init__()
|
||||
self._func = function
|
||||
|
||||
def forward(self, x, *args):
|
||||
return self._func(x, *args)
|
||||
@@ -37,7 +37,7 @@ class Scaler(ABC, nn.Module):
|
||||
Tensor
|
||||
Tensor containing the "scaled" data, shape: (N, T, C) or (N, C, T).
|
||||
Tensor
|
||||
Tensor containing the scale, of shape (N, C) if ``keepdim == False``,
|
||||
Tensor containing the scale, of shape (N, C) if ``keepdim == False``,
|
||||
and shape (N, 1, C) or (N, C, 1) if ``keepdim == True``.
|
||||
"""
|
||||
|
||||
|
||||
+23
-18
@@ -1,14 +1,17 @@
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from tqdm import tqdm
|
||||
|
||||
from gluonts.core.component import validated
|
||||
from gluonts.dataset.loader import TrainDataLoader, ValidationDataLoader
|
||||
|
||||
|
||||
class Trainer:
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
epochs: int = 100,
|
||||
@@ -18,7 +21,7 @@ class Trainer:
|
||||
pin_memory: bool = False,
|
||||
learning_rate: float = 1e-3,
|
||||
weight_decay: float = 1e-6,
|
||||
device: Optional[torch.device] = None,
|
||||
device: Optional[Union[torch.device, str]] = None,
|
||||
) -> None:
|
||||
self.epochs = epochs
|
||||
self.batch_size = batch_size
|
||||
@@ -30,26 +33,26 @@ class Trainer:
|
||||
self.pin_memory = pin_memory
|
||||
|
||||
def __call__(
|
||||
self, net: nn.Module, input_names: List[str], data_loader: DataLoader
|
||||
self,
|
||||
net: nn.Module,
|
||||
train_iter: TrainDataLoader,
|
||||
validation_iter: Optional[ValidationDataLoader] = None,
|
||||
) -> None:
|
||||
optimizer = torch.optim.Adam(
|
||||
net.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
|
||||
)
|
||||
|
||||
writer = SummaryWriter()
|
||||
#writer.add_graph(net)
|
||||
|
||||
for epoch_no in range(self.epochs):
|
||||
# mark epoch start time
|
||||
tic = time.time()
|
||||
avg_epoch_loss = 0.0
|
||||
|
||||
with tqdm(data_loader) as it:
|
||||
with tqdm(train_iter) as it:
|
||||
for batch_no, data_entry in enumerate(it, start=1):
|
||||
optimizer.zero_grad()
|
||||
inputs = [data_entry[k].to(self.device) for k in input_names]
|
||||
#inputs = [data_entry[k].to(self.device) for k in input_names]
|
||||
|
||||
output = net(*inputs)
|
||||
output = net(*data_entry.values())
|
||||
if isinstance(output, (list, tuple)):
|
||||
loss = output[0]
|
||||
else:
|
||||
@@ -63,18 +66,20 @@ class Trainer:
|
||||
},
|
||||
refresh=False,
|
||||
)
|
||||
n_iter = epoch_no*self.num_batches_per_epoch + batch_no
|
||||
writer.add_scalar('Loss/train', loss.item(), n_iter)
|
||||
n_iter = epoch_no * self.num_batches_per_epoch + batch_no
|
||||
#.add_scalar("Loss/train", loss.item(), n_iter)
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
if self.num_batches_per_epoch == batch_no:
|
||||
for name, param in net.named_parameters():
|
||||
writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)
|
||||
# for name, param in net.named_parameters():
|
||||
# writer.add_histogram(
|
||||
# name, param.clone().cpu().data.numpy(), n_iter
|
||||
# )
|
||||
break
|
||||
|
||||
# mark epoch end time and log time cost of current epoch
|
||||
toc = time.time()
|
||||
|
||||
writer.close()
|
||||
|
||||
#writer.close()
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
from .convert import (
|
||||
AsNumpyArray,
|
||||
ExpandDimArray,
|
||||
VstackFeatures,
|
||||
ConcatFeatures,
|
||||
SwapAxes,
|
||||
ListFeatures,
|
||||
TargetDimIndicator,
|
||||
SampleTargetDim,
|
||||
CDFtoGaussianTransform,
|
||||
cdf_to_gaussian_forward_transform,
|
||||
)
|
||||
from .dataset import TransformedDataset
|
||||
from .feature import (
|
||||
target_transformation_length,
|
||||
AddObservedValuesIndicator,
|
||||
AddConstFeature,
|
||||
AddTimeFeatures,
|
||||
AddAgeFeature,
|
||||
)
|
||||
from .field import (
|
||||
RemoveFields,
|
||||
RenameFields,
|
||||
SetField,
|
||||
SetFieldIfNotPresent,
|
||||
SelectFields,
|
||||
)
|
||||
from .sampler import (
|
||||
InstanceSampler,
|
||||
UniformSplitSampler,
|
||||
TestSplitSampler,
|
||||
ExpectedNumInstanceSampler,
|
||||
BucketInstanceSampler,
|
||||
ContinuousTimePointSampler,
|
||||
ContinuousTimeUniformSampler,
|
||||
)
|
||||
from .split import (
|
||||
shift_timestamp,
|
||||
InstanceSplitter,
|
||||
CanonicalInstanceSplitter,
|
||||
ContinuousTimeInstanceSplitter,
|
||||
)
|
||||
from .transform import (
|
||||
Transformation,
|
||||
Chain,
|
||||
Identity,
|
||||
MapTransformation,
|
||||
SimpleTransformation,
|
||||
AdhocTransform,
|
||||
FlatMapTransformation,
|
||||
FilterTransformation,
|
||||
)
|
||||
@@ -1,713 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from typing import Iterator, List, Tuple, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from scipy.special import erf, erfinv
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset import DataEntry
|
||||
from pts.exception import assert_pts
|
||||
from .transform import (
|
||||
SimpleTransformation,
|
||||
MapTransformation,
|
||||
FlatMapTransformation,
|
||||
)
|
||||
|
||||
|
||||
class AsNumpyArray(SimpleTransformation):
|
||||
"""
|
||||
Converts the value of a field into a numpy array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expected_ndim
|
||||
Expected number of dimensions. Throws an exception if the number of
|
||||
dimensions does not match.
|
||||
dtype
|
||||
numpy dtype to use.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self, field: str, expected_ndim: int, dtype: np.dtype = np.float32
|
||||
) -> None:
|
||||
self.field = field
|
||||
self.expected_ndim = expected_ndim
|
||||
self.dtype = dtype
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
value = data[self.field]
|
||||
if not isinstance(value, float):
|
||||
# this lines produces "ValueError: setting an array element with a
|
||||
# sequence" on our test
|
||||
# value = np.asarray(value, dtype=np.float32)
|
||||
# see https://stackoverflow.com/questions/43863748/
|
||||
value = np.asarray(list(value), dtype=self.dtype)
|
||||
else:
|
||||
# ugly: required as list conversion will fail in the case of a
|
||||
# float
|
||||
value = np.asarray(value, dtype=self.dtype)
|
||||
assert_pts(
|
||||
value.ndim >= self.expected_ndim,
|
||||
'Input for field "{self.field}" does not have the required'
|
||||
"dimension (field: {self.field}, ndim observed: {value.ndim}, "
|
||||
"expected ndim: {self.expected_ndim})",
|
||||
value=value,
|
||||
self=self,
|
||||
)
|
||||
data[self.field] = value
|
||||
return data
|
||||
|
||||
|
||||
class ExpandDimArray(SimpleTransformation):
|
||||
"""
|
||||
Expand dims in the axis specified, if the axis is not present does nothing.
|
||||
(This essentially calls np.expand_dims)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field
|
||||
Field in dictionary to use
|
||||
axis
|
||||
Axis to expand (see np.expand_dims for details)
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, field: str, axis: Optional[int] = None) -> None:
|
||||
self.field = field
|
||||
self.axis = axis
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
if self.axis is not None:
|
||||
data[self.field] = np.expand_dims(data[self.field], axis=self.axis)
|
||||
return data
|
||||
|
||||
|
||||
class VstackFeatures(SimpleTransformation):
|
||||
"""
|
||||
Stack fields together using ``np.vstack``.
|
||||
|
||||
Fields with value ``None`` are ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Field name to use for the output
|
||||
input_fields
|
||||
Fields to stack together
|
||||
drop_inputs
|
||||
If set to true the input fields will be dropped.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
|
||||
) -> None:
|
||||
self.output_field = output_field
|
||||
self.input_fields = input_fields
|
||||
self.cols_to_drop = (
|
||||
[]
|
||||
if not drop_inputs
|
||||
else [fname for fname in self.input_fields if fname != output_field]
|
||||
)
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
r = [data[fname] for fname in self.input_fields if data[fname] is not None]
|
||||
output = np.vstack(r)
|
||||
data[self.output_field] = output
|
||||
for fname in self.cols_to_drop:
|
||||
del data[fname]
|
||||
return data
|
||||
|
||||
|
||||
class ConcatFeatures(SimpleTransformation):
|
||||
"""
|
||||
Concatenate fields together using ``np.concatenate``.
|
||||
|
||||
Fields with value ``None`` are ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Field name to use for the output
|
||||
input_fields
|
||||
Fields to stack together
|
||||
drop_inputs
|
||||
If set to true the input fields will be dropped.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
|
||||
) -> None:
|
||||
self.output_field = output_field
|
||||
self.input_fields = input_fields
|
||||
self.cols_to_drop = (
|
||||
[]
|
||||
if not drop_inputs
|
||||
else [fname for fname in self.input_fields if fname != output_field]
|
||||
)
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
r = [data[fname] for fname in self.input_fields if data[fname] is not None]
|
||||
output = np.concatenate(r)
|
||||
data[self.output_field] = output
|
||||
for fname in self.cols_to_drop:
|
||||
del data[fname]
|
||||
return data
|
||||
|
||||
|
||||
class SwapAxes(SimpleTransformation):
|
||||
"""
|
||||
Apply `np.swapaxes` to fields.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_fields
|
||||
Field to apply to
|
||||
axes
|
||||
Axes to use
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, input_fields: List[str], axes: Tuple[int, int]) -> None:
|
||||
self.input_fields = input_fields
|
||||
self.axis1, self.axis2 = axes
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
for field in self.input_fields:
|
||||
data[field] = self.swap(data[field])
|
||||
return data
|
||||
|
||||
def swap(self, v):
|
||||
if isinstance(v, np.ndarray):
|
||||
return np.swapaxes(v, self.axis1, self.axis2)
|
||||
if isinstance(v, list):
|
||||
return [self.swap(x) for x in v]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected field type {type(v).__name__}, expected "
|
||||
f"np.ndarray or list[np.ndarray]"
|
||||
)
|
||||
|
||||
|
||||
class ListFeatures(SimpleTransformation):
|
||||
"""
|
||||
Creates a new field which contains a list of features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Field name for output
|
||||
input_fields
|
||||
Fields to combine into list
|
||||
drop_inputs
|
||||
If true the input fields will be removed from the result.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self, output_field: str, input_fields: List[str], drop_inputs: bool = True,
|
||||
) -> None:
|
||||
self.output_field = output_field
|
||||
self.input_fields = input_fields
|
||||
self.cols_to_drop = (
|
||||
[]
|
||||
if not drop_inputs
|
||||
else [fname for fname in self.input_fields if fname != output_field]
|
||||
)
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
data[self.output_field] = [data[fname] for fname in self.input_fields]
|
||||
for fname in self.cols_to_drop:
|
||||
del data[fname]
|
||||
return data
|
||||
|
||||
|
||||
class TargetDimIndicator(SimpleTransformation):
|
||||
"""
|
||||
Label-encoding of the target dimensions.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, field_name: str, target_field: str) -> None:
|
||||
self.field_name = field_name
|
||||
self.target_field = target_field
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
data[self.field_name] = np.arange(0, data[self.target_field].shape[0])
|
||||
return data
|
||||
|
||||
|
||||
class SampleTargetDim(FlatMapTransformation):
|
||||
"""
|
||||
Samples random dimensions from the target at training time.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
field_name: str,
|
||||
target_field: str,
|
||||
observed_values_field: str,
|
||||
num_samples: int,
|
||||
shuffle: bool = True,
|
||||
) -> None:
|
||||
self.field_name = field_name
|
||||
self.target_field = target_field
|
||||
self.observed_values_field = observed_values_field
|
||||
self.num_samples = num_samples
|
||||
self.shuffle = shuffle
|
||||
|
||||
def flatmap_transform(
|
||||
self, data: DataEntry, is_train: bool, slice_future_target: bool = True
|
||||
) -> Iterator[DataEntry]:
|
||||
if not is_train:
|
||||
yield data
|
||||
else:
|
||||
# (target_dim,)
|
||||
target_dimensions = data[self.field_name]
|
||||
|
||||
if self.shuffle:
|
||||
np.random.shuffle(target_dimensions)
|
||||
|
||||
target_dimensions = target_dimensions[: self.num_samples]
|
||||
|
||||
data[self.field_name] = target_dimensions
|
||||
# (seq_len, target_dim) -> (seq_len, num_samples)
|
||||
|
||||
for field in [
|
||||
f"past_{self.target_field}",
|
||||
f"future_{self.target_field}",
|
||||
f"past_{self.observed_values_field}",
|
||||
f"future_{self.observed_values_field}",
|
||||
]:
|
||||
data[field] = data[field][:, target_dimensions]
|
||||
|
||||
yield data
|
||||
|
||||
|
||||
class CDFtoGaussianTransform(MapTransformation):
|
||||
"""
|
||||
Marginal transformation that transforms the target via an empirical CDF
|
||||
to a standard gaussian as described here: https://arxiv.org/abs/1910.03002
|
||||
|
||||
To be used in conjunction with a multivariate gaussian to from a copula.
|
||||
Note that this transformation is currently intended for multivariate
|
||||
targets only.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
target_dim: int,
|
||||
target_field: str,
|
||||
observed_values_field: str,
|
||||
cdf_suffix="_cdf",
|
||||
max_context_length: Optional[int] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Constructor for CDFtoGaussianTransform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_dim
|
||||
Dimensionality of the target.
|
||||
target_field
|
||||
Field that will be transformed.
|
||||
observed_values_field
|
||||
Field that indicates observed values.
|
||||
cdf_suffix
|
||||
Suffix to mark the field with the transformed target.
|
||||
max_context_length
|
||||
Sets the maximum context length for the empirical CDF.
|
||||
"""
|
||||
self.target_field = target_field
|
||||
self.past_target_field = "past_" + self.target_field
|
||||
self.future_target_field = "future_" + self.target_field
|
||||
self.past_observed_field = f"past_{observed_values_field}"
|
||||
self.sort_target_field = f"past_{target_field}_sorted"
|
||||
self.slopes_field = "slopes"
|
||||
self.intercepts_field = "intercepts"
|
||||
self.cdf_suffix = cdf_suffix
|
||||
self.max_context_length = max_context_length
|
||||
self.target_dim = target_dim
|
||||
|
||||
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
|
||||
self._preprocess_data(data, is_train=is_train)
|
||||
self._calc_pw_linear_params(data)
|
||||
|
||||
for target_field in [self.past_target_field, self.future_target_field]:
|
||||
data[target_field + self.cdf_suffix] = self.standard_gaussian_ppf(
|
||||
self._empirical_cdf_forward_transform(
|
||||
data[self.sort_target_field],
|
||||
data[target_field],
|
||||
data[self.slopes_field],
|
||||
data[self.intercepts_field],
|
||||
)
|
||||
)
|
||||
return data
|
||||
|
||||
def _preprocess_data(self, data: DataEntry, is_train: bool):
|
||||
"""
|
||||
Performs several preprocess operations for computing the empirical CDF.
|
||||
1) Reshaping the data.
|
||||
2) Normalizing the target length.
|
||||
3) Adding noise to avoid zero slopes (training only)
|
||||
4) Sorting the target to compute the empirical CDF
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data
|
||||
DataEntry with input data.
|
||||
is_train
|
||||
if is_train is True, this function adds noise to the target to
|
||||
avoid zero slopes in the piece-wise linear function.
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
# (target_length, target_dim)
|
||||
past_target_vec = data[self.past_target_field].copy()
|
||||
|
||||
# pick only observed values
|
||||
target_length, target_dim = past_target_vec.shape
|
||||
|
||||
# (target_length, target_dim)
|
||||
past_observed = (data[self.past_observed_field] > 0) * (
|
||||
data["past_is_pad"].reshape((-1, 1)) == 0
|
||||
)
|
||||
assert past_observed.ndim == 2
|
||||
assert target_dim == self.target_dim
|
||||
|
||||
past_target_vec = past_target_vec[past_observed.min(axis=1)]
|
||||
|
||||
assert past_target_vec.ndim == 2
|
||||
assert past_target_vec.shape[1] == self.target_dim
|
||||
|
||||
expected_length = (
|
||||
target_length
|
||||
if self.max_context_length is None
|
||||
else self.max_context_length
|
||||
)
|
||||
|
||||
if target_length != expected_length:
|
||||
# Fills values in the case where past_target_vec.shape[-1] <
|
||||
# target_length
|
||||
# as dataset.loader.BatchBuffer does not support varying shapes
|
||||
past_target_vec = CDFtoGaussianTransform._fill(
|
||||
past_target_vec, expected_length
|
||||
)
|
||||
|
||||
# sorts along the time dimension to compute empirical CDF of each
|
||||
# dimension
|
||||
if is_train:
|
||||
past_target_vec = self._add_noise(past_target_vec)
|
||||
|
||||
past_target_vec.sort(axis=0)
|
||||
|
||||
assert past_target_vec.shape == (expected_length, self.target_dim)
|
||||
|
||||
data[self.sort_target_field] = past_target_vec
|
||||
|
||||
def _calc_pw_linear_params(self, data: DataEntry):
|
||||
"""
|
||||
Calculates the piece-wise linear parameters to interpolate between
|
||||
the observed values in the empirical CDF.
|
||||
|
||||
Once current limitation is that we use a zero slope line as the last
|
||||
piece. Thus, we cannot forecast anything higher than the highest
|
||||
observed value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data
|
||||
Input data entry containing a sorted target field.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
sorted_target = data[self.sort_target_field]
|
||||
sorted_target_length, target_dim = sorted_target.shape
|
||||
|
||||
quantiles = np.stack(
|
||||
[np.arange(sorted_target_length) for _ in range(target_dim)], axis=1,
|
||||
) / float(sorted_target_length)
|
||||
|
||||
x_diff = np.diff(sorted_target, axis=0)
|
||||
y_diff = np.diff(quantiles, axis=0)
|
||||
|
||||
# Calculate slopes of the pw-linear pieces.
|
||||
slopes = np.where(x_diff == 0.0, np.zeros_like(x_diff), y_diff / x_diff)
|
||||
|
||||
zeroes = np.zeros_like(np.expand_dims(slopes[0, :], axis=0))
|
||||
slopes = np.append(slopes, zeroes, axis=0)
|
||||
|
||||
# Calculate intercepts of the pw-linear pieces.
|
||||
intercepts = quantiles - slopes * sorted_target
|
||||
|
||||
# Populate new fields with the piece-wise linear parameters.
|
||||
data[self.slopes_field] = slopes
|
||||
data[self.intercepts_field] = intercepts
|
||||
|
||||
def _empirical_cdf_forward_transform(
|
||||
self,
|
||||
sorted_values: np.ndarray,
|
||||
values: np.ndarray,
|
||||
slopes: np.ndarray,
|
||||
intercepts: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Applies the empirical CDF forward transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sorted_values
|
||||
Sorted target vector.
|
||||
values
|
||||
Values (real valued) that will be transformed to empirical CDF
|
||||
values.
|
||||
slopes
|
||||
Slopes of the piece-wise linear function.
|
||||
intercepts
|
||||
Intercepts of the piece-wise linear function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
quantiles
|
||||
Empirical CDF quantiles in [0, 1] interval with winzorized cutoff.
|
||||
|
||||
"""
|
||||
m = sorted_values.shape[0]
|
||||
quantiles = self._forward_transform(sorted_values, values, slopes, intercepts)
|
||||
|
||||
quantiles = np.clip(
|
||||
quantiles, self.winsorized_cutoff(m), 1 - self.winsorized_cutoff(m)
|
||||
)
|
||||
return quantiles
|
||||
|
||||
@staticmethod
|
||||
def _add_noise(x: np.array) -> np.array:
|
||||
scale_noise = 0.2
|
||||
std = np.sqrt(
|
||||
(np.square(x - x.mean(axis=1, keepdims=True))).mean(axis=1, keepdims=True)
|
||||
)
|
||||
noise = np.random.normal(
|
||||
loc=np.zeros_like(x), scale=np.ones_like(x) * std * scale_noise
|
||||
)
|
||||
x = x + noise
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def _search_sorted(sorted_vec: np.array, to_insert_vec: np.array) -> np.array:
|
||||
"""
|
||||
Finds the indices of the active piece-wise linear function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sorted_vec
|
||||
Sorted target vector.
|
||||
to_insert_vec
|
||||
Vector for which the indicies of the active linear functions
|
||||
will be computed
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices
|
||||
Indices mapping to the active linear function.
|
||||
"""
|
||||
indices_left = np.searchsorted(sorted_vec, to_insert_vec, side="left")
|
||||
indices_right = np.searchsorted(sorted_vec, to_insert_vec, side="right")
|
||||
|
||||
indices = indices_left + (indices_right - indices_left) // 2
|
||||
indices = indices - 1
|
||||
indices = np.minimum(indices, len(sorted_vec) - 1)
|
||||
indices[indices < 0] = 0
|
||||
return indices
|
||||
|
||||
def _forward_transform(
|
||||
self,
|
||||
sorted_vec: np.array,
|
||||
target: np.array,
|
||||
slopes: np.array,
|
||||
intercepts: np.array,
|
||||
) -> np.array:
|
||||
"""
|
||||
Applies the forward transformation to the marginals of the multivariate
|
||||
target. Target (real valued) -> empirical cdf [0, 1]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sorted_vec
|
||||
Sorted (past) target vector.
|
||||
target
|
||||
Target that will be transformed.
|
||||
slopes
|
||||
Slopes of the piece-wise linear function.
|
||||
intercepts
|
||||
Intercepts of the piece-wise linear function
|
||||
|
||||
Returns
|
||||
-------
|
||||
transformed_target
|
||||
Transformed target vector.
|
||||
"""
|
||||
transformed = list()
|
||||
for sorted, t, slope, intercept in zip(
|
||||
sorted_vec.transpose(),
|
||||
target.transpose(),
|
||||
slopes.transpose(),
|
||||
intercepts.transpose(),
|
||||
):
|
||||
indices = self._search_sorted(sorted, t)
|
||||
transformed_value = slope[indices] * t + intercept[indices]
|
||||
transformed.append(transformed_value)
|
||||
return np.array(transformed).transpose()
|
||||
|
||||
@staticmethod
|
||||
def standard_gaussian_cdf(x: np.array) -> np.array:
|
||||
u = x / (np.sqrt(2.0))
|
||||
return (erf(u) + 1.0) / 2.0
|
||||
|
||||
@staticmethod
|
||||
def standard_gaussian_ppf(y: np.array) -> np.array:
|
||||
y_clipped = np.clip(y, a_min=1.0e-6, a_max=1.0 - 1.0e-6)
|
||||
return np.sqrt(2.0) * erfinv(2.0 * y_clipped - 1.0)
|
||||
|
||||
@staticmethod
|
||||
def winsorized_cutoff(m: np.array) -> np.array:
|
||||
"""
|
||||
Apply truncation to the empirical CDF estimator to reduce variance as
|
||||
described here: https://arxiv.org/abs/0903.0649
|
||||
|
||||
Parameters
|
||||
----------
|
||||
m
|
||||
Input array with empirical CDF values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res
|
||||
Truncated empirical CDf values.
|
||||
"""
|
||||
res = 1 / (4 * m ** 0.25 * np.sqrt(3.14 * np.log(m)))
|
||||
assert 0 < res < 1
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _fill(target: np.ndarray, expected_length: int) -> np.ndarray:
|
||||
"""
|
||||
Makes sure target has at least expected_length time-units by repeating
|
||||
it or using zeros.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target : shape (seq_len, dim)
|
||||
expected_length
|
||||
|
||||
Returns
|
||||
-------
|
||||
array of shape (target_length, dim)
|
||||
"""
|
||||
|
||||
current_length, target_dim = target.shape
|
||||
if current_length == 0:
|
||||
# todo handle the case with no observation better,
|
||||
# we could use dataset statistics but for now we use zeros
|
||||
filled_target = np.zeros((expected_length, target_dim))
|
||||
elif current_length < expected_length:
|
||||
filled_target = np.vstack(
|
||||
[target for _ in range(expected_length // current_length + 1)]
|
||||
)
|
||||
filled_target = filled_target[:expected_length]
|
||||
elif current_length > expected_length:
|
||||
filled_target = target[-expected_length:]
|
||||
else:
|
||||
filled_target = target
|
||||
|
||||
assert filled_target.shape == (expected_length, target_dim)
|
||||
|
||||
return filled_target
|
||||
|
||||
|
||||
def cdf_to_gaussian_forward_transform(
|
||||
input_batch: DataEntry, outputs: torch.Tensor
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Forward transformation of the CDFtoGaussianTransform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_batch
|
||||
Input data to the predictor.
|
||||
outputs
|
||||
Predictor outputs.
|
||||
Returns
|
||||
-------
|
||||
outputs
|
||||
Forward transformed outputs.
|
||||
|
||||
"""
|
||||
|
||||
def _empirical_cdf_inverse_transform(
|
||||
batch_target_sorted: torch.Tensor,
|
||||
batch_predictions: torch.Tensor,
|
||||
slopes: torch.Tensor,
|
||||
intercepts: torch.Tensor,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Apply forward transformation of the empirical CDF.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch_target_sorted
|
||||
Sorted targets of the input batch.
|
||||
batch_predictions
|
||||
Predictions of the underlying probability distribution
|
||||
slopes
|
||||
Slopes of the piece-wise linear function.
|
||||
intercepts
|
||||
Intercepts of the piece-wise linear function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
outputs
|
||||
Forward transformed outputs.
|
||||
|
||||
"""
|
||||
slopes = slopes.cpu().numpy()
|
||||
intercepts = intercepts.cpu().numpy()
|
||||
|
||||
batch_target_sorted = batch_target_sorted.cpu().numpy()
|
||||
_, num_timesteps, _ = batch_target_sorted.shape
|
||||
indices = np.floor(batch_predictions * num_timesteps)
|
||||
# indices = indices - 1
|
||||
# for now project into [0, 1]
|
||||
indices = np.clip(indices, 0, num_timesteps - 1)
|
||||
indices = indices.astype(np.int)
|
||||
|
||||
transformed = np.where(
|
||||
np.take_along_axis(slopes, indices, axis=1) != 0.0,
|
||||
(batch_predictions - np.take_along_axis(intercepts, indices, axis=1))
|
||||
/ np.take_along_axis(slopes, indices, axis=1),
|
||||
np.take_along_axis(batch_target_sorted, indices, axis=1),
|
||||
)
|
||||
return transformed
|
||||
|
||||
# applies inverse cdf to all outputs
|
||||
_, samples, _, _ = outputs.shape
|
||||
for sample_index in range(0, samples):
|
||||
outputs[:, sample_index, :, :] = _empirical_cdf_inverse_transform(
|
||||
input_batch["past_target_sorted"],
|
||||
CDFtoGaussianTransform.standard_gaussian_cdf(
|
||||
outputs[:, sample_index, :, :]
|
||||
),
|
||||
input_batch["slopes"],
|
||||
input_batch["intercepts"],
|
||||
)
|
||||
return outputs
|
||||
@@ -1,47 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from typing import Iterator, List
|
||||
|
||||
from pts.dataset import DataEntry, Dataset
|
||||
from .transform import Chain, Transformation
|
||||
|
||||
|
||||
class TransformedDataset(Dataset):
|
||||
"""
|
||||
A dataset that corresponds to applying a list of transformations to each
|
||||
element in the base_dataset.
|
||||
This only supports SimpleTransformations, which do the same thing at
|
||||
prediction and training time.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base_dataset
|
||||
Dataset to transform
|
||||
transformations
|
||||
List of transformations to apply
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, base_dataset: Dataset, transformations: List[Transformation]
|
||||
) -> None:
|
||||
self.base_dataset = base_dataset
|
||||
self.transformations = Chain(transformations)
|
||||
|
||||
def __iter__(self) -> Iterator[DataEntry]:
|
||||
yield from self.transformations(self.base_dataset, is_train=True)
|
||||
|
||||
def __len__(self):
|
||||
return sum(1 for _ in self)
|
||||
@@ -1,257 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset import DataEntry
|
||||
from pts.feature import TimeFeature
|
||||
from .split import shift_timestamp
|
||||
from .transform import SimpleTransformation, MapTransformation
|
||||
|
||||
|
||||
def target_transformation_length(
|
||||
target: np.array, pred_length: int, is_train: bool
|
||||
) -> int:
|
||||
return target.shape[-1] + (0 if is_train else pred_length)
|
||||
|
||||
|
||||
class AddObservedValuesIndicator(SimpleTransformation):
|
||||
"""
|
||||
Replaces missing values in a numpy array (NaNs) with a dummy value and adds
|
||||
an "observed"-indicator that is ``1`` when values are observed and ``0``
|
||||
when values are missing.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_field
|
||||
Field for which missing values will be replaced
|
||||
output_field
|
||||
Field name to use for the indicator
|
||||
dummy_value
|
||||
Value to use for replacing missing values.
|
||||
convert_nans
|
||||
If set to true (default) missing values will be replaced. Otherwise
|
||||
they will not be replaced. In any case the indicator is included in the
|
||||
result.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
target_field: str,
|
||||
output_field: str,
|
||||
dummy_value: int = 0,
|
||||
convert_nans: bool = True,
|
||||
dtype: np.dtype = np.float32,
|
||||
) -> None:
|
||||
self.dummy_value = dummy_value
|
||||
self.target_field = target_field
|
||||
self.output_field = output_field
|
||||
self.convert_nans = convert_nans
|
||||
self.dtype = dtype
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
value = data[self.target_field]
|
||||
nan_indices = np.where(np.isnan(value))
|
||||
nan_entries = np.isnan(value)
|
||||
|
||||
if self.convert_nans:
|
||||
value[nan_indices] = self.dummy_value
|
||||
|
||||
data[self.target_field] = value
|
||||
# Invert bool array so that missing values are zeros and store as float
|
||||
data[self.output_field] = np.invert(nan_entries).astype(self.dtype)
|
||||
return data
|
||||
|
||||
|
||||
class AddConstFeature(MapTransformation):
|
||||
"""
|
||||
Expands a `const` value along the time axis as a dynamic feature, where
|
||||
the T-dimension is defined as the sum of the `pred_length` parameter and
|
||||
the length of a time series specified by the `target_field`.
|
||||
|
||||
If `is_train=True` the feature matrix has the same length as the `target` field.
|
||||
If `is_train=False` the feature matrix has length len(target) + pred_length
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Field name for output.
|
||||
target_field
|
||||
Field containing the target array. The length of this array will be used.
|
||||
pred_length
|
||||
Prediction length (this is necessary since
|
||||
features have to be available in the future)
|
||||
const
|
||||
Constant value to use.
|
||||
dtype
|
||||
Numpy dtype to use for resulting array.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
output_field: str,
|
||||
target_field: str,
|
||||
pred_length: int,
|
||||
const: float = 1.0,
|
||||
dtype: np.dtype = np.float32,
|
||||
) -> None:
|
||||
self.pred_length = pred_length
|
||||
self.const = const
|
||||
self.dtype = dtype
|
||||
self.output_field = output_field
|
||||
self.target_field = target_field
|
||||
|
||||
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
|
||||
length = target_transformation_length(
|
||||
data[self.target_field], self.pred_length, is_train=is_train
|
||||
)
|
||||
data[self.output_field] = self.const * np.ones(
|
||||
shape=(1, length), dtype=self.dtype
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
class AddTimeFeatures(MapTransformation):
|
||||
"""
|
||||
Adds a set of time features.
|
||||
|
||||
If `is_train=True` the feature matrix has the same length as the `target` field.
|
||||
If `is_train=False` the feature matrix has length len(target) + pred_length
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_field
|
||||
Field with the start time stamp of the time series
|
||||
target_field
|
||||
Field with the array containing the time series values
|
||||
output_field
|
||||
Field name for result.
|
||||
time_features
|
||||
list of time features to use.
|
||||
pred_length
|
||||
Prediction length
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
start_field: str,
|
||||
target_field: str,
|
||||
output_field: str,
|
||||
time_features: List[TimeFeature],
|
||||
pred_length: int,
|
||||
) -> None:
|
||||
self.date_features = time_features
|
||||
self.pred_length = pred_length
|
||||
self.start_field = start_field
|
||||
self.target_field = target_field
|
||||
self.output_field = output_field
|
||||
self._min_time_point: pd.Timestamp = None
|
||||
self._max_time_point: pd.Timestamp = None
|
||||
self._full_range_date_features: np.ndarray = None
|
||||
self._date_index: pd.DatetimeIndex = None
|
||||
|
||||
def _update_cache(self, start: pd.Timestamp, length: int) -> None:
|
||||
end = shift_timestamp(start, length)
|
||||
if self._min_time_point is not None:
|
||||
if self._min_time_point <= start and end <= self._max_time_point:
|
||||
return
|
||||
if self._min_time_point is None:
|
||||
self._min_time_point = start
|
||||
self._max_time_point = end
|
||||
self._min_time_point = min(shift_timestamp(start, -50), self._min_time_point)
|
||||
self._max_time_point = max(shift_timestamp(end, 50), self._max_time_point)
|
||||
self.full_date_range = pd.date_range(
|
||||
self._min_time_point, self._max_time_point, freq=start.freq
|
||||
)
|
||||
self._full_range_date_features = (
|
||||
np.vstack([feat(self.full_date_range) for feat in self.date_features])
|
||||
if self.date_features
|
||||
else None
|
||||
)
|
||||
self._date_index = pd.Series(
|
||||
index=self.full_date_range, data=np.arange(len(self.full_date_range)),
|
||||
)
|
||||
|
||||
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
|
||||
start = data[self.start_field]
|
||||
length = target_transformation_length(
|
||||
data[self.target_field], self.pred_length, is_train=is_train
|
||||
)
|
||||
self._update_cache(start, length)
|
||||
i0 = self._date_index[start]
|
||||
features = (
|
||||
self._full_range_date_features[..., i0 : i0 + length]
|
||||
if self.date_features
|
||||
else None
|
||||
)
|
||||
data[self.output_field] = features
|
||||
return data
|
||||
|
||||
|
||||
class AddAgeFeature(MapTransformation):
|
||||
"""
|
||||
Adds an 'age' feature to the data_entry.
|
||||
|
||||
The age feature starts with a small value at the start of the time series
|
||||
and grows over time.
|
||||
|
||||
If `is_train=True` the age feature has the same length as the `target`
|
||||
field.
|
||||
If `is_train=False` the age feature has length len(target) + pred_length
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_field
|
||||
Field with target values (array) of time series
|
||||
output_field
|
||||
Field name to use for the output.
|
||||
pred_length
|
||||
Prediction length
|
||||
log_scale
|
||||
If set to true the age feature grows logarithmically otherwise linearly
|
||||
over time.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
target_field: str,
|
||||
output_field: str,
|
||||
pred_length: int,
|
||||
log_scale: bool = True,
|
||||
dtype: np.dtype = np.float32,
|
||||
) -> None:
|
||||
self.pred_length = pred_length
|
||||
self.target_field = target_field
|
||||
self.feature_name = output_field
|
||||
self.log_scale = log_scale
|
||||
self._age_feature = np.zeros(0)
|
||||
self.dtype = dtype
|
||||
|
||||
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
|
||||
length = target_transformation_length(
|
||||
data[self.target_field], self.pred_length, is_train=is_train
|
||||
)
|
||||
|
||||
if self.log_scale:
|
||||
age = np.log10(2.0 + np.arange(length, dtype=self.dtype))
|
||||
else:
|
||||
age = np.arange(length, dtype=self.dtype)
|
||||
|
||||
data[self.feature_name] = age.reshape((1, length))
|
||||
|
||||
return data
|
||||
@@ -1,118 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset import DataEntry
|
||||
from .transform import SimpleTransformation, MapTransformation
|
||||
|
||||
|
||||
class RenameFields(SimpleTransformation):
|
||||
"""
|
||||
Rename fields using a mapping
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mapping
|
||||
Name mapping `input_name -> output_name`
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, mapping: Dict[str, str]) -> None:
|
||||
self.mapping = mapping
|
||||
values_count = Counter(mapping.values())
|
||||
for new_key, count in values_count.items():
|
||||
assert count == 1, f"Mapped key {new_key} occurs multiple time"
|
||||
|
||||
def transform(self, data: DataEntry):
|
||||
for key, new_key in self.mapping.items():
|
||||
if key not in data:
|
||||
continue
|
||||
assert new_key not in data
|
||||
data[new_key] = data[key]
|
||||
del data[key]
|
||||
return data
|
||||
|
||||
|
||||
class RemoveFields(SimpleTransformation):
|
||||
|
||||
@validated()
|
||||
def __init__(self, field_names: List[str]) -> None:
|
||||
self.field_names = field_names
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
for k in self.field_names:
|
||||
if k in data.keys():
|
||||
del data[k]
|
||||
return data
|
||||
|
||||
|
||||
class SetField(SimpleTransformation):
|
||||
"""
|
||||
Sets a field in the dictionary with the given value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Name of the field that will be set
|
||||
value
|
||||
Value to be set
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, output_field: str, value: Any) -> None:
|
||||
self.output_field = output_field
|
||||
self.value = value
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
data[self.output_field] = self.value
|
||||
return data
|
||||
|
||||
|
||||
class SetFieldIfNotPresent(SimpleTransformation):
|
||||
"""Sets a field in the dictionary with the given value, in case it does not
|
||||
exist already.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_field
|
||||
Name of the field that will be set
|
||||
value
|
||||
Value to be set
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, field: str, value: Any) -> None:
|
||||
self.output_field = field
|
||||
self.value = value
|
||||
|
||||
def transform(self, data: DataEntry) -> DataEntry:
|
||||
if self.output_field not in data.keys():
|
||||
data[self.output_field] = self.value
|
||||
return data
|
||||
|
||||
|
||||
class SelectFields(MapTransformation):
|
||||
"""
|
||||
Only keep the listed fields
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_fields
|
||||
List of fields to keep.
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, input_fields: List[str]) -> None:
|
||||
self.input_fields = input_fields
|
||||
|
||||
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
|
||||
return {f: data[f] for f in self.input_fields}
|
||||
@@ -1,176 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset.stat import ScaleHistogram
|
||||
|
||||
|
||||
class InstanceSampler(ABC):
|
||||
"""
|
||||
An InstanceSampler is called with the time series and the valid
|
||||
index bounds a, b and should return a set of indices a <= i <= b
|
||||
at which training instances will be generated.
|
||||
|
||||
The object should be called with:
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ts
|
||||
target that should be sampled with shape (dim, seq_len)
|
||||
a
|
||||
first index of the target that can be sampled
|
||||
b
|
||||
last index of the target that can be sampled
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Selected points to sample
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
|
||||
pass
|
||||
|
||||
|
||||
class UniformSplitSampler(InstanceSampler):
|
||||
"""
|
||||
Samples each point with the same fixed probability.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
p
|
||||
Probability of selecting a time point
|
||||
"""
|
||||
|
||||
def __init__(self, p: float) -> None:
|
||||
self.p = p
|
||||
|
||||
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
|
||||
assert a <= b, "First index must be less than or equal to the last index."
|
||||
|
||||
window_size = b - a + 1
|
||||
(indices,) = np.where(np.random.random_sample(window_size) < self.p)
|
||||
return indices + a
|
||||
|
||||
|
||||
class TestSplitSampler(InstanceSampler):
|
||||
"""
|
||||
Sampler used for prediction. Always selects the last time point for
|
||||
splitting i.e. the forecast point for the time series.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
|
||||
return np.array([b])
|
||||
|
||||
|
||||
class ExpectedNumInstanceSampler(InstanceSampler):
|
||||
"""
|
||||
Keeps track of the average time series length and adjusts the probability
|
||||
per time point such that on average `num_instances` training examples are
|
||||
generated per time series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
num_instances
|
||||
number of training examples generated per time series on average
|
||||
"""
|
||||
@validated()
|
||||
def __init__(self, num_instances: float) -> None:
|
||||
self.num_instances = num_instances
|
||||
self.total_length = 0
|
||||
self.n = 0
|
||||
|
||||
def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
|
||||
window_size = b - a + 1
|
||||
self.n += 1
|
||||
self.total_length += window_size
|
||||
avg_length = self.total_length / self.n
|
||||
|
||||
sampler = UniformSplitSampler(self.num_instances / avg_length)
|
||||
return sampler(ts, a, b)
|
||||
|
||||
|
||||
class BucketInstanceSampler(InstanceSampler):
|
||||
"""
|
||||
This sample can be used when working with a set of time series that have a
|
||||
skewed distributions. For instance, if the dataset contains many time series
|
||||
with small values and few with large values.
|
||||
|
||||
The probability of sampling from bucket i is the inverse of its number of elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scale_histogram
|
||||
The histogram of scale for the time series. Here scale is the mean abs
|
||||
value of the time series.
|
||||
"""
|
||||
|
||||
def __init__(self, scale_histogram: ScaleHistogram) -> None:
|
||||
# probability of sampling a bucket i is the inverse of its number of
|
||||
# elements
|
||||
self.scale_histogram = scale_histogram
|
||||
self.lookup = np.arange(2 ** 13)
|
||||
|
||||
def __call__(self, ts: np.ndarray, a: int, b: int) -> None:
|
||||
while ts.shape[-1] >= len(self.lookup):
|
||||
self.lookup = np.arange(2 * len(self.lookup))
|
||||
p = 1.0 / self.scale_histogram.count(ts)
|
||||
mask = np.random.uniform(low=0.0, high=1.0, size=b - a + 1) < p
|
||||
indices = self.lookup[a : a + len(mask)][mask]
|
||||
return indices
|
||||
|
||||
|
||||
class ContinuousTimePointSampler(ABC):
|
||||
"""
|
||||
Abstract class for "continuous time" samplers, which, given a lower bound
|
||||
and upper bound, sample "points" (events) in continuous time from a
|
||||
specified interval.
|
||||
"""
|
||||
|
||||
def __init__(self, num_instances: int) -> None:
|
||||
self.num_instances = num_instances
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, a: float, b: float) -> np.ndarray:
|
||||
"""
|
||||
Returns random points in the real interval between :code:`a` and
|
||||
:code:`b`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a
|
||||
The lower bound (minimum time value that a sampled point can take)
|
||||
b
|
||||
Upper bound. Must be greater than a.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ContinuousTimeUniformSampler(ContinuousTimePointSampler):
|
||||
"""
|
||||
Implements a simple random sampler to sample points in the continuous
|
||||
interval between :code:`a` and :code:`b`.
|
||||
"""
|
||||
|
||||
def __call__(self, a: float, b: float) -> np.ndarray:
|
||||
assert a <= b, "Interval start time must be before interval end time."
|
||||
return np.random.rand(self.num_instances) * (b - a) + a
|
||||
@@ -1,529 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pts.core.component import validated
|
||||
from pts.dataset import DataEntry, FieldName
|
||||
from .sampler import InstanceSampler, ContinuousTimePointSampler
|
||||
from .transform import FlatMapTransformation
|
||||
|
||||
|
||||
def shift_timestamp(ts: pd.Timestamp, offset: int) -> pd.Timestamp:
|
||||
"""
|
||||
Computes a shifted timestamp.
|
||||
|
||||
Basic wrapping around pandas ``ts + offset`` with caching and exception
|
||||
handling.
|
||||
"""
|
||||
return _shift_timestamp_helper(ts, ts.freq, offset)
|
||||
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def _shift_timestamp_helper(ts: pd.Timestamp, freq: str, offset: int) -> pd.Timestamp:
|
||||
"""
|
||||
We are using this helper function which explicitly uses the frequency as a
|
||||
parameter, because the frequency is not included in the hash of a time
|
||||
stamp.
|
||||
|
||||
I.e.
|
||||
pd.Timestamp(x, freq='1D') and pd.Timestamp(x, freq='1min')
|
||||
|
||||
hash to the same value.
|
||||
"""
|
||||
try:
|
||||
# this line looks innocent, but can create a date which is out of
|
||||
# bounds values over year 9999 raise a ValueError
|
||||
# values over 2262-04-11 raise a pandas OutOfBoundsDatetime
|
||||
return ts + offset * ts.freq
|
||||
except (ValueError, pd._libs.OutOfBoundsDatetime) as ex:
|
||||
raise Exception(ex)
|
||||
|
||||
|
||||
class InstanceSplitter(FlatMapTransformation):
|
||||
"""
|
||||
Selects training instances, by slicing the target and other time series
|
||||
like arrays at random points in training mode or at the last time point in
|
||||
prediction mode. Assumption is that all time like arrays start at the same
|
||||
time point.
|
||||
|
||||
The target and each time_series_field is removed and instead two
|
||||
corresponding fields with prefix `past_` and `future_` are included. E.g.
|
||||
|
||||
If the target array is one-dimensional, the resulting instance has shape
|
||||
(len_target). In the multi-dimensional case, the instance has shape (dim,
|
||||
len_target).
|
||||
|
||||
target -> past_target and future_target
|
||||
|
||||
The transformation also adds a field 'past_is_pad' that indicates whether
|
||||
values where padded or not.
|
||||
|
||||
Convention: time axis is always the last axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
target_field
|
||||
field containing the target
|
||||
is_pad_field
|
||||
output field indicating whether padding happened
|
||||
start_field
|
||||
field containing the start date of the time series
|
||||
forecast_start_field
|
||||
output field that will contain the time point where the forecast starts
|
||||
train_sampler
|
||||
instance sampler that provides sampling indices given a time-series
|
||||
past_length
|
||||
length of the target seen before making prediction
|
||||
future_length
|
||||
length of the target that must be predicted
|
||||
time_first
|
||||
whether to have time series output in (time, dimension) or in
|
||||
(dimension, time) layout
|
||||
time_series_fields
|
||||
fields that contains time-series, they are split in the same interval
|
||||
as the target
|
||||
pick_incomplete
|
||||
whether training examples can be sampled with only a part of
|
||||
past_length time-units
|
||||
present for the time series. This is useful to train models for
|
||||
cold-start. In such case, is_pad_out contains an indicator whether
|
||||
data is padded or not.
|
||||
"""
|
||||
|
||||
@validated()
|
||||
def __init__(
|
||||
self,
|
||||
target_field: str,
|
||||
is_pad_field: str,
|
||||
start_field: str,
|
||||
forecast_start_field: str,
|
||||
train_sampler: InstanceSampler,
|
||||
past_length: int,
|
||||
future_length: int,
|
||||
time_first: bool = True,
|
||||
time_series_fields: Optional[List[str]] = None,
|
||||
pick_incomplete: bool = True,
|
||||
) -> None:
|
||||
|
||||
assert future_length > 0
|
||||
|
||||
self.train_sampler = train_sampler
|
||||
self.past_length = past_length
|
||||
self.future_length = future_length
|
||||
self.time_first = time_first
|
||||
self.ts_fields = time_series_fields if time_series_fields is not None else []
|
||||
self.target_field = target_field
|
||||
self.is_pad_field = is_pad_field
|
||||
self.start_field = start_field
|
||||
self.forecast_start_field = forecast_start_field
|
||||
self.pick_incomplete = pick_incomplete
|
||||
|
||||
def _past(self, col_name):
|
||||
return f"past_{col_name}"
|
||||
|
||||
def _future(self, col_name):
|
||||
return f"future_{col_name}"
|
||||
|
||||
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
|
||||
pl = self.future_length
|
||||
slice_cols = self.ts_fields + [self.target_field]
|
||||
target = data[self.target_field]
|
||||
|
||||
len_target = target.shape[-1]
|
||||
|
||||
minimum_length = (
|
||||
self.future_length
|
||||
if self.pick_incomplete
|
||||
else self.past_length + self.future_length
|
||||
)
|
||||
|
||||
if is_train:
|
||||
sampling_bounds = (
|
||||
(0, len_target - self.future_length)
|
||||
if self.pick_incomplete
|
||||
else (self.past_length, len_target - self.future_length)
|
||||
)
|
||||
|
||||
# We currently cannot handle time series that are
|
||||
# too short during training, so we just skip these.
|
||||
# If we want to include them we would need to pad and to
|
||||
# mask the loss.
|
||||
sampled_indices = (
|
||||
np.array([], dtype=int)
|
||||
if len_target < minimum_length
|
||||
else self.train_sampler(target, *sampling_bounds)
|
||||
)
|
||||
else:
|
||||
assert self.pick_incomplete or len_target >= self.past_length
|
||||
sampled_indices = np.array([len_target], dtype=int)
|
||||
for i in sampled_indices:
|
||||
pad_length = max(self.past_length - i, 0)
|
||||
if not self.pick_incomplete:
|
||||
assert pad_length == 0, f"pad_length should be zero, got {pad_length}"
|
||||
d = data.copy()
|
||||
for ts_field in slice_cols:
|
||||
if i > self.past_length:
|
||||
# truncate to past_length
|
||||
past_piece = d[ts_field][..., i - self.past_length : i]
|
||||
elif i < self.past_length:
|
||||
pad_block = np.zeros(
|
||||
d[ts_field].shape[:-1] + (pad_length,), dtype=d[ts_field].dtype,
|
||||
)
|
||||
past_piece = np.concatenate(
|
||||
[pad_block, d[ts_field][..., :i]], axis=-1
|
||||
)
|
||||
else:
|
||||
past_piece = d[ts_field][..., :i]
|
||||
d[self._past(ts_field)] = past_piece
|
||||
d[self._future(ts_field)] = d[ts_field][..., i : i + pl]
|
||||
del d[ts_field]
|
||||
pad_indicator = np.zeros(self.past_length)
|
||||
if pad_length > 0:
|
||||
pad_indicator[:pad_length] = 1
|
||||
|
||||
if self.time_first:
|
||||
for ts_field in slice_cols:
|
||||
d[self._past(ts_field)] = d[self._past(ts_field)].transpose()
|
||||
d[self._future(ts_field)] = d[self._future(ts_field)].transpose()
|
||||
|
||||
d[self._past(self.is_pad_field)] = pad_indicator
|
||||
d[self.forecast_start_field] = shift_timestamp(d[self.start_field], i)
|
||||
yield d
|
||||
|
||||
|
||||
class CanonicalInstanceSplitter(FlatMapTransformation):
|
||||
"""
|
||||
Selects instances, by slicing the target and other time series
|
||||
like arrays at random points in training mode or at the last time point in
|
||||
prediction mode. Assumption is that all time like arrays start at the same
|
||||
time point.
|
||||
|
||||
In training mode, the returned instances contain past_`target_field`
|
||||
as well as past_`time_series_fields`.
|
||||
|
||||
In prediction mode, one can set `use_prediction_features` to get
|
||||
future_`time_series_fields`.
|
||||
|
||||
If the target array is one-dimensional, the `target_field` in the resulting instance has shape
|
||||
(`instance_length`). In the multi-dimensional case, the instance has shape (`dim`, `instance_length`),
|
||||
where `dim` can also take a value of 1.
|
||||
|
||||
In the case of insufficient number of time series values, the
|
||||
transformation also adds a field 'past_is_pad' that indicates whether
|
||||
values where padded or not, and the value is padded with
|
||||
`default_pad_value` with a default value 0.
|
||||
This is done only if `allow_target_padding` is `True`,
|
||||
and the length of `target` is smaller than `instance_length`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_field
|
||||
fields that contains time-series
|
||||
is_pad_field
|
||||
output field indicating whether padding happened
|
||||
start_field
|
||||
field containing the start date of the time series
|
||||
forecast_start_field
|
||||
field containing the forecast start date
|
||||
instance_sampler
|
||||
instance sampler that provides sampling indices given a time-series
|
||||
instance_length
|
||||
length of the target seen before making prediction
|
||||
time_first
|
||||
whether to have time series output in (time, dimension) or in
|
||||
(dimension, time) layout
|
||||
time_series_fields
|
||||
fields that contains time-series, they are split in the same interval
|
||||
as the target
|
||||
allow_target_padding
|
||||
flag to allow padding
|
||||
pad_value
|
||||
value to be used for padding
|
||||
use_prediction_features
|
||||
flag to indicate if prediction range features should be returned
|
||||
prediction_length
|
||||
length of the prediction range, must be set if
|
||||
use_prediction_features is True
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_field: str,
|
||||
is_pad_field: str,
|
||||
start_field: str,
|
||||
forecast_start_field: str,
|
||||
instance_sampler: InstanceSampler,
|
||||
instance_length: int,
|
||||
time_first: bool = True,
|
||||
time_series_fields: List[str] = [],
|
||||
allow_target_padding: bool = False,
|
||||
pad_value: float = 0.0,
|
||||
use_prediction_features: bool = False,
|
||||
prediction_length: Optional[int] = None,
|
||||
) -> None:
|
||||
self.instance_sampler = instance_sampler
|
||||
self.instance_length = instance_length
|
||||
self.time_first = time_first
|
||||
self.dynamic_feature_fields = time_series_fields
|
||||
self.target_field = target_field
|
||||
self.allow_target_padding = allow_target_padding
|
||||
self.pad_value = pad_value
|
||||
self.is_pad_field = is_pad_field
|
||||
self.start_field = start_field
|
||||
self.forecast_start_field = forecast_start_field
|
||||
|
||||
assert (
|
||||
not use_prediction_features or prediction_length is not None
|
||||
), "You must specify `prediction_length` if `use_prediction_features`"
|
||||
|
||||
self.use_prediction_features = use_prediction_features
|
||||
self.prediction_length = prediction_length
|
||||
|
||||
def _past(self, col_name):
|
||||
return f"past_{col_name}"
|
||||
|
||||
def _future(self, col_name):
|
||||
return f"future_{col_name}"
|
||||
|
||||
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
|
||||
ts_fields = self.dynamic_feature_fields + [self.target_field]
|
||||
ts_target = data[self.target_field]
|
||||
|
||||
len_target = ts_target.shape[-1]
|
||||
|
||||
if is_train:
|
||||
if len_target < self.instance_length:
|
||||
sampling_indices = (
|
||||
# Returning [] for all time series will cause this to be in loop forever!
|
||||
[len_target]
|
||||
if self.allow_target_padding
|
||||
else []
|
||||
)
|
||||
else:
|
||||
sampling_indices = self.instance_sampler(
|
||||
ts_target, self.instance_length, len_target
|
||||
)
|
||||
else:
|
||||
sampling_indices = [len_target]
|
||||
|
||||
for i in sampling_indices:
|
||||
d = data.copy()
|
||||
|
||||
pad_length = max(self.instance_length - i, 0)
|
||||
|
||||
# update start field
|
||||
d[self.start_field] = shift_timestamp(
|
||||
data[self.start_field], i - self.instance_length
|
||||
)
|
||||
|
||||
# set is_pad field
|
||||
is_pad = np.zeros(self.instance_length)
|
||||
if pad_length > 0:
|
||||
is_pad[:pad_length] = 1
|
||||
d[self.is_pad_field] = is_pad
|
||||
|
||||
# update time series fields
|
||||
for ts_field in ts_fields:
|
||||
full_ts = data[ts_field]
|
||||
if pad_length > 0:
|
||||
pad_pre = self.pad_value * np.ones(
|
||||
shape=full_ts.shape[:-1] + (pad_length,)
|
||||
)
|
||||
past_ts = np.concatenate([pad_pre, full_ts[..., :i]], axis=-1)
|
||||
else:
|
||||
past_ts = full_ts[..., (i - self.instance_length) : i]
|
||||
|
||||
past_ts = past_ts.transpose() if self.time_first else past_ts
|
||||
d[self._past(ts_field)] = past_ts
|
||||
|
||||
if self.use_prediction_features and not is_train:
|
||||
if not ts_field == self.target_field:
|
||||
future_ts = full_ts[..., i : i + self.prediction_length]
|
||||
future_ts = (
|
||||
future_ts.transpose() if self.time_first else future_ts
|
||||
)
|
||||
d[self._future(ts_field)] = future_ts
|
||||
|
||||
del d[ts_field]
|
||||
|
||||
d[self.forecast_start_field] = shift_timestamp(
|
||||
d[self.start_field], self.instance_length
|
||||
)
|
||||
|
||||
yield d
|
||||
|
||||
|
||||
class ContinuousTimeInstanceSplitter(FlatMapTransformation):
|
||||
"""
|
||||
Selects training instances by slicing "intervals" from a continuos-time
|
||||
process instantiation. Concretely, the input data is expected to describe an
|
||||
instantiation from a point (or jump) process, with the "target"
|
||||
identifying inter-arrival times and other features (marks), as described
|
||||
in detail below.
|
||||
|
||||
The splitter will then take random points in continuous time from each
|
||||
given observation, and return a (variable-length) array of points in
|
||||
the past (context) and the future (prediction) intervals.
|
||||
|
||||
The transformation is analogous to its discrete counterpart
|
||||
`InstanceSplitter` except that
|
||||
|
||||
- It does not allow "incomplete" records. That is, the past and future
|
||||
intervals sampled are always complete
|
||||
- Outputs a (T, C) layout.
|
||||
- Does not accept `time_series_fields` (i.e., only accepts target fields) as these
|
||||
would typically not be available in TPP data.
|
||||
|
||||
The target arrays are expected to have (2, T) layout where the first axis
|
||||
corresponds to the (i) interarrival times between consecutive points, in
|
||||
order and (ii) integer identifiers of marks (from {0, 1, ..., :code:`num_marks`}).
|
||||
The returned arrays will have (T, 2) layout.
|
||||
|
||||
For example, the array below corresponds to a target array where points with timestamps
|
||||
0.5, 1.1, and 1.5 were observed belonging to categories (marks) 3, 1 and 0
|
||||
respectively: :code:`[[0.5, 0.6, 0.4], [3, 1, 0]]`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
past_interval_length
|
||||
length of the interval seen before making prediction
|
||||
future_interval_length
|
||||
length of the interval that must be predicted
|
||||
train_sampler
|
||||
instance sampler that provides sampling indices given a time-series
|
||||
target_field
|
||||
field containing the target
|
||||
start_field
|
||||
field containing the start date of the of the point process observation
|
||||
end_field
|
||||
field containing the end date of the point process observation
|
||||
forecast_start_field
|
||||
output field that will contain the time point where the forecast starts
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
past_interval_length: float,
|
||||
future_interval_length: float,
|
||||
train_sampler: ContinuousTimePointSampler,
|
||||
target_field: str = FieldName.TARGET,
|
||||
start_field: str = FieldName.START,
|
||||
end_field: str = "end",
|
||||
forecast_start_field: str = FieldName.FORECAST_START,
|
||||
) -> None:
|
||||
|
||||
assert (
|
||||
future_interval_length > 0
|
||||
), "Prediction interval must have length greater than 0."
|
||||
|
||||
self.train_sampler = train_sampler
|
||||
self.past_interval_length = past_interval_length
|
||||
self.future_interval_length = future_interval_length
|
||||
self.target_field = target_field
|
||||
self.start_field = start_field
|
||||
self.end_field = end_field
|
||||
self.forecast_start_field = forecast_start_field
|
||||
|
||||
# noinspection PyMethodMayBeStatic
|
||||
def _mask_sorted(self, a: np.ndarray, lb: float, ub: float):
|
||||
start = np.searchsorted(a, lb)
|
||||
end = np.searchsorted(a, ub)
|
||||
return np.arange(start, end)
|
||||
|
||||
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]:
|
||||
|
||||
assert data[self.start_field].freq == data[self.end_field].freq
|
||||
|
||||
total_interval_length = (data[self.end_field] - data[self.start_field]) / data[
|
||||
self.start_field
|
||||
].freq.delta
|
||||
|
||||
# sample forecast start times in continuous time
|
||||
if is_train:
|
||||
if total_interval_length < (
|
||||
self.future_interval_length + self.past_interval_length
|
||||
):
|
||||
sampling_times: np.ndarray = np.array([])
|
||||
else:
|
||||
sampling_times = self.train_sampler(
|
||||
self.past_interval_length,
|
||||
total_interval_length - self.future_interval_length,
|
||||
)
|
||||
else:
|
||||
sampling_times = np.array([total_interval_length])
|
||||
|
||||
ia_times = data[self.target_field][0, :]
|
||||
marks = data[self.target_field][1:, :]
|
||||
|
||||
ts = np.cumsum(ia_times)
|
||||
assert ts[-1] < total_interval_length, (
|
||||
"Target interarrival times provided are inconsistent with "
|
||||
"start and end timestamps."
|
||||
)
|
||||
|
||||
# select field names that will be included in outputs
|
||||
keep_cols = {
|
||||
k: v
|
||||
for k, v in data.items()
|
||||
if k not in [self.target_field, self.start_field, self.end_field]
|
||||
}
|
||||
|
||||
for future_start in sampling_times:
|
||||
|
||||
r: DataEntry = dict()
|
||||
|
||||
past_start = future_start - self.past_interval_length
|
||||
future_end = future_start + self.future_interval_length
|
||||
|
||||
assert past_start >= 0
|
||||
|
||||
past_mask = self._mask_sorted(ts, past_start, future_start)
|
||||
|
||||
past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[np.newaxis]
|
||||
|
||||
r[f"past_{self.target_field}"] = np.concatenate(
|
||||
[past_ia_times, marks[:, past_mask]], axis=0
|
||||
).transpose()
|
||||
|
||||
r["past_valid_length"] = np.array([len(past_mask)])
|
||||
|
||||
r[self.forecast_start_field] = (
|
||||
data[self.start_field]
|
||||
+ data[self.start_field].freq.delta * future_start
|
||||
)
|
||||
|
||||
if is_train: # include the future only if is_train
|
||||
assert future_end <= total_interval_length
|
||||
|
||||
future_mask = self._mask_sorted(ts, future_start, future_end)
|
||||
|
||||
future_ia_times = np.diff(np.r_[0, ts[future_mask] - future_start])[
|
||||
np.newaxis
|
||||
]
|
||||
|
||||
r[f"future_{self.target_field}"] = np.concatenate(
|
||||
[future_ia_times, marks[:, future_mask]], axis=0
|
||||
).transpose()
|
||||
|
||||
r["future_valid_length"] = np.array([len(future_mask)])
|
||||
|
||||
# include other fields
|
||||
r.update(keep_cols.copy())
|
||||
|
||||
yield r
|
||||
@@ -16,7 +16,7 @@ from abc import ABC, abstractmethod
|
||||
from functools import reduce
|
||||
from typing import Callable, Iterator, Iterable, List
|
||||
|
||||
from pts.core.component import validated
|
||||
from gluonts.core.component import validated
|
||||
from pts.dataset import DataEntry
|
||||
|
||||
MAX_IDLE_TRANSFORMS = 100
|
||||
@@ -43,6 +43,7 @@ class Chain(Transformation):
|
||||
"""
|
||||
Chain multiple transformations together.
|
||||
"""
|
||||
|
||||
@validated()
|
||||
def __init__(self, trans: List[Transformation]) -> None:
|
||||
self.transformations = []
|
||||
|
||||
@@ -16,7 +16,8 @@ setup(
|
||||
zip_safe=True,
|
||||
python_requires=">=3.6",
|
||||
install_requires = [
|
||||
'torch>=1.5.0',
|
||||
'torch>=1.7.0',
|
||||
'glounts>=0.6.4',
|
||||
'holidays',
|
||||
'numpy',
|
||||
'pandas>=1.1',
|
||||
@@ -24,7 +25,6 @@ setup(
|
||||
'tqdm',
|
||||
'pydantic',
|
||||
'matplotlib',
|
||||
'python-rapidjson',
|
||||
'tensorboard',
|
||||
],
|
||||
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# First-party imports
|
||||
from pts.dataset import FieldName
|
||||
|
||||
|
||||
def test_dataset_fields():
|
||||
assert (
|
||||
"feat_static_cat" == FieldName.FEAT_STATIC_CAT
|
||||
), "Error in the FieldName 'feat_static_cat'."
|
||||
assert (
|
||||
"feat_static_real" == FieldName.FEAT_STATIC_REAL
|
||||
), "Error in the FieldName 'feat_static_real'."
|
||||
assert (
|
||||
"feat_dynamic_cat" == FieldName.FEAT_DYNAMIC_CAT
|
||||
), "Error in the FieldName 'feat_dynamic_cat'."
|
||||
assert (
|
||||
"feat_dynamic_real" == FieldName.FEAT_DYNAMIC_REAL
|
||||
), "Error in the FieldName 'feat_dynamic_real'."
|
||||
@@ -1,129 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Standard library imports
|
||||
import pytest
|
||||
|
||||
# First-party imports
|
||||
from pts.dataset import ListDataset, MultivariateGrouper
|
||||
|
||||
UNIVARIATE_TS = [
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-08", "target": [5, 6, 7, 8]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-07", "target": [0]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-01", "target": [0]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-08", "target": [5, 6, 7, 8]},
|
||||
],
|
||||
]
|
||||
|
||||
MULTIVARIATE_TS = [
|
||||
[{"start": "2014-09-07", "target": [[1, 2, 3, 4], [5, 6, 7, 8]]}],
|
||||
[{"start": "2014-09-07", "target": [[1, 2, 3, 4, 2.5], [6.5, 5, 6, 7, 8]],}],
|
||||
[{"start": "2014-09-07", "target": [[1, 2, 3, 4], [0, 0, 0, 0]]}],
|
||||
[
|
||||
{
|
||||
"start": "2014-09-01",
|
||||
"target": [
|
||||
[2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 1, 2, 3, 4],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
],
|
||||
}
|
||||
],
|
||||
[{"start": "2014-09-07", "target": [[1, 2, 3, 4, 0], [0, 5, 6, 7, 8]]}],
|
||||
]
|
||||
|
||||
TRAIN_FILL_RULE = [np.mean, np.mean, np.mean, np.mean, lambda x: 0.0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"univariate_ts, multivariate_ts, train_fill_rule",
|
||||
zip(UNIVARIATE_TS, MULTIVARIATE_TS, TRAIN_FILL_RULE),
|
||||
)
|
||||
def test_multivariate_grouper_train(
|
||||
univariate_ts, multivariate_ts, train_fill_rule
|
||||
) -> None:
|
||||
univariate_ds = ListDataset(univariate_ts, freq="1D")
|
||||
multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False)
|
||||
|
||||
grouper = MultivariateGrouper(train_fill_rule=train_fill_rule)
|
||||
assert (
|
||||
list(grouper(univariate_ds))[0]["target"] == list(multivariate_ds)[0]["target"]
|
||||
).all()
|
||||
|
||||
assert list(grouper(univariate_ds))[0]["start"] == list(multivariate_ds)[0]["start"]
|
||||
|
||||
|
||||
UNIVARIATE_TS_TEST = [
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
|
||||
{"start": "2014-09-08", "target": [0, 1, 2, 3]},
|
||||
{"start": "2014-09-08", "target": [4, 5, 6, 7]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [1, 2, 3, 4]},
|
||||
{"start": "2014-09-07", "target": [5, 6, 7, 8]},
|
||||
{"start": "2014-09-08", "target": [0, 1, 2, 3]},
|
||||
{"start": "2014-09-08", "target": [4, 5, 6, 7]},
|
||||
],
|
||||
]
|
||||
|
||||
MULTIVARIATE_TS_TEST = [
|
||||
[
|
||||
{"start": "2014-09-07", "target": [[1, 2, 3, 4], [5, 6, 7, 8]]},
|
||||
{"start": "2014-09-07", "target": [[0, 0, 1, 2, 3], [0, 4, 5, 6, 7]]},
|
||||
],
|
||||
[
|
||||
{"start": "2014-09-07", "target": [[5, 6, 7, 8]]},
|
||||
{"start": "2014-09-07", "target": [[0, 4, 5, 6, 7]]},
|
||||
],
|
||||
]
|
||||
|
||||
TEST_FILL_RULE = [lambda x: 0.0, lambda x: 0.0]
|
||||
MAX_TARGET_DIM = [2, 1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"univariate_ts, multivariate_ts, test_fill_rule, max_target_dim",
|
||||
zip(UNIVARIATE_TS_TEST, MULTIVARIATE_TS_TEST, TEST_FILL_RULE, MAX_TARGET_DIM,),
|
||||
)
|
||||
def test_multivariate_grouper_test(
|
||||
univariate_ts, multivariate_ts, test_fill_rule, max_target_dim
|
||||
) -> None:
|
||||
univariate_ds = ListDataset(univariate_ts, freq="1D")
|
||||
multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False)
|
||||
|
||||
grouper = MultivariateGrouper(
|
||||
test_fill_rule=test_fill_rule, num_test_dates=2, max_target_dim=max_target_dim,
|
||||
)
|
||||
|
||||
for grouped_data, multivariate_data in zip(grouper(univariate_ds), multivariate_ds):
|
||||
assert (grouped_data["target"] == multivariate_data["target"]).all()
|
||||
|
||||
assert grouped_data["start"] == multivariate_data["start"]
|
||||
@@ -1,21 +0,0 @@
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from pts.dataset import ProcessStartField
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, expected",
|
||||
[
|
||||
("B", "2019-11-01"),
|
||||
("W", "2019-11-03"),
|
||||
("M", "2019-11-30"),
|
||||
("12M", "2019-11-30"),
|
||||
("A-DEC", "2019-12-31"),
|
||||
],
|
||||
)
|
||||
def test_process_start_field(freq, expected):
|
||||
process = ProcessStartField.process
|
||||
given = "2019-11-01 12:34:56"
|
||||
|
||||
assert process(given, freq) == pd.Timestamp(expected, freq)
|
||||
@@ -1,340 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
import unittest
|
||||
from typing import cast
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# First-party imports
|
||||
from pts.dataset import DataEntry, Dataset
|
||||
from pts.dataset.stat import (
|
||||
DatasetStatistics,
|
||||
ScaleHistogram,
|
||||
calculate_dataset_statistics,
|
||||
)
|
||||
|
||||
|
||||
def make_dummy_dynamic_feat(target, num_features) -> np.ndarray:
|
||||
# gives dummy dynamic_feat constructed from the target
|
||||
return np.vstack([target * (i + 1) for i in range(num_features)])
|
||||
|
||||
|
||||
# default values for TimeSeries field
|
||||
start = pd.Timestamp("1985-01-02", freq="1D")
|
||||
target = np.random.randint(0, 10, 20)
|
||||
fsc = [0, 1]
|
||||
fsr = [0.1, 0.2]
|
||||
|
||||
|
||||
def make_time_series(
|
||||
start=start,
|
||||
target=target,
|
||||
feat_static_cat=fsc,
|
||||
feat_static_real=fsr,
|
||||
num_feat_dynamic_cat=1,
|
||||
num_feat_dynamic_real=1,
|
||||
) -> DataEntry:
|
||||
feat_dynamic_cat = (
|
||||
make_dummy_dynamic_feat(target, num_feat_dynamic_cat).astype("int64")
|
||||
if num_feat_dynamic_cat > 0
|
||||
else None
|
||||
)
|
||||
feat_dynamic_real = (
|
||||
make_dummy_dynamic_feat(target, num_feat_dynamic_real).astype("float")
|
||||
if num_feat_dynamic_real > 0
|
||||
else None
|
||||
)
|
||||
data = {
|
||||
"start": start,
|
||||
"target": target,
|
||||
"feat_static_cat": feat_static_cat,
|
||||
"feat_static_real": feat_static_real,
|
||||
"feat_dynamic_cat": feat_dynamic_cat,
|
||||
"feat_dynamic_real": feat_dynamic_real,
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
def ts(
|
||||
start,
|
||||
target,
|
||||
feat_static_cat=None,
|
||||
feat_static_real=None,
|
||||
feat_dynamic_cat=None,
|
||||
feat_dynamic_real=None,
|
||||
) -> DataEntry:
|
||||
d = {"start": start, "target": target}
|
||||
if feat_static_cat is not None:
|
||||
d["feat_static_cat"] = feat_static_cat
|
||||
if feat_static_real is not None:
|
||||
d["feat_static_real"] = feat_static_real
|
||||
if feat_dynamic_cat is not None:
|
||||
d["feat_dynamic_cat"] = feat_dynamic_cat
|
||||
if feat_dynamic_real is not None:
|
||||
d["feat_dynamic_real"] = feat_dynamic_real
|
||||
return d
|
||||
|
||||
|
||||
class DatasetStatisticsTest(unittest.TestCase):
|
||||
def test_dataset_statistics(self) -> None:
|
||||
|
||||
n = 2
|
||||
T = 10
|
||||
|
||||
# use integers to avoid float conversion that can fail comparison
|
||||
np.random.seed(0)
|
||||
targets = np.random.randint(0, 10, (n, T))
|
||||
|
||||
scale_histogram = ScaleHistogram()
|
||||
for i in range(n):
|
||||
scale_histogram.add(targets[i, :])
|
||||
|
||||
scale_histogram.add([])
|
||||
|
||||
expected = DatasetStatistics(
|
||||
integer_dataset=True,
|
||||
num_time_series=n + 1,
|
||||
num_time_observations=targets.size,
|
||||
mean_target_length=T * 2 / 3,
|
||||
min_target=targets.min(),
|
||||
mean_target=targets.mean(),
|
||||
mean_abs_target=targets.mean(),
|
||||
max_target=targets.max(),
|
||||
feat_static_real=[{0.1}, {0.2, 0.3}],
|
||||
feat_static_cat=[{1}, {2, 3}],
|
||||
num_feat_dynamic_real=2,
|
||||
num_feat_dynamic_cat=2,
|
||||
num_missing_values=0,
|
||||
scale_histogram=scale_histogram,
|
||||
)
|
||||
|
||||
# FIXME: the cast below is a hack to make mypy happy
|
||||
timeseries = cast(
|
||||
Dataset,
|
||||
[
|
||||
make_time_series(
|
||||
target=targets[0, :],
|
||||
feat_static_cat=[1, 2],
|
||||
feat_static_real=[0.1, 0.2],
|
||||
num_feat_dynamic_cat=2,
|
||||
num_feat_dynamic_real=2,
|
||||
),
|
||||
make_time_series(
|
||||
target=targets[1, :],
|
||||
feat_static_cat=[1, 3],
|
||||
feat_static_real=[0.1, 0.3],
|
||||
num_feat_dynamic_cat=2,
|
||||
num_feat_dynamic_real=2,
|
||||
),
|
||||
make_time_series(
|
||||
target=np.array([]),
|
||||
feat_static_cat=[1, 3],
|
||||
feat_static_real=[0.1, 0.3],
|
||||
num_feat_dynamic_cat=2,
|
||||
num_feat_dynamic_real=2,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
found = calculate_dataset_statistics(timeseries)
|
||||
|
||||
assert expected == found
|
||||
|
||||
def test_dataset_histogram(self) -> None:
|
||||
|
||||
# generates 2 ** N - 1 timeseries with constant increasing values
|
||||
N = 6
|
||||
n = 2 ** N - 1
|
||||
T = 5
|
||||
targets = np.ones((n, T))
|
||||
for i in range(0, n):
|
||||
targets[i, :] = targets[i, :] * i
|
||||
|
||||
# FIXME: the cast below is a hack to make mypy happy
|
||||
timeseries = cast(
|
||||
Dataset, [make_time_series(target=targets[i, :]) for i in range(n)]
|
||||
)
|
||||
|
||||
found = calculate_dataset_statistics(timeseries)
|
||||
|
||||
hist = found.scale_histogram.bin_counts
|
||||
for i in range(0, N):
|
||||
assert i in hist
|
||||
assert hist[i] == 2 ** i
|
||||
|
||||
|
||||
class DatasetStatisticsExceptions(unittest.TestCase):
|
||||
def test_dataset_statistics_exceptions(self) -> None:
|
||||
def check_error_message(expected_regex, dataset) -> None:
|
||||
with self.assertRaisesRegex(Exception, expected_regex):
|
||||
calculate_dataset_statistics(dataset)
|
||||
|
||||
check_error_message("Time series dataset is empty!", [])
|
||||
|
||||
check_error_message(
|
||||
"Only empty time series found in the dataset!",
|
||||
[make_time_series(target=np.random.randint(0, 10, 0))],
|
||||
)
|
||||
|
||||
# infinite target
|
||||
# check_error_message(
|
||||
# "Target values have to be finite (e.g., not inf, -inf, "
|
||||
# "or None) and cannot exceed single precision floating "
|
||||
# "point range.",
|
||||
# [make_time_series(target=np.full(20, np.inf))]
|
||||
# )
|
||||
|
||||
# different number of feat_dynamic_{cat, real}
|
||||
check_error_message(
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_cat, found one with 2 and another with 1.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_cat=2),
|
||||
make_time_series(num_feat_dynamic_cat=1),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_cat, found one with 0 and another with 1.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_cat=0),
|
||||
make_time_series(num_feat_dynamic_cat=1),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"feat_dynamic_cat was found for some instances but not others.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_cat=1),
|
||||
make_time_series(num_feat_dynamic_cat=0),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_real, found one with 2 and another with 1.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_real=2),
|
||||
make_time_series(num_feat_dynamic_real=1),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Found instances with different number of features in "
|
||||
"feat_dynamic_real, found one with 0 and another with 1.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_real=0),
|
||||
make_time_series(num_feat_dynamic_real=1),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"feat_dynamic_real was found for some instances but not others.",
|
||||
[
|
||||
make_time_series(num_feat_dynamic_real=1),
|
||||
make_time_series(num_feat_dynamic_real=0),
|
||||
],
|
||||
)
|
||||
|
||||
# infinite feat_dynamic_{cat,real}
|
||||
inf_dynamic_feat = np.full((2, len(target)), np.inf)
|
||||
check_error_message(
|
||||
"Features values have to be finite and cannot exceed single "
|
||||
"precision floating point range.",
|
||||
[
|
||||
ts(
|
||||
start,
|
||||
target,
|
||||
feat_dynamic_cat=inf_dynamic_feat,
|
||||
feat_static_cat=[0, 1],
|
||||
)
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Features values have to be finite and cannot exceed single "
|
||||
"precision floating point range.",
|
||||
[
|
||||
ts(
|
||||
start,
|
||||
target,
|
||||
feat_dynamic_real=inf_dynamic_feat,
|
||||
feat_static_cat=[0, 1],
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
# feat_dynamic_{cat, real} different length from target
|
||||
check_error_message(
|
||||
"Each feature in feat_dynamic_cat has to have the same length as the "
|
||||
"target. Found an instance with feat_dynamic_cat of length 1 and a "
|
||||
"target of length 20.",
|
||||
[
|
||||
ts(
|
||||
start=start,
|
||||
target=target,
|
||||
feat_static_cat=[0, 1],
|
||||
feat_dynamic_cat=np.ones((1, 1)),
|
||||
)
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Each feature in feat_dynamic_real has to have the same length as the "
|
||||
"target. Found an instance with feat_dynamic_real of length 1 and a "
|
||||
"target of length 20.",
|
||||
[
|
||||
ts(
|
||||
start=start,
|
||||
target=target,
|
||||
feat_static_cat=[0, 1],
|
||||
feat_dynamic_real=np.ones((1, 1)),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
# feat_static_{cat, real} different length
|
||||
check_error_message(
|
||||
"Not all feat_static_cat vectors have the same length 2 != 1.",
|
||||
[
|
||||
ts(start=start, target=target, feat_static_cat=[0, 1]),
|
||||
ts(start=start, target=target, feat_static_cat=[1]),
|
||||
],
|
||||
)
|
||||
check_error_message(
|
||||
"Not all feat_static_real vectors have the same length 2 != 1.",
|
||||
[
|
||||
ts(start=start, target=target, feat_static_real=[0, 1]),
|
||||
ts(start=start, target=target, feat_static_real=[1]),
|
||||
],
|
||||
)
|
||||
|
||||
calculate_dataset_statistics(
|
||||
# FIXME: the cast below is a hack to make mypy happy
|
||||
cast(
|
||||
Dataset,
|
||||
[
|
||||
make_time_series(num_feat_dynamic_cat=2),
|
||||
make_time_series(num_feat_dynamic_cat=2),
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
calculate_dataset_statistics(
|
||||
# FIXME: the cast below is a hack to make mypy happy
|
||||
cast(
|
||||
Dataset,
|
||||
[
|
||||
make_time_series(num_feat_dynamic_cat=0),
|
||||
make_time_series(num_feat_dynamic_cat=0),
|
||||
],
|
||||
)
|
||||
)
|
||||
@@ -1,649 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
# First-party imports
|
||||
from pts.evaluation import (
|
||||
Evaluator,
|
||||
MultivariateEvaluator,
|
||||
)
|
||||
from pts.feature import get_seasonality
|
||||
from pts.model.forecast import QuantileForecast, SampleForecast
|
||||
|
||||
QUANTILES = [str(q / 10.0) for q in range(1, 10)]
|
||||
|
||||
|
||||
def data_iterator(ts):
|
||||
"""
|
||||
:param ts: list of pd.Series or pd.DataFrame
|
||||
:return:
|
||||
"""
|
||||
for i in range(len(ts)):
|
||||
yield ts[i]
|
||||
|
||||
|
||||
def fcst_iterator(fcst, start_dates, freq):
|
||||
"""
|
||||
:param fcst: list of numpy arrays with the sample paths
|
||||
:return:
|
||||
"""
|
||||
for i in range(len(fcst)):
|
||||
yield SampleForecast(samples=fcst[i], start_date=start_dates[i], freq=freq)
|
||||
|
||||
|
||||
def iterator(it):
|
||||
"""
|
||||
Convenience function to toggle whether to consume dataset and forecasts as iterators or iterables.
|
||||
:param it:
|
||||
:return: it (as iterator)
|
||||
"""
|
||||
return iter(it)
|
||||
|
||||
|
||||
def iterable(it):
|
||||
"""
|
||||
Convenience function to toggle whether to consume dataset and forecasts as iterators or iterables.
|
||||
:param it:
|
||||
:return: it (as iterable)
|
||||
"""
|
||||
return list(it)
|
||||
|
||||
|
||||
def naive_forecaster(ts, prediction_length, num_samples=100, target_dim=0):
|
||||
"""
|
||||
:param ts: pandas.Series
|
||||
:param prediction_length:
|
||||
:param num_samples: number of sample paths
|
||||
:param target_dim: number of axes of target (0: scalar, 1: array, ...)
|
||||
:return: np.array with dimension (num_samples, prediction_length)
|
||||
"""
|
||||
|
||||
# naive prediction: last observed value
|
||||
naive_pred = ts.values[-prediction_length - 1]
|
||||
assert len(naive_pred.shape) == target_dim
|
||||
return np.tile(
|
||||
naive_pred,
|
||||
(num_samples, prediction_length) + tuple(1 for _ in range(target_dim)),
|
||||
)
|
||||
|
||||
|
||||
def naive_multivariate_forecaster(ts, prediction_length, num_samples=100):
|
||||
return naive_forecaster(ts, prediction_length, num_samples, target_dim=1)
|
||||
|
||||
|
||||
def calculate_metrics(
|
||||
timeseries,
|
||||
evaluator,
|
||||
ts_datastructure,
|
||||
has_nans=False,
|
||||
forecaster=naive_forecaster,
|
||||
input_type=iterator,
|
||||
):
|
||||
num_timeseries = timeseries.shape[0]
|
||||
num_timestamps = timeseries.shape[1]
|
||||
|
||||
if has_nans:
|
||||
timeseries[0, 1] = np.nan
|
||||
timeseries[0, 7] = np.nan
|
||||
|
||||
num_samples = 100
|
||||
prediction_length = 3
|
||||
freq = "1D"
|
||||
|
||||
ts_start_dates = (
|
||||
[]
|
||||
) # starting date of each time series - can be different in general
|
||||
pd_timeseries = [] # list of pandas.DataFrame
|
||||
samples = [] # list of forecast samples
|
||||
start_dates = [] # start date of the prediction range
|
||||
for i in range(num_timeseries):
|
||||
ts_start_dates.append(pd.Timestamp(year=2018, month=1, day=1, hour=1))
|
||||
index = pd.date_range(ts_start_dates[i], periods=num_timestamps, freq=freq)
|
||||
|
||||
pd_timeseries.append(ts_datastructure(timeseries[i], index=index))
|
||||
samples.append(forecaster(pd_timeseries[i], prediction_length, num_samples))
|
||||
start_dates.append(
|
||||
pd.date_range(ts_start_dates[i], periods=num_timestamps, freq=freq)[
|
||||
-prediction_length
|
||||
]
|
||||
)
|
||||
|
||||
# data iterator
|
||||
data_iter = input_type(data_iterator(pd_timeseries))
|
||||
fcst_iter = input_type(fcst_iterator(samples, start_dates, freq))
|
||||
|
||||
# evaluate
|
||||
agg_df, item_df = evaluator(data_iter, fcst_iter)
|
||||
return agg_df, item_df
|
||||
|
||||
|
||||
TIMESERIES_M4 = [
|
||||
np.array(
|
||||
[
|
||||
[
|
||||
2.943_013,
|
||||
2.822_251,
|
||||
4.196_222,
|
||||
1.328_664,
|
||||
4.947_390,
|
||||
3.333_131,
|
||||
1.479_800,
|
||||
2.265_094,
|
||||
3.413_493,
|
||||
3.497_607,
|
||||
],
|
||||
[
|
||||
-0.126_781_2,
|
||||
3.057_412_2,
|
||||
1.901_594_4,
|
||||
2.772_549_5,
|
||||
3.312_853_1,
|
||||
4.411_818_0,
|
||||
3.709_025_2,
|
||||
4.322_028,
|
||||
2.565_359,
|
||||
3.074_308,
|
||||
],
|
||||
[
|
||||
2.542_998,
|
||||
2.336_757,
|
||||
1.417_916,
|
||||
1.335_139,
|
||||
2.523_035,
|
||||
3.645_589,
|
||||
3.382_819,
|
||||
2.075_960,
|
||||
2.643_869,
|
||||
2.772_456,
|
||||
],
|
||||
[
|
||||
0.315_685_6,
|
||||
1.892_312_1,
|
||||
2.476_861_2,
|
||||
3.511_628_6,
|
||||
4.384_346_5,
|
||||
2.960_685_6,
|
||||
4.897_572_5,
|
||||
3.280_125,
|
||||
4.768_556,
|
||||
4.958_616,
|
||||
],
|
||||
[
|
||||
2.205_877_3,
|
||||
0.782_759_4,
|
||||
2.401_420_8,
|
||||
2.385_643_4,
|
||||
4.845_818_2,
|
||||
3.102_322_9,
|
||||
3.567_723_7,
|
||||
4.878_143,
|
||||
3.735_245,
|
||||
2.218_113,
|
||||
],
|
||||
]
|
||||
),
|
||||
np.array(
|
||||
[
|
||||
[
|
||||
13.11301,
|
||||
13.16225,
|
||||
14.70622,
|
||||
12.00866,
|
||||
15.79739,
|
||||
14.35313,
|
||||
12.66980,
|
||||
13.62509,
|
||||
14.94349,
|
||||
15.19761,
|
||||
],
|
||||
[
|
||||
10.04322,
|
||||
13.39741,
|
||||
12.41159,
|
||||
13.45255,
|
||||
14.16285,
|
||||
15.43182,
|
||||
14.89903,
|
||||
15.68203,
|
||||
14.09536,
|
||||
14.77431,
|
||||
],
|
||||
[
|
||||
12.71300,
|
||||
12.67676,
|
||||
11.92792,
|
||||
12.01514,
|
||||
13.37303,
|
||||
14.66559,
|
||||
14.57282,
|
||||
13.43596,
|
||||
14.17387,
|
||||
14.47246,
|
||||
],
|
||||
[
|
||||
10.48569,
|
||||
12.23231,
|
||||
12.98686,
|
||||
14.19163,
|
||||
15.23435,
|
||||
13.98069,
|
||||
16.08757,
|
||||
14.64012,
|
||||
16.29856,
|
||||
16.65862,
|
||||
],
|
||||
[
|
||||
12.37588,
|
||||
11.12276,
|
||||
12.91142,
|
||||
13.06564,
|
||||
15.69582,
|
||||
14.12232,
|
||||
14.75772,
|
||||
16.23814,
|
||||
15.26524,
|
||||
13.91811,
|
||||
],
|
||||
]
|
||||
),
|
||||
]
|
||||
|
||||
RES_M4 = [
|
||||
{
|
||||
"MASE": 0.816_837_618,
|
||||
"MAPE": 0.324_517_430_685_928_1,
|
||||
"sMAPE": 0.326_973_268_4,
|
||||
"seasonal_error": np.array(
|
||||
[1.908_101, 1.258_838, 0.63018, 1.238_201, 1.287_771]
|
||||
),
|
||||
},
|
||||
{
|
||||
"MASE": 0.723_948_2,
|
||||
"MAPE": 0.063_634_129_851_747_6,
|
||||
"sMAPE": 0.065_310_85,
|
||||
"seasonal_error": np.array(
|
||||
[1.867_847, 1.315_505, 0.602_587_4, 1.351_535, 1.339_179]
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("timeseries, res", zip(TIMESERIES_M4, RES_M4))
|
||||
def test_MASE_sMAPE_M4(timeseries, res):
|
||||
ts_datastructure = pd.Series
|
||||
evaluator = Evaluator(quantiles=QUANTILES)
|
||||
agg_df, item_df = calculate_metrics(timeseries, evaluator, ts_datastructure)
|
||||
|
||||
assert abs((agg_df["MASE"] - res["MASE"]) / res["MASE"]) < 0.001, (
|
||||
"Scores for the metric MASE do not match: "
|
||||
"\nexpected: {} \nobtained: {}".format(res["MASE"], agg_df["MASE"])
|
||||
)
|
||||
assert abs((agg_df["MAPE"] - res["MAPE"]) / res["MAPE"]) < 0.001, (
|
||||
"Scores for the metric MAPE do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(res["MAPE"], agg_df["MAPE"])
|
||||
)
|
||||
assert abs((agg_df["sMAPE"] - res["sMAPE"]) / res["sMAPE"]) < 0.001, (
|
||||
"Scores for the metric sMAPE do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(res["sMAPE"], agg_df["sMAPE"])
|
||||
)
|
||||
assert sum(abs(item_df["seasonal_error"].values - res["seasonal_error"])) < 0.001, (
|
||||
"Scores for the metric seasonal_error do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(res["seasonal_error"], item_df["seasonal_error"].values)
|
||||
)
|
||||
|
||||
|
||||
TIMESERIES = [
|
||||
np.ones((5, 10), dtype=np.float64),
|
||||
np.ones((5, 10), dtype=np.float64),
|
||||
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
|
||||
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
|
||||
np.array([[np.nan] * 10, [1.0] * 10]),
|
||||
]
|
||||
|
||||
RES = [
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 15.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"MAPE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
},
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 14.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"MAPE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
},
|
||||
{
|
||||
"MSE": 4.666_666_666_666,
|
||||
"abs_error": 30.0,
|
||||
"abs_target_sum": 420.0,
|
||||
"abs_target_mean": 28.0,
|
||||
"seasonal_error": 1.0,
|
||||
"MASE": 2.0,
|
||||
"MAPE": 0.103_112_211_532_524_85,
|
||||
"sMAPE": 0.113_254_049_3,
|
||||
"MSIS": 80.0,
|
||||
"RMSE": 2.160_246_899_469_286_9,
|
||||
"NRMSE": 0.077_151_674_981_045_956,
|
||||
"ND": 0.071_428_571_428_571_42,
|
||||
"MAE_Coverage": 0.5,
|
||||
},
|
||||
{
|
||||
"MSE": 5.033_333_333_333_3,
|
||||
"abs_error": 29.0,
|
||||
"abs_target_sum": 413.0,
|
||||
"abs_target_mean": 28.1,
|
||||
"seasonal_error": 1.0,
|
||||
"MASE": 2.1,
|
||||
"MAPE": 0.113_032_846_453_159_77,
|
||||
"sMAPE": 0.125_854_781_903_299_57,
|
||||
"MSIS": 84.0,
|
||||
"RMSE": 2.243_509_156_061_845_6,
|
||||
"NRMSE": 0.079_840_183_489_745_39,
|
||||
"ND": 0.070_217_917_675_544_79,
|
||||
"MAE_Coverage": 0.5,
|
||||
},
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 3.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"MAPE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
},
|
||||
]
|
||||
|
||||
HAS_NANS = [False, True, False, True, True]
|
||||
|
||||
|
||||
INPUT_TYPE = [iterable, iterable, iterator, iterator, iterable]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"timeseries, res, has_nans, input_type", zip(TIMESERIES, RES, HAS_NANS, INPUT_TYPE),
|
||||
)
|
||||
def test_metrics(timeseries, res, has_nans, input_type):
|
||||
ts_datastructure = pd.Series
|
||||
evaluator = Evaluator(quantiles=QUANTILES, num_workers=0)
|
||||
agg_metrics, item_metrics = calculate_metrics(
|
||||
timeseries,
|
||||
evaluator,
|
||||
ts_datastructure,
|
||||
has_nans=has_nans,
|
||||
input_type=input_type,
|
||||
)
|
||||
|
||||
for metric, score in agg_metrics.items():
|
||||
if metric in res.keys():
|
||||
assert abs(score - res[metric]) < 0.001, (
|
||||
"Scores for the metric {} do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(metric, res[metric], score)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"timeseries, res, has_nans, input_type", zip(TIMESERIES, RES, HAS_NANS, INPUT_TYPE),
|
||||
)
|
||||
def test_metrics_mp(timeseries, res, has_nans, input_type):
|
||||
ts_datastructure = pd.Series
|
||||
# Default will be multiprocessing evaluator
|
||||
evaluator = Evaluator(quantiles=QUANTILES, num_workers=4)
|
||||
agg_metrics, item_metrics = calculate_metrics(
|
||||
timeseries,
|
||||
evaluator,
|
||||
ts_datastructure,
|
||||
has_nans=has_nans,
|
||||
input_type=input_type,
|
||||
)
|
||||
|
||||
for metric, score in agg_metrics.items():
|
||||
if metric in res.keys():
|
||||
assert abs(score - res[metric]) < 0.001, (
|
||||
"Scores for the metric {} do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(metric, res[metric], score)
|
||||
)
|
||||
|
||||
|
||||
TIMESERIES_MULTIVARIATE = [
|
||||
np.ones((5, 10, 2), dtype=np.float64),
|
||||
np.ones((5, 10, 2), dtype=np.float64),
|
||||
np.ones((5, 10, 2), dtype=np.float64),
|
||||
np.stack(
|
||||
(
|
||||
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
|
||||
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
|
||||
),
|
||||
axis=2,
|
||||
),
|
||||
np.stack(
|
||||
(
|
||||
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
|
||||
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
|
||||
),
|
||||
axis=2,
|
||||
),
|
||||
np.stack(
|
||||
(
|
||||
np.arange(0, 50, dtype=np.float64).reshape(5, 10),
|
||||
np.arange(50, 100, dtype=np.float64).reshape(5, 10),
|
||||
),
|
||||
axis=2,
|
||||
),
|
||||
]
|
||||
|
||||
RES_MULTIVARIATE = [
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"0_MSE": 0.0,
|
||||
"1_MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 15.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 0.0,
|
||||
},
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 15.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 0.0,
|
||||
},
|
||||
{
|
||||
"MSE": 0.0,
|
||||
"abs_error": 0.0,
|
||||
"abs_target_sum": 30.0,
|
||||
"abs_target_mean": 1.0,
|
||||
"seasonal_error": 0.0,
|
||||
"MASE": 0.0,
|
||||
"sMAPE": 0.0,
|
||||
"MSIS": 0.0,
|
||||
"RMSE": 0.0,
|
||||
"NRMSE": 0.0,
|
||||
"ND": 0.0,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 0.0,
|
||||
},
|
||||
{
|
||||
"MSE": 4.666_666_666_666,
|
||||
"abs_error": 30.0,
|
||||
"abs_target_sum": 420.0,
|
||||
"abs_target_mean": 28.0,
|
||||
"seasonal_error": 1.0,
|
||||
"MASE": 2.0,
|
||||
"sMAPE": 0.113_254_049_3,
|
||||
"MSIS": 80.0,
|
||||
"RMSE": 2.160_246_899_469_286_9,
|
||||
"NRMSE": 0.077_151_674_981_045_956,
|
||||
"ND": 0.071_428_571_428_571_42,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 18.666_666_666_666,
|
||||
},
|
||||
{
|
||||
"MSE": 4.666_666_666_666,
|
||||
"abs_error": 30.0,
|
||||
"abs_target_sum": 1170.0,
|
||||
"abs_target_mean": 78.0,
|
||||
"seasonal_error": 1.0,
|
||||
"MASE": 2.0,
|
||||
"sMAPE": 0.026_842_301_756_499_45,
|
||||
"MSIS": 80.0,
|
||||
"RMSE": 2.160_246_899_469_286_9,
|
||||
"NRMSE": 0.027_695_473_070_119_065,
|
||||
"ND": 0.025_641_025_641_025_64,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 18.666_666_666_666,
|
||||
},
|
||||
{
|
||||
"MSE": 4.666_666_666_666,
|
||||
"abs_error": 60.0,
|
||||
"abs_target_sum": 1590.0,
|
||||
"abs_target_mean": 53.0,
|
||||
"seasonal_error": 1.0,
|
||||
"MASE": 2.0,
|
||||
"sMAPE": 0.070_048_175_528_249_73,
|
||||
"MSIS": 80.0,
|
||||
"RMSE": 2.160_246_899_469_286_9,
|
||||
"NRMSE": 0.040_759_375_461_684_65,
|
||||
"ND": 0.037_735_849_056_603_77,
|
||||
"MAE_Coverage": 0.5,
|
||||
"m_sum_MSE": 18.666_666_666_666,
|
||||
},
|
||||
]
|
||||
|
||||
HAS_NANS_MULTIVARIATE = [False, False, False, False, False, False]
|
||||
|
||||
EVAL_DIMS = [[0], [1], [0, 1], [0], [1], None]
|
||||
|
||||
INPUT_TYPE = [iterable, iterable, iterator, iterator, iterable, iterator]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"timeseries, res, has_nans, eval_dims, input_type",
|
||||
zip(
|
||||
TIMESERIES_MULTIVARIATE,
|
||||
RES_MULTIVARIATE,
|
||||
HAS_NANS_MULTIVARIATE,
|
||||
EVAL_DIMS,
|
||||
INPUT_TYPE,
|
||||
),
|
||||
)
|
||||
def test_metrics_multivariate(timeseries, res, has_nans, eval_dims, input_type):
|
||||
ts_datastructure = pd.DataFrame
|
||||
evaluator = MultivariateEvaluator(
|
||||
quantiles=QUANTILES, eval_dims=eval_dims, target_agg_funcs={"sum": np.sum},
|
||||
)
|
||||
|
||||
agg_metrics, item_metrics = calculate_metrics(
|
||||
timeseries,
|
||||
evaluator,
|
||||
ts_datastructure,
|
||||
has_nans=has_nans,
|
||||
forecaster=naive_multivariate_forecaster,
|
||||
input_type=input_type,
|
||||
)
|
||||
|
||||
for metric, score in agg_metrics.items():
|
||||
if metric in res.keys():
|
||||
assert abs(score - res[metric]) < 0.001, (
|
||||
"Scores for the metric {} do not match: \nexpected: {} "
|
||||
"\nobtained: {}".format(metric, res[metric], score)
|
||||
)
|
||||
|
||||
|
||||
def test_evaluation_with_QuantileForecast():
|
||||
start = "2012-01-11"
|
||||
target = [2.4, 1.0, 3.0, 4.4, 5.5, 4.9] * 11
|
||||
index = pd.date_range(start=start, freq="1D", periods=len(target))
|
||||
ts = pd.Series(index=index, data=target)
|
||||
|
||||
ev = Evaluator(quantiles=("0.1", "0.2", "0.5"))
|
||||
|
||||
fcst = [
|
||||
QuantileForecast(
|
||||
start_date=pd.Timestamp("2012-01-11"),
|
||||
freq="D",
|
||||
forecast_arrays=np.array([[2.4, 9.0, 3.0, 2.4, 5.5, 4.9] * 10]),
|
||||
forecast_keys=["0.5"],
|
||||
)
|
||||
]
|
||||
|
||||
agg_metric, _ = ev(iter([ts]), iter(fcst))
|
||||
|
||||
assert np.isfinite(agg_metric["wQuantileLoss[0.5]"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, expected_seasonality",
|
||||
[
|
||||
("1H", 24),
|
||||
("H", 24),
|
||||
("2H", 12),
|
||||
("3H", 8),
|
||||
("4H", 6),
|
||||
("15H", 1),
|
||||
("5B", 1),
|
||||
("1B", 5),
|
||||
("2W", 1),
|
||||
("3M", 4),
|
||||
("1D", 1),
|
||||
("7D", 1),
|
||||
("8D", 1),
|
||||
],
|
||||
)
|
||||
def test_get_seasonality(freq, expected_seasonality):
|
||||
assert get_seasonality(freq) == expected_seasonality
|
||||
@@ -1,311 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
|
||||
from pts.feature import get_lags_for_frequency
|
||||
|
||||
# These are the expected lags for common frequencies and corner cases.
|
||||
# By default all frequencies have the following lags: [1, 2, 3, 4, 5, 6, 7].
|
||||
# Remaining lags correspond to the same `season` (+/- `delta`) in previous `k` cycles.
|
||||
expected_lags = {
|
||||
# (apart from the default lags) centered around each of the last 3 hours (delta = 2)
|
||||
"min": [
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
58,
|
||||
59,
|
||||
60,
|
||||
61,
|
||||
62,
|
||||
118,
|
||||
119,
|
||||
120,
|
||||
121,
|
||||
122,
|
||||
178,
|
||||
179,
|
||||
180,
|
||||
181,
|
||||
182,
|
||||
],
|
||||
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1)
|
||||
"15min": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
||||
+ [
|
||||
95,
|
||||
96,
|
||||
97,
|
||||
191,
|
||||
192,
|
||||
193,
|
||||
287,
|
||||
288,
|
||||
289,
|
||||
383,
|
||||
384,
|
||||
385,
|
||||
479,
|
||||
480,
|
||||
481,
|
||||
575,
|
||||
576,
|
||||
577,
|
||||
671,
|
||||
672,
|
||||
673,
|
||||
],
|
||||
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + 3 weeks (delta = 1)
|
||||
"30min": [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
+ [
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
95,
|
||||
96,
|
||||
97,
|
||||
143,
|
||||
144,
|
||||
145,
|
||||
191,
|
||||
192,
|
||||
193,
|
||||
239,
|
||||
240,
|
||||
241,
|
||||
287,
|
||||
288,
|
||||
289,
|
||||
335,
|
||||
336,
|
||||
337,
|
||||
]
|
||||
+ [671, 672, 673, 1007, 1008, 1009],
|
||||
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
|
||||
"59min": [1, 2, 3, 4, 5, 6, 7]
|
||||
+ [
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
72,
|
||||
73,
|
||||
74,
|
||||
96,
|
||||
97,
|
||||
98,
|
||||
121,
|
||||
122,
|
||||
123,
|
||||
145,
|
||||
146,
|
||||
147,
|
||||
169,
|
||||
170,
|
||||
171,
|
||||
]
|
||||
+ [340, 341, 342, 511, 512, 513, 682, 683, 684, 731, 732, 733],
|
||||
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
|
||||
"61min": [1, 2, 3, 4, 5, 6, 7]
|
||||
+ [
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
69,
|
||||
70,
|
||||
71,
|
||||
93,
|
||||
94,
|
||||
95,
|
||||
117,
|
||||
118,
|
||||
119,
|
||||
140,
|
||||
141,
|
||||
142,
|
||||
164,
|
||||
165,
|
||||
166,
|
||||
]
|
||||
+ [329, 330, 331, 494, 495, 496, 659, 660, 661, 707, 708, 709],
|
||||
# centered around each of the last 3 hours (delta = 2) + last 7 days (delta = 1) + last 6 weeks (delta = 1)
|
||||
"H": [1, 2, 3, 4, 5, 6, 7]
|
||||
+ [
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
71,
|
||||
72,
|
||||
73,
|
||||
95,
|
||||
96,
|
||||
97,
|
||||
119,
|
||||
120,
|
||||
121,
|
||||
143,
|
||||
144,
|
||||
145,
|
||||
167,
|
||||
168,
|
||||
169,
|
||||
]
|
||||
+ [335, 336, 337, 503, 504, 505, 671, 672, 673, 719, 720, 721],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0)
|
||||
"6H": [
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
]
|
||||
+ [55, 56, 57, 83, 84, 85, 111, 112, 113]
|
||||
+ [119, 120, 121]
|
||||
+ [224, 336],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0) + last year (delta = 1)
|
||||
"12H": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
||||
+ [27, 28, 29, 41, 42, 43, 55, 56, 57]
|
||||
+ [59, 60, 61]
|
||||
+ [112, 168]
|
||||
+ [727, 728, 729],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
|
||||
"23H": [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
+ [13, 14, 15, 20, 21, 22, 28, 29]
|
||||
+ [30, 31, 32]
|
||||
+ [58, 87]
|
||||
+ [378, 379, 380, 758, 759, 760, 1138, 1139, 1140],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
|
||||
"25H": [1, 2, 3, 4, 5, 6, 7]
|
||||
+ [12, 13, 14, 19, 20, 21, 25, 26, 27]
|
||||
+ [28, 29]
|
||||
+ [53, 80]
|
||||
+ [348, 349, 350, 697, 698, 699, 1047, 1048, 1049],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
|
||||
"D": [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
+ [13, 14, 15, 20, 21, 22, 27, 28, 29]
|
||||
+ [30, 31]
|
||||
+ [56, 84]
|
||||
+ [363, 364, 365, 727, 728, 729, 1091, 1092, 1093],
|
||||
# centered around each of the last 7 days (delta = 1) + last 4 weeks (delta = 1) + last 1 month (delta = 1) +
|
||||
# last 8th and 12th weeks (delta = 0) + last 3 years (delta = 1)
|
||||
"2D": [1, 2, 3, 4, 5]
|
||||
+ [6, 7, 8, 9, 10, 11, 13, 14, 15]
|
||||
+ [16]
|
||||
+ [28, 42]
|
||||
+ [181, 182, 183, 363, 364, 365, 545, 546, 547],
|
||||
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
|
||||
"6D": [1, 2, 3, 4, 5, 6, 7, 9, 14] + [59, 60, 61, 120, 121, 122, 181, 182, 183],
|
||||
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
|
||||
"W": [1, 2, 3, 4, 5, 6, 7, 8, 12] + [51, 52, 53, 103, 104, 105, 155, 156, 157],
|
||||
# centered around each of the last 3 months (delta = 0) + last 3 years (delta = 1) (assuming 52 weeks per year)
|
||||
"8D": [1, 2, 3, 4, 5, 6, 7, 10] + [44, 45, 46, 90, 91, 92, 135, 136, 137],
|
||||
# centered around each of the last 3 years (delta = 1)
|
||||
"4W": [1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 25, 26, 27, 38, 39, 40],
|
||||
# centered around each of the last 3 years (delta = 1)
|
||||
"3W": [1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 33, 34, 35, 51, 52, 53],
|
||||
# centered around each of the last 3 years (delta = 1)
|
||||
"5W": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 19, 20, 21, 30, 31, 32],
|
||||
# centered around each of the last 3 years (delta = 1)
|
||||
"M": [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 23, 24, 25, 35, 36, 37],
|
||||
# default
|
||||
"6M": [1, 2, 3, 4, 5, 6, 7],
|
||||
# default
|
||||
"12M": [1, 2, 3, 4, 5, 6, 7],
|
||||
}
|
||||
|
||||
# For the default multiple (1)
|
||||
for freq in ["min", "H", "D", "W", "M"]:
|
||||
expected_lags["1" + freq] = expected_lags[freq]
|
||||
|
||||
# For frequencies that do not have unique form
|
||||
expected_lags["60min"] = expected_lags["1H"]
|
||||
expected_lags["24H"] = expected_lags["1D"]
|
||||
expected_lags["7D"] = expected_lags["1W"]
|
||||
|
||||
|
||||
def test_lags():
|
||||
|
||||
freq_strs = [
|
||||
"min",
|
||||
"1min",
|
||||
"15min",
|
||||
"30min",
|
||||
"59min",
|
||||
"60min",
|
||||
"61min",
|
||||
"H",
|
||||
"1H",
|
||||
"6H",
|
||||
"12H",
|
||||
"23H",
|
||||
"24H",
|
||||
"25H",
|
||||
"D",
|
||||
"1D",
|
||||
"2D",
|
||||
"6D",
|
||||
"7D",
|
||||
"8D",
|
||||
"W",
|
||||
"1W",
|
||||
"3W",
|
||||
"4W",
|
||||
"5W",
|
||||
"M",
|
||||
"6M",
|
||||
"12M",
|
||||
]
|
||||
|
||||
for freq_str in freq_strs:
|
||||
lags = get_lags_for_frequency(freq_str)
|
||||
|
||||
assert (
|
||||
lags == expected_lags[freq_str]
|
||||
), "lags do not match for the frequency '{}':\nexpected: {},\nprovided: {}".format(
|
||||
freq_str, expected_lags[freq_str], lags
|
||||
)
|
||||
@@ -17,8 +17,8 @@ from torch.nn.utils import clip_grad_norm_
|
||||
from torch.optim import SGD
|
||||
from torch.utils.data import TensorDataset, DataLoader
|
||||
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from pts.modules import (
|
||||
DistributionOutput,
|
||||
StudentTOutput,
|
||||
BetaOutput,
|
||||
NegativeBinomialOutput,
|
||||
|
||||
@@ -11,13 +11,14 @@ from torch.nn.utils import clip_grad_norm_
|
||||
from torch.optim import SGD
|
||||
from torch.utils.data import TensorDataset, DataLoader
|
||||
|
||||
from gluonts.dataset.repository.datasets import get_dataset
|
||||
from gluonts.evaluation import Evaluator
|
||||
from gluonts.evaluation.backtest import make_evaluation_predictions
|
||||
from gluonts.torch.modules.distribution_output import DistributionOutput
|
||||
from pts import Trainer
|
||||
from pts.dataset.repository import get_dataset
|
||||
from pts.evaluation import make_evaluation_predictions, Evaluator
|
||||
from pts.model.deepar import DeepAREstimator
|
||||
from pts.model.simple_feedforward import SimpleFeedForwardEstimator
|
||||
from pts.modules import (
|
||||
DistributionOutput,
|
||||
ImplicitQuantileOutput
|
||||
)
|
||||
|
||||
@@ -172,7 +173,7 @@ def test_training_with_implicit_quantile_output():
|
||||
)
|
||||
forecasts = list(forecast_it)
|
||||
tss = list(ts_it)
|
||||
evaluator = Evaluator()
|
||||
evaluator = Evaluator(num_workers=0)
|
||||
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test))
|
||||
|
||||
assert agg_metrics["MSE"] > 0
|
||||
@@ -220,7 +221,7 @@ def test_instanciation_of_args_proj():
|
||||
)
|
||||
forecasts = list(forecast_it)
|
||||
tss = list(ts_it)
|
||||
evaluator = Evaluator()
|
||||
evaluator = Evaluator(num_workers=0)
|
||||
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test))
|
||||
assert distr_output.method_calls == 2
|
||||
|
||||
|
||||
@@ -1,808 +0,0 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
# A copy of the License is located at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is distributed
|
||||
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
# express or implied. See the License for the specific language governing
|
||||
# permissions and limitations under the License.
|
||||
|
||||
# Standard library imports
|
||||
from typing import Tuple
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from pts import transform
|
||||
|
||||
# First-party imports
|
||||
from pts.dataset import (
|
||||
ProcessStartField,
|
||||
FieldName,
|
||||
ListDataset,
|
||||
DataEntry,
|
||||
calculate_dataset_statistics,
|
||||
ScaleHistogram,
|
||||
)
|
||||
from pts.feature import time_feature
|
||||
|
||||
FREQ = "1D"
|
||||
|
||||
TEST_VALUES = {
|
||||
"is_train": [True, False],
|
||||
"target": [np.zeros(0), np.random.rand(13), np.random.rand(100)],
|
||||
"start": [
|
||||
ProcessStartField.process("2012-01-02", freq="1D"),
|
||||
ProcessStartField.process("1994-02-19 20:01:02", freq="3D"),
|
||||
],
|
||||
"use_prediction_features": [True, False],
|
||||
"allow_target_padding": [True, False],
|
||||
}
|
||||
|
||||
|
||||
def test_align_timestamp():
|
||||
def aligned_with(date_str, freq):
|
||||
return str(ProcessStartField.process(date_str, freq=freq))
|
||||
|
||||
for _ in range(2):
|
||||
assert aligned_with("2012-03-05 09:13:12", "min") == "2012-03-05 09:13:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "2min") == "2012-03-05 09:12:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "H") == "2012-03-05 09:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "D") == "2012-03-05 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "W") == "2012-03-11 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "4W") == "2012-03-11 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "M") == "2012-03-31 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "3M") == "2012-03-31 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:13:12", "Y") == "2012-12-31 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "min") == "2012-03-05 09:14:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "2min") == "2012-03-05 09:14:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "H") == "2012-03-05 09:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "D") == "2012-03-05 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "W") == "2012-03-11 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "4W") == "2012-03-11 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "M") == "2012-03-31 00:00:00"
|
||||
assert aligned_with("2012-03-05 09:14:11", "3M") == "2012-03-31 00:00:00"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
@pytest.mark.parametrize("target", TEST_VALUES["target"])
|
||||
@pytest.mark.parametrize("start", TEST_VALUES["start"])
|
||||
def test_AddTimeFeatures(start, target, is_train: bool):
|
||||
pred_length = 13
|
||||
t = transform.AddTimeFeatures(
|
||||
start_field=FieldName.START,
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="myout",
|
||||
pred_length=pred_length,
|
||||
time_features=[time_feature.DayOfWeek(), time_feature.DayOfMonth()],
|
||||
)
|
||||
|
||||
data = {"start": start, "target": target}
|
||||
res = t.map_transform(data, is_train=is_train)
|
||||
mat = res["myout"]
|
||||
expected_length = len(target) + (0 if is_train else pred_length)
|
||||
assert mat.shape == (2, expected_length)
|
||||
tmp_idx = pd.date_range(start=start, freq=start.freq, periods=expected_length)
|
||||
assert np.alltrue(mat[0] == time_feature.DayOfWeek()(tmp_idx))
|
||||
assert np.alltrue(mat[1] == time_feature.DayOfMonth()(tmp_idx))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
@pytest.mark.parametrize("target", TEST_VALUES["target"])
|
||||
@pytest.mark.parametrize("start", TEST_VALUES["start"])
|
||||
def test_AddTimeFeatures_empty_time_features(start, target, is_train: bool):
|
||||
pred_length = 13
|
||||
t = transform.AddTimeFeatures(
|
||||
start_field=FieldName.START,
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="myout",
|
||||
pred_length=pred_length,
|
||||
time_features=[],
|
||||
)
|
||||
|
||||
data = {"start": start, "target": target}
|
||||
res = t.map_transform(data, is_train=is_train)
|
||||
assert res["myout"] is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
@pytest.mark.parametrize("target", TEST_VALUES["target"])
|
||||
@pytest.mark.parametrize("start", TEST_VALUES["start"])
|
||||
def test_AddAgeFeatures(start, target, is_train: bool):
|
||||
pred_length = 13
|
||||
t = transform.AddAgeFeature(
|
||||
pred_length=pred_length,
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="age",
|
||||
log_scale=True,
|
||||
)
|
||||
|
||||
data = {"start": start, "target": target}
|
||||
out = t.map_transform(data, is_train=is_train)
|
||||
expected_length = len(target) + (0 if is_train else pred_length)
|
||||
assert out["age"].shape[-1] == expected_length
|
||||
assert np.allclose(
|
||||
out["age"],
|
||||
np.log10(2.0 + np.arange(expected_length)).reshape((1, expected_length)),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pick_incomplete", TEST_VALUES["allow_target_padding"])
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
@pytest.mark.parametrize("target", TEST_VALUES["target"])
|
||||
@pytest.mark.parametrize("start", TEST_VALUES["start"])
|
||||
def test_InstanceSplitter(start, target, is_train: bool, pick_incomplete: bool):
|
||||
train_length = 100
|
||||
pred_length = 13
|
||||
t = transform.InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
train_sampler=transform.UniformSplitSampler(p=1.0),
|
||||
past_length=train_length,
|
||||
future_length=pred_length,
|
||||
time_series_fields=["some_time_feature"],
|
||||
pick_incomplete=pick_incomplete,
|
||||
)
|
||||
|
||||
other_feat = np.arange(len(target) + 100)
|
||||
data = {
|
||||
"start": start,
|
||||
"target": target,
|
||||
"some_time_feature": other_feat,
|
||||
"some_other_col": "ABC",
|
||||
}
|
||||
|
||||
if not is_train and not pick_incomplete and len(target) < train_length:
|
||||
with pytest.raises(AssertionError):
|
||||
out = list(t.flatmap_transform(data, is_train=is_train))
|
||||
return
|
||||
else:
|
||||
out = list(t.flatmap_transform(data, is_train=is_train))
|
||||
|
||||
if is_train:
|
||||
assert len(out) == max(
|
||||
0, len(target) - pred_length + 1 - (0 if pick_incomplete else train_length),
|
||||
)
|
||||
else:
|
||||
assert len(out) == 1
|
||||
|
||||
for o in out:
|
||||
assert "target" not in o
|
||||
assert "some_time_feature" not in o
|
||||
assert "some_other_col" in o
|
||||
|
||||
assert len(o["past_some_time_feature"]) == train_length
|
||||
assert len(o["past_target"]) == train_length
|
||||
|
||||
if is_train:
|
||||
assert len(o["future_target"]) == pred_length
|
||||
assert len(o["future_some_time_feature"]) == pred_length
|
||||
else:
|
||||
assert len(o["future_target"]) == 0
|
||||
assert len(o["future_some_time_feature"]) == pred_length
|
||||
|
||||
# expected_length = len(target) + (0 if is_train else pred_length)
|
||||
# assert len(out['age']) == expected_length
|
||||
# assert np.alltrue(out['age'] == np.log10(2.0 + np.arange(expected_length)))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
@pytest.mark.parametrize("target", TEST_VALUES["target"])
|
||||
@pytest.mark.parametrize("start", TEST_VALUES["start"])
|
||||
@pytest.mark.parametrize(
|
||||
"use_prediction_features", TEST_VALUES["use_prediction_features"]
|
||||
)
|
||||
@pytest.mark.parametrize("allow_target_padding", TEST_VALUES["allow_target_padding"])
|
||||
def test_CanonicalInstanceSplitter(
|
||||
start,
|
||||
target,
|
||||
is_train: bool,
|
||||
use_prediction_features: bool,
|
||||
allow_target_padding: bool,
|
||||
):
|
||||
train_length = 100
|
||||
pred_length = 13
|
||||
t = transform.CanonicalInstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
instance_sampler=transform.UniformSplitSampler(p=1.0),
|
||||
instance_length=train_length,
|
||||
prediction_length=pred_length,
|
||||
time_series_fields=["some_time_feature"],
|
||||
allow_target_padding=allow_target_padding,
|
||||
use_prediction_features=use_prediction_features,
|
||||
)
|
||||
|
||||
other_feat = np.arange(len(target) + 100)
|
||||
data = {
|
||||
"start": start,
|
||||
"target": target,
|
||||
"some_time_feature": other_feat,
|
||||
"some_other_col": "ABC",
|
||||
}
|
||||
|
||||
out = list(t.flatmap_transform(data, is_train=is_train))
|
||||
|
||||
min_num_instances = 1 if allow_target_padding else 0
|
||||
if is_train:
|
||||
assert len(out) == max(min_num_instances, len(target) - train_length + 1)
|
||||
else:
|
||||
assert len(out) == 1
|
||||
|
||||
for o in out:
|
||||
assert "target" not in o
|
||||
assert "future_target" not in o
|
||||
assert "some_time_feature" not in o
|
||||
assert "some_other_col" in o
|
||||
|
||||
assert len(o["past_some_time_feature"]) == train_length
|
||||
assert len(o["past_target"]) == train_length
|
||||
|
||||
if use_prediction_features and not is_train:
|
||||
assert len(o["future_some_time_feature"]) == pred_length
|
||||
|
||||
|
||||
def test_Transformation():
|
||||
train_length = 100
|
||||
ds = ListDataset(
|
||||
[{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D"
|
||||
)
|
||||
|
||||
pred_length = 10
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.AddTimeFeatures(
|
||||
start_field=FieldName.START,
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="time_feat",
|
||||
time_features=[
|
||||
time_feature.DayOfWeek(),
|
||||
time_feature.DayOfMonth(),
|
||||
time_feature.MonthOfYear(),
|
||||
],
|
||||
pred_length=pred_length,
|
||||
),
|
||||
transform.AddAgeFeature(
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="age",
|
||||
pred_length=pred_length,
|
||||
log_scale=True,
|
||||
),
|
||||
transform.AddObservedValuesIndicator(
|
||||
target_field=FieldName.TARGET, output_field="observed_values"
|
||||
),
|
||||
transform.VstackFeatures(
|
||||
output_field="dynamic_feat",
|
||||
input_fields=["age", "time_feat"],
|
||||
drop_inputs=True,
|
||||
),
|
||||
transform.InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
|
||||
past_length=train_length,
|
||||
future_length=pred_length,
|
||||
time_series_fields=["dynamic_feat", "observed_values"],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
for u in t(iter(ds), is_train=True):
|
||||
print(u)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_train", TEST_VALUES["is_train"])
|
||||
def test_multi_dim_transformation(is_train):
|
||||
train_length = 10
|
||||
|
||||
first_dim: list = list(np.arange(1, 11, 1))
|
||||
first_dim[-1] = "NaN"
|
||||
|
||||
second_dim: list = list(np.arange(11, 21, 1))
|
||||
second_dim[0] = "NaN"
|
||||
|
||||
ds = ListDataset(
|
||||
data_iter=[{"start": "2012-01-01", "target": [first_dim, second_dim]}],
|
||||
freq="1D",
|
||||
one_dim_target=False,
|
||||
)
|
||||
pred_length = 2
|
||||
|
||||
# Looks weird - but this is necessary to assert the nan entries correctly.
|
||||
first_dim[-1] = np.nan
|
||||
second_dim[0] = np.nan
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.AddTimeFeatures(
|
||||
start_field=FieldName.START,
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="time_feat",
|
||||
time_features=[
|
||||
time_feature.DayOfWeek(),
|
||||
time_feature.DayOfMonth(),
|
||||
time_feature.MonthOfYear(),
|
||||
],
|
||||
pred_length=pred_length,
|
||||
),
|
||||
transform.AddAgeFeature(
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="age",
|
||||
pred_length=pred_length,
|
||||
log_scale=True,
|
||||
),
|
||||
transform.AddObservedValuesIndicator(
|
||||
target_field=FieldName.TARGET,
|
||||
output_field="observed_values",
|
||||
convert_nans=False,
|
||||
),
|
||||
transform.VstackFeatures(
|
||||
output_field="dynamic_feat",
|
||||
input_fields=["age", "time_feat"],
|
||||
drop_inputs=True,
|
||||
),
|
||||
transform.InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
|
||||
past_length=train_length,
|
||||
future_length=pred_length,
|
||||
time_series_fields=["dynamic_feat", "observed_values"],
|
||||
time_first=False,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
if is_train:
|
||||
for u in t(iter(ds), is_train=True):
|
||||
assert_shape(u["past_target"], (2, 10))
|
||||
assert_shape(u["past_dynamic_feat"], (4, 10))
|
||||
assert_shape(u["past_observed_values"], (2, 10))
|
||||
assert_shape(u["future_target"], (2, 2))
|
||||
|
||||
assert_padded_array(
|
||||
u["past_observed_values"],
|
||||
np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
|
||||
u["past_is_pad"],
|
||||
)
|
||||
assert_padded_array(
|
||||
u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"],
|
||||
)
|
||||
else:
|
||||
for u in t(iter(ds), is_train=False):
|
||||
assert_shape(u["past_target"], (2, 10))
|
||||
assert_shape(u["past_dynamic_feat"], (4, 10))
|
||||
assert_shape(u["past_observed_values"], (2, 10))
|
||||
assert_shape(u["future_target"], (2, 0))
|
||||
|
||||
assert_padded_array(
|
||||
u["past_observed_values"],
|
||||
np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
|
||||
u["past_is_pad"],
|
||||
)
|
||||
assert_padded_array(
|
||||
u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"],
|
||||
)
|
||||
|
||||
|
||||
def test_ExpectedNumInstanceSampler():
|
||||
N = 6
|
||||
train_length = 2
|
||||
pred_length = 1
|
||||
ds = make_dataset(N, train_length)
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
train_sampler=transform.ExpectedNumInstanceSampler(num_instances=4),
|
||||
past_length=train_length,
|
||||
future_length=pred_length,
|
||||
pick_incomplete=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
scale_hist = ScaleHistogram()
|
||||
|
||||
repetition = 2
|
||||
for i in range(repetition):
|
||||
for data in t(iter(ds), is_train=True):
|
||||
target_values = data["past_target"]
|
||||
# for simplicity, discard values that are zeros to avoid confusion with padding
|
||||
target_values = target_values[target_values > 0]
|
||||
scale_hist.add(target_values)
|
||||
|
||||
expected_values = {i: 2 ** i * repetition for i in range(1, N)}
|
||||
|
||||
assert expected_values == scale_hist.bin_counts
|
||||
|
||||
|
||||
def test_BucketInstanceSampler():
|
||||
N = 6
|
||||
train_length = 2
|
||||
pred_length = 1
|
||||
ds = make_dataset(N, train_length)
|
||||
|
||||
dataset_stats = calculate_dataset_statistics(ds)
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.InstanceSplitter(
|
||||
target_field=FieldName.TARGET,
|
||||
is_pad_field=FieldName.IS_PAD,
|
||||
start_field=FieldName.START,
|
||||
forecast_start_field=FieldName.FORECAST_START,
|
||||
train_sampler=transform.BucketInstanceSampler(
|
||||
dataset_stats.scale_histogram
|
||||
),
|
||||
past_length=train_length,
|
||||
future_length=pred_length,
|
||||
pick_incomplete=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
scale_hist = ScaleHistogram()
|
||||
|
||||
repetition = 200
|
||||
for i in range(repetition):
|
||||
for data in t(iter(ds), is_train=True):
|
||||
target_values = data["past_target"]
|
||||
# for simplicity, discard values that are zeros to avoid confusion with padding
|
||||
target_values = target_values[target_values > 0]
|
||||
scale_hist.add(target_values)
|
||||
|
||||
expected_values = {i: repetition for i in range(1, N)}
|
||||
found_values = scale_hist.bin_counts
|
||||
|
||||
for i in range(1, N):
|
||||
assert abs(expected_values[i] - found_values[i] < expected_values[i] * 0.3)
|
||||
|
||||
|
||||
def test_cdf_to_gaussian_transformation():
|
||||
def make_test_data():
|
||||
target = np.array(
|
||||
[0, 0, 0, 0, 10, 10, 20, 20, 30, 30, 40, 50, 59, 60, 60, 70, 80, 90, 100,]
|
||||
).tolist()
|
||||
|
||||
np.random.shuffle(target)
|
||||
|
||||
multi_dim_target = np.array([target, target]).transpose()
|
||||
|
||||
past_is_pad = np.array([[0] * len(target)]).transpose()
|
||||
|
||||
past_observed_target = np.array(
|
||||
[[1] * len(target), [1] * len(target)]
|
||||
).transpose()
|
||||
|
||||
ds = ListDataset(
|
||||
# Mimic output from InstanceSplitter
|
||||
data_iter=[
|
||||
{
|
||||
"start": "2012-01-01",
|
||||
"target": multi_dim_target,
|
||||
"past_target": multi_dim_target,
|
||||
"future_target": multi_dim_target,
|
||||
"past_is_pad": past_is_pad,
|
||||
f"past_{FieldName.OBSERVED_VALUES}": past_observed_target,
|
||||
}
|
||||
],
|
||||
freq="1D",
|
||||
one_dim_target=False,
|
||||
)
|
||||
return ds
|
||||
|
||||
def make_fake_output(u: DataEntry):
|
||||
fake_output = np.expand_dims(
|
||||
np.expand_dims(u["past_target_cdf"], axis=0), axis=0
|
||||
)
|
||||
return fake_output
|
||||
|
||||
ds = make_test_data()
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.CDFtoGaussianTransform(
|
||||
target_field=FieldName.TARGET,
|
||||
observed_values_field=FieldName.OBSERVED_VALUES,
|
||||
max_context_length=20,
|
||||
target_dim=2,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
for u in t(iter(ds), is_train=False):
|
||||
|
||||
fake_output = make_fake_output(u)
|
||||
|
||||
# Fake transformation chain output
|
||||
u["past_target_sorted"] = torch.tensor(
|
||||
np.expand_dims(u["past_target_sorted"], axis=0)
|
||||
)
|
||||
|
||||
u["slopes"] = torch.tensor(np.expand_dims(u["slopes"], axis=0))
|
||||
|
||||
u["intercepts"] = torch.tensor(np.expand_dims(u["intercepts"], axis=0))
|
||||
|
||||
back_transformed = transform.cdf_to_gaussian_forward_transform(u, fake_output)
|
||||
|
||||
# Get any sample/batch (slopes[i][:, d]they are all the same)
|
||||
back_transformed = back_transformed[0][0]
|
||||
|
||||
original_target = u["target"]
|
||||
|
||||
# Original target and back-transformed target should be the same
|
||||
assert np.allclose(original_target, back_transformed)
|
||||
|
||||
|
||||
def test_gaussian_cdf():
|
||||
try:
|
||||
from scipy.stats import norm
|
||||
except:
|
||||
pytest.skip("scipy not installed skipping test for erf")
|
||||
|
||||
x = np.array(
|
||||
[-1000, -100, -10] + np.linspace(-2, 2, 1001).tolist() + [10, 100, 1000]
|
||||
)
|
||||
y_gluonts = transform.CDFtoGaussianTransform.standard_gaussian_cdf(x)
|
||||
y_scipy = norm.cdf(x)
|
||||
|
||||
assert np.allclose(y_gluonts, y_scipy, atol=1e-7)
|
||||
|
||||
|
||||
def test_gaussian_ppf():
|
||||
try:
|
||||
from scipy.stats import norm
|
||||
except:
|
||||
pytest.skip("scipy not installed skipping test for erf")
|
||||
|
||||
x = np.linspace(0.0001, 0.9999, 1001)
|
||||
y_gluonts = transform.CDFtoGaussianTransform.standard_gaussian_ppf(x)
|
||||
y_scipy = norm.ppf(x)
|
||||
|
||||
assert np.allclose(y_gluonts, y_scipy, atol=1e-7)
|
||||
|
||||
|
||||
def test_target_dim_indicator():
|
||||
target = np.array([0, 2, 3, 10]).tolist()
|
||||
|
||||
multi_dim_target = np.array([target, target, target, target])
|
||||
dataset = ListDataset(
|
||||
data_iter=[{"start": "2012-01-01", "target": multi_dim_target}],
|
||||
freq="1D",
|
||||
one_dim_target=False,
|
||||
)
|
||||
|
||||
t = transform.Chain(
|
||||
trans=[
|
||||
transform.TargetDimIndicator(
|
||||
target_field=FieldName.TARGET, field_name="target_dimensions"
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
for data_entry in t(dataset, is_train=True):
|
||||
assert (data_entry["target_dimensions"] == np.array([0, 1, 2, 3])).all()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def point_process_dataset():
|
||||
|
||||
ia_times = np.array([0.2, 0.7, 0.2, 0.5, 0.3, 0.3, 0.2, 0.1])
|
||||
marks = np.array([0, 1, 2, 0, 1, 2, 2, 2])
|
||||
|
||||
lds = ListDataset(
|
||||
[
|
||||
{
|
||||
"target": np.c_[ia_times, marks].T,
|
||||
"start": pd.Timestamp("2011-01-01 00:00:00", freq="H"),
|
||||
"end": pd.Timestamp("2011-01-01 03:00:00", freq="H"),
|
||||
}
|
||||
],
|
||||
freq="H",
|
||||
one_dim_target=False,
|
||||
)
|
||||
|
||||
return lds
|
||||
|
||||
|
||||
class MockContinuousTimeSampler(transform.ContinuousTimePointSampler):
|
||||
# noinspection PyMissingConstructor,PyUnusedLocal
|
||||
def __init__(self, ret_values, *args, **kwargs):
|
||||
self._ret_values = ret_values
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return np.array(self._ret_values)
|
||||
|
||||
|
||||
def test_ctsplitter_mask_sorted(point_process_dataset):
|
||||
d = next(iter(point_process_dataset))
|
||||
|
||||
ia_times = d["target"][0, :]
|
||||
|
||||
ts = np.cumsum(ia_times)
|
||||
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
2, 1, train_sampler=transform.ContinuousTimeUniformSampler(num_instances=10),
|
||||
)
|
||||
|
||||
# no boundary conditions
|
||||
res = splitter._mask_sorted(ts, 1, 2)
|
||||
assert all([a == b for a, b in zip([2, 3, 4], res)])
|
||||
|
||||
# lower bound equal, exclusive of upper bound
|
||||
res = splitter._mask_sorted(np.array([1, 2, 3, 4, 5, 6]), 1, 2)
|
||||
assert all([a == b for a, b in zip([0], res)])
|
||||
|
||||
|
||||
def test_ctsplitter_no_train_last_point(point_process_dataset):
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
2, 1, train_sampler=transform.ContinuousTimeUniformSampler(num_instances=10),
|
||||
)
|
||||
|
||||
iter_de = splitter(point_process_dataset, is_train=False)
|
||||
|
||||
d_out = next(iter(iter_de))
|
||||
|
||||
assert "future_target" not in d_out
|
||||
assert "future_valid_length" not in d_out
|
||||
assert "past_target" in d_out
|
||||
assert "past_valid_length" in d_out
|
||||
|
||||
assert d_out["past_valid_length"] == 6
|
||||
assert np.allclose(
|
||||
[0.1, 0.5, 0.3, 0.3, 0.2, 0.1], d_out["past_target"][..., 0], atol=0.01
|
||||
)
|
||||
|
||||
|
||||
def test_ctsplitter_train_correct(point_process_dataset):
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
1,
|
||||
1,
|
||||
train_sampler=MockContinuousTimeSampler(
|
||||
ret_values=[1.01, 1.5, 1.99], num_instances=3
|
||||
),
|
||||
)
|
||||
|
||||
iter_de = splitter(point_process_dataset, is_train=True)
|
||||
|
||||
outputs = list(iter_de)
|
||||
|
||||
assert outputs[0]["past_valid_length"] == 2
|
||||
assert outputs[0]["future_valid_length"] == 3
|
||||
|
||||
assert np.allclose(outputs[0]["past_target"], np.array([[0.19, 0.7], [0, 1]]).T)
|
||||
assert np.allclose(
|
||||
outputs[0]["future_target"], np.array([[0.09, 0.5, 0.3], [2, 0, 1]]).T
|
||||
)
|
||||
|
||||
assert outputs[1]["past_valid_length"] == 2
|
||||
assert outputs[1]["future_valid_length"] == 4
|
||||
|
||||
assert outputs[2]["past_valid_length"] == 3
|
||||
assert outputs[2]["future_valid_length"] == 3
|
||||
|
||||
|
||||
def test_ctsplitter_train_correct_out_count(point_process_dataset):
|
||||
|
||||
# produce new TPP data by shuffling existing TS instance
|
||||
def shuffle_iterator(num_duplications=5):
|
||||
for entry in point_process_dataset:
|
||||
for i in range(num_duplications):
|
||||
d = dict.copy(entry)
|
||||
d["target"] = np.random.permutation(d["target"].T).T
|
||||
yield d
|
||||
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
1,
|
||||
1,
|
||||
train_sampler=MockContinuousTimeSampler(
|
||||
ret_values=[1.01, 1.5, 1.99], num_instances=3
|
||||
),
|
||||
)
|
||||
|
||||
iter_de = splitter(shuffle_iterator(), is_train=True)
|
||||
|
||||
outputs = list(iter_de)
|
||||
|
||||
assert len(outputs) == 5 * 3
|
||||
|
||||
|
||||
def test_ctsplitter_train_samples_correct_times(point_process_dataset):
|
||||
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
1.25, 1.25, train_sampler=transform.ContinuousTimeUniformSampler(20)
|
||||
)
|
||||
|
||||
iter_de = splitter(point_process_dataset, is_train=True)
|
||||
|
||||
assert all(
|
||||
[
|
||||
(
|
||||
pd.Timestamp("2011-01-01 01:15:00")
|
||||
<= d["forecast_start"]
|
||||
<= pd.Timestamp("2011-01-01 01:45:00")
|
||||
)
|
||||
for d in iter_de
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_ctsplitter_train_short_intervals(point_process_dataset):
|
||||
splitter = transform.ContinuousTimeInstanceSplitter(
|
||||
0.01,
|
||||
0.01,
|
||||
train_sampler=MockContinuousTimeSampler(
|
||||
ret_values=[1.01, 1.5, 1.99], num_instances=3
|
||||
),
|
||||
)
|
||||
|
||||
iter_de = splitter(point_process_dataset, is_train=True)
|
||||
|
||||
for d in iter_de:
|
||||
assert d["future_valid_length"] == d["past_valid_length"] == 0
|
||||
assert np.prod(np.shape(d["past_target"])) == 0
|
||||
assert np.prod(np.shape(d["future_target"])) == 0
|
||||
|
||||
|
||||
def make_dataset(N, train_length):
|
||||
# generates 2 ** N - 1 timeseries with constant increasing values
|
||||
n = 2 ** N - 1
|
||||
targets = np.ones((n, train_length))
|
||||
for i in range(0, n):
|
||||
targets[i, :] = targets[i, :] * i
|
||||
|
||||
ds = ListDataset(
|
||||
data_iter=[{"start": "2012-01-01", "target": targets[i, :]} for i in range(n)],
|
||||
freq="1D",
|
||||
)
|
||||
|
||||
return ds
|
||||
|
||||
|
||||
def assert_shape(array: np.array, reference_shape: Tuple[int, int]):
|
||||
assert (
|
||||
array.shape == reference_shape
|
||||
), f"Shape should be {reference_shape} but found {array.shape}."
|
||||
|
||||
|
||||
def assert_padded_array(
|
||||
sampled_array: np.array, reference_array: np.array, padding_array: np.array
|
||||
):
|
||||
num_padded = int(np.sum(padding_array))
|
||||
sampled_no_padding = sampled_array[:, num_padded:]
|
||||
|
||||
reference_array = np.roll(reference_array, num_padded, axis=1)
|
||||
reference_no_padding = reference_array[:, num_padded:]
|
||||
|
||||
# Convert nans to dummy value for assertion because
|
||||
# np.nan == np.nan -> False.
|
||||
reference_no_padding[np.isnan(reference_no_padding)] = 9999.0
|
||||
sampled_no_padding[np.isnan(sampled_no_padding)] = 9999.0
|
||||
|
||||
reference_no_padding = np.array(reference_no_padding, dtype=np.float32)
|
||||
|
||||
assert (sampled_no_padding == reference_no_padding).all(), (
|
||||
f"Sampled and reference arrays do not match. '"
|
||||
f"Got {sampled_no_padding} but should be {reference_no_padding}."
|
||||
)
|
||||
Reference in New Issue
Block a user