Files
catalyst/zipline/modelling/term.py
T
Scott Sanderson b89fc0c028 BUG: Fix error from RequiredWindowLengthMixin.
WindowLengthNotSpecified expects an argument.
2015-08-04 01:41:03 -04:00

293 lines
8.8 KiB
Python

"""
Base class for Filters, Factors and Classifiers
"""
from numpy import (
empty,
float64,
full,
nan,
)
from weakref import WeakValueDictionary
from zipline.errors import (
InputTermNotAtomic,
TermInputsNotSpecified,
WindowLengthNotPositive,
WindowLengthNotSpecified,
)
from zipline.utils.lazyval import lazyval
NotSpecified = (object(),)
class Term(object):
"""
Base class for terms in an FFC API compute graph.
"""
inputs = NotSpecified
window_length = NotSpecified
domain = None
dtype = float64
_term_cache = WeakValueDictionary()
def __new__(cls,
inputs=None,
window_length=None,
domain=None,
dtype=None,
*args,
**kwargs):
"""
Memoized constructor for Terms.
Caching previously-constructed Terms is useful because it allows us to
only compute equivalent sub-expressions once when traversing an FFC
dependency graph.
Caching previously-constructed Terms is **sane** because terms and
their inputs are both conceptually immutable.
"""
if inputs is None:
inputs = tuple(cls.inputs)
else:
inputs = tuple(inputs)
if window_length is None:
window_length = cls.window_length
if domain is None:
domain = cls.domain
if dtype is None:
dtype = cls.dtype
identity = cls.static_identity(
inputs=inputs,
window_length=window_length,
domain=domain,
dtype=dtype,
*args, **kwargs
)
try:
return cls._term_cache[identity]
except KeyError:
new_instance = cls._term_cache[identity] = \
super(Term, cls).__new__(cls)._init(
inputs=inputs,
window_length=window_length,
domain=domain,
dtype=dtype,
*args, **kwargs
)
return new_instance
def __init__(self, *args, **kwargs):
"""
Noop constructor to play nicely with our caching __new__. Subclasses
should implement _init instead of this method.
When a class' __new__ returns an instance of that class, Python will
automatically call __init__ on the object, even if a new object wasn't
actually constructed. Because we memoize instances, we often return an
object that was already initialized from __new__, in which case we
don't want to call __init__ again.
Subclasses that need to initialize new instances should override _init,
which is guaranteed to be called only once.
"""
pass
def _init(self, inputs, window_length, domain, dtype):
self.inputs = inputs
self.window_length = window_length
self.domain = domain
self.dtype = dtype
self._validate()
return self
@classmethod
def static_identity(cls, inputs, window_length, domain, dtype):
"""
Return the identity of the Term that would be constructed from the
given arguments.
Identities that compare equal will cause us to return a cached instance
rather than constructing a new one. We do this primarily because it
makes dependency resolution easier.
This is a classmethod so that it can be called from Term.__new__ to
determine whether to produce a new instance.
"""
return (cls, inputs, window_length, domain, dtype)
def _validate(self):
"""
Assert that this term is well-formed. This should be called exactly
once, at the end of Term._init().
"""
if self.inputs is NotSpecified:
raise TermInputsNotSpecified(termname=type(self).__name__)
if self.window_length is NotSpecified:
raise WindowLengthNotSpecified(termname=type(self).__name__)
if self.window_length:
for child in self.inputs:
if not child.atomic:
raise InputTermNotAtomic(parent=self, child=child)
@lazyval
def atomic(self):
"""
Whether or not this term has dependencies.
If term.atomic is truthy, it should have dataset and dtype attributes.
"""
return len(self.inputs) == 0
@lazyval
def windowed(self):
"""
Whether or not this term represents a trailing window computation.
If term.windowed is truthy, its compute_from_windows method will be
called with instances of AdjustedArray as inputs.
If term.windowed is falsey, its compute_from_baseline will be called
with instances of np.ndarray as inputs.
"""
return (
self.window_length is not NotSpecified
and self.window_length > 0
)
@lazyval
def extra_input_rows(self):
"""
The number of extra rows needed for each of our inputs to compute this
term.
"""
return max(0, self.window_length - 1)
def compute_from_windows(self, windows, mask):
"""
Subclasses should implement this for computations requiring moving
windows of continually-adjusting data.
"""
raise NotImplementedError()
def compute_from_arrays(self, arrays, mask):
"""
Subclasses should implement this for computations that can be expressed
directly as array computations.
"""
raise NotImplementedError()
def __repr__(self):
return (
"{type}({inputs}, window_length={window_length})"
).format(
type=type(self).__name__,
inputs=self.inputs,
window_length=self.window_length,
)
# TODO: Move mixins to a separate file?
class SingleInputMixin(object):
def _validate(self):
num_inputs = len(self.inputs)
if num_inputs != 1:
raise ValueError(
"{typename} expects only one input, "
"but received {num_inputs} instead.".format(
typename=type(self).__name__,
num_inputs=num_inputs
)
)
return super(SingleInputMixin, self)._validate()
class RequiredWindowLengthMixin(object):
def _validate(self):
if self.windowed or self.window_length is NotSpecified:
return super(RequiredWindowLengthMixin, self)._validate()
raise WindowLengthNotPositive(window_length=self.window_length)
class CustomTermMixin(object):
"""
Mixin for user-defined rolling-window Terms.
Implements `compute_from_windows` in terms of a user-defined `compute`
function, which is mapped over the input windows.
Used by CustomFactor, CustomFilter, CustomClassifier, etc.
"""
def compute(self, today, assets, out, *arrays):
"""
Override this method with a function that writes a value into `out`.
"""
raise NotImplementedError()
def compute_from_windows(self, windows, mask):
"""
Call the user's `compute` function on each window with a pre-built
output array.
"""
# TODO: Make mask available to user's `compute`.
compute = self.compute
dates, assets = mask.index, mask.columns
out = full(mask.shape, nan, dtype=self.dtype)
with self.ctx:
# TODO: Consider pre-filtering columns that are all-nan at each
# time-step?
for idx, date in enumerate(dates):
compute(
date,
assets,
out[idx],
*(next(w) for w in windows)
)
out[~mask.values] = nan
return out
class TestingTermMixin(object):
"""
Mixin for Term subclasses testing engines that asserts all inputs are
correctly shaped.
Used by TestingTerm, TestingFilter, TestingClassifier, etc.
"""
def compute_from_windows(self, windows, mask):
assert self.window_length > 0
dates, assets = mask.index, mask.columns
outbuf = empty(mask.shape, dtype=self.dtype)
for idx, _ in enumerate(dates):
result = self.from_windows(*(next(w) for w in windows))
assert result.shape == (len(assets),)
outbuf[idx] = result
for window in windows:
try:
next(window)
except StopIteration:
pass
else:
raise AssertionError("window %s was not exhausted" % window)
return outbuf
def compute_from_arrays(self, arrays, mask):
assert self.window_length == 0
outbuf = empty(mask.shape, dtype=self.dtype)
for array in arrays:
assert array.shape == outbuf.shape
outbuf[:] = self.from_arrays(*arrays)
return outbuf