Files
catalyst/zipline/modelling/factor/factor.py
T
Scott Sanderson 8eb09f70d7 BUG: Don't crash when raising UnsupportedDataType.
Before this was raising a KeyError on failure to format the message
properly.
2015-09-21 13:20:26 -04:00

449 lines
13 KiB
Python

"""
factor.py
"""
from operator import attrgetter
from numpy import (
apply_along_axis,
float64,
nan,
)
from scipy.stats import rankdata
from zipline.errors import (
UnknownRankMethod,
UnsupportedDataType,
)
from zipline.lib.rank import rankdata_2d_ordinal
from zipline.modelling.term import (
CustomTermMixin,
NotSpecified,
RequiredWindowLengthMixin,
SingleInputMixin,
Term,
)
from zipline.modelling.expression import (
BadBinaryOperator,
COMPARISONS,
is_comparison,
MATH_BINOPS,
method_name_for_op,
NUMERIC_TYPES,
NumericalExpression,
NUMEXPR_MATH_FUNCS,
UNARY_OPS,
)
from zipline.modelling.filter import (
NumExprFilter,
PercentileFilter,
)
from zipline.utils.control_flow import nullctx
_RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
def binop_return_type(op):
if is_comparison(op):
return NumExprFilter
else:
return NumExprFactor
def binary_operator(op):
"""
Factory function for making binary operator methods on a Factor subclass.
Returns a function, "binary_operator" suitable for implementing functions
like __add__.
"""
# When combining a Factor with a NumericalExpression, we use this
# attrgetter instance to defer to the commuted implementation of the
# NumericalExpression operator.
commuted_method_getter = attrgetter(method_name_for_op(op, commute=True))
def binary_operator(self, other):
# This can't be hoisted up a scope because the types returned by
# binop_return_type aren't defined when the top-level function is
# invoked in the class body of Factor.
return_type = binop_return_type(op)
if isinstance(self, NumExprFactor):
self_expr, other_expr, new_inputs = self.build_binary_op(
op, other,
)
return return_type(
"({left}) {op} ({right})".format(
left=self_expr,
op=op,
right=other_expr,
),
new_inputs,
)
elif isinstance(other, NumExprFactor):
# NumericalExpression overrides ops to correctly handle merging of
# inputs. Look up and call the appropriate reflected operator with
# ourself as the input.
return commuted_method_getter(other)(self)
elif isinstance(other, Factor):
if self is other:
return return_type(
"x_0 {op} x_0".format(op=op),
(self,),
)
return return_type(
"x_0 {op} x_1".format(op=op),
(self, other),
)
elif isinstance(other, NUMERIC_TYPES):
return return_type(
"x_0 {op} ({constant})".format(op=op, constant=other),
binds=(self,),
)
raise BadBinaryOperator(op, self, other)
return binary_operator
def reflected_binary_operator(op):
"""
Factory function for making binary operator methods on a Factor.
Returns a function, "reflected_binary_operator" suitable for implementing
functions like __radd__.
"""
assert not is_comparison(op)
def reflected_binary_operator(self, other):
if isinstance(self, NumericalExpression):
self_expr, other_expr, new_inputs = self.build_binary_op(
op, other
)
return NumExprFactor(
"({left}) {op} ({right})".format(
left=other_expr,
right=self_expr,
op=op,
),
new_inputs,
)
# Only have to handle the numeric case because in all other valid cases
# the corresponding left-binding method will be called.
elif isinstance(other, NUMERIC_TYPES):
return NumExprFactor(
"{constant} {op} x_0".format(op=op, constant=other),
binds=(self,),
)
raise BadBinaryOperator(op, other, self)
return reflected_binary_operator
def unary_operator(op):
"""
Factory function for making unary operator methods for Factors.
"""
# Only negate is currently supported for all our possible input types.
valid_ops = {'-'}
if op not in valid_ops:
raise ValueError("Invalid unary operator %s." % op)
def unary_operator(self):
# This can't be hoisted up a scope because the types returned by
# unary_op_return_type aren't defined when the top-level function is
# invoked.
if isinstance(self, NumericalExpression):
return NumExprFactor(
"{op}({expr})".format(op=op, expr=self._expr),
self.inputs,
)
else:
return NumExprFactor("{op}x_0".format(op=op), (self,))
return unary_operator
def function_application(func):
"""
Factory function for producing function application methods for Factor
subclasses.
"""
if func not in NUMEXPR_MATH_FUNCS:
raise ValueError("Unsupported mathematical function '%s'" % func)
def mathfunc(self):
if isinstance(self, NumericalExpression):
return NumExprFactor(
"{func}({expr})".format(func=func, expr=self._expr),
self.inputs,
)
else:
return NumExprFactor("{func}(x_0)".format(func=func), (self,))
return mathfunc
class Factor(Term):
"""
A transformation yielding a timeseries of scalar values associated with an
Asset.
"""
dtype = float64
# Dynamically add functions for creating NumExprFactor/NumExprFilter
# instances.
clsdict = locals()
clsdict.update(
{
method_name_for_op(op): binary_operator(op)
# Don't override __eq__ because it breaks comparisons on tuples of
# Factors.
for op in MATH_BINOPS.union(COMPARISONS - {'=='})
}
)
clsdict.update(
{
method_name_for_op(op, commute=True): reflected_binary_operator(op)
for op in MATH_BINOPS
}
)
clsdict.update(
{
'__neg__': unary_operator(op)
for op in UNARY_OPS
}
)
clsdict.update(
{
funcname: function_application(funcname)
for funcname in NUMEXPR_MATH_FUNCS
}
)
__truediv__ = clsdict['__div__']
__rtruediv__ = clsdict['__rdiv__']
eq = binary_operator('==')
def rank(self, method='ordinal', ascending=True, mask=NotSpecified):
"""
Construct a new Factor representing the sorted rank of each column
within each row.
Parameters
----------
ranks : zipline.modelling.factor.Rank
A new factor that will compute the sorted indices of the data
produced by `self`.
method : str, {'ordinal', 'min', 'max', 'dense', 'average'}
The method used to assign ranks to tied elements. See
`scipy.stats.rankdata` for a full description of the semantics for
each ranking method. Default is 'ordinal'.
ascending : bool, optional
Whether to return sorted rank in ascending or descending order.
Default is True.
Returns
-------
ranks : zipline.modelling.factor.Rank
Notes
-----
The default value for `method` is different from the default for
`scipy.stats.rankdata`. See that function's documentation for a full
description of the valid inputs to `method`.
Missing or non-existent data on a given day will cause an asset to be
given a rank of NaN for that day.
See Also
--------
scipy.stats.rankdata
zipline.lib.rank
zipline.modelling.factor.Rank
"""
return Rank(self if ascending else -self, method=method, mask=mask)
def top(self, N, mask=NotSpecified):
"""
Construct a Filter matching the top N asset values of self each day.
Parameters
----------
N : int
Number of assets passing the returned filter each day.
Returns
-------
filter : zipline.modelling.filter.Filter
"""
return self.rank(ascending=False, mask=mask) <= N
def bottom(self, N, mask=NotSpecified):
"""
Construct a Filter matching the bottom N asset values of self each day.
Parameters
----------
N : int
Number of assets passing the returned filter each day.
Returns
-------
filter : zipline.modelling.filter.Filter
"""
return self.rank(ascending=True, mask=mask) <= N
def percentile_between(self,
min_percentile,
max_percentile,
mask=NotSpecified):
"""
Construct a new Filter representing entries from the output of this
Factor that fall within the percentile range defined by min_percentile
and max_percentile.
Parameters
----------
min_percentile : float [0.0, 100.0]
max_percentile : float [0.0, 100.0]
Returns
-------
out : zipline.modelling.filter.PercentileFilter
A new filter that will compute the specified percentile-range mask.
See Also
--------
zipline.modelling.filter.PercentileFilter
"""
return PercentileFilter(
self,
min_percentile=min_percentile,
max_percentile=max_percentile,
mask=mask,
)
class NumExprFactor(NumericalExpression, Factor):
"""
Factor computed from a numexpr expression.
Parameters
----------
expr : string
A string suitable for passing to numexpr. All variables in 'expr'
should be of the form "x_i", where i is the index of the corresponding
factor input in 'binds'.
binds : tuple
A tuple of factors to use as inputs.
Notes
-----
NumExprFactors are constructed by numerical operators like `+` and `-`.
Users should rarely need to construct a NumExprFactor directly.
"""
pass
class Rank(SingleInputMixin, Factor):
"""
A Factor representing the row-wise rank data of another Factor.
Parameters
----------
factor : zipline.modelling.factor.Factor
The factor on which to compute ranks.
method : str, {'average', 'min', 'max', 'dense', 'ordinal'}
The method used to assign ranks to tied elements. See
`scipy.stats.rankdata` for a full description of the semantics for each
ranking method.
See Also
--------
scipy.stats.rankdata : Underlying ranking algorithm.
zipline.factor.Factor.rank : Method-style interface to same functionality.
Notes
-----
Most users should call Factor.rank rather than directly construct an
instance of this class.
"""
window_length = 0
dtype = float64
def __new__(cls, factor, method, mask):
return super(Rank, cls).__new__(
cls,
inputs=(factor,),
method=method,
mask=mask,
)
def _init(self, method, *args, **kwargs):
self._method = method
return super(Rank, self)._init(*args, **kwargs)
@classmethod
def static_identity(cls, method, *args, **kwargs):
return (
super(Rank, cls).static_identity(*args, **kwargs),
method,
)
def _validate(self):
"""
Verify that the stored rank method is valid.
"""
if self._method not in _RANK_METHODS:
raise UnknownRankMethod(
method=self._method,
choices=set(_RANK_METHODS),
)
return super(Rank, self)._validate()
def _compute(self, arrays, dates, assets, mask):
"""
For each row in the input, compute a like-shaped array of per-row
ranks.
"""
inv_mask = ~mask
data = arrays[0].copy()
data[inv_mask] = nan
# OPTIMIZATION: Fast path the default case with our own specialized
# Cython implementation.
if self._method == 'ordinal':
result = rankdata_2d_ordinal(data)
else:
# FUTURE OPTIMIZATION:
# Write a less general "apply to rows" method that doesn't do all
# the extra work that apply_along_axis does.
result = apply_along_axis(rankdata, 1, data, method=self._method)
# rankdata will sort nan values into last place, but we want our
# nans to propagate, so explicitly re-apply.
result[inv_mask] = nan
return result
def __repr__(self):
return "{type}({input_}, method='{method}', mask={mask})".format(
type=type(self).__name__,
input_=self.inputs[0],
method=self._method,
mask=self.mask,
)
class CustomFactor(RequiredWindowLengthMixin, CustomTermMixin, Factor):
"""
Base class for user-defined Factors operating on windows of raw data.
TODO: This is basically the most important class to document in the whole
FFC API...
We currently only support CustomFactors of type float64.
"""
ctx = nullctx()
def _validate(self):
if self.dtype != float64:
raise UnsupportedDataType(dtype=self.dtype)
return super(CustomFactor, self)._validate()