Files
catalyst/zipline/modelling/filter.py
T
Scott Sanderson 26fd6fda8b ENH/BUG: Modeling API enhancements.
- Fixes an error where Modeling API data known as of the close of `day
  N` would be shown to algorithms during `before_trading_start` as of
  the close of the same day.  Algorithms should now only receive data
  during `before_trading_start/handle_data` that was known as of the
  simulation time at which the function would be called.

- All Term instances now have a `mask` attribute that must be a `Filter`
  or an instance of `AssetExists()`.  `mask` can be used to specify that
  a Factor should be computed in a manner that ignores the values that
  were not `True` in the mask.

- Changed the interface for `FFCLoader.load_adjusted_array` and
  `Term._compute` from `(columns, mask)`, with mask as a DataFrame, to
  `(columns, dates, assets, mask)`, where mask is a numpy array.  This
  is primarily to avoid having to reconstruct extra DataFrames when
  using masks produced by non `AssetExists` filters.

- Adds `BoundColumn.latest`, which gives the most-recently-known value
  of a column.
2015-09-16 01:47:11 -04:00

191 lines
5.7 KiB
Python

"""
filter.py
"""
from numpy import (
bool_,
float64,
nan,
nanpercentile,
)
from itertools import chain
from operator import attrgetter
from zipline.errors import (
BadPercentileBounds,
)
from zipline.modelling.term import (
SingleInputMixin,
Term,
)
from zipline.modelling.expression import (
BadBinaryOperator,
FILTER_BINOPS,
method_name_for_op,
NumericalExpression,
)
def concat_tuples(*tuples):
"""
Concatenate a sequence of tuples into one tuple.
"""
return tuple(chain(*tuples))
def binary_operator(op):
"""
Factory function for making binary operator methods on a Filter subclass.
Returns a function "binary_operator" suitable for implementing functions
like __and__ or __or__.
"""
# When combining a Filter with a NumericalExpression, we use this
# attrgetter instance to defer to the commuted interpretation of the
# NumericalExpression operator.
commuted_method_getter = attrgetter(method_name_for_op(op, commute=True))
def binary_operator(self, other):
if isinstance(self, NumericalExpression):
self_expr, other_expr, new_inputs = self.build_binary_op(
op, other,
)
return NumExprFilter(
"({left}) {op} ({right})".format(
left=self_expr,
op=op,
right=other_expr,
),
new_inputs,
)
elif isinstance(other, NumericalExpression):
# NumericalExpression overrides numerical ops to correctly handle
# merging of inputs. Look up and call the appropriate
# right-binding operator with ourself as the input.
return commuted_method_getter(other)(self)
elif isinstance(other, Filter):
if self is other:
return NumExprFilter(
"x_0 {op} x_0".format(op=op),
(self,),
)
return NumExprFilter(
"x_0 {op} x_1".format(op=op),
(self, other),
)
elif isinstance(other, int): # Note that this is true for bool as well
return NumExprFilter(
"x_0 {op} ({constant})".format(op=op, constant=int(other)),
binds=(self,),
)
raise BadBinaryOperator(op, self, other)
return binary_operator
class Filter(Term):
"""
A boolean predicate on a universe of Assets.
"""
dtype = bool_
clsdict = locals()
clsdict.update(
{
method_name_for_op(op): binary_operator(op)
for op in FILTER_BINOPS
}
)
class NumExprFilter(NumericalExpression, Filter):
"""
A Filter computed from a numexpr expression.
"""
def _compute(self, arrays, dates, assets, mask):
"""
Compute our result with numexpr, then re-apply `mask`.
"""
return super(NumExprFilter, self)._compute(
arrays,
dates,
assets,
mask,
) & mask
class PercentileFilter(SingleInputMixin, Filter):
"""
A Filter representing assets falling between percentile bounds of a Factor.
Parameters
----------
factor : zipline.modelling.factor.Factor
The factor over which to compute percentile bounds.
min_percentile : float [0.0, 1.0]
The minimum percentile rank of an asset that will pass the filter.
max_percentile : float [0.0, 1.0]
The maxiumum percentile rank of an asset that will pass the filter.
"""
window_length = 0
def __new__(cls, factor, min_percentile, max_percentile, mask):
return super(PercentileFilter, cls).__new__(
cls,
inputs=(factor,),
mask=mask,
min_percentile=min_percentile,
max_percentile=max_percentile,
)
def _init(self, min_percentile, max_percentile, *args, **kwargs):
self._min_percentile = min_percentile
self._max_percentile = max_percentile
return super(PercentileFilter, self)._init(*args, **kwargs)
@classmethod
def static_identity(cls, min_percentile, max_percentile, *args, **kwargs):
return (
super(PercentileFilter, cls).static_identity(*args, **kwargs),
min_percentile,
max_percentile,
)
def _validate(self):
"""
Ensure that our percentile bounds are well-formed.
"""
if not 0.0 <= self._min_percentile < self._max_percentile <= 100.0:
raise BadPercentileBounds(
min_percentile=self._min_percentile,
max_percentile=self._max_percentile,
)
return super(PercentileFilter, self)._validate()
def _compute(self, arrays, dates, assets, mask):
"""
For each row in the input, compute a mask of all values falling between
the given percentiles.
"""
# TODO: Review whether there's a better way of handling small numbers
# of columns.
data = arrays[0].copy().astype(float64)
data[~mask] = nan
# FIXME: np.nanpercentile **should** support computing multiple bounds
# at once, but there's a bug in the logic for multiple bounds in numpy
# 1.9.2. It will be fixed in 1.10.
# c.f. https://github.com/numpy/numpy/pull/5981
lower_bounds = nanpercentile(
data,
self._min_percentile,
axis=1,
keepdims=True,
)
upper_bounds = nanpercentile(
data,
self._max_percentile,
axis=1,
keepdims=True,
)
return (lower_bounds <= data) & (data <= upper_bounds)