TST: Add tests for winsorize factor

This commit is contained in:
Ana Ruelas
2017-03-06 14:08:51 -05:00
parent 309ec73faa
commit b4e97bc9d8
4 changed files with 236 additions and 58 deletions
+169 -10
View File
@@ -24,7 +24,7 @@ from numpy.random import randn, seed
import pandas as pd
from scipy.stats.mstats import winsorize as scipy_winsorize
from zipline.errors import UnknownRankMethod
from zipline.errors import BadPercentileBounds, UnknownRankMethod
from zipline.lib.labelarray import LabelArray
from zipline.lib.rank import masked_rankdata_2d
from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
@@ -710,12 +710,157 @@ class FactorTestCase(BasePipelineTestCase):
check=partial(check_allclose, atol=0.001),
)
def test_winsorize_hand_computed(self):
"""
Test the hand-computed example in factor.winsorize.
"""
f = self.f
m = Mask()
c = C()
str_c = C(dtype=categorical_dtype, missing_value=None)
factor_data = array([
[1., 2., 3., 4., 5., 6.],
[1., 8., 27., 64., 125., 216.],
[6., 5., 4., 3., 2., 1.]
])
filter_data = array(
[[False, True, True, True, True, True],
[True, False, True, True, True, True],
[True, True, False, True, True, True]],
dtype=bool,
)
classifier_data = array(
[[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2]],
dtype=int64_dtype,
)
string_classifier_data = LabelArray(
classifier_data.astype(str).astype(object),
missing_value=None,
)
terms = {
'winsor_1': f.winsorize(
min_percentile=0.33,
max_percentile=0.67
),
'winsor_2': f.winsorize(
min_percentile=0.49,
max_percentile=1
),
'winsor_3': f.winsorize(
min_percentile=0,
max_percentile=.67
),
'masked': f.winsorize(
min_percentile=0.33,
max_percentile=0.67,
mask=m
),
'grouped': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
groupby=c
),
'grouped_str': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
groupby=str_c
),
'grouped_masked': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
mask=m,
groupby=c
),
'grouped_masked_str': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
mask=m,
groupby=str_c
),
}
expected = {
'winsor_1': array([
[2., 2., 3., 4., 5., 5.],
[8., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 2.]
]),
'winsor_2': array([
[3.0, 3., 3., 4., 5., 6.],
[27., 27., 27., 64., 125., 216.],
[6.0, 5., 4., 3., 3., 3.]
]),
'winsor_3': array([
[1., 2., 3., 4., 5., 5.],
[1., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 1.]
]),
'masked': array([
[nan, 3., 3., 4., 5., 5.],
[27., nan, 27., 64., 125., 125.],
[5.0, 5., nan, 3., 2., 2.]
]),
'grouped': array([
[2., 2., 2., 5., 5., 5.],
[8., 8., 8., 125., 125., 125.],
[5., 5., 5., 2., 2., 2.]
]),
'grouped_masked': array([
[nan, 2., 3., 5., 5., 5.],
[1.0, nan, 27., 125., 125., 125.],
[6.0, 5., nan, 2., 2., 2.]
]),
}
# Changing the classifier dtype shouldn't affect anything.
expected['grouped_str'] = expected['grouped']
expected['grouped_masked_str'] = expected['grouped_masked']
self.check_terms(
terms,
expected,
initial_workspace={
f: factor_data,
c: classifier_data,
str_c: string_classifier_data,
m: filter_data,
},
mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
check=partial(check_allclose, atol=0.001),
)
def test_winsorize_bad_bounds(self):
"""
Test out of bounds input for factor.winsorize.
"""
f = self.f
bad_percentiles = [
(-.1, 1),
(0, 95),
(5, 95),
(5, 5),
(.6, .4)
]
for min_, max_ in bad_percentiles:
with self.assertRaises(BadPercentileBounds):
f.winsorize(min_percentile=min_, max_percentile=max_)
@parameter_space(
seed_value=range(1, 2),
normalizer_name_and_func=[
('demean', lambda row: row - nanmean(row)),
('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
('winsorize', lambda row: scipy_winsorize(row, limits=0.05)),
('demean', {}, lambda row: row - nanmean(row)),
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
(
'winsorize',
{"min_percentile": 0.25, "max_percentile": 0.75},
lambda row: scipy_winsorize(
row,
limits=0.25,
)
),
],
add_nulls_to_factor=(False, True,),
)
@@ -724,9 +869,9 @@ class FactorTestCase(BasePipelineTestCase):
normalizer_name_and_func,
add_nulls_to_factor):
name, func = normalizer_name_and_func
name, kwargs, func = normalizer_name_and_func
shape = (7, 7)
shape = (20, 20)
# All Trues.
nomask = self.ones_mask(shape=shape)
@@ -757,7 +902,7 @@ class FactorTestCase(BasePipelineTestCase):
c = C()
c_with_nulls = OtherC()
m = Mask()
method = getattr(f, name)
method = partial(getattr(f, name), **kwargs)
terms = {
'vanilla': method(),
'masked': method(mask=m),
@@ -1054,7 +1199,7 @@ class ShortReprTestCase(TestCase):
self.assertEqual(r, "GroupedRowTransform('zscore')")
def test_winsorize(self):
r = F().winsorize().short_repr()
r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr()
self.assertEqual(r, "GroupedRowTransform('winsorize')")
@@ -1068,8 +1213,22 @@ class TestWindowSafety(TestCase):
self.assertFalse(F(window_safe=False).demean().window_safe)
self.assertTrue(F(window_safe=True).demean().window_safe)
def test_winsorize_is_window_safe(self):
self.assertTrue(F().winsorize().window_safe)
def test_winsorize_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(
F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
)
self.assertFalse(
F(window_safe=False).winsorize(
min_percentile=.05,
max_percentile=.95
).window_safe
)
self.assertTrue(
F(window_safe=True).winsorize(
min_percentile=.05,
max_percentile=.95
).window_safe
)
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
+2 -2
View File
@@ -570,8 +570,8 @@ class BadPercentileBounds(ZiplineError):
are invalid.
"""
msg = (
"Percentile bounds must fall between 0.0 and 100.0, and min must be "
"less than max."
"Percentile bounds must fall between 0.0 and {upper_bound}, and min "
"must be less than max."
"\nInputs were min={min_percentile}, max={max_percentile}."
)
+64 -46
View File
@@ -4,12 +4,12 @@ factor.py
from functools import wraps
from operator import attrgetter
from numbers import Number
from math import ceil
from numpy import empty_like, inf, nan, where
from scipy.stats import rankdata
from scipy.stats.mstats import winsorize as scipy_winsorize
from zipline.errors import UnknownRankMethod
from zipline.errors import BadPercentileBounds, UnknownRankMethod
from zipline.lib.normalize import naive_grouped_rowwise_apply
from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending
from zipline.pipeline.api_utils import restrict_to_dtype
@@ -833,19 +833,25 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
regression_length=regression_length,
mask=mask,
)
@expect_types(
min_percentile=(int, float),
max_percentile=(int, float),
mask=(Filter, NotSpecifiedType),
groupby=(Classifier, NotSpecifiedType),
)
@float64_only
def winsorize(self,
limits,
inclusive=(True, True),
min_percentile,
max_percentile,
mask=NotSpecified,
groupby=NotSpecified):
"""
Construct a Factor returns a winsorized row for results. Winsorizing
clips the input values to fixed percentiles. The (limits[0])th lowest
values are set to the value at the (limits[0])th percentile. The values
above the (limits[1])th percentiles are set to the value at the
(limits[1])th percentile. This is useful when limiting the impact of
extreme values.
Construct a Factor returns a winsorized row. Winsorizing changes values
ranked less than the minimum percentile to to value at the minimum
percentile. Similarly, values ranking above the maximum percentile will
be changed to the value at the maximum percentile. This is useful
when limiting the impact of extreme values.
If ``mask`` is supplied, ignore values where ``mask`` returns False
when computing row means and standard deviations, and output NaN
@@ -857,14 +863,14 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
Parameters
----------
limits : None, tuple of float, optional
A tuple of two values between 0 and 100 inclusive. This is the
percentage to cut from each tail of the array. A value of None
can be used to indicate an open limit.
inclusive : a tuple of bool, optional
A bool indicating whether the data on each side should be
rounded(True) or truncated(False). A value of None can be used if
one side is not being winsorized. Default is (False, False).
min_percentile: float, int
Entries with values at or below this percentile will be replaced
with the (len(inp) * min_percentile)th lowest value. If low values
should not be clipped, use 0.
max_percentile: float, int
Entries with values at or above this percentile will be replaced
with the (len(inp) * max_percentile)th lowest value. If high
values should not be clipped, use 1.
mask : zipline.pipeline.Filter, optional
A Filter defining values to ignore when winsorizing.
groupby : zipline.pipeline.Classifier, optional
@@ -881,34 +887,43 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
price = USEquityPricing.close.latest
columns={
'PRICE': price,
'WINSOR_1: price.winsorize(limits=25),
'WINSOR_2': price.winsorize(limits=(50, None)),
'WINSOR_3': price.winsorize(
limits=25, inclusive=(False, False)
'WINSOR_1: price.winsorize(
min_percentile=0.25, max_percentile=0.75
),
'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
'WINSOR_5': price.winsorize(limits=(20, 40)),
'WINSOR_2': price.winsorize(
min_percentile=0.50, max_percentile=1.0
),
'WINSOR_3': price.winsorize(
min_percentile=0.0, max_percentile=0.5
),
}
Given a pipeline with columns, defined above, the result for a
given day could look like:
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
Asset_1 1 2 4 3 2 2
Asset_2 2 2 4 3 2 2
Asset_3 3 3 4 3 3 2
Asset_4 4 4 4 4 4 4
Asset_5 5 5 5 4 4 4
Asset_6 6 5 5 4 4 4
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3'
Asset_1 1 2 4 3
Asset_2 2 2 4 3
Asset_3 3 3 4 3
Asset_4 4 4 4 4
Asset_5 5 5 5 4
Asset_6 6 5 5 4
See Also
--------
:func:`scipy.stats.mstats.winsorize`
:meth:`pandas.DataFrame.groupby`
"""
if not 0.0 <= min_percentile < max_percentile <= 1.0:
raise BadPercentileBounds(
min_percentile=min_percentile,
max_percentile=max_percentile,
upper_bound=1.0,
)
return GroupedRowTransform(
transform=winsorize,
transform_args=(limits, inclusive),
transform_args=(min_percentile, max_percentile),
factor=self,
groupby=groupby,
dtype=self.dtype,
@@ -1616,18 +1631,21 @@ def zscore(row):
return (row - nanmean(row)) / nanstd(row)
def winsorize(row, limits, inclusive):
if isinstance(limits, int) or isinstance(limits, float):
limits = limits / 100.
if isinstance(limits, tuple):
if limits[0] is not None:
limit_0 = limits[0] / 100.
else:
limit_0 = None
if limits[1] is not None:
limit_1 = limits[1] / 100
else:
limit_1 = None
limits = (limit_0, limit_1)
def winsorize(row, min_percentile, max_percentile):
"""
This implementation is based on scipy.stats.mstats.winsorize
"""
a = row.copy()
num = a.size
idx = a.argsort()
if min_percentile > 0:
lowidx = int(min_percentile * num)
a[idx[:lowidx]] = a[idx[lowidx]]
if max_percentile < 1:
upidx = ceil(num * max_percentile)
# upidx could return as the length of the array, in this case
# no modification to the right tail is necessary.
if upidx < num:
a[idx[upidx:]] = a[idx[upidx - 1]]
return scipy_winsorize(row, limits=limits, inclusive=inclusive)
return a
+1
View File
@@ -334,6 +334,7 @@ class PercentileFilter(SingleInputMixin, Filter):
raise BadPercentileBounds(
min_percentile=self._min_percentile,
max_percentile=self._max_percentile,
upper_bound=100.0
)
return super(PercentileFilter, self)._validate()