mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-28 07:48:45 +08:00
TST: Add tests for winsorize factor
This commit is contained in:
+169
-10
@@ -24,7 +24,7 @@ from numpy.random import randn, seed
|
||||
import pandas as pd
|
||||
from scipy.stats.mstats import winsorize as scipy_winsorize
|
||||
|
||||
from zipline.errors import UnknownRankMethod
|
||||
from zipline.errors import BadPercentileBounds, UnknownRankMethod
|
||||
from zipline.lib.labelarray import LabelArray
|
||||
from zipline.lib.rank import masked_rankdata_2d
|
||||
from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
|
||||
@@ -710,12 +710,157 @@ class FactorTestCase(BasePipelineTestCase):
|
||||
check=partial(check_allclose, atol=0.001),
|
||||
)
|
||||
|
||||
def test_winsorize_hand_computed(self):
|
||||
"""
|
||||
Test the hand-computed example in factor.winsorize.
|
||||
"""
|
||||
f = self.f
|
||||
m = Mask()
|
||||
c = C()
|
||||
str_c = C(dtype=categorical_dtype, missing_value=None)
|
||||
|
||||
factor_data = array([
|
||||
[1., 2., 3., 4., 5., 6.],
|
||||
[1., 8., 27., 64., 125., 216.],
|
||||
[6., 5., 4., 3., 2., 1.]
|
||||
])
|
||||
filter_data = array(
|
||||
[[False, True, True, True, True, True],
|
||||
[True, False, True, True, True, True],
|
||||
[True, True, False, True, True, True]],
|
||||
dtype=bool,
|
||||
)
|
||||
classifier_data = array(
|
||||
[[1, 1, 1, 2, 2, 2],
|
||||
[1, 1, 1, 2, 2, 2],
|
||||
[1, 1, 1, 2, 2, 2]],
|
||||
dtype=int64_dtype,
|
||||
)
|
||||
string_classifier_data = LabelArray(
|
||||
classifier_data.astype(str).astype(object),
|
||||
missing_value=None,
|
||||
)
|
||||
|
||||
terms = {
|
||||
'winsor_1': f.winsorize(
|
||||
min_percentile=0.33,
|
||||
max_percentile=0.67
|
||||
),
|
||||
'winsor_2': f.winsorize(
|
||||
min_percentile=0.49,
|
||||
max_percentile=1
|
||||
),
|
||||
'winsor_3': f.winsorize(
|
||||
min_percentile=0,
|
||||
max_percentile=.67
|
||||
),
|
||||
'masked': f.winsorize(
|
||||
min_percentile=0.33,
|
||||
max_percentile=0.67,
|
||||
mask=m
|
||||
),
|
||||
'grouped': f.winsorize(
|
||||
min_percentile=0.34,
|
||||
max_percentile=0.66,
|
||||
groupby=c
|
||||
),
|
||||
'grouped_str': f.winsorize(
|
||||
min_percentile=0.34,
|
||||
max_percentile=0.66,
|
||||
groupby=str_c
|
||||
),
|
||||
'grouped_masked': f.winsorize(
|
||||
min_percentile=0.34,
|
||||
max_percentile=0.66,
|
||||
mask=m,
|
||||
groupby=c
|
||||
),
|
||||
'grouped_masked_str': f.winsorize(
|
||||
min_percentile=0.34,
|
||||
max_percentile=0.66,
|
||||
mask=m,
|
||||
groupby=str_c
|
||||
),
|
||||
}
|
||||
expected = {
|
||||
'winsor_1': array([
|
||||
[2., 2., 3., 4., 5., 5.],
|
||||
[8., 8., 27., 64., 125., 125.],
|
||||
[5., 5., 4., 3., 2., 2.]
|
||||
]),
|
||||
'winsor_2': array([
|
||||
[3.0, 3., 3., 4., 5., 6.],
|
||||
[27., 27., 27., 64., 125., 216.],
|
||||
[6.0, 5., 4., 3., 3., 3.]
|
||||
]),
|
||||
'winsor_3': array([
|
||||
[1., 2., 3., 4., 5., 5.],
|
||||
[1., 8., 27., 64., 125., 125.],
|
||||
[5., 5., 4., 3., 2., 1.]
|
||||
]),
|
||||
'masked': array([
|
||||
[nan, 3., 3., 4., 5., 5.],
|
||||
[27., nan, 27., 64., 125., 125.],
|
||||
[5.0, 5., nan, 3., 2., 2.]
|
||||
]),
|
||||
'grouped': array([
|
||||
[2., 2., 2., 5., 5., 5.],
|
||||
[8., 8., 8., 125., 125., 125.],
|
||||
[5., 5., 5., 2., 2., 2.]
|
||||
]),
|
||||
'grouped_masked': array([
|
||||
[nan, 2., 3., 5., 5., 5.],
|
||||
[1.0, nan, 27., 125., 125., 125.],
|
||||
[6.0, 5., nan, 2., 2., 2.]
|
||||
]),
|
||||
}
|
||||
# Changing the classifier dtype shouldn't affect anything.
|
||||
expected['grouped_str'] = expected['grouped']
|
||||
expected['grouped_masked_str'] = expected['grouped_masked']
|
||||
|
||||
self.check_terms(
|
||||
terms,
|
||||
expected,
|
||||
initial_workspace={
|
||||
f: factor_data,
|
||||
c: classifier_data,
|
||||
str_c: string_classifier_data,
|
||||
m: filter_data,
|
||||
},
|
||||
mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
|
||||
check=partial(check_allclose, atol=0.001),
|
||||
)
|
||||
|
||||
def test_winsorize_bad_bounds(self):
|
||||
"""
|
||||
Test out of bounds input for factor.winsorize.
|
||||
"""
|
||||
f = self.f
|
||||
|
||||
bad_percentiles = [
|
||||
(-.1, 1),
|
||||
(0, 95),
|
||||
(5, 95),
|
||||
(5, 5),
|
||||
(.6, .4)
|
||||
]
|
||||
for min_, max_ in bad_percentiles:
|
||||
with self.assertRaises(BadPercentileBounds):
|
||||
f.winsorize(min_percentile=min_, max_percentile=max_)
|
||||
|
||||
@parameter_space(
|
||||
seed_value=range(1, 2),
|
||||
normalizer_name_and_func=[
|
||||
('demean', lambda row: row - nanmean(row)),
|
||||
('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
|
||||
('winsorize', lambda row: scipy_winsorize(row, limits=0.05)),
|
||||
('demean', {}, lambda row: row - nanmean(row)),
|
||||
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
|
||||
(
|
||||
'winsorize',
|
||||
{"min_percentile": 0.25, "max_percentile": 0.75},
|
||||
lambda row: scipy_winsorize(
|
||||
row,
|
||||
limits=0.25,
|
||||
)
|
||||
),
|
||||
],
|
||||
add_nulls_to_factor=(False, True,),
|
||||
)
|
||||
@@ -724,9 +869,9 @@ class FactorTestCase(BasePipelineTestCase):
|
||||
normalizer_name_and_func,
|
||||
add_nulls_to_factor):
|
||||
|
||||
name, func = normalizer_name_and_func
|
||||
name, kwargs, func = normalizer_name_and_func
|
||||
|
||||
shape = (7, 7)
|
||||
shape = (20, 20)
|
||||
|
||||
# All Trues.
|
||||
nomask = self.ones_mask(shape=shape)
|
||||
@@ -757,7 +902,7 @@ class FactorTestCase(BasePipelineTestCase):
|
||||
c = C()
|
||||
c_with_nulls = OtherC()
|
||||
m = Mask()
|
||||
method = getattr(f, name)
|
||||
method = partial(getattr(f, name), **kwargs)
|
||||
terms = {
|
||||
'vanilla': method(),
|
||||
'masked': method(mask=m),
|
||||
@@ -1054,7 +1199,7 @@ class ShortReprTestCase(TestCase):
|
||||
self.assertEqual(r, "GroupedRowTransform('zscore')")
|
||||
|
||||
def test_winsorize(self):
|
||||
r = F().winsorize().short_repr()
|
||||
r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr()
|
||||
self.assertEqual(r, "GroupedRowTransform('winsorize')")
|
||||
|
||||
|
||||
@@ -1068,8 +1213,22 @@ class TestWindowSafety(TestCase):
|
||||
self.assertFalse(F(window_safe=False).demean().window_safe)
|
||||
self.assertTrue(F(window_safe=True).demean().window_safe)
|
||||
|
||||
def test_winsorize_is_window_safe(self):
|
||||
self.assertTrue(F().winsorize().window_safe)
|
||||
def test_winsorize_is_window_safe_if_input_is_window_safe(self):
|
||||
self.assertFalse(
|
||||
F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
|
||||
)
|
||||
self.assertFalse(
|
||||
F(window_safe=False).winsorize(
|
||||
min_percentile=.05,
|
||||
max_percentile=.95
|
||||
).window_safe
|
||||
)
|
||||
self.assertTrue(
|
||||
F(window_safe=True).winsorize(
|
||||
min_percentile=.05,
|
||||
max_percentile=.95
|
||||
).window_safe
|
||||
)
|
||||
|
||||
|
||||
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
|
||||
|
||||
+2
-2
@@ -570,8 +570,8 @@ class BadPercentileBounds(ZiplineError):
|
||||
are invalid.
|
||||
"""
|
||||
msg = (
|
||||
"Percentile bounds must fall between 0.0 and 100.0, and min must be "
|
||||
"less than max."
|
||||
"Percentile bounds must fall between 0.0 and {upper_bound}, and min "
|
||||
"must be less than max."
|
||||
"\nInputs were min={min_percentile}, max={max_percentile}."
|
||||
)
|
||||
|
||||
|
||||
@@ -4,12 +4,12 @@ factor.py
|
||||
from functools import wraps
|
||||
from operator import attrgetter
|
||||
from numbers import Number
|
||||
from math import ceil
|
||||
|
||||
from numpy import empty_like, inf, nan, where
|
||||
from scipy.stats import rankdata
|
||||
from scipy.stats.mstats import winsorize as scipy_winsorize
|
||||
|
||||
from zipline.errors import UnknownRankMethod
|
||||
from zipline.errors import BadPercentileBounds, UnknownRankMethod
|
||||
from zipline.lib.normalize import naive_grouped_rowwise_apply
|
||||
from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending
|
||||
from zipline.pipeline.api_utils import restrict_to_dtype
|
||||
@@ -833,19 +833,25 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
|
||||
regression_length=regression_length,
|
||||
mask=mask,
|
||||
)
|
||||
|
||||
@expect_types(
|
||||
min_percentile=(int, float),
|
||||
max_percentile=(int, float),
|
||||
mask=(Filter, NotSpecifiedType),
|
||||
groupby=(Classifier, NotSpecifiedType),
|
||||
)
|
||||
@float64_only
|
||||
def winsorize(self,
|
||||
limits,
|
||||
inclusive=(True, True),
|
||||
min_percentile,
|
||||
max_percentile,
|
||||
mask=NotSpecified,
|
||||
groupby=NotSpecified):
|
||||
"""
|
||||
Construct a Factor returns a winsorized row for results. Winsorizing
|
||||
clips the input values to fixed percentiles. The (limits[0])th lowest
|
||||
values are set to the value at the (limits[0])th percentile. The values
|
||||
above the (limits[1])th percentiles are set to the value at the
|
||||
(limits[1])th percentile. This is useful when limiting the impact of
|
||||
extreme values.
|
||||
Construct a Factor returns a winsorized row. Winsorizing changes values
|
||||
ranked less than the minimum percentile to to value at the minimum
|
||||
percentile. Similarly, values ranking above the maximum percentile will
|
||||
be changed to the value at the maximum percentile. This is useful
|
||||
when limiting the impact of extreme values.
|
||||
|
||||
If ``mask`` is supplied, ignore values where ``mask`` returns False
|
||||
when computing row means and standard deviations, and output NaN
|
||||
@@ -857,14 +863,14 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
limits : None, tuple of float, optional
|
||||
A tuple of two values between 0 and 100 inclusive. This is the
|
||||
percentage to cut from each tail of the array. A value of None
|
||||
can be used to indicate an open limit.
|
||||
inclusive : a tuple of bool, optional
|
||||
A bool indicating whether the data on each side should be
|
||||
rounded(True) or truncated(False). A value of None can be used if
|
||||
one side is not being winsorized. Default is (False, False).
|
||||
min_percentile: float, int
|
||||
Entries with values at or below this percentile will be replaced
|
||||
with the (len(inp) * min_percentile)th lowest value. If low values
|
||||
should not be clipped, use 0.
|
||||
max_percentile: float, int
|
||||
Entries with values at or above this percentile will be replaced
|
||||
with the (len(inp) * max_percentile)th lowest value. If high
|
||||
values should not be clipped, use 1.
|
||||
mask : zipline.pipeline.Filter, optional
|
||||
A Filter defining values to ignore when winsorizing.
|
||||
groupby : zipline.pipeline.Classifier, optional
|
||||
@@ -881,34 +887,43 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
|
||||
price = USEquityPricing.close.latest
|
||||
columns={
|
||||
'PRICE': price,
|
||||
'WINSOR_1: price.winsorize(limits=25),
|
||||
'WINSOR_2': price.winsorize(limits=(50, None)),
|
||||
'WINSOR_3': price.winsorize(
|
||||
limits=25, inclusive=(False, False)
|
||||
'WINSOR_1: price.winsorize(
|
||||
min_percentile=0.25, max_percentile=0.75
|
||||
),
|
||||
'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
|
||||
'WINSOR_5': price.winsorize(limits=(20, 40)),
|
||||
'WINSOR_2': price.winsorize(
|
||||
min_percentile=0.50, max_percentile=1.0
|
||||
),
|
||||
'WINSOR_3': price.winsorize(
|
||||
min_percentile=0.0, max_percentile=0.5
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
Given a pipeline with columns, defined above, the result for a
|
||||
given day could look like:
|
||||
|
||||
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
|
||||
Asset_1 1 2 4 3 2 2
|
||||
Asset_2 2 2 4 3 2 2
|
||||
Asset_3 3 3 4 3 3 2
|
||||
Asset_4 4 4 4 4 4 4
|
||||
Asset_5 5 5 5 4 4 4
|
||||
Asset_6 6 5 5 4 4 4
|
||||
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3'
|
||||
Asset_1 1 2 4 3
|
||||
Asset_2 2 2 4 3
|
||||
Asset_3 3 3 4 3
|
||||
Asset_4 4 4 4 4
|
||||
Asset_5 5 5 5 4
|
||||
Asset_6 6 5 5 4
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`scipy.stats.mstats.winsorize`
|
||||
:meth:`pandas.DataFrame.groupby`
|
||||
"""
|
||||
if not 0.0 <= min_percentile < max_percentile <= 1.0:
|
||||
raise BadPercentileBounds(
|
||||
min_percentile=min_percentile,
|
||||
max_percentile=max_percentile,
|
||||
upper_bound=1.0,
|
||||
)
|
||||
return GroupedRowTransform(
|
||||
transform=winsorize,
|
||||
transform_args=(limits, inclusive),
|
||||
transform_args=(min_percentile, max_percentile),
|
||||
factor=self,
|
||||
groupby=groupby,
|
||||
dtype=self.dtype,
|
||||
@@ -1616,18 +1631,21 @@ def zscore(row):
|
||||
return (row - nanmean(row)) / nanstd(row)
|
||||
|
||||
|
||||
def winsorize(row, limits, inclusive):
|
||||
if isinstance(limits, int) or isinstance(limits, float):
|
||||
limits = limits / 100.
|
||||
if isinstance(limits, tuple):
|
||||
if limits[0] is not None:
|
||||
limit_0 = limits[0] / 100.
|
||||
else:
|
||||
limit_0 = None
|
||||
if limits[1] is not None:
|
||||
limit_1 = limits[1] / 100
|
||||
else:
|
||||
limit_1 = None
|
||||
limits = (limit_0, limit_1)
|
||||
def winsorize(row, min_percentile, max_percentile):
|
||||
"""
|
||||
This implementation is based on scipy.stats.mstats.winsorize
|
||||
"""
|
||||
a = row.copy()
|
||||
num = a.size
|
||||
idx = a.argsort()
|
||||
if min_percentile > 0:
|
||||
lowidx = int(min_percentile * num)
|
||||
a[idx[:lowidx]] = a[idx[lowidx]]
|
||||
if max_percentile < 1:
|
||||
upidx = ceil(num * max_percentile)
|
||||
# upidx could return as the length of the array, in this case
|
||||
# no modification to the right tail is necessary.
|
||||
if upidx < num:
|
||||
a[idx[upidx:]] = a[idx[upidx - 1]]
|
||||
|
||||
return scipy_winsorize(row, limits=limits, inclusive=inclusive)
|
||||
return a
|
||||
|
||||
@@ -334,6 +334,7 @@ class PercentileFilter(SingleInputMixin, Filter):
|
||||
raise BadPercentileBounds(
|
||||
min_percentile=self._min_percentile,
|
||||
max_percentile=self._max_percentile,
|
||||
upper_bound=100.0
|
||||
)
|
||||
return super(PercentileFilter, self)._validate()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user