diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py index 9d5fb04a..46245222 100644 --- a/tests/pipeline/test_factor.py +++ b/tests/pipeline/test_factor.py @@ -24,7 +24,7 @@ from numpy.random import randn, seed import pandas as pd from scipy.stats.mstats import winsorize as scipy_winsorize -from zipline.errors import UnknownRankMethod +from zipline.errors import BadPercentileBounds, UnknownRankMethod from zipline.lib.labelarray import LabelArray from zipline.lib.rank import masked_rankdata_2d from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply @@ -710,12 +710,157 @@ class FactorTestCase(BasePipelineTestCase): check=partial(check_allclose, atol=0.001), ) + def test_winsorize_hand_computed(self): + """ + Test the hand-computed example in factor.winsorize. + """ + f = self.f + m = Mask() + c = C() + str_c = C(dtype=categorical_dtype, missing_value=None) + + factor_data = array([ + [1., 2., 3., 4., 5., 6.], + [1., 8., 27., 64., 125., 216.], + [6., 5., 4., 3., 2., 1.] + ]) + filter_data = array( + [[False, True, True, True, True, True], + [True, False, True, True, True, True], + [True, True, False, True, True, True]], + dtype=bool, + ) + classifier_data = array( + [[1, 1, 1, 2, 2, 2], + [1, 1, 1, 2, 2, 2], + [1, 1, 1, 2, 2, 2]], + dtype=int64_dtype, + ) + string_classifier_data = LabelArray( + classifier_data.astype(str).astype(object), + missing_value=None, + ) + + terms = { + 'winsor_1': f.winsorize( + min_percentile=0.33, + max_percentile=0.67 + ), + 'winsor_2': f.winsorize( + min_percentile=0.49, + max_percentile=1 + ), + 'winsor_3': f.winsorize( + min_percentile=0, + max_percentile=.67 + ), + 'masked': f.winsorize( + min_percentile=0.33, + max_percentile=0.67, + mask=m + ), + 'grouped': f.winsorize( + min_percentile=0.34, + max_percentile=0.66, + groupby=c + ), + 'grouped_str': f.winsorize( + min_percentile=0.34, + max_percentile=0.66, + groupby=str_c + ), + 'grouped_masked': f.winsorize( + min_percentile=0.34, + max_percentile=0.66, + mask=m, + groupby=c + ), + 'grouped_masked_str': f.winsorize( + min_percentile=0.34, + max_percentile=0.66, + mask=m, + groupby=str_c + ), + } + expected = { + 'winsor_1': array([ + [2., 2., 3., 4., 5., 5.], + [8., 8., 27., 64., 125., 125.], + [5., 5., 4., 3., 2., 2.] + ]), + 'winsor_2': array([ + [3.0, 3., 3., 4., 5., 6.], + [27., 27., 27., 64., 125., 216.], + [6.0, 5., 4., 3., 3., 3.] + ]), + 'winsor_3': array([ + [1., 2., 3., 4., 5., 5.], + [1., 8., 27., 64., 125., 125.], + [5., 5., 4., 3., 2., 1.] + ]), + 'masked': array([ + [nan, 3., 3., 4., 5., 5.], + [27., nan, 27., 64., 125., 125.], + [5.0, 5., nan, 3., 2., 2.] + ]), + 'grouped': array([ + [2., 2., 2., 5., 5., 5.], + [8., 8., 8., 125., 125., 125.], + [5., 5., 5., 2., 2., 2.] + ]), + 'grouped_masked': array([ + [nan, 2., 3., 5., 5., 5.], + [1.0, nan, 27., 125., 125., 125.], + [6.0, 5., nan, 2., 2., 2.] + ]), + } + # Changing the classifier dtype shouldn't affect anything. + expected['grouped_str'] = expected['grouped'] + expected['grouped_masked_str'] = expected['grouped_masked'] + + self.check_terms( + terms, + expected, + initial_workspace={ + f: factor_data, + c: classifier_data, + str_c: string_classifier_data, + m: filter_data, + }, + mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), + check=partial(check_allclose, atol=0.001), + ) + + def test_winsorize_bad_bounds(self): + """ + Test out of bounds input for factor.winsorize. + """ + f = self.f + + bad_percentiles = [ + (-.1, 1), + (0, 95), + (5, 95), + (5, 5), + (.6, .4) + ] + for min_, max_ in bad_percentiles: + with self.assertRaises(BadPercentileBounds): + f.winsorize(min_percentile=min_, max_percentile=max_) + @parameter_space( seed_value=range(1, 2), normalizer_name_and_func=[ - ('demean', lambda row: row - nanmean(row)), - ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)), - ('winsorize', lambda row: scipy_winsorize(row, limits=0.05)), + ('demean', {}, lambda row: row - nanmean(row)), + ('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)), + ( + 'winsorize', + {"min_percentile": 0.25, "max_percentile": 0.75}, + lambda row: scipy_winsorize( + row, + limits=0.25, + ) + ), ], add_nulls_to_factor=(False, True,), ) @@ -724,9 +869,9 @@ class FactorTestCase(BasePipelineTestCase): normalizer_name_and_func, add_nulls_to_factor): - name, func = normalizer_name_and_func + name, kwargs, func = normalizer_name_and_func - shape = (7, 7) + shape = (20, 20) # All Trues. nomask = self.ones_mask(shape=shape) @@ -757,7 +902,7 @@ class FactorTestCase(BasePipelineTestCase): c = C() c_with_nulls = OtherC() m = Mask() - method = getattr(f, name) + method = partial(getattr(f, name), **kwargs) terms = { 'vanilla': method(), 'masked': method(mask=m), @@ -1054,7 +1199,7 @@ class ShortReprTestCase(TestCase): self.assertEqual(r, "GroupedRowTransform('zscore')") def test_winsorize(self): - r = F().winsorize().short_repr() + r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr() self.assertEqual(r, "GroupedRowTransform('winsorize')") @@ -1068,8 +1213,22 @@ class TestWindowSafety(TestCase): self.assertFalse(F(window_safe=False).demean().window_safe) self.assertTrue(F(window_safe=True).demean().window_safe) - def test_winsorize_is_window_safe(self): - self.assertTrue(F().winsorize().window_safe) + def test_winsorize_is_window_safe_if_input_is_window_safe(self): + self.assertFalse( + F().winsorize(min_percentile=.05, max_percentile=.95).window_safe + ) + self.assertFalse( + F(window_safe=False).winsorize( + min_percentile=.05, + max_percentile=.95 + ).window_safe + ) + self.assertTrue( + F(window_safe=True).winsorize( + min_percentile=.05, + max_percentile=.95 + ).window_safe + ) class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase): diff --git a/zipline/errors.py b/zipline/errors.py index 656f554e..17626df3 100644 --- a/zipline/errors.py +++ b/zipline/errors.py @@ -570,8 +570,8 @@ class BadPercentileBounds(ZiplineError): are invalid. """ msg = ( - "Percentile bounds must fall between 0.0 and 100.0, and min must be " - "less than max." + "Percentile bounds must fall between 0.0 and {upper_bound}, and min " + "must be less than max." "\nInputs were min={min_percentile}, max={max_percentile}." ) diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index 8f422dd0..9b552ef9 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -4,12 +4,12 @@ factor.py from functools import wraps from operator import attrgetter from numbers import Number +from math import ceil from numpy import empty_like, inf, nan, where from scipy.stats import rankdata -from scipy.stats.mstats import winsorize as scipy_winsorize -from zipline.errors import UnknownRankMethod +from zipline.errors import BadPercentileBounds, UnknownRankMethod from zipline.lib.normalize import naive_grouped_rowwise_apply from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending from zipline.pipeline.api_utils import restrict_to_dtype @@ -833,19 +833,25 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): regression_length=regression_length, mask=mask, ) + + @expect_types( + min_percentile=(int, float), + max_percentile=(int, float), + mask=(Filter, NotSpecifiedType), + groupby=(Classifier, NotSpecifiedType), + ) @float64_only def winsorize(self, - limits, - inclusive=(True, True), + min_percentile, + max_percentile, mask=NotSpecified, groupby=NotSpecified): """ - Construct a Factor returns a winsorized row for results. Winsorizing - clips the input values to fixed percentiles. The (limits[0])th lowest - values are set to the value at the (limits[0])th percentile. The values - above the (limits[1])th percentiles are set to the value at the - (limits[1])th percentile. This is useful when limiting the impact of - extreme values. + Construct a Factor returns a winsorized row. Winsorizing changes values + ranked less than the minimum percentile to to value at the minimum + percentile. Similarly, values ranking above the maximum percentile will + be changed to the value at the maximum percentile. This is useful + when limiting the impact of extreme values. If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means and standard deviations, and output NaN @@ -857,14 +863,14 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): Parameters ---------- - limits : None, tuple of float, optional - A tuple of two values between 0 and 100 inclusive. This is the - percentage to cut from each tail of the array. A value of None - can be used to indicate an open limit. - inclusive : a tuple of bool, optional - A bool indicating whether the data on each side should be - rounded(True) or truncated(False). A value of None can be used if - one side is not being winsorized. Default is (False, False). + min_percentile: float, int + Entries with values at or below this percentile will be replaced + with the (len(inp) * min_percentile)th lowest value. If low values + should not be clipped, use 0. + max_percentile: float, int + Entries with values at or above this percentile will be replaced + with the (len(inp) * max_percentile)th lowest value. If high + values should not be clipped, use 1. mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when winsorizing. groupby : zipline.pipeline.Classifier, optional @@ -881,34 +887,43 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): price = USEquityPricing.close.latest columns={ 'PRICE': price, - 'WINSOR_1: price.winsorize(limits=25), - 'WINSOR_2': price.winsorize(limits=(50, None)), - 'WINSOR_3': price.winsorize( - limits=25, inclusive=(False, False) + 'WINSOR_1: price.winsorize( + min_percentile=0.25, max_percentile=0.75 ), - 'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)), - 'WINSOR_5': price.winsorize(limits=(20, 40)), + 'WINSOR_2': price.winsorize( + min_percentile=0.50, max_percentile=1.0 + ), + 'WINSOR_3': price.winsorize( + min_percentile=0.0, max_percentile=0.5 + ), + } Given a pipeline with columns, defined above, the result for a given day could look like: - 'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5' - Asset_1 1 2 4 3 2 2 - Asset_2 2 2 4 3 2 2 - Asset_3 3 3 4 3 3 2 - Asset_4 4 4 4 4 4 4 - Asset_5 5 5 5 4 4 4 - Asset_6 6 5 5 4 4 4 + 'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' + Asset_1 1 2 4 3 + Asset_2 2 2 4 3 + Asset_3 3 3 4 3 + Asset_4 4 4 4 4 + Asset_5 5 5 5 4 + Asset_6 6 5 5 4 See Also -------- :func:`scipy.stats.mstats.winsorize` :meth:`pandas.DataFrame.groupby` """ + if not 0.0 <= min_percentile < max_percentile <= 1.0: + raise BadPercentileBounds( + min_percentile=min_percentile, + max_percentile=max_percentile, + upper_bound=1.0, + ) return GroupedRowTransform( transform=winsorize, - transform_args=(limits, inclusive), + transform_args=(min_percentile, max_percentile), factor=self, groupby=groupby, dtype=self.dtype, @@ -1616,18 +1631,21 @@ def zscore(row): return (row - nanmean(row)) / nanstd(row) -def winsorize(row, limits, inclusive): - if isinstance(limits, int) or isinstance(limits, float): - limits = limits / 100. - if isinstance(limits, tuple): - if limits[0] is not None: - limit_0 = limits[0] / 100. - else: - limit_0 = None - if limits[1] is not None: - limit_1 = limits[1] / 100 - else: - limit_1 = None - limits = (limit_0, limit_1) +def winsorize(row, min_percentile, max_percentile): + """ + This implementation is based on scipy.stats.mstats.winsorize + """ + a = row.copy() + num = a.size + idx = a.argsort() + if min_percentile > 0: + lowidx = int(min_percentile * num) + a[idx[:lowidx]] = a[idx[lowidx]] + if max_percentile < 1: + upidx = ceil(num * max_percentile) + # upidx could return as the length of the array, in this case + # no modification to the right tail is necessary. + if upidx < num: + a[idx[upidx:]] = a[idx[upidx - 1]] - return scipy_winsorize(row, limits=limits, inclusive=inclusive) + return a diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py index 864e18f9..b4e3809d 100644 --- a/zipline/pipeline/filters/filter.py +++ b/zipline/pipeline/filters/filter.py @@ -334,6 +334,7 @@ class PercentileFilter(SingleInputMixin, Filter): raise BadPercentileBounds( min_percentile=self._min_percentile, max_percentile=self._max_percentile, + upper_bound=100.0 ) return super(PercentileFilter, self)._validate()