TST: Add tests for winsorize factor

2026-06-28 07:48:45 +08:00 · 2017-03-06 14:08:51 -05:00
parent 309ec73faa
commit b4e97bc9d8
4 changed files with 236 additions and 58 deletions
@@ -24,7 +24,7 @@ from numpy.random import randn, seed
 import pandas as pd
 from scipy.stats.mstats import winsorize as scipy_winsorize

-from zipline.errors import UnknownRankMethod
+from zipline.errors import BadPercentileBounds, UnknownRankMethod
 from zipline.lib.labelarray import LabelArray
 from zipline.lib.rank import masked_rankdata_2d
 from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
@@ -710,12 +710,157 @@ class FactorTestCase(BasePipelineTestCase):
            check=partial(check_allclose, atol=0.001),
        )

+    def test_winsorize_hand_computed(self):
+        """
+        Test the hand-computed example in factor.winsorize.
+        """
+        f = self.f
+        m = Mask()
+        c = C()
+        str_c = C(dtype=categorical_dtype, missing_value=None)
+
+        factor_data = array([
+            [1.,     2.,  3.,  4.,   5.,   6.],
+            [1.,     8., 27., 64., 125., 216.],
+            [6.,     5.,  4.,  3.,   2.,   1.]
+        ])
+        filter_data = array(
+            [[False, True, True, True, True, True],
+             [True, False, True, True, True, True],
+             [True, True, False, True, True, True]],
+            dtype=bool,
+        )
+        classifier_data = array(
+            [[1, 1, 1, 2, 2, 2],
+             [1, 1, 1, 2, 2, 2],
+             [1, 1, 1, 2, 2, 2]],
+            dtype=int64_dtype,
+        )
+        string_classifier_data = LabelArray(
+            classifier_data.astype(str).astype(object),
+            missing_value=None,
+        )
+
+        terms = {
+            'winsor_1': f.winsorize(
+                min_percentile=0.33,
+                max_percentile=0.67
+            ),
+            'winsor_2': f.winsorize(
+                min_percentile=0.49,
+                max_percentile=1
+            ),
+            'winsor_3': f.winsorize(
+                min_percentile=0,
+                max_percentile=.67
+            ),
+            'masked': f.winsorize(
+                min_percentile=0.33,
+                max_percentile=0.67,
+                mask=m
+            ),
+            'grouped': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                groupby=c
+            ),
+            'grouped_str': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                groupby=str_c
+            ),
+            'grouped_masked': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                mask=m,
+                groupby=c
+            ),
+            'grouped_masked_str': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                mask=m,
+                groupby=str_c
+            ),
+        }
+        expected = {
+            'winsor_1': array([
+                [2.,    2.,    3.,    4.,    5.,    5.],
+                [8.,    8.,   27.,   64.,  125.,  125.],
+                [5.,    5.,    4.,    3.,    2.,    2.]
+            ]),
+            'winsor_2': array([
+                [3.0,    3.,    3.,    4.,    5.,    6.],
+                [27.,   27.,   27.,   64.,  125.,  216.],
+                [6.0,    5.,    4.,    3.,    3.,    3.]
+            ]),
+            'winsor_3': array([
+                [1.,    2.,    3.,    4.,    5.,    5.],
+                [1.,    8.,   27.,   64.,  125.,  125.],
+                [5.,    5.,    4.,    3.,    2.,    1.]
+            ]),
+            'masked': array([
+                [nan,    3.,    3.,    4.,    5.,    5.],
+                [27.,   nan,   27.,   64.,  125.,  125.],
+                [5.0,    5.,    nan,    3.,    2.,   2.]
+            ]),
+            'grouped': array([
+                [2.,    2.,    2.,    5.,    5.,    5.],
+                [8.,    8.,    8.,  125.,  125.,  125.],
+                [5.,    5.,    5.,    2.,    2.,    2.]
+            ]),
+            'grouped_masked': array([
+                [nan,    2.,    3.,    5.,    5.,    5.],
+                [1.0,   nan,   27.,  125.,  125.,  125.],
+                [6.0,    5.,    nan,    2.,    2.,   2.]
+            ]),
+        }
+        # Changing the classifier dtype shouldn't affect anything.
+        expected['grouped_str'] = expected['grouped']
+        expected['grouped_masked_str'] = expected['grouped_masked']
+
+        self.check_terms(
+            terms,
+            expected,
+            initial_workspace={
+                f: factor_data,
+                c: classifier_data,
+                str_c: string_classifier_data,
+                m: filter_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
+            check=partial(check_allclose, atol=0.001),
+        )
+
+    def test_winsorize_bad_bounds(self):
+        """
+        Test out of bounds input for factor.winsorize.
+        """
+        f = self.f
+
+        bad_percentiles = [
+            (-.1, 1),
+            (0, 95),
+            (5, 95),
+            (5, 5),
+            (.6, .4)
+        ]
+        for min_, max_ in bad_percentiles:
+            with self.assertRaises(BadPercentileBounds):
+                f.winsorize(min_percentile=min_, max_percentile=max_)
+
    @parameter_space(
        seed_value=range(1, 2),
        normalizer_name_and_func=[
-            ('demean', lambda row: row - nanmean(row)),
-            ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
-            ('winsorize', lambda row: scipy_winsorize(row, limits=0.05)),
+            ('demean', {}, lambda row: row - nanmean(row)),
+            ('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
+            (
+                'winsorize',
+                {"min_percentile": 0.25, "max_percentile": 0.75},
+                lambda row: scipy_winsorize(
+                    row,
+                    limits=0.25,
+                )
+            ),
        ],
        add_nulls_to_factor=(False, True,),
    )
@@ -724,9 +869,9 @@ class FactorTestCase(BasePipelineTestCase):
                                       normalizer_name_and_func,
                                       add_nulls_to_factor):

-        name, func = normalizer_name_and_func
+        name, kwargs, func = normalizer_name_and_func

-        shape = (7, 7)
+        shape = (20, 20)

        # All Trues.
        nomask = self.ones_mask(shape=shape)
@@ -757,7 +902,7 @@ class FactorTestCase(BasePipelineTestCase):
        c = C()
        c_with_nulls = OtherC()
        m = Mask()
-        method = getattr(f, name)
+        method = partial(getattr(f, name), **kwargs)
        terms = {
            'vanilla': method(),
            'masked': method(mask=m),
@@ -1054,7 +1199,7 @@ class ShortReprTestCase(TestCase):
        self.assertEqual(r, "GroupedRowTransform('zscore')")

    def test_winsorize(self):
-        r = F().winsorize().short_repr()
+        r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr()
        self.assertEqual(r, "GroupedRowTransform('winsorize')")


@@ -1068,8 +1213,22 @@ class TestWindowSafety(TestCase):
        self.assertFalse(F(window_safe=False).demean().window_safe)
        self.assertTrue(F(window_safe=True).demean().window_safe)

-    def test_winsorize_is_window_safe(self):
-        self.assertTrue(F().winsorize().window_safe)
+    def test_winsorize_is_window_safe_if_input_is_window_safe(self):
+        self.assertFalse(
+            F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
+        )
+        self.assertFalse(
+            F(window_safe=False).winsorize(
+                min_percentile=.05,
+                max_percentile=.95
+            ).window_safe
+        )
+        self.assertTrue(
+            F(window_safe=True).winsorize(
+                min_percentile=.05,
+                max_percentile=.95
+            ).window_safe
+        )


 class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
@@ -570,8 +570,8 @@ class BadPercentileBounds(ZiplineError):
    are invalid.
    """
    msg = (
-        "Percentile bounds must fall between 0.0 and 100.0, and min must be "
-        "less than max."
+        "Percentile bounds must fall between 0.0 and {upper_bound}, and min "
+        "must be less than max."
        "\nInputs were min={min_percentile}, max={max_percentile}."
    )

@@ -4,12 +4,12 @@ factor.py
 from functools import wraps
 from operator import attrgetter
 from numbers import Number
+from math import ceil

 from numpy import empty_like, inf, nan, where
 from scipy.stats import rankdata
-from scipy.stats.mstats import winsorize as scipy_winsorize

-from zipline.errors import UnknownRankMethod
+from zipline.errors import BadPercentileBounds, UnknownRankMethod
 from zipline.lib.normalize import naive_grouped_rowwise_apply
 from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending
 from zipline.pipeline.api_utils import restrict_to_dtype
@@ -833,19 +833,25 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
            regression_length=regression_length,
            mask=mask,
        )
+
+    @expect_types(
+        min_percentile=(int, float),
+        max_percentile=(int, float),
+        mask=(Filter, NotSpecifiedType),
+        groupby=(Classifier, NotSpecifiedType),
+    )
    @float64_only
    def winsorize(self,
-                  limits,
-                  inclusive=(True, True),
+                  min_percentile,
+                  max_percentile,
                  mask=NotSpecified,
                  groupby=NotSpecified):
        """
-        Construct a Factor returns a winsorized row for results. Winsorizing
-        clips the input values to fixed percentiles. The (limits[0])th lowest
-        values are set to the value at the (limits[0])th percentile. The values
-        above the (limits[1])th percentiles are set to the value at the
-        (limits[1])th percentile. This is useful when limiting the impact of
-        extreme values.
+        Construct a Factor returns a winsorized row. Winsorizing changes values
+        ranked less than the minimum percentile to to value at the minimum
+        percentile. Similarly, values ranking above the maximum percentile will
+        be changed to the value at the maximum percentile. This is useful
+        when limiting the impact of extreme values.

        If ``mask`` is supplied, ignore values where ``mask`` returns False
        when computing row means and standard deviations, and output NaN
@@ -857,14 +863,14 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):

        Parameters
        ----------
-        limits : None, tuple of float, optional
-            A tuple of two values between 0 and 100 inclusive. This is the
-            percentage to cut from each tail of the array. A value of None
-            can be used to indicate an open limit.
-        inclusive : a tuple of bool, optional
-            A bool indicating whether the data on each side should be
-            rounded(True) or truncated(False). A value of None can be used if
-            one side is not being winsorized. Default is (False, False).
+        min_percentile: float, int
+            Entries with values at or below this percentile will be replaced
+            with the (len(inp) * min_percentile)th lowest value. If low values
+            should not be clipped, use 0.
+        max_percentile: float, int
+            Entries with values at or above this percentile will be replaced
+            with the (len(inp) * max_percentile)th lowest value. If high
+            values should not be clipped, use 1.
        mask : zipline.pipeline.Filter, optional
            A Filter defining values to ignore when winsorizing.
        groupby : zipline.pipeline.Classifier, optional
@@ -881,34 +887,43 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
        price = USEquityPricing.close.latest
        columns={
            'PRICE': price,
-            'WINSOR_1: price.winsorize(limits=25),
-            'WINSOR_2': price.winsorize(limits=(50, None)),
-            'WINSOR_3': price.winsorize(
-                limits=25, inclusive=(False, False)
+            'WINSOR_1: price.winsorize(
+                min_percentile=0.25, max_percentile=0.75
            ),
-            'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
-            'WINSOR_5': price.winsorize(limits=(20, 40)),
+            'WINSOR_2': price.winsorize(
+                min_percentile=0.50, max_percentile=1.0
+            ),
+            'WINSOR_3': price.winsorize(
+                min_percentile=0.0, max_percentile=0.5
+            ),
+
        }

        Given a pipeline with columns, defined above, the result for a
        given day could look like:

-                'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
-        Asset_1    1        2          4          3          2          2
-        Asset_2    2        2          4          3          2          2
-        Asset_3    3        3          4          3          3          2
-        Asset_4    4        4          4          4          4          4
-        Asset_5    5        5          5          4          4          4
-        Asset_6    6        5          5          4          4          4
+                'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3'
+        Asset_1    1        2          4          3
+        Asset_2    2        2          4          3
+        Asset_3    3        3          4          3
+        Asset_4    4        4          4          4
+        Asset_5    5        5          5          4
+        Asset_6    6        5          5          4

        See Also
        --------
        :func:`scipy.stats.mstats.winsorize`
        :meth:`pandas.DataFrame.groupby`
        """
+        if not 0.0 <= min_percentile < max_percentile <= 1.0:
+            raise BadPercentileBounds(
+                min_percentile=min_percentile,
+                max_percentile=max_percentile,
+                upper_bound=1.0,
+            )
        return GroupedRowTransform(
            transform=winsorize,
-            transform_args=(limits, inclusive),
+            transform_args=(min_percentile, max_percentile),
            factor=self,
            groupby=groupby,
            dtype=self.dtype,
@@ -1616,18 +1631,21 @@ def zscore(row):
    return (row - nanmean(row)) / nanstd(row)


-def winsorize(row, limits, inclusive):
-    if isinstance(limits, int) or isinstance(limits, float):
-        limits = limits / 100.
-    if isinstance(limits, tuple):
-        if limits[0] is not None:
-            limit_0 = limits[0] / 100.
-        else:
-            limit_0 = None
-        if limits[1] is not None:
-            limit_1 = limits[1] / 100
-        else:
-            limit_1 = None
-        limits = (limit_0, limit_1)
+def winsorize(row, min_percentile, max_percentile):
+    """
+    This implementation is based on scipy.stats.mstats.winsorize
+    """
+    a = row.copy()
+    num = a.size
+    idx = a.argsort()
+    if min_percentile > 0:
+        lowidx = int(min_percentile * num)
+        a[idx[:lowidx]] = a[idx[lowidx]]
+    if max_percentile < 1:
+        upidx = ceil(num * max_percentile)
+        # upidx could return as the length of the array, in this case
+        # no modification to the right tail is necessary.
+        if upidx < num:
+            a[idx[upidx:]] = a[idx[upidx - 1]]

-    return scipy_winsorize(row, limits=limits, inclusive=inclusive)
+    return a
@@ -334,6 +334,7 @@ class PercentileFilter(SingleInputMixin, Filter):
            raise BadPercentileBounds(
                min_percentile=self._min_percentile,
                max_percentile=self._max_percentile,
+                upper_bound=100.0
            )
        return super(PercentileFilter, self)._validate()