From 9e3404646ee415560ea4a07de77eb29aa4262886 Mon Sep 17 00:00:00 2001 From: Andrey Portnoy Date: Fri, 3 Jun 2016 17:23:27 -0700 Subject: [PATCH] add groupby to rank, top, and bottom --- tests/pipeline/test_factor.py | 201 +++++++++++++++++++++++++++++ zipline/pipeline/factors/factor.py | 42 ++++-- 2 files changed, 235 insertions(+), 8 deletions(-) diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py index e34ffb56..5de7a6be 100644 --- a/tests/pipeline/test_factor.py +++ b/tests/pipeline/test_factor.py @@ -336,6 +336,207 @@ class FactorTestCase(BasePipelineTestCase): for method in results: check_arrays(expected[method], results[method]) + def test_grouped_rank_ascending(self, factor_dtype=float64_dtype): + + f = F(dtype=factor_dtype) + c = C() + str_c = C(dtype=categorical_dtype, missing_value=None) + + # Generated with: + # data = arange(25).reshape(5, 5).transpose() % 4 + data = array([[0, 1, 2, 3, 0], + [1, 2, 3, 0, 1], + [2, 3, 0, 1, 2], + [3, 0, 1, 2, 3], + [0, 1, 2, 3, 0]], dtype=factor_dtype) + + # Generated with: + # classifier_data = arange(25).reshape(5, 5).transpose() % 2 + classifier_data = array([[0, 1, 0, 1, 0], + [1, 0, 1, 0, 1], + [0, 1, 0, 1, 0], + [1, 0, 1, 0, 1], + [0, 1, 0, 1, 0]], dtype=int64_dtype) + string_classifier_data = LabelArray( + classifier_data.astype(str).astype(object), + missing_value=None, + ) + + expected_grouped_ranks = { + 'ordinal': array( + [[1., 1., 3., 2., 2.], + [1., 2., 3., 1., 2.], + [2., 2., 1., 1., 3.], + [2., 1., 1., 2., 3.], + [1., 1., 3., 2., 2.]] + ), + 'average': array( + [[1.5, 1., 3., 2., 1.5], + [1.5, 2., 3., 1., 1.5], + [2.5, 2., 1., 1., 2.5], + [2.5, 1., 1., 2., 2.5], + [1.5, 1., 3., 2., 1.5]] + ), + 'min': array( + [[1., 1., 3., 2., 1.], + [1., 2., 3., 1., 1.], + [2., 2., 1., 1., 2.], + [2., 1., 1., 2., 2.], + [1., 1., 3., 2., 1.]] + ), + 'max': array( + [[2., 1., 3., 2., 2.], + [2., 2., 3., 1., 2.], + [3., 2., 1., 1., 3.], + [3., 1., 1., 2., 3.], + [2., 1., 3., 2., 2.]] + ), + 'dense': array( + [[1., 1., 2., 2., 1.], + [1., 2., 2., 1., 1.], + [2., 2., 1., 1., 2.], + [2., 1., 1., 2., 2.], + [1., 1., 2., 2., 1.]] + ), + } + + def check(terms): + graph = TermGraph(terms) + results = self.run_graph( + graph, + initial_workspace={ + f: data, + c: classifier_data, + str_c: string_classifier_data, + }, + mask=self.build_mask(ones((5, 5))), + ) + + for method in terms: + check_arrays(results[method], expected_grouped_ranks[method]) + + # Not specifying the value of ascending param should default to True + check({ + meth: f.rank(method=meth, groupby=c) + for meth in expected_grouped_ranks + }) + check({ + meth: f.rank(method=meth, groupby=str_c) + for meth in expected_grouped_ranks + }) + check({ + meth: f.rank(method=meth, groupby=c, ascending=True) + for meth in expected_grouped_ranks + }) + check({ + meth: f.rank(method=meth, groupby=str_c, ascending=True) + for meth in expected_grouped_ranks + }) + + # Not passing a method should default to ordinal + check({'ordinal': f.rank(groupby=c)}) + check({'ordinal': f.rank(groupby=str_c)}) + check({'ordinal': f.rank(groupby=c, ascending=True)}) + check({'ordinal': f.rank(groupby=str_c, ascending=True)}) + + def test_grouped_rank_descending(self, factor_dtype=float64_dtype): + + f = F(dtype=factor_dtype) + c = C() + str_c = C(dtype=categorical_dtype, missing_value=None) + + # Generated with: + # data = arange(25).reshape(5, 5).transpose() % 4 + data = array([[0, 1, 2, 3, 0], + [1, 2, 3, 0, 1], + [2, 3, 0, 1, 2], + [3, 0, 1, 2, 3], + [0, 1, 2, 3, 0]], dtype=factor_dtype) + + # Generated with: + # classifier_data = arange(25).reshape(5, 5).transpose() % 2 + classifier_data = array([[0, 1, 0, 1, 0], + [1, 0, 1, 0, 1], + [0, 1, 0, 1, 0], + [1, 0, 1, 0, 1], + [0, 1, 0, 1, 0]], dtype=int64_dtype) + + string_classifier_data = LabelArray( + classifier_data.astype(str).astype(object), + missing_value=None, + ) + + expected_grouped_ranks = { + 'ordinal': array( + [[2., 2., 1., 1., 3.], + [2., 1., 1., 2., 3.], + [1., 1., 3., 2., 2.], + [1., 2., 3., 1., 2.], + [2., 2., 1., 1., 3.]] + ), + 'average': array( + [[2.5, 2., 1., 1., 2.5], + [2.5, 1., 1., 2., 2.5], + [1.5, 1., 3., 2., 1.5], + [1.5, 2., 3., 1., 1.5], + [2.5, 2., 1., 1., 2.5]] + ), + 'min': array( + [[2., 2., 1., 1., 2.], + [2., 1., 1., 2., 2.], + [1., 1., 3., 2., 1.], + [1., 2., 3., 1., 1.], + [2., 2., 1., 1., 2.]] + ), + 'max': array( + [[3., 2., 1., 1., 3.], + [3., 1., 1., 2., 3.], + [2., 1., 3., 2., 2.], + [2., 2., 3., 1., 2.], + [3., 2., 1., 1., 3.]] + ), + 'dense': array( + [[2., 2., 1., 1., 2.], + [2., 1., 1., 2., 2.], + [1., 1., 2., 2., 1.], + [1., 2., 2., 1., 1.], + [2., 2., 1., 1., 2.]] + ), + } + + def check(terms): + graph = TermGraph(terms) + results = self.run_graph( + graph, + initial_workspace={ + f: data, + c: classifier_data, + str_c: string_classifier_data, + }, + mask=self.build_mask(ones((5, 5))), + ) + + for method in terms: + check_arrays(results[method], expected_grouped_ranks[method]) + + check({ + meth: f.rank(method=meth, groupby=c, ascending=False) + for meth in expected_grouped_ranks + }) + check({ + meth: f.rank(method=meth, groupby=str_c, ascending=False) + for meth in expected_grouped_ranks + }) + + # Not passing a method should default to ordinal + check({'ordinal': f.rank(groupby=c, ascending=False)}) + check({'ordinal': f.rank(groupby=str_c, ascending=False)}) + + # TODO finish this + # @for_each_factor_dtype + # def test_grouped_rank_after_mask(self, name, factor_dtype): + # pass + @parameterized.expand([ # Test cases computed by doing: # from numpy.random import seed, randn diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index 52403bcc..cfcc0a5d 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -6,6 +6,7 @@ from operator import attrgetter from numbers import Number from numpy import inf, where +from scipy.stats import rankdata from zipline.errors import UnknownRankMethod from zipline.lib.normalize import naive_grouped_rowwise_apply @@ -581,7 +582,11 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): window_safe=True, ) - def rank(self, method='ordinal', ascending=True, mask=NotSpecified): + def rank(self, + method='ordinal', + ascending=True, + mask=NotSpecified, + groupby=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. @@ -599,6 +604,8 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): A Filter representing assets to consider when computing ranks. If mask is supplied, ranks are computed ignoring any asset/date pairs for which `mask` produces a value of False. + groupby : zipline.pipeline.Classifier, optional + A classifier defining partitions over which to perform ranking. Returns ------- @@ -620,7 +627,21 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): :func:`scipy.stats.rankdata` :class:`zipline.pipeline.factors.factor.Rank` """ - return Rank(self, method=method, ascending=ascending, mask=mask) + + if groupby is NotSpecified: + return Rank(self, method=method, ascending=ascending, mask=mask) + + else: + def rank(row): + return rankdata(row if ascending else -row, method=method) + + return GroupedRowTransform( + transform=rank, + factor=self, + mask=mask, + groupby=groupby, + window_safe=True, + ) @expect_types( target=Term, correlation_length=int, mask=(Filter, NotSpecifiedType), @@ -913,7 +934,7 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): """ return self.quantiles(bins=10, mask=mask) - def top(self, N, mask=NotSpecified): + def top(self, N, mask=NotSpecified, groupby=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. @@ -925,14 +946,16 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): A Filter representing assets to consider when computing ranks. If mask is supplied, top values are computed ignoring any asset/date pairs for which `mask` produces a value of False. + groupby : zipline.pipeline.Classifier, optional + A classifier defining partitions over which to perform ranking. Returns ------- filter : zipline.pipeline.filters.Filter """ - return self.rank(ascending=False, mask=mask) <= N + return self.rank(ascending=False, mask=mask, groupby=groupby) <= N - def bottom(self, N, mask=NotSpecified): + def bottom(self, N, mask=NotSpecified, groupby=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. @@ -944,12 +967,14 @@ class Factor(RestrictedDTypeMixin, ComputableTerm): A Filter representing assets to consider when computing ranks. If mask is supplied, bottom values are computed ignoring any asset/date pairs for which `mask` produces a value of False. + groupby : zipline.pipeline.Classifier, optional + A classifier defining partitions over which to perform ranking. Returns ------- filter : zipline.pipeline.Filter """ - return self.rank(ascending=True, mask=mask) <= N + return self.rank(ascending=True, mask=mask, groupby=groupby) <= N def percentile_between(self, min_percentile, @@ -1075,7 +1100,7 @@ class GroupedRowTransform(Factor): Factor. This is most often useful for normalization operators like ``zscore`` or - ``demean``. + ``demean`` or for performing ranking using ``rank``. Parameters ---------- @@ -1093,12 +1118,13 @@ class GroupedRowTransform(Factor): ----- Users should rarely construct instances of this factor directly. Instead, they should construct instances via factor normalization methods like - ``zscore`` and ``demean``. + ``zscore`` and ``demean`` or using ``rank`` with ``groupby``. See Also -------- zipline.pipeline.factors.Factor.zscore zipline.pipeline.factors.Factor.demean + zipline.pipeline.factors.Factor.rank """ window_length = 0