From b85eb36da82230b12390d83a0654a3f2464ae0f2 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 21 Mar 2016 18:33:32 -0400
Subject: [PATCH 01/18] TEST: Add test for demean example.

---
 tests/pipeline/test_factor.py     | 95 +++++++++++++++++++++++++++++--
 zipline/testing/core.py           | 10 +++-
 zipline/utils/functional.py       | 61 ++++++++++++++++++++
 zipline/utils/input_validation.py | 26 ++++-----
 4 files changed, 173 insertions(+), 19 deletions(-)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index 2e27eb2f..e7dab7cc 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -2,6 +2,7 @@
 Tests for Factor terms.
 """
 from itertools import product
+from six import iteritems
 from nose_parameterized import parameterized
 
 from numpy import (
@@ -32,6 +33,7 @@ from zipline.testing import (
     check_arrays,
     parameter_space,
 )
+from zipline.utils.functional import dzip_exact
 from zipline.utils.numpy_utils import (
     datetime64ns_dtype,
     float64_dtype,
@@ -423,6 +425,91 @@ class FactorTestCase(BasePipelineTestCase):
 
         check_arrays(float_result, datetime_result)
 
+    def test_normalizations_hand_computed(self):
+        """
+        Test the hand-computed example in factor.demean.
+        """
+        f = self.f
+        m = Mask()
+        c = C()
+
+        factor_data = array(
+            [[1.0, 2.0, 3.0, 4.0],
+             [1.5, 2.5, 3.5, 1.0],
+             [2.0, 3.0, 4.0, 1.5],
+             [2.5, 3.5, 1.0, 2.0]],
+        )
+        filter_data = array(
+            [[False, True, True, True],
+             [True, False, True, True],
+             [True, True, False, True],
+             [True, True, True, False]],
+            dtype=bool,
+        )
+        classifier_data = array(
+            [[1, 1, 2, 2],
+             [1, 1, 2, 2],
+             [1, 1, 2, 2],
+             [1, 1, 2, 2]],
+            dtype=int,
+        )
+
+        terms = {
+            'vanilla': f.demean(),
+            'masked': f.demean(mask=m),
+            'grouped': f.demean(groupby=c),
+            'grouped_masked': f.demean(mask=m, groupby=c),
+        }
+        expected = {
+            'vanilla': array(
+                [[-1.500, -0.500,  0.500,  1.500],
+                 [-0.625,  0.375,  1.375, -1.125],
+                 [-0.625,  0.375,  1.375, -1.125],
+                 [0.250,   1.250, -1.250, -0.250]],
+            ),
+            'masked': array(
+                [[nan,    -1.000,  0.000,  1.000],
+                 [-0.500,    nan,  1.500, -1.000],
+                 [-0.166,  0.833,    nan, -0.666],
+                 [0.166,   1.166, -1.333,    nan]],
+            ),
+            'grouped': array(
+                [[-0.500, 0.500, -0.500,  0.500],
+                 [-0.500, 0.500,  1.250, -1.250],
+                 [-0.500, 0.500,  1.250, -1.250],
+                 [-0.500, 0.500, -0.500,  0.500]],
+            ),
+            'grouped_masked': array(
+                [[nan,     0.000, -0.500,  0.500],
+                 [0.000,     nan,  1.250, -1.250],
+                 [-0.500,  0.500,    nan,  0.000],
+                 [-0.500,  0.500,  0.000,    nan]]
+            )
+        }
+
+        graph = TermGraph(terms)
+        results = self.run_graph(
+            graph,
+            initial_workspace={
+                f: factor_data,
+                c: classifier_data,
+                m: filter_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
+        )
+
+        for key, (res, exp) in iteritems(dzip_exact(results, expected)):
+            check_allclose(
+                res,
+                exp,
+                # The hand-computed values aren't very precise (in particular,
+                # we truncate repeating decimals at 3 places) This is just
+                # asserting that the example isn't misleading by being totally
+                # wrong.
+                atol=0.001,
+                err_msg="Mismatch for %r" % key
+            )
+
     @parameter_space(
         seed_value=range(1, 2),
         normalizer_name_and_func=[
@@ -431,10 +518,10 @@ class FactorTestCase(BasePipelineTestCase):
         ],
         add_nulls_to_factor=(False, True,)
     )
-    def test_normalizations(self,
-                            seed_value,
-                            normalizer_name_and_func,
-                            add_nulls_to_factor):
+    def test_normalizations_randomized(self,
+                                       seed_value,
+                                       normalizer_name_and_func,
+                                       add_nulls_to_factor):
 
         name, func = normalizer_name_and_func
 
diff --git a/zipline/testing/core.py b/zipline/testing/core.py
index 15d1a703..8bc05bc0 100644
--- a/zipline/testing/core.py
+++ b/zipline/testing/core.py
@@ -568,8 +568,14 @@ def check_allclose(actual,
     """
     if type(actual) != type(desired):
         raise AssertionError("%s != %s" % (type(actual), type(desired)))
-    return assert_allclose(actual, desired, rtol=rtol, atol=atol,
-                           err_msg=err_msg, verbose=verbose)
+    return assert_allclose(
+        actual,
+        desired,
+        atol=atol,
+        rtol=rtol,
+        err_msg=err_msg,
+        verbose=verbose,
+    )
 
 
 def check_arrays(x, y, err_msg='', verbose=True):
diff --git a/zipline/utils/functional.py b/zipline/utils/functional.py
index 420cd604..80ac1878 100644
--- a/zipline/utils/functional.py
+++ b/zipline/utils/functional.py
@@ -1,3 +1,8 @@
+from operator import methodcaller
+from six.moves import map
+from pprint import pformat
+
+
 def mapall(funcs, seq):
     """
     Parameters
@@ -20,3 +25,59 @@ def mapall(funcs, seq):
     for func in funcs:
         for elem in seq:
             yield func(elem)
+
+
+def same(*values):
+    """
+    Check if all values in a sequence are equal.
+
+    Returns True on empty sequences.
+
+    Example
+    -------
+    >>> same(1, 1, 1, 1)
+    True
+    >>> same(1, 2, 1)
+    False
+    >>> same()
+    True
+    """
+    if not values:
+        return True
+    first, rest = values[0], values[1:]
+    return all(value == first for value in rest)
+
+
+def _format_unequal_keys(dicts):
+    return pformat([sorted(d.keys()) for d in dicts])
+
+
+def dzip_exact(*dicts):
+    """
+    Parameters
+    ----------
+    *dicts : iterable[dict]
+        A sequence of dicts all sharing the same keys.
+
+    Returns
+    -------
+    zipped : dict
+        A dict whose keys are the union of all keys in *dicts, and whose values
+        are tuples of length len(dicts) containing the result of looking up
+        each key in each dict.
+
+    Raises
+    ------
+    ValueError
+        If dicts don't all have the same keys.
+
+    Example
+    -------
+    >>> dzip_exact({'a': 1, 'b': 2}, {'a': 3, 'b': 4})
+    {'a': (1, 3), 'b': (2, 4)}
+    """
+    if not same(*map(methodcaller('viewkeys'), dicts)):
+        raise ValueError(
+            "dict keys not all equal:\n\n%s" % _format_unequal_keys(dicts)
+        )
+    return {k: tuple(d[k] for d in dicts) for k in dicts[0]}
diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py
index d04869fc..692cfaf7 100644
--- a/zipline/utils/input_validation.py
+++ b/zipline/utils/input_validation.py
@@ -321,6 +321,19 @@ def optional(type_):
     return (type_, type(None))
 
 
+def _expect_element(collection):
+    template = (
+        "%(funcname)s() expected a value in {collection} "
+        "for argument '%(argname)s', but got %(actual)s instead."
+    ).format(collection=collection)
+    return make_check(
+        ValueError,
+        template,
+        complement(op.contains(collection)),
+        repr,
+    )
+
+
 def expect_element(*_pos, **named):
     """
     Preprocessing decorator that verifies inputs are elements of some
@@ -391,16 +404,3 @@ def coerce(from_, to, **to_kwargs):
 
 
 coerce_string = partial(coerce, string_types)
-
-
-def _expect_element(collection):
-    template = (
-        "%(funcname)s() expected a value in {collection} "
-        "for argument '%(argname)s', but got %(actual)s instead."
-    ).format(collection=collection)
-    return make_check(
-        ValueError,
-        template,
-        complement(op.contains(collection)),
-        repr,
-    )

From 124555234098763427f7fe12de0fd87d8f8144ed Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 22 Mar 2016 13:30:43 -0400
Subject: [PATCH 02/18] DEV: Add expect_dimensions preprocessor.

---
 tests/utils/test_preprocess.py    | 38 ++++++++++++++++++++++++++-
 zipline/utils/input_validation.py | 43 +++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/tests/utils/test_preprocess.py b/tests/utils/test_preprocess.py
index 2626cedb..343966ab 100644
--- a/tests/utils/test_preprocess.py
+++ b/tests/utils/test_preprocess.py
@@ -6,12 +6,13 @@ from types import FunctionType
 from unittest import TestCase
 
 from nose_parameterized import parameterized
-from numpy import arange, dtype
+from numpy import arange, array, dtype
 import pytz
 from six import PY3
 
 from zipline.utils.preprocess import call, preprocess
 from zipline.utils.input_validation import (
+    expect_dimensions,
     ensure_timezone,
     expect_element,
     expect_dtypes,
@@ -367,3 +368,38 @@ class PreprocessTestCase(TestCase):
         with self.assertRaises(TypeError) as e:
             f('a')
         self.assertIs(e.exception, error)
+
+    def test_expect_dimensions(self):
+
+        @expect_dimensions(x=2)
+        def foo(x, y):
+            return x[0, 0]
+
+        self.assertEqual(foo(arange(1).reshape(1, 1), 10), 0)
+
+        with self.assertRaises(ValueError) as e:
+            foo(arange(1), 1)
+        errmsg = str(e.exception)
+        expected = (
+            "{qualname}() expected a 2-D array for argument 'x', but got"
+            " a 1-D array instead.".format(qualname=qualname(foo))
+        )
+        self.assertEqual(errmsg, expected)
+
+        with self.assertRaises(ValueError) as e:
+            foo(arange(1).reshape(1, 1, 1), 1)
+        errmsg = str(e.exception)
+        expected = (
+            "{qualname}() expected a 2-D array for argument 'x', but got"
+            " a 3-D array instead.".format(qualname=qualname(foo))
+        )
+        self.assertEqual(errmsg, expected)
+
+        with self.assertRaises(ValueError) as e:
+            foo(array(0), 1)
+        errmsg = str(e.exception)
+        expected = (
+            "{qualname}() expected a 2-D array for argument 'x', but got"
+            " a scalar instead.".format(qualname=qualname(foo))
+        )
+        self.assertEqual(errmsg, expected)
diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py
index 692cfaf7..5588518a 100644
--- a/zipline/utils/input_validation.py
+++ b/zipline/utils/input_validation.py
@@ -366,6 +366,49 @@ def expect_element(*_pos, **named):
     return preprocess(**valmap(_expect_element, named))
 
 
+def expect_dimensions(**dimensions):
+    """
+    Preprocessing decorator that verifies inputs are numpy arrays with a
+    specific dimensionality.
+
+    Usage
+    -----
+    >>> from numpy import array
+    >>> @expect_dimensions(x=1, y=2)
+    ... def foo(x, y):
+    ...    return x[0] + y[0, 0]
+    ...
+    >>> foo(array([1, 1]), array([[1, 1], [2, 2]]))
+    2
+    >>> foo(array([1, 1], array([1, 1])))
+    Traceback (most recent call last):
+       ...
+    TypeError: foo() expected a 2-D array for argument 'y', but got a 1-D array instead.  # noqa
+    """
+    def _expect_dimension(expected_ndim):
+        def _check(func, argname, argvalue):
+            funcname = _qualified_name(func)
+            actual_ndim = argvalue.ndim
+            if actual_ndim != expected_ndim:
+                if actual_ndim == 0:
+                    actual_repr = 'scalar'
+                else:
+                    actual_repr = "%d-D array" % actual_ndim
+                raise ValueError(
+                    "{func}() expected a {expected:d}-D array"
+                    " for argument {argname!r}, but got a {actual}"
+                    " instead.".format(
+                        func=funcname,
+                        expected=expected_ndim,
+                        argname=argname,
+                        actual=actual_repr,
+                    )
+                )
+            return argvalue
+        return _check
+    return preprocess(**valmap(_expect_dimension, dimensions))
+
+
 def coerce(from_, to, **to_kwargs):
     """
     A preprocessing decorator that coerces inputs of a given type by passing

From 1f237d43a3e676dcb045b39a169a012316a6203e Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 22 Mar 2016 14:02:51 -0400
Subject: [PATCH 03/18] MAINT: Make preprocessor factories closures.

---
 zipline/utils/input_validation.py | 134 +++++++++++++++---------------
 1 file changed, 65 insertions(+), 69 deletions(-)

diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py
index 5588518a..51708dd0 100644
--- a/zipline/utils/input_validation.py
+++ b/zipline/utils/input_validation.py
@@ -159,41 +159,43 @@ def expect_dtypes(*_pos, **named):
                     name=name, dtype=dtype,
                 )
             )
+
+    def _expect_dtype(_dtype_or_dtype_tuple):
+        """
+        Factory for dtype-checking functions that work the @preprocess
+        decorator.
+        """
+        # Slightly different messages for dtype and tuple of dtypes.
+        if isinstance(_dtype_or_dtype_tuple, tuple):
+            allowed_dtypes = _dtype_or_dtype_tuple
+        else:
+            allowed_dtypes = (_dtype_or_dtype_tuple,)
+        template = (
+            "%(funcname)s() expected a value with dtype {dtype_str} "
+            "for argument '%(argname)s', but got %(actual)r instead."
+        ).format(dtype_str=' or '.join(repr(d.name) for d in allowed_dtypes))
+
+        def check_dtype(value):
+            return getattr(value, 'dtype', None) not in allowed_dtypes
+
+        def display_bad_value(value):
+            # If the bad value has a dtype, but it's wrong, show the dtype
+            # name.
+            try:
+                return value.dtype.name
+            except AttributeError:
+                return value
+
+        return make_check(
+            exc_type=TypeError,
+            template=template,
+            pred=check_dtype,
+            actual=display_bad_value,
+        )
+
     return preprocess(**valmap(_expect_dtype, named))
 
 
-def _expect_dtype(_dtype_or_dtype_tuple):
-    """
-    Factory for dtype-checking functions that work the @preprocess decorator.
-    """
-    # Slightly different messages for dtype and tuple of dtypes.
-    if isinstance(_dtype_or_dtype_tuple, tuple):
-        allowed_dtypes = _dtype_or_dtype_tuple
-    else:
-        allowed_dtypes = (_dtype_or_dtype_tuple,)
-    template = (
-        "%(funcname)s() expected a value with dtype {dtype_str} "
-        "for argument '%(argname)s', but got %(actual)r instead."
-    ).format(dtype_str=' or '.join(repr(d.name) for d in allowed_dtypes))
-
-    def check_dtype(value):
-        return getattr(value, 'dtype', None) not in allowed_dtypes
-
-    def display_bad_value(value):
-        # If the bad value has a dtype, but it's wrong, show the dtype name.
-        try:
-            return value.dtype.name
-        except AttributeError:
-            return value
-
-    return make_check(
-        exc_type=TypeError,
-        template=template,
-        pred=check_dtype,
-        actual=display_bad_value,
-    )
-
-
 def expect_types(*_pos, **named):
     """
     Preprocessing decorator that verifies inputs have expected types.
@@ -223,6 +225,26 @@ def expect_types(*_pos, **named):
                 )
             )
 
+    def _expect_type(type_):
+        # Slightly different messages for type and tuple of types.
+        _template = (
+            "%(funcname)s() expected a value of type {type_or_types} "
+            "for argument '%(argname)s', but got %(actual)s instead."
+        )
+        if isinstance(type_, tuple):
+            template = _template.format(
+                type_or_types=' or '.join(map(_qualified_name, type_))
+            )
+        else:
+            template = _template.format(type_or_types=_qualified_name(type_))
+
+        return make_check(
+            TypeError,
+            template,
+            lambda v: not isinstance(v, type_),
+            compose(_qualified_name, type),
+        )
+
     return preprocess(**valmap(_expect_type, named))
 
 
@@ -273,30 +295,6 @@ def make_check(exc_type, template, pred, actual):
     return _check
 
 
-def _expect_type(type_):
-    """
-    Factory for type-checking functions that work the @preprocess decorator.
-    """
-    # Slightly different messages for type and tuple of types.
-    _template = (
-        "%(funcname)s() expected a value of type {type_or_types} "
-        "for argument '%(argname)s', but got %(actual)s instead."
-    )
-    if isinstance(type_, tuple):
-        template = _template.format(
-            type_or_types=' or '.join(map(_qualified_name, type_))
-        )
-    else:
-        template = _template.format(type_or_types=_qualified_name(type_))
-
-    return make_check(
-        TypeError,
-        template,
-        lambda v: not isinstance(v, type_),
-        compose(_qualified_name, type),
-    )
-
-
 def optional(type_):
     """
     Helper for use with `expect_types` when an input can be `type_` or `None`.
@@ -321,19 +319,6 @@ def optional(type_):
     return (type_, type(None))
 
 
-def _expect_element(collection):
-    template = (
-        "%(funcname)s() expected a value in {collection} "
-        "for argument '%(argname)s', but got %(actual)s instead."
-    ).format(collection=collection)
-    return make_check(
-        ValueError,
-        template,
-        complement(op.contains(collection)),
-        repr,
-    )
-
-
 def expect_element(*_pos, **named):
     """
     Preprocessing decorator that verifies inputs are elements of some
@@ -363,6 +348,17 @@ def expect_element(*_pos, **named):
     if _pos:
         raise TypeError("expect_element() only takes keyword arguments.")
 
+    def _expect_element(collection):
+        template = (
+            "%(funcname)s() expected a value in {collection} "
+            "for argument '%(argname)s', but got %(actual)s instead."
+        ).format(collection=collection)
+        return make_check(
+            ValueError,
+            template,
+            complement(op.contains(collection)),
+            repr,
+        )
     return preprocess(**valmap(_expect_element, named))
 
 

From d0625e8a8d58ef4a3584d22de63370b7475cbbbb Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 22 Mar 2016 20:54:45 -0400
Subject: [PATCH 04/18] MAINT: Move ignore_nanwarnings to numpy_utils.

---
 zipline/pipeline/factors/technical.py |  2 +-
 zipline/utils/control_flow.py         | 38 --------------------------
 zipline/utils/numpy_utils.py          | 39 +++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/zipline/pipeline/factors/technical.py b/zipline/pipeline/factors/technical.py
index 3eb7eda2..f67471fc 100644
--- a/zipline/pipeline/factors/technical.py
+++ b/zipline/pipeline/factors/technical.py
@@ -23,7 +23,7 @@ from numexpr import evaluate
 
 from zipline.pipeline.data import USEquityPricing
 from zipline.pipeline.mixins import SingleInputMixin
-from zipline.utils.control_flow import ignore_nanwarnings
+from zipline.utils.numpy_utils import ignore_nanwarnings
 from zipline.utils.input_validation import expect_types
 from zipline.utils.math_utils import (
     nanargmax,
diff --git a/zipline/utils/control_flow.py b/zipline/utils/control_flow.py
index 24fe3fcb..253a71b3 100644
--- a/zipline/utils/control_flow.py
+++ b/zipline/utils/control_flow.py
@@ -2,10 +2,6 @@
 Control flow utilities.
 """
 from six import iteritems
-from warnings import (
-    catch_warnings,
-    filterwarnings,
-)
 
 
 class nullctx(object):
@@ -23,40 +19,6 @@ class nullctx(object):
         return False
 
 
-class WarningContext(object):
-    """
-    Re-entrant contextmanager for contextually managing warnings.
-    """
-    def __init__(self, *warning_specs):
-        self._warning_specs = warning_specs
-        self._catchers = []
-
-    def __enter__(self):
-        catcher = catch_warnings()
-        catcher.__enter__()
-        self._catchers.append(catcher)
-        for args, kwargs in self._warning_specs:
-            filterwarnings(*args, **kwargs)
-        return catcher
-
-    def __exit__(self, *exc_info):
-        catcher = self._catchers.pop()
-        return catcher.__exit__(*exc_info)
-
-
-def ignore_nanwarnings():
-    """
-    Helper for building a WarningContext that ignores warnings from numpy's
-    nanfunctions.
-    """
-    return WarningContext(
-        (
-            ('ignore',),
-            {'category': RuntimeWarning, 'module': 'numpy.lib.nanfunctions'},
-        )
-    )
-
-
 def invert(d):
     """
     Invert a dictionary into a dictionary of sets.
diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py
index 79bca175..a873d7c0 100644
--- a/zipline/utils/numpy_utils.py
+++ b/zipline/utils/numpy_utils.py
@@ -2,6 +2,11 @@
 Utilities for working with numpy arrays.
 """
 from datetime import datetime
+from warnings import (
+    catch_warnings,
+    filterwarnings,
+)
+
 from numpy import (
     broadcast,
     busday_count,
@@ -219,3 +224,37 @@ def busday_count_mask_NaT(begindates,
     # Fill in entries where either comparison was NaT with nan in the output.
     out[beginmask | endmask] = nan
     return out
+
+
+class WarningContext(object):
+    """
+    Re-usable contextmanager for contextually managing warnings.
+    """
+    def __init__(self, *warning_specs):
+        self._warning_specs = warning_specs
+        self._catchers = []
+
+    def __enter__(self):
+        catcher = catch_warnings()
+        catcher.__enter__()
+        self._catchers.append(catcher)
+        for args, kwargs in self._warning_specs:
+            filterwarnings(*args, **kwargs)
+        return self
+
+    def __exit__(self, *exc_info):
+        catcher = self._catchers.pop()
+        return catcher.__exit__(*exc_info)
+
+
+def ignore_nanwarnings():
+    """
+    Helper for building a WarningContext that ignores warnings from numpy's
+    nanfunctions.
+    """
+    return WarningContext(
+        (
+            ('ignore',),
+            {'category': RuntimeWarning, 'module': 'numpy.lib.nanfunctions'},
+        )
+    )

From 16c5aecba6fcf54f4d20c68db308223501954033 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 22 Mar 2016 21:00:32 -0400
Subject: [PATCH 05/18] DEV: Add utility for permuting rows in an array.

Useful for testing rank-order functions on arrays.
---
 zipline/testing/__init__.py |  1 +
 zipline/testing/core.py     | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/zipline/testing/__init__.py b/zipline/testing/__init__.py
index 16bc73d7..34bf9db8 100644
--- a/zipline/testing/__init__.py
+++ b/zipline/testing/__init__.py
@@ -21,6 +21,7 @@ from .core import (  # noqa
     make_trade_panel_for_asset_info,
     num_days_in_range,
     parameter_space,
+    permute_rows,
     powerset,
     product_upper_triangle,
     seconds_to_timestamp,
diff --git a/zipline/testing/core.py b/zipline/testing/core.py
index 8bc05bc0..c759480b 100644
--- a/zipline/testing/core.py
+++ b/zipline/testing/core.py
@@ -31,6 +31,7 @@ from zipline.finance.order import ORDER_STATUS
 from zipline.pipeline.engine import SimplePipelineEngine
 from zipline.pipeline.loaders.testing import make_seeded_random_loader
 from zipline.utils import security_list
+from zipline.utils.input_validation import expect_dimensions
 from zipline.utils.tradingcalendar import trading_days
 
 
@@ -891,6 +892,22 @@ def parameter_space(**params):
     return decorator
 
 
+@expect_dimensions(array=2)
+def permute_rows(seed, array):
+    """
+    Shuffle each row in ``array`` based on permutations generated by ``seed``.
+
+    Parameters
+    ----------
+    seed : int
+        Seed for numpy.RandomState
+    array : np.ndarray[ndim=2]
+        Array over which to apply permutations.
+    """
+    rand = np.random.RandomState(seed)
+    return np.apply_along_axis(rand.permutation, 1, array)
+
+
 @nottest
 def make_test_handler(testcase, *args, **kwargs):
     """

From 872b84e09a1bb78ff1b7d53ad405188411b43283 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 21 Mar 2016 22:26:58 -0400
Subject: [PATCH 06/18] ENH: Implement Factor.quantiles.

---
 tests/pipeline/test_factor.py              | 188 +++++++++++++++++++--
 zipline/lib/quantiles.py                   |  17 ++
 zipline/pipeline/classifiers/__init__.py   |   9 +-
 zipline/pipeline/classifiers/classifier.py |  26 ++-
 zipline/pipeline/factors/factor.py         |  33 +++-
 zipline/testing/core.py                    |   9 +-
 6 files changed, 261 insertions(+), 21 deletions(-)
 create mode 100644 zipline/lib/quantiles.py

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index e7dab7cc..3a049bc1 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -1,10 +1,11 @@
 """
 Tests for Factor terms.
 """
+from functools import partial
 from itertools import product
-from six import iteritems
 from nose_parameterized import parameterized
 
+from toolz import compose
 from numpy import (
     apply_along_axis,
     arange,
@@ -13,9 +14,8 @@ from numpy import (
     empty,
     eye,
     nan,
-    nanmean,
-    nanstd,
     ones,
+    rot90,
     where,
 )
 from numpy.random import randn, seed
@@ -32,6 +32,7 @@ from zipline.testing import (
     check_allclose,
     check_arrays,
     parameter_space,
+    permute_rows,
 )
 from zipline.utils.functional import dzip_exact
 from zipline.utils.numpy_utils import (
@@ -40,6 +41,7 @@ from zipline.utils.numpy_utils import (
     int64_dtype,
     NaTns,
 )
+from zipline.utils.math_utils import nanmean, nanstd
 
 from .base import BasePipelineTestCase
 
@@ -50,6 +52,12 @@ class F(Factor):
     window_length = 0
 
 
+class OtherF(Factor):
+    dtype = float64_dtype
+    inputs = ()
+    window_length = 0
+
+
 class C(Classifier):
     dtype = int64_dtype
     missing_value = -1
@@ -498,7 +506,7 @@ class FactorTestCase(BasePipelineTestCase):
             mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
         )
 
-        for key, (res, exp) in iteritems(dzip_exact(results, expected)):
+        for key, (res, exp) in dzip_exact(results, expected).items():
             check_allclose(
                 res,
                 exp,
@@ -516,7 +524,7 @@ class FactorTestCase(BasePipelineTestCase):
             ('demean', lambda row: row - nanmean(row)),
             ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
         ],
-        add_nulls_to_factor=(False, True,)
+        add_nulls_to_factor=(False, True,),
     )
     def test_normalizations_randomized(self,
                                        seed_value,
@@ -532,9 +540,9 @@ class FactorTestCase(BasePipelineTestCase):
         # Falses on main diagonal.
         eyemask = self.eye_mask(shape=shape)
         # Falses on other diagonal.
-        eyemask_T = eyemask.T
+        eyemask90 = rot90(eyemask)
         # Falses on both diagonals.
-        xmask = eyemask & eyemask_T
+        xmask = eyemask & eyemask90
 
         # Block of random data.
         factor_data = self.randn_data(seed=seed_value, shape=shape)
@@ -548,7 +556,7 @@ class FactorTestCase(BasePipelineTestCase):
         # With -1s on main diagonal.
         classifier_data_eyenulls = where(eyemask, classifier_data, -1)
         # With -1s on opposite diagonal.
-        classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1)
+        classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1)
         # With -1s on both diagonals.
         classifier_data_xnulls = where(xmask, classifier_data, -1)
 
@@ -581,8 +589,8 @@ class FactorTestCase(BasePipelineTestCase):
             # If the classifier has nulls, we should get NaNs in the
             # corresponding locations in the output.
             'grouped_with_nulls': where(
-                eyemask_T,
-                grouped_apply(factor_data, classifier_data_eyenulls_T, func),
+                eyemask90,
+                grouped_apply(factor_data, classifier_data_eyenulls90, func),
                 nan,
             ),
             # Passing a mask with a classifier should behave as though the
@@ -613,7 +621,7 @@ class FactorTestCase(BasePipelineTestCase):
             initial_workspace={
                 f: factor_data,
                 c: classifier_data,
-                c_with_nulls: classifier_data_eyenulls_T,
+                c_with_nulls: classifier_data_eyenulls90,
                 Mask(): eyemask,
             },
             mask=self.build_mask(nomask),
@@ -640,3 +648,161 @@ class FactorTestCase(BasePipelineTestCase):
         ).format(normalizer=method_name)
 
         self.assertEqual(errmsg, expected)
+
+    @parameter_space(seed=[1, 2, 3])
+    def test_quantiles_unmasked(self, seed):
+        permute = partial(permute_rows, seed)
+
+        shape = (6, 6)
+
+        # Shuffle the input rows to verify that we don't depend on the order.
+        # Div by 2 to ensure that we don't depend on inputs being integral.
+        factor_data = permute(arange(36, dtype=float).reshape(shape)) / 2.0
+
+        f = self.f
+        terms = {
+            '2': f.quantiles(bins=2),
+            '3': f.quantiles(bins=3),
+            '6': f.quantiles(bins=6),
+        }
+
+        # Apply the same shuffle we applied to the input rows to our
+        # expectations. Doing it this way makes it obvious that our
+        # expectation corresponds to our input, while still testing against
+        # a range of input orderings.
+        permuted_array = compose(permute, partial(array, dtype=int))
+        expected = {
+            # The values in the input are all increasing, so the first half of
+            # each row should be in the bottom bucket, and the second half
+            # should be in the top bucket.
+            '2': permuted_array([[0, 0, 0, 1, 1, 1],
+                                 [0, 0, 0, 1, 1, 1],
+                                 [0, 0, 0, 1, 1, 1],
+                                 [0, 0, 0, 1, 1, 1],
+                                 [0, 0, 0, 1, 1, 1],
+                                 [0, 0, 0, 1, 1, 1]]),
+            # Similar for three buckets.
+            '3': permuted_array([[0, 0, 1, 1, 2, 2],
+                                 [0, 0, 1, 1, 2, 2],
+                                 [0, 0, 1, 1, 2, 2],
+                                 [0, 0, 1, 1, 2, 2],
+                                 [0, 0, 1, 1, 2, 2],
+                                 [0, 0, 1, 1, 2, 2]]),
+            # In the limiting case, we just have every column different.
+            '6': permuted_array([[0, 1, 2, 3, 4, 5],
+                                 [0, 1, 2, 3, 4, 5],
+                                 [0, 1, 2, 3, 4, 5],
+                                 [0, 1, 2, 3, 4, 5],
+                                 [0, 1, 2, 3, 4, 5],
+                                 [0, 1, 2, 3, 4, 5]]),
+        }
+
+        graph = TermGraph(terms)
+        results = self.run_graph(
+            graph,
+            initial_workspace={
+                f: factor_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=shape)),
+        )
+
+        for key, (res, exp) in dzip_exact(results, expected).items():
+            check_arrays(res, exp)
+
+    @parameter_space(seed=[1, 2, 3])
+    def test_quantiles_masked(self, seed):
+        permute = partial(permute_rows, seed)
+
+        # 7 x 7 so that we divide evenly into 2/3/6-tiles after including the
+        # nan value in each row.
+        shape = (7, 7)
+
+        # Shuffle the input rows to verify that we don't depend on the order.
+        # Div by 2 to ensure that we don't depend on inputs being integral.
+        factor_data = permute(arange(49, dtype=float).reshape(shape)) / 2.0
+        factor_data_w_nans = where(
+            permute(rot90(self.eye_mask(shape=shape))),
+            factor_data,
+            nan,
+        )
+        mask_data = permute(self.eye_mask(shape=shape))
+
+        f = F()
+        f_nans = OtherF()
+        m = Mask()
+
+        terms = {
+            '2_masked': f.quantiles(bins=2, mask=m),
+            '3_masked': f.quantiles(bins=3, mask=m),
+            '6_masked': f.quantiles(bins=6, mask=m),
+            '2_nans': f_nans.quantiles(bins=2),
+            '3_nans': f_nans.quantiles(bins=3),
+            '6_nans': f_nans.quantiles(bins=6),
+        }
+
+        # Apply the same shuffle we applied to the input rows to our
+        # expectations. Doing it this way makes it obvious that our
+        # expectation corresponds to our input, while still testing against
+        # a range of input orderings.
+        permuted_array = compose(permute, partial(array, dtype=int))
+        expected = {
+            # Expected results here are the same as in test_quantiles_masked,
+            # except with diagonals of -1s interpolated to match the effects of
+            # masking and/or input nans.
+            '2_masked': permuted_array([[-1, 0,  0,  0,  1,  1,  1],
+                                        [0, -1,  0,  0,  1,  1,  1],
+                                        [0,  0, -1,  0,  1,  1,  1],
+                                        [0,  0,  0, -1,  1,  1,  1],
+                                        [0,  0,  0,  1, -1,  1,  1],
+                                        [0,  0,  0,  1,  1, -1,  1],
+                                        [0,  0,  0,  1,  1,  1, -1]]),
+            '3_masked': permuted_array([[-1, 0,  0,  1,  1,  2,  2],
+                                        [0, -1,  0,  1,  1,  2,  2],
+                                        [0,  0, -1,  1,  1,  2,  2],
+                                        [0,  0,  1, -1,  1,  2,  2],
+                                        [0,  0,  1,  1, -1,  2,  2],
+                                        [0,  0,  1,  1,  2, -1,  2],
+                                        [0,  0,  1,  1,  2,  2, -1]]),
+            '6_masked': permuted_array([[-1, 0,  1,  2,  3,  4,  5],
+                                        [0, -1,  1,  2,  3,  4,  5],
+                                        [0,  1, -1,  2,  3,  4,  5],
+                                        [0,  1,  2, -1,  3,  4,  5],
+                                        [0,  1,  2,  3, -1,  4,  5],
+                                        [0,  1,  2,  3,  4, -1,  5],
+                                        [0,  1,  2,  3,  4,  5, -1]]),
+            '2_nans': permuted_array([[0,  0,  0,  1,  1,  1, -1],
+                                      [0,  0,  0,  1,  1, -1,  1],
+                                      [0,  0,  0,  1, -1,  1,  1],
+                                      [0,  0,  0, -1,  1,  1,  1],
+                                      [0,  0, -1,  0,  1,  1,  1],
+                                      [0, -1,  0,  0,  1,  1,  1],
+                                      [-1, 0,  0,  0,  1,  1,  1]]),
+            '3_nans': permuted_array([[0,  0,  1,  1,  2,  2, -1],
+                                      [0,  0,  1,  1,  2, -1,  2],
+                                      [0,  0,  1,  1, -1,  2,  2],
+                                      [0,  0,  1, -1,  1,  2,  2],
+                                      [0,  0, -1,  1,  1,  2,  2],
+                                      [0, -1,  0,  1,  1,  2,  2],
+                                      [-1, 0,  0,  1,  1,  2,  2]]),
+            '6_nans': permuted_array([[0,  1,  2,  3,  4,  5, -1],
+                                      [0,  1,  2,  3,  4, -1,  5],
+                                      [0,  1,  2,  3, -1,  4,  5],
+                                      [0,  1,  2, -1,  3,  4,  5],
+                                      [0,  1, -1,  2,  3,  4,  5],
+                                      [0, -1,  1,  2,  3,  4,  5],
+                                      [-1, 0,  1,  2,  3,  4,  5]]),
+        }
+
+        graph = TermGraph(terms)
+        results = self.run_graph(
+            graph,
+            initial_workspace={
+                f: factor_data,
+                f_nans: factor_data_w_nans,
+                m: mask_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=shape)),
+        )
+
+        for key, (res, exp) in dzip_exact(results, expected).items():
+            check_arrays(res, exp)
diff --git a/zipline/lib/quantiles.py b/zipline/lib/quantiles.py
new file mode 100644
index 00000000..470153d0
--- /dev/null
+++ b/zipline/lib/quantiles.py
@@ -0,0 +1,17 @@
+"""
+Algorithms for computing quantiles on numpy arrays.
+"""
+from numpy.lib import apply_along_axis
+from pandas import qcut
+
+
+def quantiles(data, nbins_or_partition_bounds):
+    """
+    Compute rowwise array quantiles on an input.
+    """
+    return apply_along_axis(
+        qcut,
+        1,
+        data,
+        q=nbins_or_partition_bounds, labels=False,
+    )
diff --git a/zipline/pipeline/classifiers/__init__.py b/zipline/pipeline/classifiers/__init__.py
index b512100f..cc2e242b 100644
--- a/zipline/pipeline/classifiers/__init__.py
+++ b/zipline/pipeline/classifiers/__init__.py
@@ -1,8 +1,15 @@
-from .classifier import Classifier, CustomClassifier, Everything, Latest
+from .classifier import (
+    Classifier,
+    CustomClassifier,
+    Quantiles,
+    Everything,
+    Latest,
+)
 
 __all__ = [
     'Classifier',
     'CustomClassifier',
     'Everything',
     'Latest',
+    'Quantiles',
 ]
diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index 6777e03e..0d6db642 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -1,8 +1,9 @@
 """
 classifier.py
 """
-from numpy import zeros, where
+from numpy import where, isnan, nan, zeros
 
+from zipline.lib.quantiles import quantiles
 from zipline.pipeline.term import ComputableTerm
 from zipline.utils.numpy_utils import int64_dtype
 
@@ -10,7 +11,8 @@ from ..mixins import (
     CustomTermMixin,
     LatestMixin,
     PositiveWindowLengthMixin,
-    RestrictedDTypeMixin
+    RestrictedDTypeMixin,
+    SingleInputMixin,
 )
 
 
@@ -44,6 +46,26 @@ class Everything(Classifier):
         )
 
 
+class Quantiles(SingleInputMixin, Classifier):
+    """
+    A classifier computing quantiles over an input.
+    """
+    params = ('bins',)
+    dtype = int64_dtype
+    window_length = 0
+    missing_value = -1
+
+    def _compute(self, arrays, dates, assets, mask):
+        data = arrays[0]
+        bins = self.params['bins']
+        to_bin = where(mask, data, nan)
+        result = quantiles(to_bin, bins)
+        # Write self.missing_value into nan locations, whether they were
+        # generated by our input mask or not.
+        result[isnan(result)] = self.missing_value
+        return result.astype(int64_dtype)
+
+
 class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier):
     """
     Base class for user-defined Classifiers.
diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
index b9054e5d..774ed7e4 100644
--- a/zipline/pipeline/factors/factor.py
+++ b/zipline/pipeline/factors/factor.py
@@ -5,13 +5,13 @@ from functools import wraps
 from operator import attrgetter
 from numbers import Number
 
-from numpy import inf, where, nanstd
+from numpy import inf, where
 from toolz import curry
 
 from zipline.errors import UnknownRankMethod
 from zipline.lib.normalize import naive_grouped_rowwise_apply
 from zipline.lib.rank import masked_rankdata_2d
-from zipline.pipeline.classifiers import Classifier, Everything
+from zipline.pipeline.classifiers import Classifier, Everything, Quantiles
 from zipline.pipeline.mixins import (
     CustomTermMixin,
     LatestMixin,
@@ -43,7 +43,7 @@ from zipline.pipeline.filters import (
     NullFilter,
 )
 from zipline.utils.input_validation import expect_types
-from zipline.utils.math_utils import nanmean
+from zipline.utils.math_utils import nanmean, nanstd
 from zipline.utils.numpy_utils import (
     bool_dtype,
     coerce_to_dtype,
@@ -685,6 +685,33 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
         """
         return Rank(self, method=method, ascending=ascending, mask=mask)
 
+    @expect_types(bins=int, mask=(Filter, NotSpecifiedType))
+    def quantiles(self, bins, mask=NotSpecified):
+        """
+        Construct a Classifier computing quantiles of the output of ``self``.
+
+        Every non-NaN data point the output is labelled with an integer value
+        from 0 to (bins - 1).  NaNs are labelled with -1.
+
+        If ``mask`` is supplied, ignore data points in locations for which
+        ``mask`` produces False, and emit a label of -1 at those locations.
+
+        Parameters
+        ----------
+        bins : int
+            Number of bins labels to compute.
+        mask : zipline.pipeline.Filter, optional
+            Mask of values to ignore when computing quantiles.
+
+        Returns
+        -------
+        quantiles : zipline.pipeline.classifiers.Quantiles
+            A Classifier producing integer labels ranging from 0 to (bins - 1).
+        """
+        if mask is NotSpecified:
+            mask = self.mask
+        return Quantiles(inputs=(self,), bins=bins, mask=mask)
+
     def top(self, N, mask=NotSpecified):
         """
         Construct a Filter matching the top N asset values of self each day.
diff --git a/zipline/testing/core.py b/zipline/testing/core.py
index c759480b..3d96c4ec 100644
--- a/zipline/testing/core.py
+++ b/zipline/testing/core.py
@@ -409,7 +409,7 @@ def make_trade_panel_for_asset_info(dates,
                                     volume_step_by_date,
                                     volume_step_by_sid):
     """
-    Convert an asset info frame into a panel of trades, writing NaNs for
+
     locations where assets did not exist.
     """
     sids = list(asset_info.index)
@@ -579,7 +579,7 @@ def check_allclose(actual,
     )
 
 
-def check_arrays(x, y, err_msg='', verbose=True):
+def check_arrays(x, y, err_msg='', verbose=True, check_dtypes=True):
     """
     Wrapper around np.testing.assert_array_equal that also verifies that inputs
     are ndarrays.
@@ -588,8 +588,9 @@ def check_arrays(x, y, err_msg='', verbose=True):
     --------
     np.assert_array_equal
     """
-    if type(x) != type(y):
-        raise AssertionError("%s != %s" % (type(x), type(y)))
+    assert type(x) == type(y), "{x} != {y}".format(x=type(x), y=type(y))
+    assert x.dtype == y.dtype, "{x.dtype} != {y.dtype}".format(x=x, y=y)
+
     return assert_array_equal(x, y, err_msg=err_msg, verbose=True)
 
 

From 5ed1a4fcd1e98c413c916b51b544a7b4c3b3a6a9 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 22 Mar 2016 22:08:50 -0400
Subject: [PATCH 07/18] ENH: Add quartiles/quintiles/deciles.

They're all syntactic sugar for the equivalent invocations of quantiles.
---
 tests/pipeline/test_factor.py      | 16 +++++++
 zipline/pipeline/factors/factor.py | 72 ++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index 3a049bc1..3a7d4819 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -806,3 +806,19 @@ class FactorTestCase(BasePipelineTestCase):
 
         for key, (res, exp) in dzip_exact(results, expected).items():
             check_arrays(res, exp)
+
+    def test_quantile_helpers(self):
+        f = self.f
+        m = Mask()
+
+        self.assertIs(f.quartiles(), f.quantiles(bins=4))
+        self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m))
+        self.assertIsNot(f.quartiles(), f.quartiles(mask=m))
+
+        self.assertIs(f.quintiles(), f.quantiles(bins=5))
+        self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m))
+        self.assertIsNot(f.quintiles(), f.quintiles(mask=m))
+
+        self.assertIs(f.deciles(), f.quantiles(bins=10))
+        self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m))
+        self.assertIsNot(f.deciles(), f.deciles(mask=m))
diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
index 774ed7e4..27e0e476 100644
--- a/zipline/pipeline/factors/factor.py
+++ b/zipline/pipeline/factors/factor.py
@@ -712,6 +712,78 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
             mask = self.mask
         return Quantiles(inputs=(self,), bins=bins, mask=mask)
 
+    @expect_types(mask=(Filter, NotSpecifiedType))
+    def quartiles(self, mask=NotSpecified):
+        """
+        Construct a Classifier computing quartiles over the output of ``self``.
+
+        Every non-NaN data point the output is labelled with a value of either
+        0, 1, 2, or 3, corresponding to the first, second, third, or fourth
+        quartile over each row.  NaN data points are labelled with -1.
+
+        If ``mask`` is supplied, ignore data points in locations for which
+        ``mask`` produces False, and emit a label of -1 at those locations.
+
+        Parameters
+        ----------
+        mask : zipline.pipeline.Filter, optional
+            Mask of values to ignore when computing quartiles.
+
+        Returns
+        -------
+        quartiles : zipline.pipeline.classifiers.Quantiles
+            A Classifier producing integer labels ranging from 0 to 3.
+        """
+        return self.quantiles(bins=4, mask=mask)
+
+    @expect_types(mask=(Filter, NotSpecifiedType))
+    def quintiles(self, mask=NotSpecified):
+        """
+        Construct a Classifier computing quintile labels on ``self``.
+
+        Every non-NaN data point the output is labelled with a value of either
+        0, 1, 2, or 3, 4, corresonding to quintiles over each row.  NaN data
+        points are labelled with -1.
+
+        If ``mask`` is supplied, ignore data points in locations for which
+        ``mask`` produces False, and emit a label of -1 at those locations.
+
+        Parameters
+        ----------
+        mask : zipline.pipeline.Filter, optional
+            Mask of values to ignore when computing quintiles.
+
+        Returns
+        -------
+        quintiles : zipline.pipeline.classifiers.Quantiles
+            A Classifier producing integer labels ranging from 0 to 4.
+        """
+        return self.quantiles(bins=5, mask=mask)
+
+    @expect_types(mask=(Filter, NotSpecifiedType))
+    def deciles(self, mask=NotSpecified):
+        """
+        Construct a Classifier computing decile labels on ``self``.
+
+        Every non-NaN data point the output is labelled with a value from 0 to
+        9 corresonding to deciles over each row.  NaN data points are labelled
+        with -1.
+
+        If ``mask`` is supplied, ignore data points in locations for which
+        ``mask`` produces False, and emit a label of -1 at those locations.
+
+        Parameters
+        ----------
+        mask : zipline.pipeline.Filter, optional
+            Mask of values to ignore when computing deciles.
+
+        Returns
+        -------
+        deciles : zipline.pipeline.classifiers.Quantiles
+            A Classifier producing integer labels ranging from 0 to 4.
+        """
+        return self.quantiles(bins=10, mask=mask)
+
     def top(self, N, mask=NotSpecified):
         """
         Construct a Filter matching the top N asset values of self each day.

From 758d6c74fc86c99c4d3b63713f39db66df745bfa Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 23 Mar 2016 12:04:58 -0400
Subject: [PATCH 08/18] ENH: Add isnull and notnull for classifiers.

---
 tests/pipeline/test_classifier.py          | 43 ++++++++++++++++++++++
 zipline/pipeline/classifiers/classifier.py | 13 +++++++
 2 files changed, 56 insertions(+)
 create mode 100644 tests/pipeline/test_classifier.py

diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py
new file mode 100644
index 00000000..6e36ce30
--- /dev/null
+++ b/tests/pipeline/test_classifier.py
@@ -0,0 +1,43 @@
+import numpy as np
+
+from zipline.pipeline import Classifier, TermGraph
+from zipline.testing import check_arrays, parameter_space
+from zipline.utils.numpy_utils import int64_dtype
+
+from .base import BasePipelineTestCase
+
+
+class ClassifierTestCase(BasePipelineTestCase):
+
+    @parameter_space(mv=[-1, 0, 1, 999])
+    def test_isnull(self, mv):
+
+        class C(Classifier):
+            dtype = int64_dtype
+            missing_value = mv
+            inputs = ()
+            window_length = 0
+
+        # There's no significance to the values here other than that they
+        # contain a mix of missing and non-missing values.
+        data = np.array([[-1,  1,  0, 2],
+                         [3,   0,  1, 0],
+                         [-5,  0, -1, 0],
+                         [-3,  1,  2, 2]], dtype=int)
+
+        c = C()
+        graph = TermGraph(
+            {
+                'isnull': c.isnull(),
+                'notnull': c.notnull()
+            }
+        )
+
+        results = self.run_graph(
+            graph,
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
+
+        check_arrays(results['isnull'], (data == mv))
+        check_arrays(results['notnull'], (data != mv))
diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index 0d6db642..7a584eaa 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -7,6 +7,7 @@ from zipline.lib.quantiles import quantiles
 from zipline.pipeline.term import ComputableTerm
 from zipline.utils.numpy_utils import int64_dtype
 
+from ..filters import NullFilter
 from ..mixins import (
     CustomTermMixin,
     LatestMixin,
@@ -28,6 +29,18 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
     """
     ALLOWED_DTYPES = (int64_dtype,)  # Used by RestrictedDTypeMixin
 
+    def isnull(self):
+        """
+        A Filter producing True for values where this term has missing data.
+        """
+        return NullFilter(self)
+
+    def notnull(self):
+        """
+        A Filter producing True for values where this term has complete data.
+        """
+        return ~self.isnull()
+
 
 class Everything(Classifier):
     """

From 18bd7010b58c768f02b39cf1b7a5b5a43f0a9ce6 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 23 Mar 2016 16:00:52 -0400
Subject: [PATCH 09/18] ENH: Improve short_reprs of classifier/normalizer.

GroupedRowTransform now shows the name of its transform, and Quantiles
shows the number of quantiles.

These are used by Pipeline.show_graph().
---
 tests/pipeline/test_factor.py              | 15 +++++++++++++++
 zipline/pipeline/classifiers/classifier.py |  3 +++
 zipline/pipeline/factors/factor.py         | 21 +++++++++++++++++++--
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index 3a7d4819..a884049b 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -4,6 +4,7 @@ Tests for Factor terms.
 from functools import partial
 from itertools import product
 from nose_parameterized import parameterized
+from unittest import TestCase
 
 from toolz import compose
 from numpy import (
@@ -822,3 +823,17 @@ class FactorTestCase(BasePipelineTestCase):
         self.assertIs(f.deciles(), f.quantiles(bins=10))
         self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m))
         self.assertIsNot(f.deciles(), f.deciles(mask=m))
+
+
+class ShortReprTestCase(TestCase):
+    """
+    Tests for short_repr methods of Factors.
+    """
+
+    def test_demean(self):
+        r = F().demean().short_repr()
+        self.assertEqual(r, "GroupedRowTransform('demean')")
+
+    def test_zscore(self):
+        r = F().zscore().short_repr()
+        self.assertEqual(r, "GroupedRowTransform('zscore')")
diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index 7a584eaa..fcd2accd 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -78,6 +78,9 @@ class Quantiles(SingleInputMixin, Classifier):
         result[isnan(result)] = self.missing_value
         return result.astype(int64_dtype)
 
+    def short_repr(self):
+        return type(self).__name__ + '(%d)' % self.params['bins']
+
 
 class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier):
     """
diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
index 27e0e476..3310523b 100644
--- a/zipline/pipeline/factors/factor.py
+++ b/zipline/pipeline/factors/factor.py
@@ -576,8 +576,13 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
         --------
         :meth:`pandas.DataFrame.groupby`
         """
+        # This is a named function so that it has a __name__ for use in the
+        # graph repr of GroupedRowTransform.
+        def demean(row):
+            return row - nanmean(row)
+
         return GroupedRowTransform(
-            transform=lambda row: row - nanmean(row),
+            transform=demean,
             factor=self,
             mask=mask,
             groupby=groupby,
@@ -637,8 +642,13 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
         --------
         :meth:`pandas.DataFrame.groupby`
         """
+        # This is a named function so that it has a __name__ for use in the
+        # graph repr of GroupedRowTransform.
+        def zscore(row):
+            return (row - nanmean(row)) / nanstd(row)
+
         return GroupedRowTransform(
-            transform=lambda row: (row - nanmean(row)) / nanstd(row),
+            transform=zscore,
             factor=self,
             mask=mask,
             groupby=groupby,
@@ -1022,6 +1032,13 @@ class GroupedRowTransform(Factor):
             self.missing_value,
         )
 
+    @property
+    def transform_name(self):
+        return self._transform.__name__
+
+    def short_repr(self):
+        return type(self).__name__ + '(%r)' % self.transform_name
+
 
 class Rank(SingleInputMixin, Factor):
     """

From 39507efed2d7ddedda3197a0ed3e3e8b565e3b66 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 23 Mar 2016 16:18:19 -0400
Subject: [PATCH 10/18] DOC: Add whatsnew for quantiles.

---
 docs/source/whatsnew/0.8.5.txt | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/source/whatsnew/0.8.5.txt b/docs/source/whatsnew/0.8.5.txt
index 6fd25cbd..49bf161c 100644
--- a/docs/source/whatsnew/0.8.5.txt
+++ b/docs/source/whatsnew/0.8.5.txt
@@ -32,13 +32,20 @@ Enhancements
   factors use the new ``CashBuybackAuthorizations`` and
   ``ShareBuybackAuthorizations`` datasets, respectively. (:issue:`1022`).
 
+* Implemented :class:`zipline.pipeline.Classifier`, a new core pipeline API
+  term representing grouping keys.  Classifiers are primarily used by passing
+  them as the ``groupby`` parameter to factor normalization methods.
+
 * Added factor normalization methods:
   :meth:`zipline.pipeline.Factor.demean` and
   :meth:`zipline.pipeline.Factor.zscore`. (:issue:`1046`)
 
-* Implemented :class:`zipline.pipeline.Classifier`, a new core pipeline API
-  term representing grouping keys.  Classifiers are primarily used by passing
-  them as the ``groupby`` parameter to factor normalization methods.
+* Added :meth:`zipline.pipeline.Factor.quantiles`, a method for computing a
+  Classifier from a Factor by partitioning into equally-sized buckets. Also
+  added helpers for common quantile sizes
+  (:meth:`zipline.pipeline.Factor.quartiles`,
+  :meth:`zipline.pipeline.Factor.quartiles`, and
+  :meth:`zipline.pipeline.Factor.deciles`) (:issue:`1075`).
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~

From 9e0ad5de15e0849cdad4a7b12004d0e5855ae519 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Thu, 24 Mar 2016 11:54:19 -0400
Subject: [PATCH 11/18] DOC: Fix bincount in docstring.

---
 zipline/pipeline/factors/factor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
index 3310523b..4edf66d4 100644
--- a/zipline/pipeline/factors/factor.py
+++ b/zipline/pipeline/factors/factor.py
@@ -790,7 +790,7 @@ class Factor(RestrictedDTypeMixin, ComputableTerm):
         Returns
         -------
         deciles : zipline.pipeline.classifiers.Quantiles
-            A Classifier producing integer labels ranging from 0 to 4.
+            A Classifier producing integer labels ranging from 0 to 9.
         """
         return self.quantiles(bins=10, mask=mask)
 

From a932628627200fda63497494058f0743015100d0 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Thu, 24 Mar 2016 15:00:04 -0400
Subject: [PATCH 12/18] BUG: Use six viewkeys.

---
 zipline/utils/functional.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/zipline/utils/functional.py b/zipline/utils/functional.py
index 80ac1878..46985c37 100644
--- a/zipline/utils/functional.py
+++ b/zipline/utils/functional.py
@@ -1,7 +1,8 @@
-from operator import methodcaller
-from six.moves import map
 from pprint import pformat
 
+from six import viewkeys
+from six.moves import map
+
 
 def mapall(funcs, seq):
     """
@@ -76,7 +77,7 @@ def dzip_exact(*dicts):
     >>> dzip_exact({'a': 1, 'b': 2}, {'a': 3, 'b': 4})
     {'a': (1, 3), 'b': (2, 4)}
     """
-    if not same(*map(methodcaller('viewkeys'), dicts)):
+    if not same(*map(viewkeys, dicts)):
         raise ValueError(
             "dict keys not all equal:\n\n%s" % _format_unequal_keys(dicts)
         )

From c6e58af51b6608cd640423671370340ec11d35d6 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Fri, 25 Mar 2016 15:24:26 -0400
Subject: [PATCH 13/18] TEST: Test quantiles with better input.

Take the log of arange so that we know we don't depend on linearity of
the input.
---
 tests/pipeline/test_factor.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index a884049b..2b9783aa 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -14,6 +14,7 @@ from numpy import (
     datetime64,
     empty,
     eye,
+    log1p,
     nan,
     ones,
     rot90,
@@ -657,8 +658,9 @@ class FactorTestCase(BasePipelineTestCase):
         shape = (6, 6)
 
         # Shuffle the input rows to verify that we don't depend on the order.
-        # Div by 2 to ensure that we don't depend on inputs being integral.
-        factor_data = permute(arange(36, dtype=float).reshape(shape)) / 2.0
+        # Take the log to ensure that we don't depend on linear scaling or
+        # integrality of inputs
+        factor_data = permute(log1p(arange(36, dtype=float).reshape(shape)))
 
         f = self.f
         terms = {
@@ -719,8 +721,9 @@ class FactorTestCase(BasePipelineTestCase):
         shape = (7, 7)
 
         # Shuffle the input rows to verify that we don't depend on the order.
-        # Div by 2 to ensure that we don't depend on inputs being integral.
-        factor_data = permute(arange(49, dtype=float).reshape(shape)) / 2.0
+        # Take the log to ensure that we don't depend on linear scaling or
+        # integrality of inputs
+        factor_data = permute(log1p(arange(49, dtype=float).reshape(shape)))
         factor_data_w_nans = where(
             permute(rot90(self.eye_mask(shape=shape))),
             factor_data,

From 92feaa3a7d6b456e7b6310ef594690ccb75388c1 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 28 Mar 2016 10:21:57 -0400
Subject: [PATCH 14/18] BUG/TEST: Dict key order isn't guaranteed.

---
 zipline/utils/functional.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/zipline/utils/functional.py b/zipline/utils/functional.py
index 46985c37..05a1d632 100644
--- a/zipline/utils/functional.py
+++ b/zipline/utils/functional.py
@@ -74,8 +74,9 @@ def dzip_exact(*dicts):
 
     Example
     -------
-    >>> dzip_exact({'a': 1, 'b': 2}, {'a': 3, 'b': 4})
-    {'a': (1, 3), 'b': (2, 4)}
+    >>> result = dzip_exact({'a': 1, 'b': 2}, {'a': 3, 'b': 4})
+    >>> result == {'a': (1, 3), 'b': (2, 4)}
+    True
     """
     if not same(*map(viewkeys, dicts)):
         raise ValueError(

From fe22bde99873458dcbbe6e08e4ff1a9233e12c76 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 28 Mar 2016 11:34:58 -0400
Subject: [PATCH 15/18] TEST: Test uneven buckets in quantiles.

---
 tests/pipeline/test_factor.py | 41 +++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index 2b9783aa..de30db62 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -811,6 +811,47 @@ class FactorTestCase(BasePipelineTestCase):
         for key, (res, exp) in dzip_exact(results, expected).items():
             check_arrays(res, exp)
 
+    def test_quantiles_uneven_buckets(self):
+        permute = partial(permute_rows, 5)
+        shape = (5, 5)
+
+        factor_data = permute(log1p(arange(25, dtype=float).reshape(shape)))
+        mask_data = permute(self.eye_mask(shape=shape))
+
+        f = F()
+        m = Mask()
+
+        terms = {
+            '3_masked': f.quantiles(bins=3, mask=m),
+            '7_masked': f.quantiles(bins=20, mask=m),
+        }
+
+        expected = {
+            '3_masked': [[-1, 0,  0,  1,  2],
+                         [0, -1,  0,  1,  2],
+                         [0,  0, -1,  1,  2],
+                         [0,  0,  1, -1,  2],
+                         [0,  0,  1,  2, -1]],
+            '7_masked': [[-1, 0,  2,  4,  6],
+                         [0, -1,  2,  4,  6],
+                         [0,  2, -1,  4,  6],
+                         [0,  2,  4, -1,  6],
+                         [0,  2,  4,  6, -1]],
+        }
+
+        graph = TermGraph(terms)
+        results = self.run_graph(
+            graph,
+            initial_workspace={
+                f: factor_data,
+                m: mask_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=shape)),
+        )
+
+        for key, (res, exp) in dzip_exact(results, expected).items():
+            check_arrays(res, exp)
+
     def test_quantile_helpers(self):
         f = self.f
         m = Mask()

From 076868f5a11cc751be178e87604b8848c059ed88 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 28 Mar 2016 11:56:15 -0400
Subject: [PATCH 16/18] MAINT: Refactor shared code into test method.

---
 tests/pipeline/base.py        |  14 +-
 tests/pipeline/test_factor.py | 238 ++++++++++++++++------------------
 2 files changed, 122 insertions(+), 130 deletions(-)

diff --git a/tests/pipeline/base.py b/tests/pipeline/base.py
index 414091c0..e8261ccb 100644
--- a/tests/pipeline/base.py
+++ b/tests/pipeline/base.py
@@ -13,10 +13,11 @@ from pandas import date_range, Int64Index, DataFrame
 from pandas.util.testing import assert_series_equal
 from six import iteritems
 
-from zipline.pipeline import Pipeline
+from zipline.pipeline import Pipeline, TermGraph
 from zipline.pipeline.engine import SimplePipelineEngine
 from zipline.pipeline.term import AssetExists
 from zipline.testing import (
+    check_arrays,
     ExplodingObject,
     gen_calendars,
     make_simple_equity_info,
@@ -24,6 +25,7 @@ from zipline.testing import (
     tmp_asset_finder,
 )
 
+from zipline.utils.functional import dzip_exact
 from zipline.utils.numpy_utils import (
     NaTD,
     make_datetime64D
@@ -125,6 +127,16 @@ class BasePipelineTestCase(TestCase):
             initial_workspace,
         )
 
+    def check_terms(self, terms, expected, initial_workspace, mask):
+        """
+        Compile the given terms into a TermGraph, compute it with
+        initial_workspace, and compare the results with ``expected``.
+        """
+        graph = TermGraph(terms)
+        results = self.run_graph(graph, initial_workspace, mask)
+        for key, (res, exp) in dzip_exact(results, expected).items():
+            check_arrays(res, exp)
+
     def build_mask(self, array):
         """
         Helper for constructing an AssetExists mask from a boolean-coercible
diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index de30db62..b39de5cc 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -617,9 +617,9 @@ class FactorTestCase(BasePipelineTestCase):
             )
         }
 
-        graph = TermGraph(terms)
-        results = self.run_graph(
-            graph,
+        self.check_terms(
+            terms=terms,
+            expected=expected,
             initial_workspace={
                 f: factor_data,
                 c: classifier_data,
@@ -629,9 +629,6 @@ class FactorTestCase(BasePipelineTestCase):
             mask=self.build_mask(nomask),
         )
 
-        for key in expected:
-            check_arrays(expected[key], results[key])
-
     @parameter_space(method_name=['demean', 'zscore'])
     def test_cant_normalize_non_float(self, method_name):
         class DateFactor(Factor):
@@ -663,55 +660,49 @@ class FactorTestCase(BasePipelineTestCase):
         factor_data = permute(log1p(arange(36, dtype=float).reshape(shape)))
 
         f = self.f
-        terms = {
-            '2': f.quantiles(bins=2),
-            '3': f.quantiles(bins=3),
-            '6': f.quantiles(bins=6),
-        }
 
         # Apply the same shuffle we applied to the input rows to our
         # expectations. Doing it this way makes it obvious that our
         # expectation corresponds to our input, while still testing against
         # a range of input orderings.
         permuted_array = compose(permute, partial(array, dtype=int))
-        expected = {
-            # The values in the input are all increasing, so the first half of
-            # each row should be in the bottom bucket, and the second half
-            # should be in the top bucket.
-            '2': permuted_array([[0, 0, 0, 1, 1, 1],
-                                 [0, 0, 0, 1, 1, 1],
-                                 [0, 0, 0, 1, 1, 1],
-                                 [0, 0, 0, 1, 1, 1],
-                                 [0, 0, 0, 1, 1, 1],
-                                 [0, 0, 0, 1, 1, 1]]),
-            # Similar for three buckets.
-            '3': permuted_array([[0, 0, 1, 1, 2, 2],
-                                 [0, 0, 1, 1, 2, 2],
-                                 [0, 0, 1, 1, 2, 2],
-                                 [0, 0, 1, 1, 2, 2],
-                                 [0, 0, 1, 1, 2, 2],
-                                 [0, 0, 1, 1, 2, 2]]),
-            # In the limiting case, we just have every column different.
-            '6': permuted_array([[0, 1, 2, 3, 4, 5],
-                                 [0, 1, 2, 3, 4, 5],
-                                 [0, 1, 2, 3, 4, 5],
-                                 [0, 1, 2, 3, 4, 5],
-                                 [0, 1, 2, 3, 4, 5],
-                                 [0, 1, 2, 3, 4, 5]]),
-        }
-
-        graph = TermGraph(terms)
-        results = self.run_graph(
-            graph,
+        self.check_terms(
+            terms={
+                '2': f.quantiles(bins=2),
+                '3': f.quantiles(bins=3),
+                '6': f.quantiles(bins=6),
+            },
             initial_workspace={
                 f: factor_data,
             },
+            expected={
+                # The values in the input are all increasing, so the first half
+                # of each row should be in the bottom bucket, and the second
+                # half should be in the top bucket.
+                '2': permuted_array([[0, 0, 0, 1, 1, 1],
+                                     [0, 0, 0, 1, 1, 1],
+                                     [0, 0, 0, 1, 1, 1],
+                                     [0, 0, 0, 1, 1, 1],
+                                     [0, 0, 0, 1, 1, 1],
+                                     [0, 0, 0, 1, 1, 1]]),
+                # Similar for three buckets.
+                '3': permuted_array([[0, 0, 1, 1, 2, 2],
+                                     [0, 0, 1, 1, 2, 2],
+                                     [0, 0, 1, 1, 2, 2],
+                                     [0, 0, 1, 1, 2, 2],
+                                     [0, 0, 1, 1, 2, 2],
+                                     [0, 0, 1, 1, 2, 2]]),
+                # In the limiting case, we just have every column different.
+                '6': permuted_array([[0, 1, 2, 3, 4, 5],
+                                     [0, 1, 2, 3, 4, 5],
+                                     [0, 1, 2, 3, 4, 5],
+                                     [0, 1, 2, 3, 4, 5],
+                                     [0, 1, 2, 3, 4, 5],
+                                     [0, 1, 2, 3, 4, 5]]),
+            },
             mask=self.build_mask(self.ones_mask(shape=shape)),
         )
 
-        for key, (res, exp) in dzip_exact(results, expected).items():
-            check_arrays(res, exp)
-
     @parameter_space(seed=[1, 2, 3])
     def test_quantiles_masked(self, seed):
         permute = partial(permute_rows, seed)
@@ -735,82 +726,77 @@ class FactorTestCase(BasePipelineTestCase):
         f_nans = OtherF()
         m = Mask()
 
-        terms = {
-            '2_masked': f.quantiles(bins=2, mask=m),
-            '3_masked': f.quantiles(bins=3, mask=m),
-            '6_masked': f.quantiles(bins=6, mask=m),
-            '2_nans': f_nans.quantiles(bins=2),
-            '3_nans': f_nans.quantiles(bins=3),
-            '6_nans': f_nans.quantiles(bins=6),
-        }
-
         # Apply the same shuffle we applied to the input rows to our
         # expectations. Doing it this way makes it obvious that our
         # expectation corresponds to our input, while still testing against
         # a range of input orderings.
         permuted_array = compose(permute, partial(array, dtype=int))
-        expected = {
-            # Expected results here are the same as in test_quantiles_masked,
-            # except with diagonals of -1s interpolated to match the effects of
-            # masking and/or input nans.
-            '2_masked': permuted_array([[-1, 0,  0,  0,  1,  1,  1],
-                                        [0, -1,  0,  0,  1,  1,  1],
-                                        [0,  0, -1,  0,  1,  1,  1],
-                                        [0,  0,  0, -1,  1,  1,  1],
-                                        [0,  0,  0,  1, -1,  1,  1],
-                                        [0,  0,  0,  1,  1, -1,  1],
-                                        [0,  0,  0,  1,  1,  1, -1]]),
-            '3_masked': permuted_array([[-1, 0,  0,  1,  1,  2,  2],
-                                        [0, -1,  0,  1,  1,  2,  2],
-                                        [0,  0, -1,  1,  1,  2,  2],
-                                        [0,  0,  1, -1,  1,  2,  2],
-                                        [0,  0,  1,  1, -1,  2,  2],
-                                        [0,  0,  1,  1,  2, -1,  2],
-                                        [0,  0,  1,  1,  2,  2, -1]]),
-            '6_masked': permuted_array([[-1, 0,  1,  2,  3,  4,  5],
-                                        [0, -1,  1,  2,  3,  4,  5],
-                                        [0,  1, -1,  2,  3,  4,  5],
-                                        [0,  1,  2, -1,  3,  4,  5],
-                                        [0,  1,  2,  3, -1,  4,  5],
-                                        [0,  1,  2,  3,  4, -1,  5],
-                                        [0,  1,  2,  3,  4,  5, -1]]),
-            '2_nans': permuted_array([[0,  0,  0,  1,  1,  1, -1],
-                                      [0,  0,  0,  1,  1, -1,  1],
-                                      [0,  0,  0,  1, -1,  1,  1],
-                                      [0,  0,  0, -1,  1,  1,  1],
-                                      [0,  0, -1,  0,  1,  1,  1],
-                                      [0, -1,  0,  0,  1,  1,  1],
-                                      [-1, 0,  0,  0,  1,  1,  1]]),
-            '3_nans': permuted_array([[0,  0,  1,  1,  2,  2, -1],
-                                      [0,  0,  1,  1,  2, -1,  2],
-                                      [0,  0,  1,  1, -1,  2,  2],
-                                      [0,  0,  1, -1,  1,  2,  2],
-                                      [0,  0, -1,  1,  1,  2,  2],
-                                      [0, -1,  0,  1,  1,  2,  2],
-                                      [-1, 0,  0,  1,  1,  2,  2]]),
-            '6_nans': permuted_array([[0,  1,  2,  3,  4,  5, -1],
-                                      [0,  1,  2,  3,  4, -1,  5],
-                                      [0,  1,  2,  3, -1,  4,  5],
-                                      [0,  1,  2, -1,  3,  4,  5],
-                                      [0,  1, -1,  2,  3,  4,  5],
-                                      [0, -1,  1,  2,  3,  4,  5],
-                                      [-1, 0,  1,  2,  3,  4,  5]]),
-        }
 
-        graph = TermGraph(terms)
-        results = self.run_graph(
-            graph,
+        self.check_terms(
+            terms={
+                '2_masked': f.quantiles(bins=2, mask=m),
+                '3_masked': f.quantiles(bins=3, mask=m),
+                '6_masked': f.quantiles(bins=6, mask=m),
+                '2_nans': f_nans.quantiles(bins=2),
+                '3_nans': f_nans.quantiles(bins=3),
+                '6_nans': f_nans.quantiles(bins=6),
+            },
             initial_workspace={
                 f: factor_data,
                 f_nans: factor_data_w_nans,
                 m: mask_data,
             },
+            expected={
+                # Expected results here are the same as in
+                # test_quantiles_unmasked, except with diagonals of -1s
+                # interpolated to match the effects of masking and/or input
+                # nans.
+                '2_masked': permuted_array([[-1, 0,  0,  0,  1,  1,  1],
+                                            [0, -1,  0,  0,  1,  1,  1],
+                                            [0,  0, -1,  0,  1,  1,  1],
+                                            [0,  0,  0, -1,  1,  1,  1],
+                                            [0,  0,  0,  1, -1,  1,  1],
+                                            [0,  0,  0,  1,  1, -1,  1],
+                                            [0,  0,  0,  1,  1,  1, -1]]),
+                '3_masked': permuted_array([[-1, 0,  0,  1,  1,  2,  2],
+                                            [0, -1,  0,  1,  1,  2,  2],
+                                            [0,  0, -1,  1,  1,  2,  2],
+                                            [0,  0,  1, -1,  1,  2,  2],
+                                            [0,  0,  1,  1, -1,  2,  2],
+                                            [0,  0,  1,  1,  2, -1,  2],
+                                            [0,  0,  1,  1,  2,  2, -1]]),
+                '6_masked': permuted_array([[-1, 0,  1,  2,  3,  4,  5],
+                                            [0, -1,  1,  2,  3,  4,  5],
+                                            [0,  1, -1,  2,  3,  4,  5],
+                                            [0,  1,  2, -1,  3,  4,  5],
+                                            [0,  1,  2,  3, -1,  4,  5],
+                                            [0,  1,  2,  3,  4, -1,  5],
+                                            [0,  1,  2,  3,  4,  5, -1]]),
+                '2_nans': permuted_array([[0,  0,  0,  1,  1,  1, -1],
+                                          [0,  0,  0,  1,  1, -1,  1],
+                                          [0,  0,  0,  1, -1,  1,  1],
+                                          [0,  0,  0, -1,  1,  1,  1],
+                                          [0,  0, -1,  0,  1,  1,  1],
+                                          [0, -1,  0,  0,  1,  1,  1],
+                                          [-1, 0,  0,  0,  1,  1,  1]]),
+                '3_nans': permuted_array([[0,  0,  1,  1,  2,  2, -1],
+                                          [0,  0,  1,  1,  2, -1,  2],
+                                          [0,  0,  1,  1, -1,  2,  2],
+                                          [0,  0,  1, -1,  1,  2,  2],
+                                          [0,  0, -1,  1,  1,  2,  2],
+                                          [0, -1,  0,  1,  1,  2,  2],
+                                          [-1, 0,  0,  1,  1,  2,  2]]),
+                '6_nans': permuted_array([[0,  1,  2,  3,  4,  5, -1],
+                                          [0,  1,  2,  3,  4, -1,  5],
+                                          [0,  1,  2,  3, -1,  4,  5],
+                                          [0,  1,  2, -1,  3,  4,  5],
+                                          [0,  1, -1,  2,  3,  4,  5],
+                                          [0, -1,  1,  2,  3,  4,  5],
+                                          [-1, 0,  1,  2,  3,  4,  5]]),
+            },
             mask=self.build_mask(self.ones_mask(shape=shape)),
         )
 
-        for key, (res, exp) in dzip_exact(results, expected).items():
-            check_arrays(res, exp)
-
     def test_quantiles_uneven_buckets(self):
         permute = partial(permute_rows, 5)
         shape = (5, 5)
@@ -821,37 +807,31 @@ class FactorTestCase(BasePipelineTestCase):
         f = F()
         m = Mask()
 
-        terms = {
-            '3_masked': f.quantiles(bins=3, mask=m),
-            '7_masked': f.quantiles(bins=20, mask=m),
-        }
-
-        expected = {
-            '3_masked': [[-1, 0,  0,  1,  2],
-                         [0, -1,  0,  1,  2],
-                         [0,  0, -1,  1,  2],
-                         [0,  0,  1, -1,  2],
-                         [0,  0,  1,  2, -1]],
-            '7_masked': [[-1, 0,  2,  4,  6],
-                         [0, -1,  2,  4,  6],
-                         [0,  2, -1,  4,  6],
-                         [0,  2,  4, -1,  6],
-                         [0,  2,  4,  6, -1]],
-        }
-
-        graph = TermGraph(terms)
-        results = self.run_graph(
-            graph,
+        permuted_array = compose(permute, partial(array, dtype=int))
+        self.check_terms(
+            terms={
+                '3_masked': f.quantiles(bins=3, mask=m),
+                '7_masked': f.quantiles(bins=7, mask=m),
+            },
             initial_workspace={
                 f: factor_data,
                 m: mask_data,
             },
+            expected={
+                '3_masked': permuted_array([[-1, 0,  0,  1,  2],
+                                            [0, -1,  0,  1,  2],
+                                            [0,  0, -1,  1,  2],
+                                            [0,  0,  1, -1,  2],
+                                            [0,  0,  1,  2, -1]]),
+                '7_masked': permuted_array([[-1, 0,  2,  4,  6],
+                                            [0, -1,  2,  4,  6],
+                                            [0,  2, -1,  4,  6],
+                                            [0,  2,  4, -1,  6],
+                                            [0,  2,  4,  6, -1]]),
+            },
             mask=self.build_mask(self.ones_mask(shape=shape)),
         )
 
-        for key, (res, exp) in dzip_exact(results, expected).items():
-            check_arrays(res, exp)
-
     def test_quantile_helpers(self):
         f = self.f
         m = Mask()

From 0ebb72fe0d662da3d638c952e79a70f8b682b274 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 28 Mar 2016 12:21:58 -0400
Subject: [PATCH 17/18] TEST: Explicitly use int64 everywhere.

Otherwise these tests will fail on 32-bit systems.
---
 tests/pipeline/test_factor.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index b39de5cc..c5873c1f 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -461,7 +461,7 @@ class FactorTestCase(BasePipelineTestCase):
              [1, 1, 2, 2],
              [1, 1, 2, 2],
              [1, 1, 2, 2]],
-            dtype=int,
+            dtype=int64_dtype,
         )
 
         terms = {
@@ -553,7 +553,7 @@ class FactorTestCase(BasePipelineTestCase):
 
         # Cycles of 0, 1, 2, 0, 1, 2, ...
         classifier_data = (
-            (self.arange_data(shape=shape, dtype=int) + seed_value) % 3
+            (self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) % 3
         )
         # With -1s on main diagonal.
         classifier_data_eyenulls = where(eyemask, classifier_data, -1)
@@ -665,7 +665,7 @@ class FactorTestCase(BasePipelineTestCase):
         # expectations. Doing it this way makes it obvious that our
         # expectation corresponds to our input, while still testing against
         # a range of input orderings.
-        permuted_array = compose(permute, partial(array, dtype=int))
+        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
         self.check_terms(
             terms={
                 '2': f.quantiles(bins=2),
@@ -730,7 +730,7 @@ class FactorTestCase(BasePipelineTestCase):
         # expectations. Doing it this way makes it obvious that our
         # expectation corresponds to our input, while still testing against
         # a range of input orderings.
-        permuted_array = compose(permute, partial(array, dtype=int))
+        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
 
         self.check_terms(
             terms={
@@ -807,7 +807,7 @@ class FactorTestCase(BasePipelineTestCase):
         f = F()
         m = Mask()
 
-        permuted_array = compose(permute, partial(array, dtype=int))
+        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
         self.check_terms(
             terms={
                 '3_masked': f.quantiles(bins=3, mask=m),

From 9a04621781b425ef1293350014f7ec34b0d5860e Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 28 Mar 2016 15:46:28 -0400
Subject: [PATCH 18/18] ENH: Add eq and __ne__ to Classifier.

---
 tests/pipeline/test_classifier.py          | 103 ++++++++++++++++++---
 zipline/pipeline/classifiers/classifier.py |  47 +++++++++-
 zipline/pipeline/expression.py             |   4 +-
 zipline/pipeline/filters/filter.py         |   2 +-
 4 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py
index 6e36ce30..a4b25178 100644
--- a/tests/pipeline/test_classifier.py
+++ b/tests/pipeline/test_classifier.py
@@ -1,7 +1,7 @@
 import numpy as np
 
-from zipline.pipeline import Classifier, TermGraph
-from zipline.testing import check_arrays, parameter_space
+from zipline.pipeline import Classifier
+from zipline.testing import parameter_space
 from zipline.utils.numpy_utils import int64_dtype
 
 from .base import BasePipelineTestCase
@@ -18,26 +18,103 @@ class ClassifierTestCase(BasePipelineTestCase):
             inputs = ()
             window_length = 0
 
+        c = C()
+
         # There's no significance to the values here other than that they
         # contain a mix of missing and non-missing values.
         data = np.array([[-1,  1,  0, 2],
                          [3,   0,  1, 0],
                          [-5,  0, -1, 0],
-                         [-3,  1,  2, 2]], dtype=int)
+                         [-3,  1,  2, 2]], dtype=int64_dtype)
 
-        c = C()
-        graph = TermGraph(
-            {
+        self.check_terms(
+            terms={
                 'isnull': c.isnull(),
                 'notnull': c.notnull()
-            }
-        )
-
-        results = self.run_graph(
-            graph,
+            },
+            expected={
+                'isnull': data == mv,
+                'notnull': data != mv,
+            },
             initial_workspace={c: data},
             mask=self.build_mask(self.ones_mask(shape=data.shape)),
         )
 
-        check_arrays(results['isnull'], (data == mv))
-        check_arrays(results['notnull'], (data != mv))
+    @parameter_space(compval=[0, 1, 999])
+    def test_eq(self, compval):
+
+        class C(Classifier):
+            dtype = int64_dtype
+            missing_value = -1
+            inputs = ()
+            window_length = 0
+
+        c = C()
+
+        # There's no significance to the values here other than that they
+        # contain a mix of the comparison value and other values.
+        data = np.array([[-1,  1,  0, 2],
+                         [3,   0,  1, 0],
+                         [-5,  0, -1, 0],
+                         [-3,  1,  2, 2]], dtype=int64_dtype)
+
+        self.check_terms(
+            terms={
+                'eq': c.eq(compval),
+            },
+            expected={
+                'eq': (data == compval),
+            },
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
+
+    @parameter_space(missing=[-1, 0, 1])
+    def test_disallow_comparison_to_missing_value(self, missing):
+        class C(Classifier):
+            dtype = int64_dtype
+            missing_value = missing
+            inputs = ()
+            window_length = 0
+
+        with self.assertRaises(ValueError) as e:
+            C().eq(missing)
+        errmsg = str(e.exception)
+        self.assertEqual(
+            errmsg,
+            "Comparison against self.missing_value ({v}) in C.eq().\n"
+            "Missing values have NaN semantics, so the requested comparison"
+            " would always produce False.\n"
+            "Use the isnull() method to check for missing values.".format(
+                v=missing,
+            ),
+        )
+
+    @parameter_space(compval=[0, 1, 999], missing=[-1, 0, 999])
+    def test_not_equal(self, compval, missing):
+
+        class C(Classifier):
+            dtype = int64_dtype
+            missing_value = missing
+            inputs = ()
+            window_length = 0
+
+        c = C()
+
+        # There's no significance to the values here other than that they
+        # contain a mix of the comparison value and other values.
+        data = np.array([[-1,  1,  0, 2],
+                         [3,   0,  1, 0],
+                         [-5,  0, -1, 0],
+                         [-3,  1,  2, 2]], dtype=int64_dtype)
+
+        self.check_terms(
+            terms={
+                'ne': c != compval,
+            },
+            expected={
+                'ne': (data != compval) & (data != C.missing_value),
+            },
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index fcd2accd..c4d77ec9 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -1,13 +1,16 @@
 """
 classifier.py
 """
+from numbers import Number
+
 from numpy import where, isnan, nan, zeros
 
 from zipline.lib.quantiles import quantiles
 from zipline.pipeline.term import ComputableTerm
+from zipline.utils.input_validation import expect_types
 from zipline.utils.numpy_utils import int64_dtype
 
-from ..filters import NullFilter
+from ..filters import NullFilter, NumExprFilter
 from ..mixins import (
     CustomTermMixin,
     LatestMixin,
@@ -41,6 +44,48 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
         """
         return ~self.isnull()
 
+    # We explicitly don't support classifier to classifier comparisons, since
+    # the numbers likely don't mean the same thing. This may be relaxed in the
+    # future, but for now we're starting conservatively.
+    @expect_types(other=Number)
+    def eq(self, other):
+        """
+        Construct a Filter returning True for asset/date pairs where the output
+        of ``self`` matches ``other.
+        """
+        # We treat this as an error because missing_values have NaN semantics,
+        # which means this would return an array of all False, which is almost
+        # certainly not what the user wants.
+        if other == self.missing_value:
+            raise ValueError(
+                "Comparison against self.missing_value ({value}) in"
+                " {typename}.eq().\n"
+                "Missing values have NaN semantics, so the "
+                "requested comparison would always produce False.\n"
+                "Use the isnull() method to check for missing values.".format(
+                    value=other,
+                    typename=(type(self).__name__),
+                )
+            )
+        return NumExprFilter.create(
+            "x_0 == {other}".format(other=int(other)),
+            binds=(self,),
+        )
+
+    @expect_types(other=Number)
+    def __ne__(self, other):
+        """
+        Construct a Filter returning True for asset/date pairs where the output
+        of ``self`` matches ``other.
+        """
+        return NumExprFilter.create(
+            "((x_0 != {other}) & (x_0 != {missing}))".format(
+                other=int(other),
+                missing=self.missing_value,
+            ),
+            binds=(self,),
+        )
+
 
 class Everything(Classifier):
     """
diff --git a/zipline/pipeline/expression.py b/zipline/pipeline/expression.py
index 1f7b976d..0aa83d59 100644
--- a/zipline/pipeline/expression.py
+++ b/zipline/pipeline/expression.py
@@ -8,7 +8,7 @@ from numbers import Number
 import numexpr
 from numexpr.necompiler import getExprNames
 from numpy import (
-    empty,
+    full,
     inf,
 )
 
@@ -229,7 +229,7 @@ class NumericalExpression(ComputableTerm):
         """
         Compute our stored expression string with numexpr.
         """
-        out = empty(mask.shape, dtype=self.dtype)
+        out = full(mask.shape, self.missing_value, dtype=self.dtype)
         # This writes directly into our output buffer.
         numexpr.evaluate(
             self._expr,
diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py
index a2e2c6e0..5c3a8783 100644
--- a/zipline/pipeline/filters/filter.py
+++ b/zipline/pipeline/filters/filter.py
@@ -82,7 +82,7 @@ def binary_operator(op):
             )
         elif isinstance(other, int):  # Note that this is true for bool as well
             return NumExprFilter.create(
-                "x_0 {op} ({constant})".format(op=op, constant=int(other)),
+                "x_0 {op} {constant}".format(op=op, constant=int(other)),
                 binds=(self,),
             )
         raise BadBinaryOperator(op, self, other)