From f635a142896f2cfcd0a2cc2d6238cb2a41928d38 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Thu, 3 Mar 2016 22:32:22 -0500
Subject: [PATCH] ENH: Add `isnull` and `notnull` methods to Factor.

---
 tests/pipeline/test_adjusted_array.py | 40 +++++-------
 tests/pipeline/test_factor.py         | 74 +++++++++++++++++++++-
 tests/pipeline/test_filter.py         | 12 +++-
 zipline/lib/rank.pyx                  | 23 +++++--
 zipline/pipeline/factors/factor.py    | 88 +++++++++++++++++++++++----
 zipline/pipeline/filters/__init__.py  |  3 +-
 zipline/pipeline/filters/filter.py    | 22 +++++++
 7 files changed, 215 insertions(+), 47 deletions(-)

diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py
index e2a96e53..f7be0b72 100644
--- a/tests/pipeline/test_adjusted_array.py
+++ b/tests/pipeline/test_adjusted_array.py
@@ -23,11 +23,11 @@ from zipline.lib.adjustment import (
 )
 from zipline.lib.adjusted_array import AdjustedArray, NOMASK
 from zipline.utils.numpy_utils import (
+    coerce_to_dtype,
     datetime64ns_dtype,
     default_missing_value_for_dtype,
     float64_dtype,
     int64_dtype,
-    make_datetime64ns,
 )
 from zipline.utils.test_utils import check_arrays, parameter_space
 
@@ -62,18 +62,6 @@ def valid_window_lengths(underlying_buffer_length):
     return iter(range(1, underlying_buffer_length + 1))
 
 
-def value_with_dtype(dtype, value):
-    """
-    Make a value with the specified numpy dtype.
-    """
-    name = dtype.name
-    if name.startswith('datetime64'):
-        if name != 'datetime64[ns]':
-            raise TypeError("Expected datetime64[ns], but got %s." % name)
-        return make_datetime64ns(value)
-    return dtype.type(value)
-
-
 def _gen_unadjusted_cases(dtype):
 
     nrows = 6
@@ -124,7 +112,7 @@ def _gen_multiplicative_adjustment_cases(dtype):
 
     # Note that row indices are inclusive!
     adjustments[1] = [
-        adjustment_type(0, 0, 0, 0, value_with_dtype(dtype, 2)),
+        adjustment_type(0, 0, 0, 0, coerce_to_dtype(dtype, 2)),
     ]
     buffer_as_of[1] = array([[2, 1, 1],
                              [1, 1, 1],
@@ -137,8 +125,8 @@ def _gen_multiplicative_adjustment_cases(dtype):
     buffer_as_of[2] = buffer_as_of[1]
 
     adjustments[3] = [
-        adjustment_type(1, 2, 1, 1, value_with_dtype(dtype, 3)),
-        adjustment_type(0, 1, 0, 0, value_with_dtype(dtype, 4)),
+        adjustment_type(1, 2, 1, 1, coerce_to_dtype(dtype, 3)),
+        adjustment_type(0, 1, 0, 0, coerce_to_dtype(dtype, 4)),
     ]
     buffer_as_of[3] = array([[8, 1, 1],
                              [4, 3, 1],
@@ -148,7 +136,7 @@ def _gen_multiplicative_adjustment_cases(dtype):
                              [1, 1, 1]], dtype=dtype)
 
     adjustments[4] = [
-        adjustment_type(0, 3, 2, 2, value_with_dtype(dtype, 5))
+        adjustment_type(0, 3, 2, 2, coerce_to_dtype(dtype, 5))
     ]
     buffer_as_of[4] = array([[8, 1, 5],
                              [4, 3, 5],
@@ -158,8 +146,8 @@ def _gen_multiplicative_adjustment_cases(dtype):
                              [1, 1, 1]], dtype=dtype)
 
     adjustments[5] = [
-        adjustment_type(0, 4, 1, 1, value_with_dtype(dtype, 6)),
-        adjustment_type(2, 2, 2, 2, value_with_dtype(dtype, 7)),
+        adjustment_type(0, 4, 1, 1, coerce_to_dtype(dtype, 6)),
+        adjustment_type(2, 2, 2, 2, coerce_to_dtype(dtype, 7)),
     ]
     buffer_as_of[5] = array([[8,  6,  5],
                              [4, 18,  5],
@@ -191,7 +179,7 @@ def _gen_overwrite_adjustment_cases(dtype):
 
     # Note that row indices are inclusive!
     adjustments[1] = [
-        adjustment_type(0, 0, 0, 0, value_with_dtype(dtype, 1)),
+        adjustment_type(0, 0, 0, 0, coerce_to_dtype(dtype, 1)),
     ]
     buffer_as_of[1] = array([[1, 2, 2],
                              [2, 2, 2],
@@ -204,8 +192,8 @@ def _gen_overwrite_adjustment_cases(dtype):
     buffer_as_of[2] = buffer_as_of[1]
 
     adjustments[3] = [
-        adjustment_type(1, 2, 1, 1, value_with_dtype(dtype, 3)),
-        adjustment_type(0, 1, 0, 0, value_with_dtype(dtype, 4)),
+        adjustment_type(1, 2, 1, 1, coerce_to_dtype(dtype, 3)),
+        adjustment_type(0, 1, 0, 0, coerce_to_dtype(dtype, 4)),
     ]
     buffer_as_of[3] = array([[4, 2, 2],
                              [4, 3, 2],
@@ -215,7 +203,7 @@ def _gen_overwrite_adjustment_cases(dtype):
                              [2, 2, 2]], dtype=dtype)
 
     adjustments[4] = [
-        adjustment_type(0, 3, 2, 2, value_with_dtype(dtype, 5))
+        adjustment_type(0, 3, 2, 2, coerce_to_dtype(dtype, 5))
     ]
     buffer_as_of[4] = array([[4, 2, 5],
                              [4, 3, 5],
@@ -225,8 +213,8 @@ def _gen_overwrite_adjustment_cases(dtype):
                              [2, 2, 2]], dtype=dtype)
 
     adjustments[5] = [
-        adjustment_type(0, 4, 1, 1, value_with_dtype(dtype, 6)),
-        adjustment_type(2, 2, 2, 2, value_with_dtype(dtype, 7)),
+        adjustment_type(0, 4, 1, 1, coerce_to_dtype(dtype, 6)),
+        adjustment_type(2, 2, 2, 2, coerce_to_dtype(dtype, 7)),
     ]
     buffer_as_of[5] = array([[4,  6,  5],
                              [4,  6,  5],
@@ -335,7 +323,7 @@ class AdjustedArrayTestCase(TestCase):
         window_length=[2, 3],
     )
     def test_masking(self, dtype, missing_value, window_length):
-        missing_value = value_with_dtype(dtype, missing_value)
+        missing_value = coerce_to_dtype(dtype, missing_value)
         baseline_ints = arange(15).reshape(5, 3)
         baseline = baseline_ints.astype(dtype)
         mask = (baseline_ints % 2).astype(bool)
diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
index 9e480b63..d6713bc8 100644
--- a/tests/pipeline/test_factor.py
+++ b/tests/pipeline/test_factor.py
@@ -22,10 +22,15 @@ from zipline.pipeline.factors import (
     Returns,
     RSI,
 )
-from zipline.utils.test_utils import check_allclose, check_arrays
+from zipline.utils.test_utils import (
+    check_allclose,
+    check_arrays,
+    parameter_space,
+)
 from zipline.utils.numpy_utils import (
     datetime64ns_dtype,
     float64_dtype,
+    int64_dtype,
     NaTns,
 )
 
@@ -59,6 +64,73 @@ class FactorTestCase(BasePipelineTestCase):
         with self.assertRaises(UnknownRankMethod):
             self.f.rank("not a real rank method")
 
+    @parameter_space(method_name=['isnan', 'notnan', 'isfinite'])
+    def test_float64_only_ops(self, method_name):
+        class NotFloat(Factor):
+            dtype = datetime64ns_dtype
+            inputs = ()
+            window_length = 0
+
+        nf = NotFloat()
+        meth = getattr(nf, method_name)
+        with self.assertRaises(TypeError):
+            meth()
+
+    @parameter_space(custom_missing_value=[-1, 0])
+    def test_isnull_int_dtype(self, custom_missing_value):
+
+        class CustomMissingValue(Factor):
+            dtype = int64_dtype
+            window_length = 0
+            missing_value = custom_missing_value
+            inputs = ()
+
+        factor = CustomMissingValue()
+
+        data = arange(25).reshape(5, 5)
+        data[eye(5, dtype=bool)] = custom_missing_value
+
+        graph = TermGraph(
+            {
+                'isnull': factor.isnull(),
+                'notnull': factor.notnull(),
+            }
+        )
+
+        results = self.run_graph(
+            graph,
+            initial_workspace={factor: data},
+            mask=self.build_mask(ones((5, 5))),
+        )
+        check_arrays(results['isnull'], eye(5, dtype=bool))
+        check_arrays(results['notnull'], ~eye(5, dtype=bool))
+
+    def test_isnull_datetime_dtype(self):
+        class DatetimeFactor(Factor):
+            dtype = datetime64ns_dtype
+            window_length = 0
+            inputs = ()
+
+        factor = DatetimeFactor()
+
+        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
+        data[eye(5, dtype=bool)] = NaTns
+
+        graph = TermGraph(
+            {
+                'isnull': factor.isnull(),
+                'notnull': factor.notnull(),
+            }
+        )
+
+        results = self.run_graph(
+            graph,
+            initial_workspace={factor: data},
+            mask=self.build_mask(ones((5, 5))),
+        )
+        check_arrays(results['isnull'], eye(5, dtype=bool))
+        check_arrays(results['notnull'], ~eye(5, dtype=bool))
+
     @for_each_factor_dtype
     def test_rank_ascending(self, name, factor_dtype):
 
diff --git a/tests/pipeline/test_filter.py b/tests/pipeline/test_filter.py
index 2f77e859..28f6e2c7 100644
--- a/tests/pipeline/test_filter.py
+++ b/tests/pipeline/test_filter.py
@@ -345,10 +345,14 @@ class FilterTestCase(BasePipelineTestCase):
         data[diag] = nan
 
         results = self.run_graph(
-            TermGraph({'isnan': self.f.isnan()}),
+            TermGraph({
+                'isnan': self.f.isnan(),
+                'isnull': self.f.isnull(),
+            }),
             initial_workspace={self.f: data},
         )
         check_arrays(results['isnan'], diag)
+        check_arrays(results['isnull'], diag)
 
     def test_notnan(self):
         data = self.randn_data(seed=10)
@@ -356,10 +360,14 @@ class FilterTestCase(BasePipelineTestCase):
         data[diag] = nan
 
         results = self.run_graph(
-            TermGraph({'notnan': self.f.notnan()}),
+            TermGraph({
+                'notnan': self.f.notnan(),
+                'notnull': self.f.notnull(),
+            }),
             initial_workspace={self.f: data},
         )
         check_arrays(results['notnan'], ~diag)
+        check_arrays(results['notnull'], ~diag)
 
     def test_isfinite(self):
         data = self.randn_data(seed=10)
diff --git a/zipline/lib/rank.pyx b/zipline/lib/rank.pyx
index 84a57d07..e9a045a4 100644
--- a/zipline/lib/rank.pyx
+++ b/zipline/lib/rank.pyx
@@ -17,10 +17,26 @@ from numpy cimport (
 from numpy import apply_along_axis, float64, isnan, nan
 from scipy.stats import rankdata
 
+from zipline.utils.numpy_utils import (
+    is_float,
+    float64_dtype,
+    int64_dtype,
+    datetime64ns_dtype,
+)
+
 
 import_array()
 
 
+cpdef ismissing(ndarray data, object missing_value):
+    """
+    Generic ismissing function that handles quirks with NaN.
+    """
+    if is_float(data) and isnan(missing_value):
+        return isnan(data)
+    return (data == missing_value)
+
+
 def masked_rankdata_2d(ndarray data,
                        ndarray mask,
                        object missing_value,
@@ -35,12 +51,7 @@ def masked_rankdata_2d(ndarray data,
             "Can't compute rankdata on array of dtype %r." % dtype_name
         )
 
-    cdef ndarray missing_locations = ~mask
-    # Mask out any entries that are equal to the missing value.
-    if dtype_name == 'float64' and isnan(missing_value):
-        missing_locations |= isnan(data)
-    else:
-        missing_locations |= (data == missing_value)
+    cdef ndarray missing_locations = (~mask | ismissing(data, missing_value))
 
     # Interpret the bytes of integral data as floats for sorting.
     data = data.copy().view(float64)
diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
index 31649dec..a5d12cf7 100644
--- a/zipline/pipeline/factors/factor.py
+++ b/zipline/pipeline/factors/factor.py
@@ -1,10 +1,11 @@
 """
 factor.py
 """
+from functools import wraps
 from operator import attrgetter
 from numbers import Number
 
-from numpy import float64, inf
+from numpy import inf
 from toolz import curry
 
 from zipline.errors import (
@@ -32,30 +33,43 @@ from zipline.pipeline.expression import (
 from zipline.pipeline.filters import (
     NumExprFilter,
     PercentileFilter,
+    NullFilter,
 )
 from zipline.utils.control_flow import nullctx
 from zipline.utils.numpy_utils import (
     bool_dtype,
+    coerce_to_dtype,
     datetime64ns_dtype,
     float64_dtype,
     int64_dtype,
 )
-from zipline.utils.preprocess import preprocess
 
 
 _RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
 
 
-def numbers_to_float64(func, argname, argvalue):
+def coerce_numbers_to_my_dtype(f):
     """
-    Preprocessor for converting numerical inputs into floats.
+    A decorator for methods whose signature is f(self, other) that coerces
+    ``other`` to ``self.dtype``.
 
-    This is used in the binary operator constructors for Factor so that
-    `2 + Factor()` has the same behavior as `2.0 + Factor()`.
+    This is used to make comparison operations between numbers and `Factor`
+    instances work independently of whether the user supplies a float or
+    integer literal.
+
+    For example, if I write::
+
+        my_filter = my_factor > 3
+
+    my_factor probably has dtype float64, but 3 is an int, so we want to coerce
+    to float64 before doing the comparison.
     """
-    if isinstance(argvalue, Number):
-        return float64(argvalue)
-    return argvalue
+    @wraps(f)
+    def method(self, other):
+        if isinstance(other, Number):
+            other = coerce_to_dtype(self.dtype, other)
+        return f(self, other)
+    return method
 
 
 @curry
@@ -148,9 +162,9 @@ def binary_operator(op):
     # NumericalExpression operator.
     commuted_method_getter = attrgetter(method_name_for_op(op, commute=True))
 
-    @preprocess(other=numbers_to_float64)
     @with_doc("Binary Operator: '%s'" % op)
     @with_name(method_name_for_op(op))
+    @coerce_numbers_to_my_dtype
     def binary_operator(self, other):
         # This can't be hoisted up a scope because the types returned by
         # binop_return_type aren't defined when the top-level function is
@@ -207,8 +221,8 @@ def reflected_binary_operator(op):
     """
     assert not is_comparison(op)
 
-    @preprocess(other=numbers_to_float64)
     @with_name(method_name_for_op(op, commute=True))
+    @coerce_numbers_to_my_dtype
     def reflected_binary_operator(self, other):
 
         if isinstance(self, NumericalExpression):
@@ -304,6 +318,28 @@ def function_application(func):
     return mathfunc
 
 
+def if_not_float64_tell_caller_to_use_isnull(f):
+    """
+    Factor method decorator that checks if self.dtype if float64.
+
+    If the factor instance is of another dtype, this raises a TypeError
+    directing the user to `isnull` or `notnull` instead.
+    """
+    @wraps(f)
+    def wrapped_method(self, *args, **kwargs):
+        if self.dtype != float64_dtype:
+            raise TypeError(
+                "{meth}() was called on a factor of dtype {dtype}.\n"
+                "{meth}() is only defined for dtype float64."
+                "To filter missing data, use isnull() or notnull().".format(
+                    meth=f.__name__,
+                    dtype=self.dtype,
+                ),
+            )
+        return f(self, *args, **kwargs)
+    return wrapped_method
+
+
 FACTOR_DTYPES = frozenset([datetime64ns_dtype, float64_dtype, int64_dtype])
 
 
@@ -476,6 +512,34 @@ class Factor(CompositeTerm):
             mask=mask,
         )
 
+    def isnull(self):
+        """
+        A Filter producing True for values where this Factor has missing data.
+
+        Equivalent to self.isnan() when ``self.dtype`` is float64.
+        Otherwise equivalent to ``self.eq(self.missing_value)``.
+
+        Returns
+        -------
+        filter : zipline.pipeline.filters.Filter
+        """
+        if self.dtype == float64_dtype:
+            # Using isnan is more efficient when possible because we can fold
+            # the isnan computation with other NumExpr expressions.
+            return self.isnan()
+        else:
+            return NullFilter(self)
+
+    def notnull(self):
+        """
+        A Filter producing True for values where this Factor has complete data.
+
+        Equivalent to ``~self.isnan()` when ``self.dtype`` is float64.
+        Otherwise equivalent to ``(self != self.missing_value)``.
+        """
+        return ~self.isnull()
+
+    @if_not_float64_tell_caller_to_use_isnull
     def isnan(self):
         """
         A Filter producing True for all values where this Factor is NaN.
@@ -486,6 +550,7 @@ class Factor(CompositeTerm):
         """
         return self != self
 
+    @if_not_float64_tell_caller_to_use_isnull
     def notnan(self):
         """
         A Filter producing True for values where this Factor is not NaN.
@@ -496,6 +561,7 @@ class Factor(CompositeTerm):
         """
         return ~self.isnan()
 
+    @if_not_float64_tell_caller_to_use_isnull
     def isfinite(self):
         """
         A Filter producing True for values where this Factor is anything but
diff --git a/zipline/pipeline/filters/__init__.py b/zipline/pipeline/filters/__init__.py
index 9ff7a0eb..2f18840c 100644
--- a/zipline/pipeline/filters/__init__.py
+++ b/zipline/pipeline/filters/__init__.py
@@ -1,9 +1,10 @@
-from .filter import Filter, NumExprFilter, PercentileFilter
+from .filter import Filter, NumExprFilter, NullFilter, PercentileFilter
 from .latest import Latest
 
 __all__ = [
     'Filter',
     'Latest',
     'NumExprFilter',
+    'NullFilter',
     'PercentileFilter',
 ]
diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py
index 86eaa3a9..822c60e8 100644
--- a/zipline/pipeline/filters/filter.py
+++ b/zipline/pipeline/filters/filter.py
@@ -13,6 +13,7 @@ from zipline.errors import (
     BadPercentileBounds,
     UnsupportedDataType,
 )
+from zipline.lib.rank import ismissing
 from zipline.pipeline.mixins import (
     CustomTermMixin,
     PositiveWindowLengthMixin,
@@ -173,6 +174,27 @@ class NumExprFilter(NumericalExpression, Filter):
         ) & mask
 
 
+class NullFilter(SingleInputMixin, Filter):
+    """
+    A Filter indicating whether an input input values are missing.
+
+    Parameters
+    ----------
+    factor zipline.pipeline.factor.Factor
+        The factor to compare with null.
+    """
+    window_length = 0
+
+    def __new__(cls, factor):
+        return super(NullFilter, cls).__new__(
+            cls,
+            inputs=(factor,),
+        )
+
+    def _compute(self, arrays, dates, assets, mask):
+        return ismissing(arrays[0], self.inputs[0].missing_value)
+
+
 class PercentileFilter(SingleInputMixin, Filter):
     """
     A Filter representing assets falling between percentile bounds of a Factor.