ENH: Add support for Classifiers.

Classifiers are computations that represent grouping keys. They can be used in conjuction with normalization functions like ``zscore`` or ``demean`` to perform normalizations over subsets of a dataset. Notable changes: - Added ``demean()`` and ``zscore()`` methods to ``Factor``. - Added a classifier versions of ``Latest`` and ``CustomTermMixin``. The .latest attribute of int64 dataset columns no produces a classifier by default. - Added ``Everything``, a classifier that maps all data to the same value. - Added ``zipline.lib.normalize``, which implements a naive, pure-Python grouped normalize function. This will likely be moved to Cython in a subsequent PR.
2026-06-27 22:21:37 +08:00 · 2016-03-07 18:14:03 -05:00
parent 1f0e1e8908
commit 53d3b0855b
14 changed files with 674 additions and 34 deletions
@@ -147,6 +147,24 @@ class BasePipelineTestCase(TestCase):
        """
        return arange(prod(shape), dtype=dtype).reshape(shape)

+    @with_default_shape
+    def randn_data(self, seed, shape):
+        """
+        Build a block of testing data from a seeded RandomState.
+        """
+        return np.random.RandomState(seed).randn(*shape)
+
+    @with_default_shape
+    def eye_mask(self, shape):
+        """
+        Build a mask using np.eye.
+        """
+        return ~np.eye(*shape, dtype=bool)
+
+    @with_default_shape
+    def ones_mask(self, shape):
+        return np.ones(shape, dtype=bool)
+

 class EventLoaderCommonMixin(object):
    @abc.abstractproperty
@@ -5,19 +5,24 @@ from itertools import product
 from nose_parameterized import parameterized

 from numpy import (
+    apply_along_axis,
    arange,
    array,
    datetime64,
    empty,
    eye,
    nan,
+    nanmean,
+    nanstd,
    ones,
+    where,
 )
 from numpy.random import randn, seed

 from zipline.errors import UnknownRankMethod
 from zipline.lib.rank import masked_rankdata_2d
-from zipline.pipeline import Factor, Filter, TermGraph
+from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
+from zipline.pipeline import Classifier, Factor, Filter, TermGraph
 from zipline.pipeline.factors import (
    Returns,
    RSI,
@@ -43,6 +48,20 @@ class F(Factor):
    window_length = 0


+class C(Classifier):
+    dtype = int64_dtype
+    missing_value = -1
+    inputs = ()
+    window_length = 0
+
+
+class OtherC(Classifier):
+    dtype = int64_dtype
+    missing_value = -1
+    inputs = ()
+    window_length = 0
+
+
 class Mask(Filter):
    inputs = ()
    window_length = 0
@@ -403,3 +422,134 @@ class FactorTestCase(BasePipelineTestCase):
        )

        check_arrays(float_result, datetime_result)
+
+    @parameter_space(
+        seed_value=range(1, 2),
+        normalizer_name_and_func=[
+            ('demean', lambda row: row - nanmean(row)),
+            ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
+        ],
+        add_nulls_to_factor=(False, True,)
+    )
+    def test_normalizations(self,
+                            seed_value,
+                            normalizer_name_and_func,
+                            add_nulls_to_factor):
+
+        name, func = normalizer_name_and_func
+
+        shape = (7, 7)
+
+        # All Trues.
+        nomask = self.ones_mask(shape=shape)
+        # Falses on main diagonal.
+        eyemask = self.eye_mask(shape=shape)
+        # Falses on other diagonal.
+        eyemask_T = eyemask.T
+        # Falses on both diagonals.
+        xmask = eyemask & eyemask_T
+
+        # Block of random data.
+        factor_data = self.randn_data(seed=seed_value, shape=shape)
+        if add_nulls_to_factor:
+            factor_data = where(eyemask, factor_data, nan)
+
+        # Cycles of 0, 1, 2, 0, 1, 2, ...
+        classifier_data = (
+            (self.arange_data(shape=shape, dtype=int) + seed_value) % 3
+        )
+        # With -1s on main diagonal.
+        classifier_data_eyenulls = where(eyemask, classifier_data, -1)
+        # With -1s on opposite diagonal.
+        classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1)
+        # With -1s on both diagonals.
+        classifier_data_xnulls = where(xmask, classifier_data, -1)
+
+        f = self.f
+        c = C()
+        c_with_nulls = OtherC()
+        m = Mask()
+        method = getattr(f, name)
+        terms = {
+            'vanilla': method(),
+            'masked': method(mask=m),
+            'grouped': method(groupby=c),
+            'grouped_with_nulls': method(groupby=c_with_nulls),
+            'both': method(mask=m, groupby=c),
+            'both_with_nulls': method(mask=m, groupby=c_with_nulls),
+        }
+
+        expected = {
+            'vanilla': apply_along_axis(func, 1, factor_data,),
+            'masked': where(
+                eyemask,
+                grouped_apply(factor_data, eyemask, func),
+                nan,
+            ),
+            'grouped': grouped_apply(
+                factor_data,
+                classifier_data,
+                func,
+            ),
+            # If the classifier has nulls, we should get NaNs in the
+            # corresponding locations in the output.
+            'grouped_with_nulls': where(
+                eyemask_T,
+                grouped_apply(factor_data, classifier_data_eyenulls_T, func),
+                nan,
+            ),
+            # Passing a mask with a classifier should behave as though the
+            # classifier had nulls where the mask was False.
+            'both': where(
+                eyemask,
+                grouped_apply(
+                    factor_data,
+                    classifier_data_eyenulls,
+                    func,
+                ),
+                nan,
+            ),
+            'both_with_nulls': where(
+                xmask,
+                grouped_apply(
+                    factor_data,
+                    classifier_data_xnulls,
+                    func,
+                ),
+                nan,
+            )
+        }
+
+        graph = TermGraph(terms)
+        results = self.run_graph(
+            graph,
+            initial_workspace={
+                f: factor_data,
+                c: classifier_data,
+                c_with_nulls: classifier_data_eyenulls_T,
+                Mask(): eyemask,
+            },
+            mask=self.build_mask(nomask),
+        )
+
+        for key in expected:
+            check_arrays(expected[key], results[key])
+
+    @parameter_space(normalizer=['demean', 'zscore'])
+    def test_cant_normalize_non_float(self, normalizer):
+        class DateFactor(Factor):
+            dtype = datetime64ns_dtype
+            inputs = ()
+            window_length = 0
+
+        d = DateFactor()
+        with self.assertRaises(TypeError) as e:
+            getattr(d, normalizer)()
+
+        errmsg = str(e.exception)
+        expected = (
+            "{normalizer}() is only defined on Factors of dtype float64,"
+            " but it was called on a Factor of dtype datetime64[ns]."
+        ).format(normalizer=normalizer)
+
+        self.assertEqual(errmsg, expected)
@@ -13,7 +13,7 @@ from zipline.errors import (
    UnsupportedDType,
    WindowLengthNotSpecified,
 )
-from zipline.pipeline import Factor, Filter, TermGraph
+from zipline.pipeline import Classifier, Factor, Filter, TermGraph
 from zipline.pipeline.data import Column, DataSet
 from zipline.pipeline.data.testing import TestingDataSet
 from zipline.pipeline.term import AssetExists, NotSpecified
@@ -344,10 +344,12 @@ class ObjectIdentityTestCase(TestCase):
            SomeFactor(dtype=complex128_dtype)

    def test_latest_on_different_dtypes(self):
-        factor_dtypes = (int64_dtype, float64_dtype, datetime64ns_dtype)
+        factor_dtypes = (float64_dtype, datetime64ns_dtype)
        for column in TestingDataSet.columns:
            if column.dtype == bool_dtype:
                self.assertIsInstance(column.latest, Filter)
+            elif column.dtype == int64_dtype:
+                self.assertIsInstance(column.latest, Classifier)
            elif column.dtype in factor_dtypes:
                self.assertIsInstance(column.latest, Factor)
            else:
@@ -4,7 +4,7 @@ import doctest
 from unittest import TestCase

 from zipline import testing
-from zipline.lib import adjustment
+from zipline.lib import adjustment, normalize
 from zipline.pipeline import (
    engine,
    expression,
@@ -86,3 +86,6 @@ class DoctestTestCase(TestCase):

    def test_functional_docs(self):
        self._check_docs(functional)
+
+    def test_normalize_docs(self):
+        self._check_docs(normalize)
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def naive_grouped_rowwise_apply(data, group_labels, func, out=None):
+    """
+    Simple implementation of grouped row-wise function application.
+
+    Parameters
+    ----------
+    data : ndarray[ndim=2]
+        Input array over which to apply a grouped function.
+    group_labels : ndarray[ndim=2, dtype=int64]
+        Labels to use to bucket inputs from array.
+        Should be the same shape as array.
+    func : function[ndarray[ndim=1]] -> function[ndarray[ndim=1]]
+        Function to apply to pieces of each row in array.
+    out : ndarray, optional
+        Array into which to write output.  If not supplied, a new array of the
+        same shape as ``data`` is allocated and returned.
+
+    Example
+    -------
+    >>> data = np.array([[1., 2., 3.],
+    ...                  [2., 3., 4.],
+    ...                  [5., 6., 7.]])
+    >>> labels = np.array([[0, 0, 1],
+    ...                    [0, 1, 0],
+    ...                    [1, 0, 2]])
+    >>> naive_grouped_rowwise_apply(data, labels, lambda row: row - row.min())
+    array([[ 0.,  1.,  0.],
+           [ 0.,  0.,  2.],
+           [ 0.,  0.,  0.]])
+    >>> naive_grouped_rowwise_apply(data, labels, lambda row: row / row.sum())
+    array([[ 0.33333333,  0.66666667,  1.        ],
+           [ 0.33333333,  1.        ,  0.66666667],
+           [ 1.        ,  1.        ,  1.        ]])
+    """
+    if out is None:
+        out = np.empty_like(data)
+
+    for (row, label_row, out_row) in zip(data, group_labels, out):
+        for label in np.unique(label_row):
+            locs = (label_row == label)
+            out_row[locs] = func(row[locs])
+    return out
@@ -1,7 +1,7 @@
 from __future__ import print_function
 from zipline.assets import AssetFinder

-from .classifier import Classifier
+from .classifiers import Classifier
 from .engine import SimplePipelineEngine
 from .factors import Factor, CustomFactor
 from .filters import Filter
@@ -0,0 +1,9 @@
+from .classifier import Classifier, CustomClassifier, Everything
+from latest import Latest
+
+__all__ = [
+    'Classifier',
+    'CustomClassifier',
+    'Everything',
+    'Latest',
+]
@@ -0,0 +1,54 @@
+"""
+classifier.py
+"""
+from numpy import zeros, where
+
+from zipline.errors import UnsupportedDataType
+from zipline.pipeline.term import ComputableTerm
+from zipline.utils.numpy_utils import int64_dtype
+
+from ..mixins import CustomTermMixin, PositiveWindowLengthMixin
+
+
+class Classifier(ComputableTerm):
+
+    def _validate(self):
+        # Run superclass validation first so that we handle `dtype not passed`
+        # before this.
+        retval = super(Classifier, self)._validate()
+        # TODO: Support strings here.
+        if self.dtype != int64_dtype:
+            raise UnsupportedDataType(
+                typename=type(self).__name__,
+                dtype=self.dtype
+            )
+        return retval
+
+
+class Everything(Classifier):
+    """
+    A trivial classifier that classifies everything the same.
+    """
+    dtype = int64_dtype
+    window_length = 0
+    inputs = ()
+    missing_value = -1
+
+    def _compute(self, arrays, dates, assets, mask):
+        return where(
+            mask,
+            zeros(shape=mask.shape, dtype=int64_dtype),
+            self.missing_value,
+        )
+
+
+class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier):
+    """
+    Base class for user-defined Classifiers.
+
+    See Also
+    --------
+    zipline.pipeline.CustomFactor
+    zipline.pipeline.CustomFilter
+    """
+    pass
@@ -0,0 +1,29 @@
+"""
+Classifier that produces the most most recently-known value of a
+integer-valued column.
+"""
+from zipline.utils.numpy_utils import int64_dtype
+
+from .classifier import CustomClassifier
+from ..mixins import SingleInputMixin
+
+
+class Latest(SingleInputMixin, CustomClassifier):
+    """
+    Filter producing the most recently-known value of `inputs[0]` on each day.
+    """
+    window_length = 1
+
+    def compute(self, today, assets, out, data):
+        out[:] = data[-1]
+
+    def _validate(self):
+        if self.inputs[0].dtype != int64_dtype:
+            raise TypeError(
+                "{name} expected an input of dtype int64, "
+                "but got {not_bool} instead.".format(
+                    name=type(self).__name__,
+                    not_bool=self.inputs[0].dtype,
+                )
+            )
+        super(Latest, self)._validate()
@@ -16,6 +16,7 @@ from zipline.pipeline.term import (
 from zipline.utils.input_validation import ensure_dtype
 from zipline.utils.numpy_utils import (
    bool_dtype,
+    int64_dtype,
    NoDefaultMissingValue,
 )
 from zipline.utils.preprocess import preprocess
@@ -93,16 +94,20 @@ class BoundColumn(LoadableTerm):
    A column of data that's been concretely bound to a particular dataset.

    Instances of this class are dynamically created upon access to attributes
-    of DataSets.
+    of DataSets (for example, USEquityPricing.close is an instance of this
+    class).

    Attributes
    ----------
    dtype : numpy.dtype
        The dtype of data produced when this column is loaded.
    latest : zipline.pipeline.data.Factor or zipline.pipeline.data.Filter
-        A Filter/Factor computing the most recently known value of this column
-        on each date. Produces a Filter if self.dtype == ``np.bool_``,
-        otherwise produces a Factor.
+        A Filter, Factor, or Classifier computing the most recently known value
+        of this column on each date.
+
+        Produces a Filter if self.dtype == ``np.bool_``.
+        Produces a Classifier if self.dtype == ``np.int64``
+        Otherwise produces a Factor.
    dataset : zipline.pipeline.data.DataSet
        The dataset to which this column is bound.
    name : str
@@ -162,6 +167,8 @@ class BoundColumn(LoadableTerm):
    def latest(self):
        if self.dtype == bool_dtype:
            from zipline.pipeline.filters import Latest
+        elif self.dtype == int64_dtype:
+            from zipline.pipeline.classifiers import Latest
        else:
            from zipline.pipeline.factors import Latest
        return Latest(
@@ -5,20 +5,27 @@ from functools import wraps
 from operator import attrgetter
 from numbers import Number

-from numpy import inf
+from numpy import inf, where, nanstd
 from toolz import curry

 from zipline.errors import (
    UnknownRankMethod,
    UnsupportedDataType,
 )
+from zipline.lib.normalize import naive_grouped_rowwise_apply
 from zipline.lib.rank import masked_rankdata_2d
+from zipline.pipeline.classifiers import Classifier, Everything
 from zipline.pipeline.mixins import (
    CustomTermMixin,
    PositiveWindowLengthMixin,
    SingleInputMixin,
 )
-from zipline.pipeline.term import ComputableTerm, NotSpecified, Term
+from zipline.pipeline.term import (
+    ComputableTerm,
+    NotSpecified,
+    NotSpecifiedType,
+    Term,
+)
 from zipline.pipeline.expression import (
    BadBinaryOperator,
    COMPARISONS,
@@ -31,11 +38,13 @@ from zipline.pipeline.expression import (
    unary_op_name,
 )
 from zipline.pipeline.filters import (
+    Filter,
    NumExprFilter,
    PercentileFilter,
    NullFilter,
 )
-from zipline.utils.control_flow import nullctx
+from zipline.utils.input_validation import expect_types
+from zipline.utils.math_utils import nanmean
 from zipline.utils.numpy_utils import (
    bool_dtype,
    coerce_to_dtype,
@@ -43,6 +52,7 @@ from zipline.utils.numpy_utils import (
    float64_dtype,
    int64_dtype,
 )
+from zipline.utils.preprocess import preprocess


 _RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
@@ -319,26 +329,67 @@ def function_application(func):
    return mathfunc


-def if_not_float64_tell_caller_to_use_isnull(f):
+def restrict_to_dtype(dtype, message_template):
    """
-    Factor method decorator that checks if self.dtype if float64.
+    A factory for decorators that restricting Factor methods to only be
+    callable on Factors with a specific dtype.

-    If the factor instance is of another dtype, this raises a TypeError
-    directing the user to `isnull` or `notnull` instead.
+    This is conceptually similar to
+    zipline.utils.input_validation.expect_dtypes, but provides more flexibility
+    for providing error messages that are specifically targeting Factor
+    methods.
+
+    Parameters
+    ----------
+    dtype : numpy.dtype
+        The dtype on which the decorated method may be called.
+    message_template : str
+        A template for the error message to be raised.
+        `message_template.format` will be called with keyword arguments
+        `method_name`, `expected_dtype`, and `received_dtype`.
+
+    Usage
+    -----
+    @restrict_to_dtype(
+        dtype=float64_dtype,
+        message_template=(
+            "{method_name}() was called on a factor of dtype {received_dtype}."
+            "{method_name}() requires factors of dtype{expected_dtype}."
+
+        ),
+    )
+    def some_factor_method(self, ...):
+        self.stuff_that_requires_being_float64(...)
    """
-    @wraps(f)
-    def wrapped_method(self):
-        if self.dtype != float64_dtype:
+    def processor(factor_method, _, factor_instance):
+        factor_dtype = factor_instance.dtype
+        if factor_dtype != dtype:
            raise TypeError(
-                "{meth}() was called on a factor of dtype {dtype}.\n"
-                "{meth}() is only defined for dtype float64."
-                "To filter missing data, use isnull() or notnull().".format(
-                    meth=f.__name__,
-                    dtype=self.dtype,
-                ),
+                message_template.format(
+                    method_name=factor_method.__name__,
+                    expected_dtype=dtype.name,
+                    received_dtype=factor_dtype,
+                )
            )
-        return f(self)
-    return wrapped_method
+        return factor_instance
+    return preprocess(self=processor)
+
+if_not_float64_tell_caller_to_use_isnull = restrict_to_dtype(
+    dtype=float64_dtype,
+    message_template=(
+        "{method_name}() was called on a factor of dtype {received_dtype}.\n"
+        "{method_name}() is only defined for dtype {expected_dtype}."
+        "To filter missing data, use isnull() or notnull()."
+    )
+)
+
+float64_only = restrict_to_dtype(
+    dtype=float64_dtype,
+    message_template=(
+        "{method_name}() is only defined on Factors of dtype {expected_dtype},"
+        " but it was called on a Factor of dtype {received_dtype}."
+    )
+)


 FACTOR_DTYPES = frozenset([datetime64ns_dtype, float64_dtype, int64_dtype])
@@ -395,6 +446,190 @@ class Factor(ComputableTerm):
            )
        return retval

+    @expect_types(
+        mask=(Filter, NotSpecifiedType),
+        groupby=(Classifier, NotSpecifiedType),
+    )
+    @float64_only
+    def demean(self, mask=NotSpecified, groupby=NotSpecified):
+        """
+        Construct a Factor that computes ``self`` and subtracts the mean from
+        row of the result.
+
+        If ``mask`` is supplied, ignore values where ``mask`` returns False
+        when computing row means, and output NaN anywhere the mask is False.
+
+        If ``groupby`` is supplied, compute by partitioning each row based on
+        the values produced by ``groupby``, de-meaning the partitioned arrays,
+        and stitching the sub-results back together.
+
+        Parameters
+        ----------
+        mask : zipline.pipeline.Filter, optional
+            A Filter defining values to ignore when computing means.
+        groupby : zipline.pipeline.Classifier, optional
+            A classifier defining partitions over which to compute means.
+
+        Example
+        -------
+        Let ``f`` be a Factor which would produce the following output::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13    1.0    2.0    3.0    4.0
+            2017-03-14    1.5    2.5    3.5    1.0
+            2017-03-15    2.0    3.0    4.0    1.5
+            2017-03-16    2.5    3.5    1.0    2.0
+
+        Let ``c`` be a Classifier producing the following output::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13      1      1      2      2
+            2017-03-14      1      1      2      2
+            2017-03-15      1      1      2      2
+            2017-03-16      1      1      2      2
+
+        Let ``m`` be a Filter producing the following output::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13  False   True   True   True
+            2017-03-14   True  False   True   True
+            2017-03-15   True   True  False   True
+            2017-03-16   True   True   True  False
+
+        Then ``f.demean()`` will subtract the mean from each row produced by
+        ``f``.
+
+        ::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13 -1.500 -0.500  0.500  1.500
+            2017-03-14 -0.625  0.375  1.375 -1.125
+            2017-03-15 -0.625  0.375  1.375 -1.125
+            2017-03-16  0.250  1.250 -1.250 -0.250
+
+        ``f.demean(mask=m)`` will subtract the mean from each row, but means
+        will be calculated ignoring values on the diagonal, and NaNs will
+        written to the diagonal in the output. Diagonal values are ignored
+        because they are the locations where the mask ``m`` produced False.
+
+        ::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13    NaN -1.000  0.000  1.000
+            2017-03-14 -0.500    NaN  1.500 -1.000
+            2017-03-15 -0.166  0.833    NaN -0.666
+            2017-03-16  0.166  1.166 -1.333    NaN
+
+        ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and
+        MCD/BK from their respective entries.  The AAPL/MSFT are grouped
+        together because both assets always produce 1 in the output of the
+        classifier ``c``.  Similarly, MCD/BK are grouped together because they
+        always produce 2.
+
+        ::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13 -0.500  0.500 -0.500  0.500
+            2017-03-14 -0.500  0.500  1.250 -1.250
+            2017-03-15 -0.500  0.500  1.250 -1.250
+            2017-03-16 -0.500  0.500 -0.500  0.500
+
+        ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of
+        AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on
+        the diagonal , and NaNs will be written to the diagonal in the output.
+
+        ::
+
+                         AAPL   MSFT    MCD     BK
+            2017-03-13    NaN  0.000 -0.500  0.500
+            2017-03-14  0.000    NaN  1.250 -1.250
+            2017-03-15 -0.500  0.500    NaN  0.000
+            2017-03-16 -0.500  0.500  0.000    NaN
+
+        Notes
+        -----
+        Mean is sensitive to the magnitudes of outliers. When working with
+        factor that can potentially produce large outliers, it is often useful
+        to use the ``mask`` parameter to discard values at the extremes of the
+        distribution::
+
+            >>> base = MyFactor(...)
+            >>> normalized = base.demean(mask=base.percentile_between(1, 99))
+
+        ``demean()`` is only supported on Factors of dtype float64.
+
+        See Also
+        --------
+        :meth:`pandas.DataFrame.groupby`
+        """
+        return GroupedRowTransform(
+            transform=lambda row: row - nanmean(row),
+            factor=self,
+            mask=mask,
+            groupby=groupby,
+        )
+
+    @expect_types(
+        mask=(Filter, NotSpecifiedType),
+        groupby=(Classifier, NotSpecifiedType),
+    )
+    @float64_only
+    def zscore(self, mask=NotSpecified, groupby=NotSpecified):
+        """
+        Construct a Factor that Z-Scores each day's results.
+
+        The Z-Score of a row is defined as::
+
+            (row - row.mean()) / row.stddev()
+
+        If ``mask`` is supplied, ignore values where ``mask`` returns False
+        when computing row means and standard deviations, and output NaN
+        anywhere the mask is False.
+
+        If ``groupby`` is supplied, compute by partitioning each row based on
+        the values produced by ``groupby``, z-scoring the partitioned arrays,
+        and stitching the sub-results back together.
+
+        Parameters
+        ----------
+        mask : zipline.pipeline.Filter, optional
+            A Filter defining values to ignore when Z-Scoring.
+        groupby : zipline.pipeline.Classifier, optional
+            A classifier defining partitions over which to compute Z-Scores.
+
+        Returns
+        -------
+        zscored : zipline.pipeline.Factor
+            A Factor producing that z-scores the output of self.
+
+        Notes
+        -----
+        Mean and standard deviation are sensitive to the magnitudes of
+        outliers. When working with factor that can potentially produce large
+        outliers, it is often useful to use the ``mask`` parameter to discard
+        values at the extremes of the distribution::
+
+            >>> base = MyFactor(...)
+            >>> normalized = base.zscore(mask=base.percentile_between(1, 99))
+
+        ``zscore()`` is only supported on Factors of dtype float64.
+
+        Example
+        -------
+        See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth
+        example of the semantics for ``mask`` and ``groupby``.
+
+        See Also
+        --------
+        :meth:`pandas.DataFrame.groupby`
+        """
+        return GroupedRowTransform(
+            transform=lambda row: (row - nanmean(row)) / nanstd(row),
+            factor=self,
+            mask=mask,
+            groupby=groupby,
+        )
+
    def rank(self, method='ordinal', ascending=True, mask=NotSpecified):
        """
        Construct a new Factor representing the sorted rank of each column
@@ -431,9 +666,8 @@ class Factor(ComputableTerm):

        See Also
        --------
-        scipy.stats.rankdata
-        zipline.lib.rank.masked_rankdata_2d
-        zipline.pipeline.factors.factor.Rank
+        :func:`scipy.stats.rankdata`
+        :class:`zipline.pipeline.factors.factor.Rank`
        """
        return Rank(self, method=method, ascending=ascending, mask=mask)

@@ -592,6 +826,90 @@ class NumExprFactor(NumericalExpression, Factor):
    pass


+class GroupedRowTransform(Factor):
+    """
+    A Factor that transforms an input factor by applying a row-wise
+    shape-preserving transformation on classifier-defined groups of that
+    Factor.
+
+    This is most often useful for normalization operators like ``zscore`` or
+    ``demean``.
+
+    Parameters
+    ----------
+    transform : function[ndarray[ndim=1] -> ndarray[ndim=1]]
+        Function to apply over each row group.
+    factor : zipline.pipeline.Factor
+        The factor providing baseline data to transform.
+    mask : zipline.pipeline.Filter
+        Mask of entries to ignore when calculating transforms.
+    groupby : zipline.pipeline.Classifier
+        Classifier partitioning ``factor`` into groups to use when calculating
+        means.
+
+    Notes
+    -----
+    Users should rarely construct instances of this factor directly.  Instead,
+    they should construct instances via factor normalization methods like
+    ``zscore`` and ``demean``.
+
+    See Also
+    --------
+    zipline.pipeline.factors.Factor.zscore
+    zipline.pipeline.factors.Factor.demean
+    """
+    window_length = 0
+
+    def __new__(cls, transform, factor, mask, groupby):
+
+        if mask is NotSpecified:
+            mask = factor.mask
+        else:
+            mask = mask & factor.mask
+
+        if groupby is NotSpecified:
+            groupby = Everything(mask=mask)
+
+        return super(GroupedRowTransform, cls).__new__(
+            GroupedRowTransform,
+            transform=transform,
+            inputs=(factor, groupby),
+            missing_value=factor.missing_value,
+            mask=mask,
+            dtype=factor.dtype,
+        )
+
+    def _init(self, transform, *args, **kwargs):
+        self._transform = transform
+        return super(GroupedRowTransform, self)._init(*args, **kwargs)
+
+    @classmethod
+    def static_identity(cls, transform, *args, **kwargs):
+        return (
+            super(GroupedRowTransform, cls).static_identity(*args, **kwargs),
+            transform,
+        )
+
+    def _compute(self, arrays, dates, assets, mask):
+        data = arrays[0]
+        null_group_value = self.inputs[1].missing_value
+        group_labels = where(
+            mask,
+            arrays[1],
+            null_group_value,
+        )
+
+        return where(
+            group_labels != null_group_value,
+            naive_grouped_rowwise_apply(
+                data=data,
+                group_labels=group_labels,
+                func=self._transform,
+            ),
+            self.missing_value,
+        )
+
+
 class Rank(SingleInputMixin, Factor):
    """
    A Factor representing the row-wise rank data of another Factor.
@@ -607,8 +925,8 @@ class Rank(SingleInputMixin, Factor):

    See Also
    --------
-    scipy.stats.rankdata : Underlying ranking algorithm.
-    zipline.factors.Factor.rank : Method-style interface to same functionality.
+    :func:`scipy.stats.rankdata`
+    :class:`Factor.rank`

    Notes
    -----
@@ -778,4 +1096,3 @@ class CustomFactor(PositiveWindowLengthMixin, CustomTermMixin, Factor):
        median_low15 = MedianValue([USEquityPricing.low], window_length=15)
    '''
    dtype = float64_dtype
-    ctx = nullctx()
@@ -182,7 +182,7 @@ class NullFilter(SingleInputMixin, Filter):

    Parameters
    ----------
-    factor : zipline.pipeline.factor.Factor
+    factor : zipline.pipeline.Factor
        The factor to compare against its missing_value.
    """
    window_length = 0
@@ -2,6 +2,8 @@
 Mixins classes for use with Filters and Factors.
 """
 from numpy import full_like
+
+from zipline.utils.control_flow import nullctx
 from zipline.errors import WindowLengthNotPositive

 from .term import NotSpecified
@@ -43,6 +45,8 @@ class CustomTermMixin(object):

    Used by CustomFactor, CustomFilter, CustomClassifier, etc.
    """
+    ctx = nullctx()
+
    def __new__(cls,
                inputs=NotSpecified,
                window_length=NotSpecified,
@@ -28,6 +28,8 @@ NotSpecified = sentinel(
    'Singleton sentinel value used for Term defaults.',
 )

+NotSpecifiedType = type(NotSpecified)
+

 class Term(with_metaclass(ABCMeta, object)):
    """