From ba590266c98f15cd77dd2c7ac938c55f4e451b59 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 11:49:37 -0400
Subject: [PATCH 01/11] MAINT: Simplify ArrayPredicate.

Just use `params` instead of custom `_init` and `_static_identity`.
---
 zipline/pipeline/filters/filter.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py
index 523c8927..be1959f9 100644
--- a/zipline/pipeline/filters/filter.py
+++ b/zipline/pipeline/filters/filter.py
@@ -432,6 +432,7 @@ class ArrayPredicate(SingleInputMixin, Filter):
     opargs : tuple[hashable]
         Additional argument to apply to ``op``.
     """
+    params = ('op', 'opargs')
     window_length = 0
 
     @expect_types(term=Term, opargs=tuple)
@@ -445,22 +446,10 @@ class ArrayPredicate(SingleInputMixin, Filter):
             mask=term.mask,
         )
 
-    def _init(self, op, opargs, *args, **kwargs):
-        self._op = op
-        self._opargs = opargs
-        return super(ArrayPredicate, self)._init(*args, **kwargs)
-
-    @classmethod
-    def _static_identity(cls, op, opargs, *args, **kwargs):
-        return (
-            super(ArrayPredicate, cls)._static_identity(*args, **kwargs),
-            op,
-            opargs,
-        )
-
     def _compute(self, arrays, dates, assets, mask):
+        params = self.params
         data = arrays[0]
-        return self._op(data, *self._opargs) & mask
+        return params['op'](data, *params['opargs']) & mask
 
 
 class Latest(LatestMixin, CustomFilter):

From 8b17b7d4b3722864a26fc738b580235d79558bc1 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 11:50:37 -0400
Subject: [PATCH 02/11] MAINT: Add validator to `validate_column`.

Who validates the validators?
---
 zipline/pipeline/pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/zipline/pipeline/pipeline.py b/zipline/pipeline/pipeline.py
index 78ee5ed5..e7c6b4de 100644
--- a/zipline/pipeline/pipeline.py
+++ b/zipline/pipeline/pipeline.py
@@ -229,6 +229,7 @@ class Pipeline(object):
             raise AssertionError("Unknown graph format %r." % format)
 
     @staticmethod
+    @expect_types(term=Term, column_name=str)
     def validate_column(column_name, term):
         if term.ndim == 1:
             raise UnsupportedPipelineOutput(column_name=column_name, term=term)

From e995e6f2edbc895040c4c84a7baeef471ef225b9 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 11:50:59 -0400
Subject: [PATCH 03/11] ENH: Add `relabel` method to string classifiers.

- Adds a `map` method to `LabelArray` that maps a unary function over
  the categories of a LabelArray, shrinking the underyling codes if
  possible.

- Adds a new `.relabel` method to string-dtype classifiers that maps a
  unary function over the unique elements of the underlying LabelArray.
  This is useful for things like cleaning noisy label data.
---
 tests/pipeline/test_classifier.py          |  64 +++++++++++
 tests/test_labelarray.py                   | 126 +++++++++++++++++++++
 zipline/lib/_factorize.pyx                 |  31 ++---
 zipline/lib/labelarray.py                  |  60 ++++++++++
 zipline/pipeline/classifiers/classifier.py |  66 ++++++++++-
 5 files changed, 332 insertions(+), 15 deletions(-)

diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py
index a716c19d..e70479ea 100644
--- a/tests/pipeline/test_classifier.py
+++ b/tests/pipeline/test_classifier.py
@@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase):
         )
         self.assertEqual(errmsg, expected)
 
+    @parameter_space(
+        __fail_fast=True,
+        labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
+        relabel_func=[
+            lambda s: s[0],
+            lambda s: str(len(s)),
+            lambda s: str(len([c for c in s if c == 'a'])),
+            lambda s: None,
+        ]
+    )
+    def test_relabel_strings(self, relabel_func, labelarray_dtype):
+
+        class C(Classifier):
+            inputs = ()
+            dtype = categorical_dtype
+            missing_value = None
+            window_length = 0
+
+        c = C()
+
+        raw = np.asarray(
+            [['a', 'aa', 'aaa', 'abab'],
+             ['bab', 'aba', 'aa', 'bb'],
+             ['a', 'aba', 'abaa', 'abaab'],
+             ['a', 'aa', 'aaa', 'aaaa']],
+            dtype=labelarray_dtype,
+        )
+        raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)
+
+        data = LabelArray(raw, missing_value=None)
+
+        terms = {
+            'relabeled': c.relabel(relabel_func),
+        }
+        expected_results = {
+            'relabeled': LabelArray(raw_relabeled, missing_value=None),
+        }
+
+        self.check_terms(
+            terms,
+            expected_results,
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
+
+    def test_relabel_int_classifier_not_yet_supported(self):
+        class C(Classifier):
+            inputs = ()
+            dtype = int64_dtype
+            missing_value = -1
+            window_length = 0
+
+        c = C()
+
+        with self.assertRaises(TypeError) as e:
+            c.relabel(lambda x: 0 / 0)  # Function should never be called.
+
+        result = str(e.exception)
+        expected = (
+            "relabel() is only defined on Classifiers producing strings "
+            "but it was called on a Classifier of dtype int64."
+        )
+        self.assertEqual(result, expected)
+
 
 class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
     def test_reversability_categorical(self):
diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py
index 854da975..6d5681b6 100644
--- a/tests/test_labelarray.py
+++ b/tests/test_labelarray.py
@@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase):
             np_contains(strs) & notmissing,
         )
 
+    @parameter_space(
+        __fail_fast=True,
+        f=[
+            lambda s: str(len(s)),
+            lambda s: s[0],
+            lambda s: ''.join(reversed(s)),
+            lambda s: '',
+        ]
+    )
+    def test_map(self, f):
+        data = np.array(
+            [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
+             ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
+             ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
+             ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
+            dtype=object,
+        )
+        la = LabelArray(data, missing_value=None)
+
+        numpy_transformed = np.vectorize(f)(data)
+        la_transformed = la.map(f).as_string_array()
+
+        assert_equal(numpy_transformed, la_transformed)
+
+    def test_map_ignores_missing_value(self):
+        data = np.array(['A', 'B', 'C'], dtype=object)
+        la = LabelArray(data, missing_value='A')
+
+        def increment_char(c):
+            return chr(ord(c) + 1)
+
+        result = la.map(increment_char)
+        expected = LabelArray(['A', 'C', 'D'], missing_value='A')
+        assert_equal(result.as_string_array(), expected.as_string_array())
+
+    @parameter_space(
+        __fail_fast=True,
+        f=[
+            lambda s: 0,
+            lambda s: 0.0,
+            lambda s: object(),
+        ]
+    )
+    def test_map_requires_f_to_return_a_string(self, f):
+        la = LabelArray(self.strs, missing_value=None)
+
+        with self.assertRaises(TypeError):
+            la.map(f)
+
+    def test_map_can_only_return_none_if_missing_value_is_none(self):
+
+        # Should work.
+        la = LabelArray(self.strs, missing_value=None)
+        la.map(lambda x: None)
+
+        la = LabelArray(self.strs, missing_value="__MISSING__")
+        with self.assertRaises(TypeError):
+            la.map(lambda x: None)
+
     @parameter_space(
         __fail_fast=True,
         missing_value=('', 'a', 'not in the array', None),
@@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase):
         assert_equal(arr.itemsize, 2)
         self.check_roundtrip(arr)
 
+    def test_map_shrinks_code_storage_if_possible(self):
+        arr = LabelArray(
+            # Drop the last value so we fit in a uint16 with None as a missing
+            # value.
+            self.create_categories(16, plus_one=False)[:-1],
+            missing_value=None,
+        )
+
+        self.assertEqual(arr.itemsize, 2)
+
+        def either_A_or_B(s):
+            return ('A', 'B')[sum(ord(c) for c in s) % 2]
+
+        result = arr.map(either_A_or_B)
+
+        self.assertEqual(set(result.categories), {'A', 'B', None})
+        self.assertEqual(result.itemsize, 1)
+
+        assert_equal(
+            np.vectorize(either_A_or_B)(arr.as_string_array()),
+            result.as_string_array(),
+        )
+
+    def test_map_never_increases_code_storage_size(self):
+        # This tests a pathological case where a user maps an impure function
+        # that returns a different label on every invocation, which in a naive
+        # implementation could cause us to need to **increase** the size of our
+        # codes after a map.
+        #
+        # This doesn't happen, however, because we guarantee that the user's
+        # mapping function will be called on each unique category exactly once,
+        # which means we can never increase the number of categories in the
+        # LabelArray after mapping.
+
+        # Using all but one of the categories so that we still fit in a uint8
+        # with an extra category for None as a missing value.
+        categories = self.create_categories(8, plus_one=False)[:-1]
+
+        larger_categories = self.create_categories(16, plus_one=False)
+
+        # Double the length of the categories so that we have to increase the
+        # required size after our map.
+        categories_twice = categories + categories
+
+        arr = LabelArray(categories_twice, missing_value=None)
+        assert_equal(arr.itemsize, 1)
+
+        gen_unique_categories = iter(larger_categories)
+
+        def new_string_every_time(c):
+            # Return a new unique category every time so that every result is
+            # different.
+            return next(gen_unique_categories)
+
+        result = arr.map(new_string_every_time)
+
+        # Result should still be of size 1.
+        assert_equal(result.itemsize, 1)
+
+        # Result should be the first `len(categories)` entries from the larger
+        # categories, repeated twice.
+        expected = LabelArray(
+            larger_categories[:len(categories)] * 2,
+            missing_value=None,
+        )
+        assert_equal(result.as_string_array(), expected.as_string_array())
+
     def manual_narrow_condense_back_to_valid_size_slow(self):
         """This test is really slow so we don't want it run by default.
         """
diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx
index 463b3dc6..e14a2039 100644
--- a/zipline/lib/_factorize.pyx
+++ b/zipline/lib/_factorize.pyx
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
     return log(d) / log(2);
 
 
+cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
+    """Choose the smallest numpy unsigned int dtype that can hold ``size``.
+    """
+    if maxval < 1:
+        # lim x -> 0 log2(x) == -infinity so we floor at uint8
+        return np.uint8
+    else:
+        # The number of bits required to hold the codes up to ``length`` is
+        # log2(length). The number of bits per bytes is 8. We cannot have
+        # fractional bytes so we need to round up. Finally, we can only have
+        # integers with widths 1, 2, 4, or 8 so so we need to round up to the
+        # next value by looking up the next largest size in ``_int_sizes``.
+        return unsigned_int_dtype_with_size_in_bytes(
+            _int_sizes[int(np.ceil(log2(maxval) / 8))]
+        )
+
+
 ctypedef fused unsigned_integral:
     np.uint8_t
     np.uint16_t
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
         raise ValueError('nvalues larger than uint64')
 
     length = len(categories_array)
-    if length < 1:
-        # lim x -> 0 log2(x) == -infinity so we floor at uint8
-        narrowest_dtype = np.uint8
-    else:
-        # The number of bits required to hold the codes up to ``length`` is
-        # log2(length). The number of bits per bytes is 8. We cannot have
-        # fractional bytes so we need to round up. Finally, we can only have
-        # integers with widths 1, 2, 4, or 8 so so we need to round up to the
-        # next value by looking up the next largest size in ``_int_sizes``.
-        narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
-            _int_sizes[int(np.ceil(log2(length) / 8))]
-        )
-
+    narrowest_dtype = smallest_uint_that_can_hold(length)
     if codes.dtype != narrowest_dtype:
         # condense the codes down to the narrowest dtype possible
         codes = codes.astype(narrowest_dtype)
diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py
index c9db21e6..68358d81 100644
--- a/zipline/lib/labelarray.py
+++ b/zipline/lib/labelarray.py
@@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
 from ._factorize import (
     factorize_strings,
     factorize_strings_known_categories,
+    smallest_uint_that_can_hold,
 )
 
 
@@ -136,6 +137,7 @@ class LabelArray(ndarray):
     http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
     """
     SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
+    SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)
 
     @preprocess(
         values=coerce(list, partial(np.asarray, dtype=object)),
@@ -565,6 +567,64 @@ class LabelArray(ndarray):
         # locations in our indices.
         return results[self.as_int_array()]
 
+    def map(self, f):
+        """
+        Map a function from str -> str element-wise over ``self``.
+
+        ``f`` will be applied exactly once to each non-missing unique value in
+        ``self``. Missing values will always map to ``self.missing_value``.
+        """
+        # f() should only return None if None is our missing value.
+        if self.missing_value is None:
+            allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
+        else:
+            allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
+
+        def f_to_use(x,
+                     missing_value=self.missing_value,
+                     otypes=allowed_outtypes):
+
+            if x == missing_value:
+                return x
+
+            ret = f(x)
+
+            if not isinstance(ret, otypes):
+                raise TypeError(
+                    "Expected f() to return a string. Got %s." % (
+                        type(ret).__name__
+                    )
+                )
+
+            return ret
+
+        new_categories_with_duplicates = (
+            np.vectorize(f_to_use, otypes=[object])(self.categories)
+        )
+
+        # If f() maps multiple inputs to the same output, then we can end up
+        # with the same code duplicated multiple times. Compress the categories
+        # by running them through np.unique, and then use the reverse lookup
+        # table to compress codes as well.
+        new_categories, bloated_reverse_index = np.unique(
+            new_categories_with_duplicates,
+            return_inverse=True
+        )
+
+        # `reverse_index` will always be a 64 bit integer even if we can hold a
+        # smaller array.
+        reverse_index = bloated_reverse_index.astype(
+            smallest_uint_that_can_hold(len(new_categories))
+        )
+        new_codes = np.take(reverse_index, self.as_int_array())
+
+        return self.from_codes_and_metadata(
+            new_codes,
+            new_categories,
+            dict(zip(new_categories, range(len(new_categories)))),
+            missing_value=self.missing_value,
+        )
+
     def startswith(self, prefix):
         """
         Element-wise startswith.
diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index 810430fa..f3c2e22c 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype
 from zipline.pipeline.sentinels import NotSpecified
 from zipline.pipeline.term import ComputableTerm
 from zipline.utils.compat import unicode
-from zipline.utils.input_validation import expect_types
+from zipline.utils.input_validation import expect_types, expect_dtypes
 from zipline.utils.memoize import classlazyval
 from zipline.utils.numpy_utils import (
     categorical_dtype,
@@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype(
     dtype=categorical_dtype,
     message_template=(
         "{method_name}() is only defined on Classifiers producing strings"
-        " but it was called on a Factor of dtype {received_dtype}."
+        " but it was called on a Classifier of dtype {received_dtype}."
     )
 )
 
@@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
             opargs=(pattern,),
         )
 
+    # TODO: Support relabeling for integer dtypes.
+    @string_classifiers_only
+    def relabel(self, relabeler):
+        """
+        Convert ``self`` into a new classifier by mapping a function over each
+        element produced by ``self``.
+
+        Parameters
+        ----------
+        relabeler : function[str -> str or None]
+            A function to apply to each unique value produced by ``self``.
+
+        Returns
+        -------
+        relabeled : Classifier
+            A classifier produced by applying ``relabeler`` to each unique
+            value produced by ``self``.
+        """
+        return Relabel(term=self, relabeler=relabeler)
+
     def element_of(self, choices):
         """
         Construct a Filter indicating whether values are in ``choices``.
@@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier):
         return type(self).__name__ + '(%d)' % self.params['bins']
 
 
+class Relabel(SingleInputMixin, Classifier):
+    """
+    A classifier applying a relabeling function on the result of another
+    classifier.
+
+    Parameters
+    ----------
+    arg : zipline.pipeline.Classifier
+        Term produceing the input to be relabeled.
+    relabel_func : function(LabelArray) -> LabelArray
+        Function to apply to the result of `term`.
+    """
+    window_length = 0
+    params = ('relabeler',)
+
+    # TODO: Support relabeling for integer dtypes.
+    @expect_dtypes(term=categorical_dtype)
+    @expect_types(term=Classifier)
+    def __new__(cls, term, relabeler):
+        return super(Relabel, cls).__new__(
+            cls,
+            inputs=(term,),
+            dtype=term.dtype,
+            mask=term.mask,
+            relabeler=relabeler,
+        )
+
+    def _compute(self, arrays, dates, assets, mask):
+        relabeler = self.params['relabeler']
+        data = arrays[0]
+
+        if isinstance(data, LabelArray):
+            result = data.map(relabeler)
+            result[~mask] = data.missing_value
+        else:
+            raise TypeError(
+                "Relabeling is not currently supported for "
+                "int-dtype classifiers."
+            )
+        return result
+
+
 class CustomClassifier(PositiveWindowLengthMixin,
                        StandardOutputs,
                        CustomTermMixin,

From cfe4df8f2bd4bcde3478991dd7cec1ea628f9216 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 14:16:17 -0400
Subject: [PATCH 04/11] TEST: Test map ignores missing with None.

---
 tests/test_labelarray.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py
index 6d5681b6..de73eed9 100644
--- a/tests/test_labelarray.py
+++ b/tests/test_labelarray.py
@@ -133,15 +133,16 @@ class LabelArrayTestCase(ZiplineTestCase):
 
         assert_equal(numpy_transformed, la_transformed)
 
-    def test_map_ignores_missing_value(self):
-        data = np.array(['A', 'B', 'C'], dtype=object)
-        la = LabelArray(data, missing_value='A')
+    @parameter_space(missing=['A', None])
+    def test_map_ignores_missing_value(self, missing):
+        data = np.array([missing, 'B', 'C'], dtype=object)
+        la = LabelArray(data, missing_value=missing)
 
         def increment_char(c):
             return chr(ord(c) + 1)
 
         result = la.map(increment_char)
-        expected = LabelArray(['A', 'C', 'D'], missing_value='A')
+        expected = LabelArray([missing, 'C', 'D'], missing_value=missing)
         assert_equal(result.as_string_array(), expected.as_string_array())
 
     @parameter_space(

From a5794858252d91c8f846159b5716e776d6f6cbc0 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 15:23:27 -0400
Subject: [PATCH 05/11] MAINT: s/NotImplementedError/TypeError.

---
 zipline/pipeline/classifiers/classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py
index f3c2e22c..de5da2f0 100644
--- a/zipline/pipeline/classifiers/classifier.py
+++ b/zipline/pipeline/classifiers/classifier.py
@@ -437,7 +437,7 @@ class Relabel(SingleInputMixin, Classifier):
             result = data.map(relabeler)
             result[~mask] = data.missing_value
         else:
-            raise TypeError(
+            raise NotImplementedError(
                 "Relabeling is not currently supported for "
                 "int-dtype classifiers."
             )

From ad10349992f6da5444221df77c6b23aa1197d288 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 15:28:15 -0400
Subject: [PATCH 06/11] TEST: Test map returning None.

---
 tests/test_labelarray.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py
index de73eed9..4b001035 100644
--- a/tests/test_labelarray.py
+++ b/tests/test_labelarray.py
@@ -163,7 +163,12 @@ class LabelArrayTestCase(ZiplineTestCase):
 
         # Should work.
         la = LabelArray(self.strs, missing_value=None)
-        la.map(lambda x: None)
+        result = la.map(lambda x: None)
+
+        check_arrays(
+            result,
+            LabelArray(np.full_like(self.strs, None), missing_value=None),
+        )
 
         la = LabelArray(self.strs, missing_value="__MISSING__")
         with self.assertRaises(TypeError):

From e49f4c6149e6473c81eef19c9783372b68c61dc0 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 17:07:19 -0400
Subject: [PATCH 07/11] ENH: Improve error message on bad return.

---
 tests/test_labelarray.py  | 2 +-
 zipline/lib/labelarray.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py
index 4b001035..fc6673c9 100644
--- a/tests/test_labelarray.py
+++ b/tests/test_labelarray.py
@@ -153,7 +153,7 @@ class LabelArrayTestCase(ZiplineTestCase):
             lambda s: object(),
         ]
     )
-    def test_map_requires_f_to_return_a_string(self, f):
+    def test_map_requires_f_to_return_a_string_or_none(self, f):
         la = LabelArray(self.strs, missing_value=None)
 
         with self.assertRaises(TypeError):
diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py
index 68358d81..4db75672 100644
--- a/zipline/lib/labelarray.py
+++ b/zipline/lib/labelarray.py
@@ -591,8 +591,12 @@ class LabelArray(ndarray):
 
             if not isinstance(ret, otypes):
                 raise TypeError(
-                    "Expected f() to return a string. Got %s." % (
-                        type(ret).__name__
+                    "LabelArray.map expected function {f} to return a string"
+                    " or None, but got {type} instead.\n"
+                    "Value was {value}.".format(
+                        f=f.__name__,
+                        type=type(ret).__name__,
+                        value=ret,
                     )
                 )
 

From cbe52cea88b9a5b64272f9bce2a29776f3abbad7 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 17:32:47 -0400
Subject: [PATCH 08/11] BUG: Fix NoneType comparisons in PY3.

---
 tests/pipeline/test_classifier.py |  2 +-
 zipline/lib/labelarray.py         | 37 +++++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py
index e70479ea..5bfc31b0 100644
--- a/tests/pipeline/test_classifier.py
+++ b/tests/pipeline/test_classifier.py
@@ -472,7 +472,7 @@ class ClassifierTestCase(BasePipelineTestCase):
         __fail_fast=True,
         labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
         relabel_func=[
-            lambda s: s[0],
+            lambda s: str(s[0]),
             lambda s: str(len(s)),
             lambda s: str(len([c for c in s if c == 'a'])),
             lambda s: None,
diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py
index 4db75672..bd3ce2d6 100644
--- a/zipline/lib/labelarray.py
+++ b/zipline/lib/labelarray.py
@@ -1,7 +1,7 @@
 """
 An ndarray subclass for working with arrays of strings.
 """
-from functools import partial
+from functools import partial, total_ordering
 from operator import eq, ne
 import re
 
@@ -584,8 +584,13 @@ class LabelArray(ndarray):
                      missing_value=self.missing_value,
                      otypes=allowed_outtypes):
 
+            # Don't call f on the missing value; those locations don't exist
+            # semantically. We return _sortable_sentinel rather than None
+            # because the np.unique call below sorts the categories array,
+            # which raises an error on Python 3 because None and str aren't
+            # comparable.
             if x == missing_value:
-                return x
+                return _sortable_sentinel
 
             ret = f(x)
 
@@ -600,6 +605,9 @@ class LabelArray(ndarray):
                     )
                 )
 
+            if ret == missing_value:
+                return _sortable_sentinel
+
             return ret
 
         new_categories_with_duplicates = (
@@ -610,14 +618,21 @@ class LabelArray(ndarray):
         # with the same code duplicated multiple times. Compress the categories
         # by running them through np.unique, and then use the reverse lookup
         # table to compress codes as well.
-        new_categories, bloated_reverse_index = np.unique(
+        new_categories, bloated_inverse_index = np.unique(
             new_categories_with_duplicates,
             return_inverse=True
         )
 
+        if new_categories[0] == _sortable_sentinel:
+            # f_to_use return _sortable_sentinel for locations that should be
+            # missing values in our output. Since np.unique returns the uniques
+            # in sorted order, and since _sortable_sentinel sorts before any
+            # string, we only need to check the first array entry.
+            new_categories[0] = self.missing_value
+
         # `reverse_index` will always be a 64 bit integer even if we can hold a
         # smaller array.
-        reverse_index = bloated_reverse_index.astype(
+        reverse_index = bloated_inverse_index.astype(
             smallest_uint_that_can_hold(len(new_categories))
         )
         new_codes = np.take(reverse_index, self.as_int_array())
@@ -714,3 +729,17 @@ class LabelArray(ndarray):
             element of self was an element of ``container``.
         """
         return self.map_predicate(container.__contains__)
+
+
+@total_ordering
+class _SortableSentinel(object):
+    """Dummy object that sorts before any other python object.
+    """
+    def __eq__(self, other):
+        return isinstance(other, _SortableSentinel)
+
+    def __lt__(self, other):
+        return not isinstance(other, _SortableSentinel)
+
+
+_sortable_sentinel = _SortableSentinel()

From 8a042937d9dd536495a6be26c7f1f30f5c9480c0 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 18:21:13 -0400
Subject: [PATCH 09/11] TST: Add test for missing values in relabel.

---
 tests/pipeline/test_classifier.py | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py
index 5bfc31b0..fa47df87 100644
--- a/tests/pipeline/test_classifier.py
+++ b/tests/pipeline/test_classifier.py
@@ -513,6 +513,58 @@ class ClassifierTestCase(BasePipelineTestCase):
             mask=self.build_mask(self.ones_mask(shape=data.shape)),
         )
 
+    @parameter_space(
+        __fail_fast=True,
+        missing_value=[None, 'M'],
+    )
+    def test_relabel_missing_value_interactions(self, missing_value):
+
+        mv = missing_value
+
+        class C(Classifier):
+            inputs = ()
+            dtype = categorical_dtype
+            missing_value = mv
+            window_length = 0
+
+        c = C()
+
+        def relabel_func(s):
+            if s == 'B':
+                return mv
+            return ''.join([s, s])
+
+        raw = np.asarray(
+            [['A', 'B', 'C', mv],
+             [mv, 'A', 'B', 'C'],
+             ['C', mv, 'A', 'B'],
+             ['B', 'C', mv, 'A']],
+            dtype=categorical_dtype,
+        )
+        data = LabelArray(raw, missing_value=mv)
+
+        expected_relabeled_raw = np.asarray(
+            [['AA', mv, 'CC', mv],
+             [mv, 'AA', mv, 'CC'],
+             ['CC', mv, 'AA', mv],
+             [mv, 'CC', mv, 'AA']],
+            dtype=categorical_dtype,
+        )
+
+        terms = {
+            'relabeled': c.relabel(relabel_func),
+        }
+        expected_results = {
+            'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv),
+        }
+
+        self.check_terms(
+            terms,
+            expected_results,
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
+
     def test_relabel_int_classifier_not_yet_supported(self):
         class C(Classifier):
             inputs = ()

From 609ae752d0ea396e058caf76338cc3b3ade671b4 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 18:22:30 -0400
Subject: [PATCH 10/11] DOC: Fix typo in docstring.

---
 zipline/lib/_factorize.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx
index e14a2039..3b84662e 100644
--- a/zipline/lib/_factorize.pyx
+++ b/zipline/lib/_factorize.pyx
@@ -15,7 +15,7 @@ cdef inline double log2(double d):
 
 
 cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
-    """Choose the smallest numpy unsigned int dtype that can hold ``size``.
+    """Choose the smallest numpy unsigned int dtype that can hold ``maxval``.
     """
     if maxval < 1:
         # lim x -> 0 log2(x) == -infinity so we floor at uint8

From ec2ec49914950cc42b29a96d8ba09b34775a2c81 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Wed, 7 Jun 2017 20:17:58 -0400
Subject: [PATCH 11/11] MAINT: Make _sortable_sentinel a singleton.

---
 zipline/lib/labelarray.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py
index bd3ce2d6..ff121ff9 100644
--- a/zipline/lib/labelarray.py
+++ b/zipline/lib/labelarray.py
@@ -11,6 +11,7 @@ import pandas as pd
 from toolz import compose
 
 from zipline.utils.compat import unicode
+from zipline.utils.functional import instance
 from zipline.utils.preprocess import preprocess
 from zipline.utils.sentinel import sentinel
 from zipline.utils.input_validation import (
@@ -623,7 +624,7 @@ class LabelArray(ndarray):
             return_inverse=True
         )
 
-        if new_categories[0] == _sortable_sentinel:
+        if new_categories[0] is _sortable_sentinel:
             # f_to_use return _sortable_sentinel for locations that should be
             # missing values in our output. Since np.unique returns the uniques
             # in sorted order, and since _sortable_sentinel sorts before any
@@ -731,15 +732,13 @@ class LabelArray(ndarray):
         return self.map_predicate(container.__contains__)
 
 
+@instance  # This makes _sortable_sentinel a singleton instance.
 @total_ordering
-class _SortableSentinel(object):
+class _sortable_sentinel(object):
     """Dummy object that sorts before any other python object.
     """
     def __eq__(self, other):
-        return isinstance(other, _SortableSentinel)
+        return self is other
 
     def __lt__(self, other):
-        return not isinstance(other, _SortableSentinel)
-
-
-_sortable_sentinel = _SortableSentinel()
+        return True