ENH: Add relabel method to string classifiers.

- Adds a `map` method to `LabelArray` that maps a unary function over the categories of a LabelArray, shrinking the underyling codes if possible. - Adds a new `.relabel` method to string-dtype classifiers that maps a unary function over the unique elements of the underlying LabelArray. This is useful for things like cleaning noisy label data.
2026-06-30 04:17:29 +08:00 · 2017-06-07 11:50:59 -04:00
parent 8b17b7d4b3
commit e995e6f2ed
5 changed files with 332 additions and 15 deletions
@@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase):
        )
        self.assertEqual(errmsg, expected)

+    @parameter_space(
+        __fail_fast=True,
+        labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
+        relabel_func=[
+            lambda s: s[0],
+            lambda s: str(len(s)),
+            lambda s: str(len([c for c in s if c == 'a'])),
+            lambda s: None,
+        ]
+    )
+    def test_relabel_strings(self, relabel_func, labelarray_dtype):
+
+        class C(Classifier):
+            inputs = ()
+            dtype = categorical_dtype
+            missing_value = None
+            window_length = 0
+
+        c = C()
+
+        raw = np.asarray(
+            [['a', 'aa', 'aaa', 'abab'],
+             ['bab', 'aba', 'aa', 'bb'],
+             ['a', 'aba', 'abaa', 'abaab'],
+             ['a', 'aa', 'aaa', 'aaaa']],
+            dtype=labelarray_dtype,
+        )
+        raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)
+
+        data = LabelArray(raw, missing_value=None)
+
+        terms = {
+            'relabeled': c.relabel(relabel_func),
+        }
+        expected_results = {
+            'relabeled': LabelArray(raw_relabeled, missing_value=None),
+        }
+
+        self.check_terms(
+            terms,
+            expected_results,
+            initial_workspace={c: data},
+            mask=self.build_mask(self.ones_mask(shape=data.shape)),
+        )
+
+    def test_relabel_int_classifier_not_yet_supported(self):
+        class C(Classifier):
+            inputs = ()
+            dtype = int64_dtype
+            missing_value = -1
+            window_length = 0
+
+        c = C()
+
+        with self.assertRaises(TypeError) as e:
+            c.relabel(lambda x: 0 / 0)  # Function should never be called.
+
+        result = str(e.exception)
+        expected = (
+            "relabel() is only defined on Classifiers producing strings "
+            "but it was called on a Classifier of dtype int64."
+        )
+        self.assertEqual(result, expected)
+

 class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
    def test_reversability_categorical(self):
@@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase):
            np_contains(strs) & notmissing,
        )

+    @parameter_space(
+        __fail_fast=True,
+        f=[
+            lambda s: str(len(s)),
+            lambda s: s[0],
+            lambda s: ''.join(reversed(s)),
+            lambda s: '',
+        ]
+    )
+    def test_map(self, f):
+        data = np.array(
+            [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
+             ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
+             ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
+             ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
+            dtype=object,
+        )
+        la = LabelArray(data, missing_value=None)
+
+        numpy_transformed = np.vectorize(f)(data)
+        la_transformed = la.map(f).as_string_array()
+
+        assert_equal(numpy_transformed, la_transformed)
+
+    def test_map_ignores_missing_value(self):
+        data = np.array(['A', 'B', 'C'], dtype=object)
+        la = LabelArray(data, missing_value='A')
+
+        def increment_char(c):
+            return chr(ord(c) + 1)
+
+        result = la.map(increment_char)
+        expected = LabelArray(['A', 'C', 'D'], missing_value='A')
+        assert_equal(result.as_string_array(), expected.as_string_array())
+
+    @parameter_space(
+        __fail_fast=True,
+        f=[
+            lambda s: 0,
+            lambda s: 0.0,
+            lambda s: object(),
+        ]
+    )
+    def test_map_requires_f_to_return_a_string(self, f):
+        la = LabelArray(self.strs, missing_value=None)
+
+        with self.assertRaises(TypeError):
+            la.map(f)
+
+    def test_map_can_only_return_none_if_missing_value_is_none(self):
+
+        # Should work.
+        la = LabelArray(self.strs, missing_value=None)
+        la.map(lambda x: None)
+
+        la = LabelArray(self.strs, missing_value="__MISSING__")
+        with self.assertRaises(TypeError):
+            la.map(lambda x: None)
+
    @parameter_space(
        __fail_fast=True,
        missing_value=('', 'a', 'not in the array', None),
@@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase):
        assert_equal(arr.itemsize, 2)
        self.check_roundtrip(arr)

+    def test_map_shrinks_code_storage_if_possible(self):
+        arr = LabelArray(
+            # Drop the last value so we fit in a uint16 with None as a missing
+            # value.
+            self.create_categories(16, plus_one=False)[:-1],
+            missing_value=None,
+        )
+
+        self.assertEqual(arr.itemsize, 2)
+
+        def either_A_or_B(s):
+            return ('A', 'B')[sum(ord(c) for c in s) % 2]
+
+        result = arr.map(either_A_or_B)
+
+        self.assertEqual(set(result.categories), {'A', 'B', None})
+        self.assertEqual(result.itemsize, 1)
+
+        assert_equal(
+            np.vectorize(either_A_or_B)(arr.as_string_array()),
+            result.as_string_array(),
+        )
+
+    def test_map_never_increases_code_storage_size(self):
+        # This tests a pathological case where a user maps an impure function
+        # that returns a different label on every invocation, which in a naive
+        # implementation could cause us to need to **increase** the size of our
+        # codes after a map.
+        #
+        # This doesn't happen, however, because we guarantee that the user's
+        # mapping function will be called on each unique category exactly once,
+        # which means we can never increase the number of categories in the
+        # LabelArray after mapping.
+
+        # Using all but one of the categories so that we still fit in a uint8
+        # with an extra category for None as a missing value.
+        categories = self.create_categories(8, plus_one=False)[:-1]
+
+        larger_categories = self.create_categories(16, plus_one=False)
+
+        # Double the length of the categories so that we have to increase the
+        # required size after our map.
+        categories_twice = categories + categories
+
+        arr = LabelArray(categories_twice, missing_value=None)
+        assert_equal(arr.itemsize, 1)
+
+        gen_unique_categories = iter(larger_categories)
+
+        def new_string_every_time(c):
+            # Return a new unique category every time so that every result is
+            # different.
+            return next(gen_unique_categories)
+
+        result = arr.map(new_string_every_time)
+
+        # Result should still be of size 1.
+        assert_equal(result.itemsize, 1)
+
+        # Result should be the first `len(categories)` entries from the larger
+        # categories, repeated twice.
+        expected = LabelArray(
+            larger_categories[:len(categories)] * 2,
+            missing_value=None,
+        )
+        assert_equal(result.as_string_array(), expected.as_string_array())
+
    def manual_narrow_condense_back_to_valid_size_slow(self):
        """This test is really slow so we don't want it run by default.
        """
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
    return log(d) / log(2);


+cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
+    """Choose the smallest numpy unsigned int dtype that can hold ``size``.
+    """
+    if maxval < 1:
+        # lim x -> 0 log2(x) == -infinity so we floor at uint8
+        return np.uint8
+    else:
+        # The number of bits required to hold the codes up to ``length`` is
+        # log2(length). The number of bits per bytes is 8. We cannot have
+        # fractional bytes so we need to round up. Finally, we can only have
+        # integers with widths 1, 2, 4, or 8 so so we need to round up to the
+        # next value by looking up the next largest size in ``_int_sizes``.
+        return unsigned_int_dtype_with_size_in_bytes(
+            _int_sizes[int(np.ceil(log2(maxval) / 8))]
+        )
+
+
 ctypedef fused unsigned_integral:
    np.uint8_t
    np.uint16_t
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
        raise ValueError('nvalues larger than uint64')

    length = len(categories_array)
-    if length < 1:
-        # lim x -> 0 log2(x) == -infinity so we floor at uint8
-        narrowest_dtype = np.uint8
-    else:
-        # The number of bits required to hold the codes up to ``length`` is
-        # log2(length). The number of bits per bytes is 8. We cannot have
-        # fractional bytes so we need to round up. Finally, we can only have
-        # integers with widths 1, 2, 4, or 8 so so we need to round up to the
-        # next value by looking up the next largest size in ``_int_sizes``.
-        narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
-            _int_sizes[int(np.ceil(log2(length) / 8))]
-        )
-
+    narrowest_dtype = smallest_uint_that_can_hold(length)
    if codes.dtype != narrowest_dtype:
        # condense the codes down to the narrowest dtype possible
        codes = codes.astype(narrowest_dtype)
@@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
 from ._factorize import (
    factorize_strings,
    factorize_strings_known_categories,
+    smallest_uint_that_can_hold,
 )


@@ -136,6 +137,7 @@ class LabelArray(ndarray):
    http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
    """
    SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
+    SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)

    @preprocess(
        values=coerce(list, partial(np.asarray, dtype=object)),
@@ -565,6 +567,64 @@ class LabelArray(ndarray):
        # locations in our indices.
        return results[self.as_int_array()]

+    def map(self, f):
+        """
+        Map a function from str -> str element-wise over ``self``.
+
+        ``f`` will be applied exactly once to each non-missing unique value in
+        ``self``. Missing values will always map to ``self.missing_value``.
+        """
+        # f() should only return None if None is our missing value.
+        if self.missing_value is None:
+            allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
+        else:
+            allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
+
+        def f_to_use(x,
+                     missing_value=self.missing_value,
+                     otypes=allowed_outtypes):
+
+            if x == missing_value:
+                return x
+
+            ret = f(x)
+
+            if not isinstance(ret, otypes):
+                raise TypeError(
+                    "Expected f() to return a string. Got %s." % (
+                        type(ret).__name__
+                    )
+                )
+
+            return ret
+
+        new_categories_with_duplicates = (
+            np.vectorize(f_to_use, otypes=[object])(self.categories)
+        )
+
+        # If f() maps multiple inputs to the same output, then we can end up
+        # with the same code duplicated multiple times. Compress the categories
+        # by running them through np.unique, and then use the reverse lookup
+        # table to compress codes as well.
+        new_categories, bloated_reverse_index = np.unique(
+            new_categories_with_duplicates,
+            return_inverse=True
+        )
+
+        # `reverse_index` will always be a 64 bit integer even if we can hold a
+        # smaller array.
+        reverse_index = bloated_reverse_index.astype(
+            smallest_uint_that_can_hold(len(new_categories))
+        )
+        new_codes = np.take(reverse_index, self.as_int_array())
+
+        return self.from_codes_and_metadata(
+            new_codes,
+            new_categories,
+            dict(zip(new_categories, range(len(new_categories)))),
+            missing_value=self.missing_value,
+        )
+
    def startswith(self, prefix):
        """
        Element-wise startswith.
@@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype
 from zipline.pipeline.sentinels import NotSpecified
 from zipline.pipeline.term import ComputableTerm
 from zipline.utils.compat import unicode
-from zipline.utils.input_validation import expect_types
+from zipline.utils.input_validation import expect_types, expect_dtypes
 from zipline.utils.memoize import classlazyval
 from zipline.utils.numpy_utils import (
    categorical_dtype,
@@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype(
    dtype=categorical_dtype,
    message_template=(
        "{method_name}() is only defined on Classifiers producing strings"
-        " but it was called on a Factor of dtype {received_dtype}."
+        " but it was called on a Classifier of dtype {received_dtype}."
    )
 )

@@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
            opargs=(pattern,),
        )

+    # TODO: Support relabeling for integer dtypes.
+    @string_classifiers_only
+    def relabel(self, relabeler):
+        """
+        Convert ``self`` into a new classifier by mapping a function over each
+        element produced by ``self``.
+
+        Parameters
+        ----------
+        relabeler : function[str -> str or None]
+            A function to apply to each unique value produced by ``self``.
+
+        Returns
+        -------
+        relabeled : Classifier
+            A classifier produced by applying ``relabeler`` to each unique
+            value produced by ``self``.
+        """
+        return Relabel(term=self, relabeler=relabeler)
+
    def element_of(self, choices):
        """
        Construct a Filter indicating whether values are in ``choices``.
@@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier):
        return type(self).__name__ + '(%d)' % self.params['bins']


+class Relabel(SingleInputMixin, Classifier):
+    """
+    A classifier applying a relabeling function on the result of another
+    classifier.
+
+    Parameters
+    ----------
+    arg : zipline.pipeline.Classifier
+        Term produceing the input to be relabeled.
+    relabel_func : function(LabelArray) -> LabelArray
+        Function to apply to the result of `term`.
+    """
+    window_length = 0
+    params = ('relabeler',)
+
+    # TODO: Support relabeling for integer dtypes.
+    @expect_dtypes(term=categorical_dtype)
+    @expect_types(term=Classifier)
+    def __new__(cls, term, relabeler):
+        return super(Relabel, cls).__new__(
+            cls,
+            inputs=(term,),
+            dtype=term.dtype,
+            mask=term.mask,
+            relabeler=relabeler,
+        )
+
+    def _compute(self, arrays, dates, assets, mask):
+        relabeler = self.params['relabeler']
+        data = arrays[0]
+
+        if isinstance(data, LabelArray):
+            result = data.map(relabeler)
+            result[~mask] = data.missing_value
+        else:
+            raise TypeError(
+                "Relabeling is not currently supported for "
+                "int-dtype classifiers."
+            )
+        return result
+
+
 class CustomClassifier(PositiveWindowLengthMixin,
                       StandardOutputs,
                       CustomTermMixin,