diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index a716c19d..e70479ea 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase): ) self.assertEqual(errmsg, expected) + @parameter_space( + __fail_fast=True, + labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), + relabel_func=[ + lambda s: s[0], + lambda s: str(len(s)), + lambda s: str(len([c for c in s if c == 'a'])), + lambda s: None, + ] + ) + def test_relabel_strings(self, relabel_func, labelarray_dtype): + + class C(Classifier): + inputs = () + dtype = categorical_dtype + missing_value = None + window_length = 0 + + c = C() + + raw = np.asarray( + [['a', 'aa', 'aaa', 'abab'], + ['bab', 'aba', 'aa', 'bb'], + ['a', 'aba', 'abaa', 'abaab'], + ['a', 'aa', 'aaa', 'aaaa']], + dtype=labelarray_dtype, + ) + raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw) + + data = LabelArray(raw, missing_value=None) + + terms = { + 'relabeled': c.relabel(relabel_func), + } + expected_results = { + 'relabeled': LabelArray(raw_relabeled, missing_value=None), + } + + self.check_terms( + terms, + expected_results, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + def test_relabel_int_classifier_not_yet_supported(self): + class C(Classifier): + inputs = () + dtype = int64_dtype + missing_value = -1 + window_length = 0 + + c = C() + + with self.assertRaises(TypeError) as e: + c.relabel(lambda x: 0 / 0) # Function should never be called. + + result = str(e.exception) + expected = ( + "relabel() is only defined on Classifiers producing strings " + "but it was called on a Classifier of dtype int64." + ) + self.assertEqual(result, expected) + class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase): def test_reversability_categorical(self): diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index 854da975..6d5681b6 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase): np_contains(strs) & notmissing, ) + @parameter_space( + __fail_fast=True, + f=[ + lambda s: str(len(s)), + lambda s: s[0], + lambda s: ''.join(reversed(s)), + lambda s: '', + ] + ) + def test_map(self, f): + data = np.array( + [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'], + ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'], + ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'], + ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']], + dtype=object, + ) + la = LabelArray(data, missing_value=None) + + numpy_transformed = np.vectorize(f)(data) + la_transformed = la.map(f).as_string_array() + + assert_equal(numpy_transformed, la_transformed) + + def test_map_ignores_missing_value(self): + data = np.array(['A', 'B', 'C'], dtype=object) + la = LabelArray(data, missing_value='A') + + def increment_char(c): + return chr(ord(c) + 1) + + result = la.map(increment_char) + expected = LabelArray(['A', 'C', 'D'], missing_value='A') + assert_equal(result.as_string_array(), expected.as_string_array()) + + @parameter_space( + __fail_fast=True, + f=[ + lambda s: 0, + lambda s: 0.0, + lambda s: object(), + ] + ) + def test_map_requires_f_to_return_a_string(self, f): + la = LabelArray(self.strs, missing_value=None) + + with self.assertRaises(TypeError): + la.map(f) + + def test_map_can_only_return_none_if_missing_value_is_none(self): + + # Should work. + la = LabelArray(self.strs, missing_value=None) + la.map(lambda x: None) + + la = LabelArray(self.strs, missing_value="__MISSING__") + with self.assertRaises(TypeError): + la.map(lambda x: None) + @parameter_space( __fail_fast=True, missing_value=('', 'a', 'not in the array', None), @@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase): assert_equal(arr.itemsize, 2) self.check_roundtrip(arr) + def test_map_shrinks_code_storage_if_possible(self): + arr = LabelArray( + # Drop the last value so we fit in a uint16 with None as a missing + # value. + self.create_categories(16, plus_one=False)[:-1], + missing_value=None, + ) + + self.assertEqual(arr.itemsize, 2) + + def either_A_or_B(s): + return ('A', 'B')[sum(ord(c) for c in s) % 2] + + result = arr.map(either_A_or_B) + + self.assertEqual(set(result.categories), {'A', 'B', None}) + self.assertEqual(result.itemsize, 1) + + assert_equal( + np.vectorize(either_A_or_B)(arr.as_string_array()), + result.as_string_array(), + ) + + def test_map_never_increases_code_storage_size(self): + # This tests a pathological case where a user maps an impure function + # that returns a different label on every invocation, which in a naive + # implementation could cause us to need to **increase** the size of our + # codes after a map. + # + # This doesn't happen, however, because we guarantee that the user's + # mapping function will be called on each unique category exactly once, + # which means we can never increase the number of categories in the + # LabelArray after mapping. + + # Using all but one of the categories so that we still fit in a uint8 + # with an extra category for None as a missing value. + categories = self.create_categories(8, plus_one=False)[:-1] + + larger_categories = self.create_categories(16, plus_one=False) + + # Double the length of the categories so that we have to increase the + # required size after our map. + categories_twice = categories + categories + + arr = LabelArray(categories_twice, missing_value=None) + assert_equal(arr.itemsize, 1) + + gen_unique_categories = iter(larger_categories) + + def new_string_every_time(c): + # Return a new unique category every time so that every result is + # different. + return next(gen_unique_categories) + + result = arr.map(new_string_every_time) + + # Result should still be of size 1. + assert_equal(result.itemsize, 1) + + # Result should be the first `len(categories)` entries from the larger + # categories, repeated twice. + expected = LabelArray( + larger_categories[:len(categories)] * 2, + missing_value=None, + ) + assert_equal(result.as_string_array(), expected.as_string_array()) + def manual_narrow_condense_back_to_valid_size_slow(self): """This test is really slow so we don't want it run by default. """ diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx index 463b3dc6..e14a2039 100644 --- a/zipline/lib/_factorize.pyx +++ b/zipline/lib/_factorize.pyx @@ -14,6 +14,23 @@ cdef inline double log2(double d): return log(d) / log(2); +cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval): + """Choose the smallest numpy unsigned int dtype that can hold ``size``. + """ + if maxval < 1: + # lim x -> 0 log2(x) == -infinity so we floor at uint8 + return np.uint8 + else: + # The number of bits required to hold the codes up to ``length`` is + # log2(length). The number of bits per bytes is 8. We cannot have + # fractional bytes so we need to round up. Finally, we can only have + # integers with widths 1, 2, 4, or 8 so so we need to round up to the + # next value by looking up the next largest size in ``_int_sizes``. + return unsigned_int_dtype_with_size_in_bytes( + _int_sizes[int(np.ceil(log2(maxval) / 8))] + ) + + ctypedef fused unsigned_integral: np.uint8_t np.uint16_t @@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values, raise ValueError('nvalues larger than uint64') length = len(categories_array) - if length < 1: - # lim x -> 0 log2(x) == -infinity so we floor at uint8 - narrowest_dtype = np.uint8 - else: - # The number of bits required to hold the codes up to ``length`` is - # log2(length). The number of bits per bytes is 8. We cannot have - # fractional bytes so we need to round up. Finally, we can only have - # integers with widths 1, 2, 4, or 8 so so we need to round up to the - # next value by looking up the next largest size in ``_int_sizes``. - narrowest_dtype = unsigned_int_dtype_with_size_in_bytes( - _int_sizes[int(np.ceil(log2(length) / 8))] - ) - + narrowest_dtype = smallest_uint_that_can_hold(length) if codes.dtype != narrowest_dtype: # condense the codes down to the narrowest dtype possible codes = codes.astype(narrowest_dtype) diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index c9db21e6..68358d81 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning from ._factorize import ( factorize_strings, factorize_strings_known_categories, + smallest_uint_that_can_hold, ) @@ -136,6 +137,7 @@ class LabelArray(ndarray): http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html """ SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None)) + SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode) @preprocess( values=coerce(list, partial(np.asarray, dtype=object)), @@ -565,6 +567,64 @@ class LabelArray(ndarray): # locations in our indices. return results[self.as_int_array()] + def map(self, f): + """ + Map a function from str -> str element-wise over ``self``. + + ``f`` will be applied exactly once to each non-missing unique value in + ``self``. Missing values will always map to ``self.missing_value``. + """ + # f() should only return None if None is our missing value. + if self.missing_value is None: + allowed_outtypes = self.SUPPORTED_SCALAR_TYPES + else: + allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES + + def f_to_use(x, + missing_value=self.missing_value, + otypes=allowed_outtypes): + + if x == missing_value: + return x + + ret = f(x) + + if not isinstance(ret, otypes): + raise TypeError( + "Expected f() to return a string. Got %s." % ( + type(ret).__name__ + ) + ) + + return ret + + new_categories_with_duplicates = ( + np.vectorize(f_to_use, otypes=[object])(self.categories) + ) + + # If f() maps multiple inputs to the same output, then we can end up + # with the same code duplicated multiple times. Compress the categories + # by running them through np.unique, and then use the reverse lookup + # table to compress codes as well. + new_categories, bloated_reverse_index = np.unique( + new_categories_with_duplicates, + return_inverse=True + ) + + # `reverse_index` will always be a 64 bit integer even if we can hold a + # smaller array. + reverse_index = bloated_reverse_index.astype( + smallest_uint_that_can_hold(len(new_categories)) + ) + new_codes = np.take(reverse_index, self.as_int_array()) + + return self.from_codes_and_metadata( + new_codes, + new_categories, + dict(zip(new_categories, range(len(new_categories)))), + missing_value=self.missing_value, + ) + def startswith(self, prefix): """ Element-wise startswith. diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index 810430fa..f3c2e22c 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype from zipline.pipeline.sentinels import NotSpecified from zipline.pipeline.term import ComputableTerm from zipline.utils.compat import unicode -from zipline.utils.input_validation import expect_types +from zipline.utils.input_validation import expect_types, expect_dtypes from zipline.utils.memoize import classlazyval from zipline.utils.numpy_utils import ( categorical_dtype, @@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype( dtype=categorical_dtype, message_template=( "{method_name}() is only defined on Classifiers producing strings" - " but it was called on a Factor of dtype {received_dtype}." + " but it was called on a Classifier of dtype {received_dtype}." ) ) @@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): opargs=(pattern,), ) + # TODO: Support relabeling for integer dtypes. + @string_classifiers_only + def relabel(self, relabeler): + """ + Convert ``self`` into a new classifier by mapping a function over each + element produced by ``self``. + + Parameters + ---------- + relabeler : function[str -> str or None] + A function to apply to each unique value produced by ``self``. + + Returns + ------- + relabeled : Classifier + A classifier produced by applying ``relabeler`` to each unique + value produced by ``self``. + """ + return Relabel(term=self, relabeler=relabeler) + def element_of(self, choices): """ Construct a Filter indicating whether values are in ``choices``. @@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier): return type(self).__name__ + '(%d)' % self.params['bins'] +class Relabel(SingleInputMixin, Classifier): + """ + A classifier applying a relabeling function on the result of another + classifier. + + Parameters + ---------- + arg : zipline.pipeline.Classifier + Term produceing the input to be relabeled. + relabel_func : function(LabelArray) -> LabelArray + Function to apply to the result of `term`. + """ + window_length = 0 + params = ('relabeler',) + + # TODO: Support relabeling for integer dtypes. + @expect_dtypes(term=categorical_dtype) + @expect_types(term=Classifier) + def __new__(cls, term, relabeler): + return super(Relabel, cls).__new__( + cls, + inputs=(term,), + dtype=term.dtype, + mask=term.mask, + relabeler=relabeler, + ) + + def _compute(self, arrays, dates, assets, mask): + relabeler = self.params['relabeler'] + data = arrays[0] + + if isinstance(data, LabelArray): + result = data.map(relabeler) + result[~mask] = data.missing_value + else: + raise TypeError( + "Relabeling is not currently supported for " + "int-dtype classifiers." + ) + return result + + class CustomClassifier(PositiveWindowLengthMixin, StandardOutputs, CustomTermMixin,