From ba590266c98f15cd77dd2c7ac938c55f4e451b59 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 11:49:37 -0400 Subject: [PATCH 01/11] MAINT: Simplify ArrayPredicate. Just use `params` instead of custom `_init` and `_static_identity`. --- zipline/pipeline/filters/filter.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py index 523c8927..be1959f9 100644 --- a/zipline/pipeline/filters/filter.py +++ b/zipline/pipeline/filters/filter.py @@ -432,6 +432,7 @@ class ArrayPredicate(SingleInputMixin, Filter): opargs : tuple[hashable] Additional argument to apply to ``op``. """ + params = ('op', 'opargs') window_length = 0 @expect_types(term=Term, opargs=tuple) @@ -445,22 +446,10 @@ class ArrayPredicate(SingleInputMixin, Filter): mask=term.mask, ) - def _init(self, op, opargs, *args, **kwargs): - self._op = op - self._opargs = opargs - return super(ArrayPredicate, self)._init(*args, **kwargs) - - @classmethod - def _static_identity(cls, op, opargs, *args, **kwargs): - return ( - super(ArrayPredicate, cls)._static_identity(*args, **kwargs), - op, - opargs, - ) - def _compute(self, arrays, dates, assets, mask): + params = self.params data = arrays[0] - return self._op(data, *self._opargs) & mask + return params['op'](data, *params['opargs']) & mask class Latest(LatestMixin, CustomFilter): From 8b17b7d4b3722864a26fc738b580235d79558bc1 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 11:50:37 -0400 Subject: [PATCH 02/11] MAINT: Add validator to `validate_column`. Who validates the validators? --- zipline/pipeline/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zipline/pipeline/pipeline.py b/zipline/pipeline/pipeline.py index 78ee5ed5..e7c6b4de 100644 --- a/zipline/pipeline/pipeline.py +++ b/zipline/pipeline/pipeline.py @@ -229,6 +229,7 @@ class Pipeline(object): raise AssertionError("Unknown graph format %r." % format) @staticmethod + @expect_types(term=Term, column_name=str) def validate_column(column_name, term): if term.ndim == 1: raise UnsupportedPipelineOutput(column_name=column_name, term=term) From e995e6f2edbc895040c4c84a7baeef471ef225b9 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 11:50:59 -0400 Subject: [PATCH 03/11] ENH: Add `relabel` method to string classifiers. - Adds a `map` method to `LabelArray` that maps a unary function over the categories of a LabelArray, shrinking the underyling codes if possible. - Adds a new `.relabel` method to string-dtype classifiers that maps a unary function over the unique elements of the underlying LabelArray. This is useful for things like cleaning noisy label data. --- tests/pipeline/test_classifier.py | 64 +++++++++++ tests/test_labelarray.py | 126 +++++++++++++++++++++ zipline/lib/_factorize.pyx | 31 ++--- zipline/lib/labelarray.py | 60 ++++++++++ zipline/pipeline/classifiers/classifier.py | 66 ++++++++++- 5 files changed, 332 insertions(+), 15 deletions(-) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index a716c19d..e70479ea 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase): ) self.assertEqual(errmsg, expected) + @parameter_space( + __fail_fast=True, + labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), + relabel_func=[ + lambda s: s[0], + lambda s: str(len(s)), + lambda s: str(len([c for c in s if c == 'a'])), + lambda s: None, + ] + ) + def test_relabel_strings(self, relabel_func, labelarray_dtype): + + class C(Classifier): + inputs = () + dtype = categorical_dtype + missing_value = None + window_length = 0 + + c = C() + + raw = np.asarray( + [['a', 'aa', 'aaa', 'abab'], + ['bab', 'aba', 'aa', 'bb'], + ['a', 'aba', 'abaa', 'abaab'], + ['a', 'aa', 'aaa', 'aaaa']], + dtype=labelarray_dtype, + ) + raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw) + + data = LabelArray(raw, missing_value=None) + + terms = { + 'relabeled': c.relabel(relabel_func), + } + expected_results = { + 'relabeled': LabelArray(raw_relabeled, missing_value=None), + } + + self.check_terms( + terms, + expected_results, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + def test_relabel_int_classifier_not_yet_supported(self): + class C(Classifier): + inputs = () + dtype = int64_dtype + missing_value = -1 + window_length = 0 + + c = C() + + with self.assertRaises(TypeError) as e: + c.relabel(lambda x: 0 / 0) # Function should never be called. + + result = str(e.exception) + expected = ( + "relabel() is only defined on Classifiers producing strings " + "but it was called on a Classifier of dtype int64." + ) + self.assertEqual(result, expected) + class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase): def test_reversability_categorical(self): diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index 854da975..6d5681b6 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase): np_contains(strs) & notmissing, ) + @parameter_space( + __fail_fast=True, + f=[ + lambda s: str(len(s)), + lambda s: s[0], + lambda s: ''.join(reversed(s)), + lambda s: '', + ] + ) + def test_map(self, f): + data = np.array( + [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'], + ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'], + ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'], + ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']], + dtype=object, + ) + la = LabelArray(data, missing_value=None) + + numpy_transformed = np.vectorize(f)(data) + la_transformed = la.map(f).as_string_array() + + assert_equal(numpy_transformed, la_transformed) + + def test_map_ignores_missing_value(self): + data = np.array(['A', 'B', 'C'], dtype=object) + la = LabelArray(data, missing_value='A') + + def increment_char(c): + return chr(ord(c) + 1) + + result = la.map(increment_char) + expected = LabelArray(['A', 'C', 'D'], missing_value='A') + assert_equal(result.as_string_array(), expected.as_string_array()) + + @parameter_space( + __fail_fast=True, + f=[ + lambda s: 0, + lambda s: 0.0, + lambda s: object(), + ] + ) + def test_map_requires_f_to_return_a_string(self, f): + la = LabelArray(self.strs, missing_value=None) + + with self.assertRaises(TypeError): + la.map(f) + + def test_map_can_only_return_none_if_missing_value_is_none(self): + + # Should work. + la = LabelArray(self.strs, missing_value=None) + la.map(lambda x: None) + + la = LabelArray(self.strs, missing_value="__MISSING__") + with self.assertRaises(TypeError): + la.map(lambda x: None) + @parameter_space( __fail_fast=True, missing_value=('', 'a', 'not in the array', None), @@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase): assert_equal(arr.itemsize, 2) self.check_roundtrip(arr) + def test_map_shrinks_code_storage_if_possible(self): + arr = LabelArray( + # Drop the last value so we fit in a uint16 with None as a missing + # value. + self.create_categories(16, plus_one=False)[:-1], + missing_value=None, + ) + + self.assertEqual(arr.itemsize, 2) + + def either_A_or_B(s): + return ('A', 'B')[sum(ord(c) for c in s) % 2] + + result = arr.map(either_A_or_B) + + self.assertEqual(set(result.categories), {'A', 'B', None}) + self.assertEqual(result.itemsize, 1) + + assert_equal( + np.vectorize(either_A_or_B)(arr.as_string_array()), + result.as_string_array(), + ) + + def test_map_never_increases_code_storage_size(self): + # This tests a pathological case where a user maps an impure function + # that returns a different label on every invocation, which in a naive + # implementation could cause us to need to **increase** the size of our + # codes after a map. + # + # This doesn't happen, however, because we guarantee that the user's + # mapping function will be called on each unique category exactly once, + # which means we can never increase the number of categories in the + # LabelArray after mapping. + + # Using all but one of the categories so that we still fit in a uint8 + # with an extra category for None as a missing value. + categories = self.create_categories(8, plus_one=False)[:-1] + + larger_categories = self.create_categories(16, plus_one=False) + + # Double the length of the categories so that we have to increase the + # required size after our map. + categories_twice = categories + categories + + arr = LabelArray(categories_twice, missing_value=None) + assert_equal(arr.itemsize, 1) + + gen_unique_categories = iter(larger_categories) + + def new_string_every_time(c): + # Return a new unique category every time so that every result is + # different. + return next(gen_unique_categories) + + result = arr.map(new_string_every_time) + + # Result should still be of size 1. + assert_equal(result.itemsize, 1) + + # Result should be the first `len(categories)` entries from the larger + # categories, repeated twice. + expected = LabelArray( + larger_categories[:len(categories)] * 2, + missing_value=None, + ) + assert_equal(result.as_string_array(), expected.as_string_array()) + def manual_narrow_condense_back_to_valid_size_slow(self): """This test is really slow so we don't want it run by default. """ diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx index 463b3dc6..e14a2039 100644 --- a/zipline/lib/_factorize.pyx +++ b/zipline/lib/_factorize.pyx @@ -14,6 +14,23 @@ cdef inline double log2(double d): return log(d) / log(2); +cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval): + """Choose the smallest numpy unsigned int dtype that can hold ``size``. + """ + if maxval < 1: + # lim x -> 0 log2(x) == -infinity so we floor at uint8 + return np.uint8 + else: + # The number of bits required to hold the codes up to ``length`` is + # log2(length). The number of bits per bytes is 8. We cannot have + # fractional bytes so we need to round up. Finally, we can only have + # integers with widths 1, 2, 4, or 8 so so we need to round up to the + # next value by looking up the next largest size in ``_int_sizes``. + return unsigned_int_dtype_with_size_in_bytes( + _int_sizes[int(np.ceil(log2(maxval) / 8))] + ) + + ctypedef fused unsigned_integral: np.uint8_t np.uint16_t @@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values, raise ValueError('nvalues larger than uint64') length = len(categories_array) - if length < 1: - # lim x -> 0 log2(x) == -infinity so we floor at uint8 - narrowest_dtype = np.uint8 - else: - # The number of bits required to hold the codes up to ``length`` is - # log2(length). The number of bits per bytes is 8. We cannot have - # fractional bytes so we need to round up. Finally, we can only have - # integers with widths 1, 2, 4, or 8 so so we need to round up to the - # next value by looking up the next largest size in ``_int_sizes``. - narrowest_dtype = unsigned_int_dtype_with_size_in_bytes( - _int_sizes[int(np.ceil(log2(length) / 8))] - ) - + narrowest_dtype = smallest_uint_that_can_hold(length) if codes.dtype != narrowest_dtype: # condense the codes down to the narrowest dtype possible codes = codes.astype(narrowest_dtype) diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index c9db21e6..68358d81 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning from ._factorize import ( factorize_strings, factorize_strings_known_categories, + smallest_uint_that_can_hold, ) @@ -136,6 +137,7 @@ class LabelArray(ndarray): http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html """ SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None)) + SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode) @preprocess( values=coerce(list, partial(np.asarray, dtype=object)), @@ -565,6 +567,64 @@ class LabelArray(ndarray): # locations in our indices. return results[self.as_int_array()] + def map(self, f): + """ + Map a function from str -> str element-wise over ``self``. + + ``f`` will be applied exactly once to each non-missing unique value in + ``self``. Missing values will always map to ``self.missing_value``. + """ + # f() should only return None if None is our missing value. + if self.missing_value is None: + allowed_outtypes = self.SUPPORTED_SCALAR_TYPES + else: + allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES + + def f_to_use(x, + missing_value=self.missing_value, + otypes=allowed_outtypes): + + if x == missing_value: + return x + + ret = f(x) + + if not isinstance(ret, otypes): + raise TypeError( + "Expected f() to return a string. Got %s." % ( + type(ret).__name__ + ) + ) + + return ret + + new_categories_with_duplicates = ( + np.vectorize(f_to_use, otypes=[object])(self.categories) + ) + + # If f() maps multiple inputs to the same output, then we can end up + # with the same code duplicated multiple times. Compress the categories + # by running them through np.unique, and then use the reverse lookup + # table to compress codes as well. + new_categories, bloated_reverse_index = np.unique( + new_categories_with_duplicates, + return_inverse=True + ) + + # `reverse_index` will always be a 64 bit integer even if we can hold a + # smaller array. + reverse_index = bloated_reverse_index.astype( + smallest_uint_that_can_hold(len(new_categories)) + ) + new_codes = np.take(reverse_index, self.as_int_array()) + + return self.from_codes_and_metadata( + new_codes, + new_categories, + dict(zip(new_categories, range(len(new_categories)))), + missing_value=self.missing_value, + ) + def startswith(self, prefix): """ Element-wise startswith. diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index 810430fa..f3c2e22c 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype from zipline.pipeline.sentinels import NotSpecified from zipline.pipeline.term import ComputableTerm from zipline.utils.compat import unicode -from zipline.utils.input_validation import expect_types +from zipline.utils.input_validation import expect_types, expect_dtypes from zipline.utils.memoize import classlazyval from zipline.utils.numpy_utils import ( categorical_dtype, @@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype( dtype=categorical_dtype, message_template=( "{method_name}() is only defined on Classifiers producing strings" - " but it was called on a Factor of dtype {received_dtype}." + " but it was called on a Classifier of dtype {received_dtype}." ) ) @@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): opargs=(pattern,), ) + # TODO: Support relabeling for integer dtypes. + @string_classifiers_only + def relabel(self, relabeler): + """ + Convert ``self`` into a new classifier by mapping a function over each + element produced by ``self``. + + Parameters + ---------- + relabeler : function[str -> str or None] + A function to apply to each unique value produced by ``self``. + + Returns + ------- + relabeled : Classifier + A classifier produced by applying ``relabeler`` to each unique + value produced by ``self``. + """ + return Relabel(term=self, relabeler=relabeler) + def element_of(self, choices): """ Construct a Filter indicating whether values are in ``choices``. @@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier): return type(self).__name__ + '(%d)' % self.params['bins'] +class Relabel(SingleInputMixin, Classifier): + """ + A classifier applying a relabeling function on the result of another + classifier. + + Parameters + ---------- + arg : zipline.pipeline.Classifier + Term produceing the input to be relabeled. + relabel_func : function(LabelArray) -> LabelArray + Function to apply to the result of `term`. + """ + window_length = 0 + params = ('relabeler',) + + # TODO: Support relabeling for integer dtypes. + @expect_dtypes(term=categorical_dtype) + @expect_types(term=Classifier) + def __new__(cls, term, relabeler): + return super(Relabel, cls).__new__( + cls, + inputs=(term,), + dtype=term.dtype, + mask=term.mask, + relabeler=relabeler, + ) + + def _compute(self, arrays, dates, assets, mask): + relabeler = self.params['relabeler'] + data = arrays[0] + + if isinstance(data, LabelArray): + result = data.map(relabeler) + result[~mask] = data.missing_value + else: + raise TypeError( + "Relabeling is not currently supported for " + "int-dtype classifiers." + ) + return result + + class CustomClassifier(PositiveWindowLengthMixin, StandardOutputs, CustomTermMixin, From cfe4df8f2bd4bcde3478991dd7cec1ea628f9216 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 14:16:17 -0400 Subject: [PATCH 04/11] TEST: Test map ignores missing with None. --- tests/test_labelarray.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index 6d5681b6..de73eed9 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -133,15 +133,16 @@ class LabelArrayTestCase(ZiplineTestCase): assert_equal(numpy_transformed, la_transformed) - def test_map_ignores_missing_value(self): - data = np.array(['A', 'B', 'C'], dtype=object) - la = LabelArray(data, missing_value='A') + @parameter_space(missing=['A', None]) + def test_map_ignores_missing_value(self, missing): + data = np.array([missing, 'B', 'C'], dtype=object) + la = LabelArray(data, missing_value=missing) def increment_char(c): return chr(ord(c) + 1) result = la.map(increment_char) - expected = LabelArray(['A', 'C', 'D'], missing_value='A') + expected = LabelArray([missing, 'C', 'D'], missing_value=missing) assert_equal(result.as_string_array(), expected.as_string_array()) @parameter_space( From a5794858252d91c8f846159b5716e776d6f6cbc0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 15:23:27 -0400 Subject: [PATCH 05/11] MAINT: s/NotImplementedError/TypeError. --- zipline/pipeline/classifiers/classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index f3c2e22c..de5da2f0 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -437,7 +437,7 @@ class Relabel(SingleInputMixin, Classifier): result = data.map(relabeler) result[~mask] = data.missing_value else: - raise TypeError( + raise NotImplementedError( "Relabeling is not currently supported for " "int-dtype classifiers." ) From ad10349992f6da5444221df77c6b23aa1197d288 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 15:28:15 -0400 Subject: [PATCH 06/11] TEST: Test map returning None. --- tests/test_labelarray.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index de73eed9..4b001035 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -163,7 +163,12 @@ class LabelArrayTestCase(ZiplineTestCase): # Should work. la = LabelArray(self.strs, missing_value=None) - la.map(lambda x: None) + result = la.map(lambda x: None) + + check_arrays( + result, + LabelArray(np.full_like(self.strs, None), missing_value=None), + ) la = LabelArray(self.strs, missing_value="__MISSING__") with self.assertRaises(TypeError): From e49f4c6149e6473c81eef19c9783372b68c61dc0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 17:07:19 -0400 Subject: [PATCH 07/11] ENH: Improve error message on bad return. --- tests/test_labelarray.py | 2 +- zipline/lib/labelarray.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index 4b001035..fc6673c9 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -153,7 +153,7 @@ class LabelArrayTestCase(ZiplineTestCase): lambda s: object(), ] ) - def test_map_requires_f_to_return_a_string(self, f): + def test_map_requires_f_to_return_a_string_or_none(self, f): la = LabelArray(self.strs, missing_value=None) with self.assertRaises(TypeError): diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index 68358d81..4db75672 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -591,8 +591,12 @@ class LabelArray(ndarray): if not isinstance(ret, otypes): raise TypeError( - "Expected f() to return a string. Got %s." % ( - type(ret).__name__ + "LabelArray.map expected function {f} to return a string" + " or None, but got {type} instead.\n" + "Value was {value}.".format( + f=f.__name__, + type=type(ret).__name__, + value=ret, ) ) From cbe52cea88b9a5b64272f9bce2a29776f3abbad7 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 17:32:47 -0400 Subject: [PATCH 08/11] BUG: Fix NoneType comparisons in PY3. --- tests/pipeline/test_classifier.py | 2 +- zipline/lib/labelarray.py | 37 +++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index e70479ea..5bfc31b0 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -472,7 +472,7 @@ class ClassifierTestCase(BasePipelineTestCase): __fail_fast=True, labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), relabel_func=[ - lambda s: s[0], + lambda s: str(s[0]), lambda s: str(len(s)), lambda s: str(len([c for c in s if c == 'a'])), lambda s: None, diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index 4db75672..bd3ce2d6 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -1,7 +1,7 @@ """ An ndarray subclass for working with arrays of strings. """ -from functools import partial +from functools import partial, total_ordering from operator import eq, ne import re @@ -584,8 +584,13 @@ class LabelArray(ndarray): missing_value=self.missing_value, otypes=allowed_outtypes): + # Don't call f on the missing value; those locations don't exist + # semantically. We return _sortable_sentinel rather than None + # because the np.unique call below sorts the categories array, + # which raises an error on Python 3 because None and str aren't + # comparable. if x == missing_value: - return x + return _sortable_sentinel ret = f(x) @@ -600,6 +605,9 @@ class LabelArray(ndarray): ) ) + if ret == missing_value: + return _sortable_sentinel + return ret new_categories_with_duplicates = ( @@ -610,14 +618,21 @@ class LabelArray(ndarray): # with the same code duplicated multiple times. Compress the categories # by running them through np.unique, and then use the reverse lookup # table to compress codes as well. - new_categories, bloated_reverse_index = np.unique( + new_categories, bloated_inverse_index = np.unique( new_categories_with_duplicates, return_inverse=True ) + if new_categories[0] == _sortable_sentinel: + # f_to_use return _sortable_sentinel for locations that should be + # missing values in our output. Since np.unique returns the uniques + # in sorted order, and since _sortable_sentinel sorts before any + # string, we only need to check the first array entry. + new_categories[0] = self.missing_value + # `reverse_index` will always be a 64 bit integer even if we can hold a # smaller array. - reverse_index = bloated_reverse_index.astype( + reverse_index = bloated_inverse_index.astype( smallest_uint_that_can_hold(len(new_categories)) ) new_codes = np.take(reverse_index, self.as_int_array()) @@ -714,3 +729,17 @@ class LabelArray(ndarray): element of self was an element of ``container``. """ return self.map_predicate(container.__contains__) + + +@total_ordering +class _SortableSentinel(object): + """Dummy object that sorts before any other python object. + """ + def __eq__(self, other): + return isinstance(other, _SortableSentinel) + + def __lt__(self, other): + return not isinstance(other, _SortableSentinel) + + +_sortable_sentinel = _SortableSentinel() From 8a042937d9dd536495a6be26c7f1f30f5c9480c0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 18:21:13 -0400 Subject: [PATCH 09/11] TST: Add test for missing values in relabel. --- tests/pipeline/test_classifier.py | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index 5bfc31b0..fa47df87 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -513,6 +513,58 @@ class ClassifierTestCase(BasePipelineTestCase): mask=self.build_mask(self.ones_mask(shape=data.shape)), ) + @parameter_space( + __fail_fast=True, + missing_value=[None, 'M'], + ) + def test_relabel_missing_value_interactions(self, missing_value): + + mv = missing_value + + class C(Classifier): + inputs = () + dtype = categorical_dtype + missing_value = mv + window_length = 0 + + c = C() + + def relabel_func(s): + if s == 'B': + return mv + return ''.join([s, s]) + + raw = np.asarray( + [['A', 'B', 'C', mv], + [mv, 'A', 'B', 'C'], + ['C', mv, 'A', 'B'], + ['B', 'C', mv, 'A']], + dtype=categorical_dtype, + ) + data = LabelArray(raw, missing_value=mv) + + expected_relabeled_raw = np.asarray( + [['AA', mv, 'CC', mv], + [mv, 'AA', mv, 'CC'], + ['CC', mv, 'AA', mv], + [mv, 'CC', mv, 'AA']], + dtype=categorical_dtype, + ) + + terms = { + 'relabeled': c.relabel(relabel_func), + } + expected_results = { + 'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv), + } + + self.check_terms( + terms, + expected_results, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + def test_relabel_int_classifier_not_yet_supported(self): class C(Classifier): inputs = () From 609ae752d0ea396e058caf76338cc3b3ade671b4 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 18:22:30 -0400 Subject: [PATCH 10/11] DOC: Fix typo in docstring. --- zipline/lib/_factorize.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx index e14a2039..3b84662e 100644 --- a/zipline/lib/_factorize.pyx +++ b/zipline/lib/_factorize.pyx @@ -15,7 +15,7 @@ cdef inline double log2(double d): cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval): - """Choose the smallest numpy unsigned int dtype that can hold ``size``. + """Choose the smallest numpy unsigned int dtype that can hold ``maxval``. """ if maxval < 1: # lim x -> 0 log2(x) == -infinity so we floor at uint8 From ec2ec49914950cc42b29a96d8ba09b34775a2c81 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 20:17:58 -0400 Subject: [PATCH 11/11] MAINT: Make _sortable_sentinel a singleton. --- zipline/lib/labelarray.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index bd3ce2d6..ff121ff9 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -11,6 +11,7 @@ import pandas as pd from toolz import compose from zipline.utils.compat import unicode +from zipline.utils.functional import instance from zipline.utils.preprocess import preprocess from zipline.utils.sentinel import sentinel from zipline.utils.input_validation import ( @@ -623,7 +624,7 @@ class LabelArray(ndarray): return_inverse=True ) - if new_categories[0] == _sortable_sentinel: + if new_categories[0] is _sortable_sentinel: # f_to_use return _sortable_sentinel for locations that should be # missing values in our output. Since np.unique returns the uniques # in sorted order, and since _sortable_sentinel sorts before any @@ -731,15 +732,13 @@ class LabelArray(ndarray): return self.map_predicate(container.__contains__) +@instance # This makes _sortable_sentinel a singleton instance. @total_ordering -class _SortableSentinel(object): +class _sortable_sentinel(object): """Dummy object that sorts before any other python object. """ def __eq__(self, other): - return isinstance(other, _SortableSentinel) + return self is other def __lt__(self, other): - return not isinstance(other, _SortableSentinel) - - -_sortable_sentinel = _SortableSentinel() + return True