From 3b8a6b543ebefabb49ea0c12039df73abdd775f2 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 7 Jun 2017 17:32:47 -0400 Subject: [PATCH] BUG: Fix NoneType comparisons in PY3. --- tests/pipeline/test_classifier.py | 2 +- zipline/lib/labelarray.py | 37 +++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index e70479ea..5bfc31b0 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -472,7 +472,7 @@ class ClassifierTestCase(BasePipelineTestCase): __fail_fast=True, labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), relabel_func=[ - lambda s: s[0], + lambda s: str(s[0]), lambda s: str(len(s)), lambda s: str(len([c for c in s if c == 'a'])), lambda s: None, diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index 4db75672..bd3ce2d6 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -1,7 +1,7 @@ """ An ndarray subclass for working with arrays of strings. """ -from functools import partial +from functools import partial, total_ordering from operator import eq, ne import re @@ -584,8 +584,13 @@ class LabelArray(ndarray): missing_value=self.missing_value, otypes=allowed_outtypes): + # Don't call f on the missing value; those locations don't exist + # semantically. We return _sortable_sentinel rather than None + # because the np.unique call below sorts the categories array, + # which raises an error on Python 3 because None and str aren't + # comparable. if x == missing_value: - return x + return _sortable_sentinel ret = f(x) @@ -600,6 +605,9 @@ class LabelArray(ndarray): ) ) + if ret == missing_value: + return _sortable_sentinel + return ret new_categories_with_duplicates = ( @@ -610,14 +618,21 @@ class LabelArray(ndarray): # with the same code duplicated multiple times. Compress the categories # by running them through np.unique, and then use the reverse lookup # table to compress codes as well. - new_categories, bloated_reverse_index = np.unique( + new_categories, bloated_inverse_index = np.unique( new_categories_with_duplicates, return_inverse=True ) + if new_categories[0] == _sortable_sentinel: + # f_to_use return _sortable_sentinel for locations that should be + # missing values in our output. Since np.unique returns the uniques + # in sorted order, and since _sortable_sentinel sorts before any + # string, we only need to check the first array entry. + new_categories[0] = self.missing_value + # `reverse_index` will always be a 64 bit integer even if we can hold a # smaller array. - reverse_index = bloated_reverse_index.astype( + reverse_index = bloated_inverse_index.astype( smallest_uint_that_can_hold(len(new_categories)) ) new_codes = np.take(reverse_index, self.as_int_array()) @@ -714,3 +729,17 @@ class LabelArray(ndarray): element of self was an element of ``container``. """ return self.map_predicate(container.__contains__) + + +@total_ordering +class _SortableSentinel(object): + """Dummy object that sorts before any other python object. + """ + def __eq__(self, other): + return isinstance(other, _SortableSentinel) + + def __lt__(self, other): + return not isinstance(other, _SortableSentinel) + + +_sortable_sentinel = _SortableSentinel()