diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index 625ed4a5..aba52225 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -51,7 +51,7 @@ class ClassifierTestCase(BasePipelineTestCase): mask=self.build_mask(self.ones_mask(shape=data.shape)), ) - @parameter_space(mv=[None]) + @parameter_space(mv=['0', None]) def test_string_isnull(self, mv): class C(Classifier): @@ -126,7 +126,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = categorical_dtype - missing_value = None + missing_value = '' inputs = () window_length = 0 @@ -162,7 +162,7 @@ class ClassifierTestCase(BasePipelineTestCase): ) def test_disallow_comparison_to_missing_value(self, missing, dtype_): if dtype_ == categorical_dtype: - missing = None + missing = str(missing) class C(Classifier): dtype = dtype_ @@ -224,7 +224,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = categorical_dtype - missing_value = None + missing_value = missing inputs = () window_length = 0 @@ -245,7 +245,7 @@ class ClassifierTestCase(BasePipelineTestCase): expected = ( (data.as_int_array() != data.reverse_categories.get(compval, -1)) & - (data.as_int_array() != data.reverse_categories[missing]) + (data.as_int_array() != data.reverse_categories[C.missing_value]) ) self.check_terms( @@ -271,6 +271,7 @@ class ClassifierTestCase(BasePipelineTestCase): labelarray_dtype): if labelarray_dtype == bytes_dtype: compval = compval.encode('utf-8') + missing = missing.encode('utf-8') startswith_re = b'^' + compval + b'.*' endswith_re = b'.*' + compval + b'$' @@ -282,7 +283,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = categorical_dtype - missing_value = None + missing_value = missing inputs = () window_length = 0 @@ -337,7 +338,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = categorical_dtype - missing_value = None + missing_value = missing inputs = () window_length = 0 @@ -417,7 +418,7 @@ class ClassifierTestCase(BasePipelineTestCase): Test that element_of raises a useful error if we attempt to pass it an array of choices that include the classifier's missing_value. """ - missing = None + missing = "not in the array" class C(Classifier): dtype = categorical_dtype @@ -432,7 +433,7 @@ class ClassifierTestCase(BasePipelineTestCase): c.element_of(bad_elems) errmsg = str(e.exception) expected = ( - "Found self.missing_value (None) in choices" + "Found self.missing_value ('not in the array') in choices" " supplied to C.element_of().\n" "Missing values have NaN semantics, so the requested" " comparison would always produce False.\n" @@ -446,7 +447,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = dtype_ - missing_value = None if dtype_ is categorical_dtype else -1 + missing_value = dtype.type('1') inputs = () window_length = 0 diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 80cdbb5b..c10b15c5 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -55,6 +55,15 @@ class EventDataSet(DataSet): previous_string = Column(dtype=categorical_dtype, missing_value=None) next_string = Column(dtype=categorical_dtype, missing_value=None) + previous_string_custom_missing = Column( + dtype=categorical_dtype, + missing_value=u"<>", + ) + next_string_custom_missing = Column( + dtype=categorical_dtype, + missing_value=u"<>", + ) + critical_dates = pd.to_datetime([ '2014-01-05', @@ -280,6 +289,7 @@ class EventsLoaderTestCase(WithAssetFinder, EventDataSet.next_float: 'float', EventDataSet.next_int: 'int', EventDataSet.next_string: 'string', + EventDataSet.next_string_custom_missing: 'string' } cls.previous_value_columns = { EventDataSet.previous_datetime: 'datetime', @@ -287,6 +297,7 @@ class EventsLoaderTestCase(WithAssetFinder, EventDataSet.previous_float: 'float', EventDataSet.previous_int: 'int', EventDataSet.previous_string: 'string', + EventDataSet.previous_string_custom_missing: 'string' } cls.loader = cls.make_loader( events=cls.raw_events, @@ -366,11 +377,6 @@ class EventsLoaderTestCase(WithAssetFinder, # If we've seen event 1 but not event 2, event 1 should # win. self.assertEqual(computed_value, v1) - elif column.dtype == categorical_dtype: - # XXX: The value in the output from pandas will be np.nan, - # but we currently only support None as the missing - # value for string columns. - self.assertTrue(np.isnan(computed_value)) else: # If we haven't seen either event, then we should have # column.missing_value. @@ -408,11 +414,6 @@ class EventsLoaderTestCase(WithAssetFinder, # If we've seen event 1 but not event 2, event 1 should # win. self.assertEqual(computed_value, v2) - elif column.dtype == categorical_dtype: - # XXX: The value in the output from pandas will be np.nan, - # but we currently only support None as the missing - # value for string columns. - self.assertTrue(np.isnan(computed_value)) else: # If we haven't seen either event, then we should have # column.missing_value. diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 89d9368c..c58dd4e0 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -742,7 +742,7 @@ class SubDataSetTestCase(TestCase): window_length = 5 inputs = [SomeDataSet.foo, SomeDataSet.bar] outputs = outputs_ - missing_value = None if dtype_ is categorical_dtype else -1 + missing_value = dtype_.type('123') expected_error = ( "SomeClassifier does not support custom outputs, " diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index 92f40175..0cf1e967 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -284,28 +284,11 @@ class LabelArray(ndarray): """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") - - missing_code = self.reverse_categories[self.missing_value] - raw_codes = self.as_int_array() - # As of pandas 0.18, putting null values in pandas categoricals is - # deprecated. The preferred representation is to pass -1 as the code - # for missing values. - if missing_code == 0: - # This is just a performance optimization. It should produce the - # same results as below. - codes = raw_codes - 1 - categories = self.categories[1:] - else: - # subtract 1 for anything greater than the missing code, and set - # the missing code to -1. - codes = raw_codes.copy() - codes[codes > missing_code] -= 1 - codes[codes == missing_code] = -1 - categories = self.categories[self.categories != self.missing_value] - return pd.Categorical.from_codes( - codes, - categories, + self.as_int_array(), + # We need to make a copy because pandas >= 0.17 fails if this + # buffer isn't writeable. + self.categories.copy(), ordered=False, name=name, ) diff --git a/zipline/pipeline/data/testing.py b/zipline/pipeline/data/testing.py index 0359390d..52873685 100644 --- a/zipline/pipeline/data/testing.py +++ b/zipline/pipeline/data/testing.py @@ -32,3 +32,7 @@ class TestingDataSet(DataSet): dtype=categorical_dtype, missing_value=None, ) + categorical_default_NULL_string = Column( + dtype=categorical_dtype, + missing_value=u'<>', + ) diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index f7e4a35e..6cecd147 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -28,6 +28,7 @@ from zipline.errors import ( WindowLengthNotSpecified, ) from zipline.lib.adjusted_array import can_represent_dtype +from zipline.lib.labelarray import LabelArray from zipline.utils.input_validation import expect_types from zipline.utils.memoize import lazyval from zipline.utils.numpy_utils import ( @@ -755,10 +756,12 @@ def _assert_valid_categorical_missing_value(value): Raises a TypeError if the value is cannot be used as the missing_value for a categorical_dtype Term. - - Currently, only None is supported as a missing value. """ - if value is not None: + label_types = LabelArray.SUPPORTED_SCALAR_TYPES + if not isinstance(value, label_types): raise TypeError( - "Categorical terms must have missing values of None." + "Categorical terms must have missing values of type " + "{types}.".format( + types=' or '.join([t.__name__ for t in label_types]), + ) )