Revert "MAINT: Remove support for custom string Column missing values."

This reverts commit 1b1e842e2339d6d0ee40cdfe34dcd27b4e4a7c0c.
2026-06-30 08:44:01 +08:00 · 2016-07-21 17:19:51 -04:00
parent 16f4944232
commit 0ff13e7fdc
6 changed files with 38 additions and 46 deletions
@@ -51,7 +51,7 @@ class ClassifierTestCase(BasePipelineTestCase):
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

-    @parameter_space(mv=[None])
+    @parameter_space(mv=['0', None])
    def test_string_isnull(self, mv):

        class C(Classifier):
@@ -126,7 +126,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        class C(Classifier):
            dtype = categorical_dtype
-            missing_value = None
+            missing_value = ''
            inputs = ()
            window_length = 0

@@ -162,7 +162,7 @@ class ClassifierTestCase(BasePipelineTestCase):
    )
    def test_disallow_comparison_to_missing_value(self, missing, dtype_):
        if dtype_ == categorical_dtype:
-            missing = None
+            missing = str(missing)

        class C(Classifier):
            dtype = dtype_
@@ -224,7 +224,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        class C(Classifier):
            dtype = categorical_dtype
-            missing_value = None
+            missing_value = missing
            inputs = ()
            window_length = 0

@@ -245,7 +245,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        expected = (
            (data.as_int_array() != data.reverse_categories.get(compval, -1)) &
-            (data.as_int_array() != data.reverse_categories[missing])
+            (data.as_int_array() != data.reverse_categories[C.missing_value])
        )

        self.check_terms(
@@ -271,6 +271,7 @@ class ClassifierTestCase(BasePipelineTestCase):
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
+            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
@@ -282,7 +283,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        class C(Classifier):
            dtype = categorical_dtype
-            missing_value = None
+            missing_value = missing
            inputs = ()
            window_length = 0

@@ -337,7 +338,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        class C(Classifier):
            dtype = categorical_dtype
-            missing_value = None
+            missing_value = missing
            inputs = ()
            window_length = 0

@@ -417,7 +418,7 @@ class ClassifierTestCase(BasePipelineTestCase):
        Test that element_of raises a useful error if we attempt to pass it an
        array of choices that include the classifier's missing_value.
        """
-        missing = None
+        missing = "not in the array"

        class C(Classifier):
            dtype = categorical_dtype
@@ -432,7 +433,7 @@ class ClassifierTestCase(BasePipelineTestCase):
                c.element_of(bad_elems)
            errmsg = str(e.exception)
            expected = (
-                "Found self.missing_value (None) in choices"
+                "Found self.missing_value ('not in the array') in choices"
                " supplied to C.element_of().\n"
                "Missing values have NaN semantics, so the requested"
                " comparison would always produce False.\n"
@@ -446,7 +447,7 @@ class ClassifierTestCase(BasePipelineTestCase):

        class C(Classifier):
            dtype = dtype_
-            missing_value = None if dtype_ is categorical_dtype else -1
+            missing_value = dtype.type('1')
            inputs = ()
            window_length = 0

@@ -55,6 +55,15 @@ class EventDataSet(DataSet):
    previous_string = Column(dtype=categorical_dtype, missing_value=None)
    next_string = Column(dtype=categorical_dtype, missing_value=None)

+    previous_string_custom_missing = Column(
+        dtype=categorical_dtype,
+        missing_value=u"<<NULL>>",
+    )
+    next_string_custom_missing = Column(
+        dtype=categorical_dtype,
+        missing_value=u"<<NULL>>",
+    )
+

 critical_dates = pd.to_datetime([
    '2014-01-05',
@@ -280,6 +289,7 @@ class EventsLoaderTestCase(WithAssetFinder,
            EventDataSet.next_float: 'float',
            EventDataSet.next_int: 'int',
            EventDataSet.next_string: 'string',
+            EventDataSet.next_string_custom_missing: 'string'
        }
        cls.previous_value_columns = {
            EventDataSet.previous_datetime: 'datetime',
@@ -287,6 +297,7 @@ class EventsLoaderTestCase(WithAssetFinder,
            EventDataSet.previous_float: 'float',
            EventDataSet.previous_int: 'int',
            EventDataSet.previous_string: 'string',
+            EventDataSet.previous_string_custom_missing: 'string'
        }
        cls.loader = cls.make_loader(
            events=cls.raw_events,
@@ -366,11 +377,6 @@ class EventsLoaderTestCase(WithAssetFinder,
                    # If we've seen event 1 but not event 2, event 1 should
                    # win.
                    self.assertEqual(computed_value, v1)
-                elif column.dtype == categorical_dtype:
-                    # XXX: The value in the output from pandas will be np.nan,
-                    #      but we currently only support None as the missing
-                    #      value for string columns.
-                    self.assertTrue(np.isnan(computed_value))
                else:
                    # If we haven't seen either event, then we should have
                    # column.missing_value.
@@ -408,11 +414,6 @@ class EventsLoaderTestCase(WithAssetFinder,
                    # If we've seen event 1 but not event 2, event 1 should
                    # win.
                    self.assertEqual(computed_value, v2)
-                elif column.dtype == categorical_dtype:
-                    # XXX: The value in the output from pandas will be np.nan,
-                    #      but we currently only support None as the missing
-                    #      value for string columns.
-                    self.assertTrue(np.isnan(computed_value))
                else:
                    # If we haven't seen either event, then we should have
                    # column.missing_value.
@@ -742,7 +742,7 @@ class SubDataSetTestCase(TestCase):
            window_length = 5
            inputs = [SomeDataSet.foo, SomeDataSet.bar]
            outputs = outputs_
-            missing_value = None if dtype_ is categorical_dtype else -1
+            missing_value = dtype_.type('123')

        expected_error = (
            "SomeClassifier does not support custom outputs, "
@@ -284,28 +284,11 @@ class LabelArray(ndarray):
        """
        if len(self.shape) > 1:
            raise ValueError("Can't convert a 2D array to a categorical.")
-
-        missing_code = self.reverse_categories[self.missing_value]
-        raw_codes = self.as_int_array()
-        # As of pandas 0.18, putting null values in pandas categoricals is
-        # deprecated. The preferred representation is to pass -1 as the code
-        # for missing values.
-        if missing_code == 0:
-            # This is just a performance optimization. It should produce the
-            # same results as below.
-            codes = raw_codes - 1
-            categories = self.categories[1:]
-        else:
-            # subtract 1 for anything greater than the missing code, and set
-            # the missing code to -1.
-            codes = raw_codes.copy()
-            codes[codes > missing_code] -= 1
-            codes[codes == missing_code] = -1
-            categories = self.categories[self.categories != self.missing_value]
-
        return pd.Categorical.from_codes(
-            codes,
-            categories,
+            self.as_int_array(),
+            # We need to make a copy because pandas >= 0.17 fails if this
+            # buffer isn't writeable.
+            self.categories.copy(),
            ordered=False,
            name=name,
        )
@@ -32,3 +32,7 @@ class TestingDataSet(DataSet):
        dtype=categorical_dtype,
        missing_value=None,
    )
+    categorical_default_NULL_string = Column(
+        dtype=categorical_dtype,
+        missing_value=u'<<NULL>>',
+    )
@@ -28,6 +28,7 @@ from zipline.errors import (
    WindowLengthNotSpecified,
 )
 from zipline.lib.adjusted_array import can_represent_dtype
+from zipline.lib.labelarray import LabelArray
 from zipline.utils.input_validation import expect_types
 from zipline.utils.memoize import lazyval
 from zipline.utils.numpy_utils import (
@@ -755,10 +756,12 @@ def _assert_valid_categorical_missing_value(value):

    Raises a TypeError if the value is cannot be used as the missing_value for
    a categorical_dtype Term.
-
-    Currently, only None is supported as a missing value.
    """
-    if value is not None:
+    label_types = LabelArray.SUPPORTED_SCALAR_TYPES
+    if not isinstance(value, label_types):
        raise TypeError(
-            "Categorical terms must have missing values of None."
+            "Categorical terms must have missing values of type "
+            "{types}.".format(
+                types=' or '.join([t.__name__ for t in label_types]),
+            )
        )