Revert "MAINT: Remove support for custom string Column missing values."

This reverts commit 1b1e842e2339d6d0ee40cdfe34dcd27b4e4a7c0c.
This commit is contained in:
Scott Sanderson
2016-07-21 17:19:51 -04:00
parent 16f4944232
commit 0ff13e7fdc
6 changed files with 38 additions and 46 deletions
+11 -10
View File
@@ -51,7 +51,7 @@ class ClassifierTestCase(BasePipelineTestCase):
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(mv=[None])
@parameter_space(mv=['0', None])
def test_string_isnull(self, mv):
class C(Classifier):
@@ -126,7 +126,7 @@ class ClassifierTestCase(BasePipelineTestCase):
class C(Classifier):
dtype = categorical_dtype
missing_value = None
missing_value = ''
inputs = ()
window_length = 0
@@ -162,7 +162,7 @@ class ClassifierTestCase(BasePipelineTestCase):
)
def test_disallow_comparison_to_missing_value(self, missing, dtype_):
if dtype_ == categorical_dtype:
missing = None
missing = str(missing)
class C(Classifier):
dtype = dtype_
@@ -224,7 +224,7 @@ class ClassifierTestCase(BasePipelineTestCase):
class C(Classifier):
dtype = categorical_dtype
missing_value = None
missing_value = missing
inputs = ()
window_length = 0
@@ -245,7 +245,7 @@ class ClassifierTestCase(BasePipelineTestCase):
expected = (
(data.as_int_array() != data.reverse_categories.get(compval, -1)) &
(data.as_int_array() != data.reverse_categories[missing])
(data.as_int_array() != data.reverse_categories[C.missing_value])
)
self.check_terms(
@@ -271,6 +271,7 @@ class ClassifierTestCase(BasePipelineTestCase):
labelarray_dtype):
if labelarray_dtype == bytes_dtype:
compval = compval.encode('utf-8')
missing = missing.encode('utf-8')
startswith_re = b'^' + compval + b'.*'
endswith_re = b'.*' + compval + b'$'
@@ -282,7 +283,7 @@ class ClassifierTestCase(BasePipelineTestCase):
class C(Classifier):
dtype = categorical_dtype
missing_value = None
missing_value = missing
inputs = ()
window_length = 0
@@ -337,7 +338,7 @@ class ClassifierTestCase(BasePipelineTestCase):
class C(Classifier):
dtype = categorical_dtype
missing_value = None
missing_value = missing
inputs = ()
window_length = 0
@@ -417,7 +418,7 @@ class ClassifierTestCase(BasePipelineTestCase):
Test that element_of raises a useful error if we attempt to pass it an
array of choices that include the classifier's missing_value.
"""
missing = None
missing = "not in the array"
class C(Classifier):
dtype = categorical_dtype
@@ -432,7 +433,7 @@ class ClassifierTestCase(BasePipelineTestCase):
c.element_of(bad_elems)
errmsg = str(e.exception)
expected = (
"Found self.missing_value (None) in choices"
"Found self.missing_value ('not in the array') in choices"
" supplied to C.element_of().\n"
"Missing values have NaN semantics, so the requested"
" comparison would always produce False.\n"
@@ -446,7 +447,7 @@ class ClassifierTestCase(BasePipelineTestCase):
class C(Classifier):
dtype = dtype_
missing_value = None if dtype_ is categorical_dtype else -1
missing_value = dtype.type('1')
inputs = ()
window_length = 0
+11 -10
View File
@@ -55,6 +55,15 @@ class EventDataSet(DataSet):
previous_string = Column(dtype=categorical_dtype, missing_value=None)
next_string = Column(dtype=categorical_dtype, missing_value=None)
previous_string_custom_missing = Column(
dtype=categorical_dtype,
missing_value=u"<<NULL>>",
)
next_string_custom_missing = Column(
dtype=categorical_dtype,
missing_value=u"<<NULL>>",
)
critical_dates = pd.to_datetime([
'2014-01-05',
@@ -280,6 +289,7 @@ class EventsLoaderTestCase(WithAssetFinder,
EventDataSet.next_float: 'float',
EventDataSet.next_int: 'int',
EventDataSet.next_string: 'string',
EventDataSet.next_string_custom_missing: 'string'
}
cls.previous_value_columns = {
EventDataSet.previous_datetime: 'datetime',
@@ -287,6 +297,7 @@ class EventsLoaderTestCase(WithAssetFinder,
EventDataSet.previous_float: 'float',
EventDataSet.previous_int: 'int',
EventDataSet.previous_string: 'string',
EventDataSet.previous_string_custom_missing: 'string'
}
cls.loader = cls.make_loader(
events=cls.raw_events,
@@ -366,11 +377,6 @@ class EventsLoaderTestCase(WithAssetFinder,
# If we've seen event 1 but not event 2, event 1 should
# win.
self.assertEqual(computed_value, v1)
elif column.dtype == categorical_dtype:
# XXX: The value in the output from pandas will be np.nan,
# but we currently only support None as the missing
# value for string columns.
self.assertTrue(np.isnan(computed_value))
else:
# If we haven't seen either event, then we should have
# column.missing_value.
@@ -408,11 +414,6 @@ class EventsLoaderTestCase(WithAssetFinder,
# If we've seen event 1 but not event 2, event 1 should
# win.
self.assertEqual(computed_value, v2)
elif column.dtype == categorical_dtype:
# XXX: The value in the output from pandas will be np.nan,
# but we currently only support None as the missing
# value for string columns.
self.assertTrue(np.isnan(computed_value))
else:
# If we haven't seen either event, then we should have
# column.missing_value.
+1 -1
View File
@@ -742,7 +742,7 @@ class SubDataSetTestCase(TestCase):
window_length = 5
inputs = [SomeDataSet.foo, SomeDataSet.bar]
outputs = outputs_
missing_value = None if dtype_ is categorical_dtype else -1
missing_value = dtype_.type('123')
expected_error = (
"SomeClassifier does not support custom outputs, "
+4 -21
View File
@@ -284,28 +284,11 @@ class LabelArray(ndarray):
"""
if len(self.shape) > 1:
raise ValueError("Can't convert a 2D array to a categorical.")
missing_code = self.reverse_categories[self.missing_value]
raw_codes = self.as_int_array()
# As of pandas 0.18, putting null values in pandas categoricals is
# deprecated. The preferred representation is to pass -1 as the code
# for missing values.
if missing_code == 0:
# This is just a performance optimization. It should produce the
# same results as below.
codes = raw_codes - 1
categories = self.categories[1:]
else:
# subtract 1 for anything greater than the missing code, and set
# the missing code to -1.
codes = raw_codes.copy()
codes[codes > missing_code] -= 1
codes[codes == missing_code] = -1
categories = self.categories[self.categories != self.missing_value]
return pd.Categorical.from_codes(
codes,
categories,
self.as_int_array(),
# We need to make a copy because pandas >= 0.17 fails if this
# buffer isn't writeable.
self.categories.copy(),
ordered=False,
name=name,
)
+4
View File
@@ -32,3 +32,7 @@ class TestingDataSet(DataSet):
dtype=categorical_dtype,
missing_value=None,
)
categorical_default_NULL_string = Column(
dtype=categorical_dtype,
missing_value=u'<<NULL>>',
)
+7 -4
View File
@@ -28,6 +28,7 @@ from zipline.errors import (
WindowLengthNotSpecified,
)
from zipline.lib.adjusted_array import can_represent_dtype
from zipline.lib.labelarray import LabelArray
from zipline.utils.input_validation import expect_types
from zipline.utils.memoize import lazyval
from zipline.utils.numpy_utils import (
@@ -755,10 +756,12 @@ def _assert_valid_categorical_missing_value(value):
Raises a TypeError if the value is cannot be used as the missing_value for
a categorical_dtype Term.
Currently, only None is supported as a missing value.
"""
if value is not None:
label_types = LabelArray.SUPPORTED_SCALAR_TYPES
if not isinstance(value, label_types):
raise TypeError(
"Categorical terms must have missing values of None."
"Categorical terms must have missing values of type "
"{types}.".format(
types=' or '.join([t.__name__ for t in label_types]),
)
)