mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-30 08:44:01 +08:00
Revert "MAINT: Remove support for custom string Column missing values."
This reverts commit 1b1e842e2339d6d0ee40cdfe34dcd27b4e4a7c0c.
This commit is contained in:
@@ -51,7 +51,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
mask=self.build_mask(self.ones_mask(shape=data.shape)),
|
||||
)
|
||||
|
||||
@parameter_space(mv=[None])
|
||||
@parameter_space(mv=['0', None])
|
||||
def test_string_isnull(self, mv):
|
||||
|
||||
class C(Classifier):
|
||||
@@ -126,7 +126,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
class C(Classifier):
|
||||
dtype = categorical_dtype
|
||||
missing_value = None
|
||||
missing_value = ''
|
||||
inputs = ()
|
||||
window_length = 0
|
||||
|
||||
@@ -162,7 +162,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
)
|
||||
def test_disallow_comparison_to_missing_value(self, missing, dtype_):
|
||||
if dtype_ == categorical_dtype:
|
||||
missing = None
|
||||
missing = str(missing)
|
||||
|
||||
class C(Classifier):
|
||||
dtype = dtype_
|
||||
@@ -224,7 +224,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
class C(Classifier):
|
||||
dtype = categorical_dtype
|
||||
missing_value = None
|
||||
missing_value = missing
|
||||
inputs = ()
|
||||
window_length = 0
|
||||
|
||||
@@ -245,7 +245,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
expected = (
|
||||
(data.as_int_array() != data.reverse_categories.get(compval, -1)) &
|
||||
(data.as_int_array() != data.reverse_categories[missing])
|
||||
(data.as_int_array() != data.reverse_categories[C.missing_value])
|
||||
)
|
||||
|
||||
self.check_terms(
|
||||
@@ -271,6 +271,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
labelarray_dtype):
|
||||
if labelarray_dtype == bytes_dtype:
|
||||
compval = compval.encode('utf-8')
|
||||
missing = missing.encode('utf-8')
|
||||
|
||||
startswith_re = b'^' + compval + b'.*'
|
||||
endswith_re = b'.*' + compval + b'$'
|
||||
@@ -282,7 +283,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
class C(Classifier):
|
||||
dtype = categorical_dtype
|
||||
missing_value = None
|
||||
missing_value = missing
|
||||
inputs = ()
|
||||
window_length = 0
|
||||
|
||||
@@ -337,7 +338,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
class C(Classifier):
|
||||
dtype = categorical_dtype
|
||||
missing_value = None
|
||||
missing_value = missing
|
||||
inputs = ()
|
||||
window_length = 0
|
||||
|
||||
@@ -417,7 +418,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
Test that element_of raises a useful error if we attempt to pass it an
|
||||
array of choices that include the classifier's missing_value.
|
||||
"""
|
||||
missing = None
|
||||
missing = "not in the array"
|
||||
|
||||
class C(Classifier):
|
||||
dtype = categorical_dtype
|
||||
@@ -432,7 +433,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
c.element_of(bad_elems)
|
||||
errmsg = str(e.exception)
|
||||
expected = (
|
||||
"Found self.missing_value (None) in choices"
|
||||
"Found self.missing_value ('not in the array') in choices"
|
||||
" supplied to C.element_of().\n"
|
||||
"Missing values have NaN semantics, so the requested"
|
||||
" comparison would always produce False.\n"
|
||||
@@ -446,7 +447,7 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
|
||||
class C(Classifier):
|
||||
dtype = dtype_
|
||||
missing_value = None if dtype_ is categorical_dtype else -1
|
||||
missing_value = dtype.type('1')
|
||||
inputs = ()
|
||||
window_length = 0
|
||||
|
||||
|
||||
@@ -55,6 +55,15 @@ class EventDataSet(DataSet):
|
||||
previous_string = Column(dtype=categorical_dtype, missing_value=None)
|
||||
next_string = Column(dtype=categorical_dtype, missing_value=None)
|
||||
|
||||
previous_string_custom_missing = Column(
|
||||
dtype=categorical_dtype,
|
||||
missing_value=u"<<NULL>>",
|
||||
)
|
||||
next_string_custom_missing = Column(
|
||||
dtype=categorical_dtype,
|
||||
missing_value=u"<<NULL>>",
|
||||
)
|
||||
|
||||
|
||||
critical_dates = pd.to_datetime([
|
||||
'2014-01-05',
|
||||
@@ -280,6 +289,7 @@ class EventsLoaderTestCase(WithAssetFinder,
|
||||
EventDataSet.next_float: 'float',
|
||||
EventDataSet.next_int: 'int',
|
||||
EventDataSet.next_string: 'string',
|
||||
EventDataSet.next_string_custom_missing: 'string'
|
||||
}
|
||||
cls.previous_value_columns = {
|
||||
EventDataSet.previous_datetime: 'datetime',
|
||||
@@ -287,6 +297,7 @@ class EventsLoaderTestCase(WithAssetFinder,
|
||||
EventDataSet.previous_float: 'float',
|
||||
EventDataSet.previous_int: 'int',
|
||||
EventDataSet.previous_string: 'string',
|
||||
EventDataSet.previous_string_custom_missing: 'string'
|
||||
}
|
||||
cls.loader = cls.make_loader(
|
||||
events=cls.raw_events,
|
||||
@@ -366,11 +377,6 @@ class EventsLoaderTestCase(WithAssetFinder,
|
||||
# If we've seen event 1 but not event 2, event 1 should
|
||||
# win.
|
||||
self.assertEqual(computed_value, v1)
|
||||
elif column.dtype == categorical_dtype:
|
||||
# XXX: The value in the output from pandas will be np.nan,
|
||||
# but we currently only support None as the missing
|
||||
# value for string columns.
|
||||
self.assertTrue(np.isnan(computed_value))
|
||||
else:
|
||||
# If we haven't seen either event, then we should have
|
||||
# column.missing_value.
|
||||
@@ -408,11 +414,6 @@ class EventsLoaderTestCase(WithAssetFinder,
|
||||
# If we've seen event 1 but not event 2, event 1 should
|
||||
# win.
|
||||
self.assertEqual(computed_value, v2)
|
||||
elif column.dtype == categorical_dtype:
|
||||
# XXX: The value in the output from pandas will be np.nan,
|
||||
# but we currently only support None as the missing
|
||||
# value for string columns.
|
||||
self.assertTrue(np.isnan(computed_value))
|
||||
else:
|
||||
# If we haven't seen either event, then we should have
|
||||
# column.missing_value.
|
||||
|
||||
@@ -742,7 +742,7 @@ class SubDataSetTestCase(TestCase):
|
||||
window_length = 5
|
||||
inputs = [SomeDataSet.foo, SomeDataSet.bar]
|
||||
outputs = outputs_
|
||||
missing_value = None if dtype_ is categorical_dtype else -1
|
||||
missing_value = dtype_.type('123')
|
||||
|
||||
expected_error = (
|
||||
"SomeClassifier does not support custom outputs, "
|
||||
|
||||
@@ -284,28 +284,11 @@ class LabelArray(ndarray):
|
||||
"""
|
||||
if len(self.shape) > 1:
|
||||
raise ValueError("Can't convert a 2D array to a categorical.")
|
||||
|
||||
missing_code = self.reverse_categories[self.missing_value]
|
||||
raw_codes = self.as_int_array()
|
||||
# As of pandas 0.18, putting null values in pandas categoricals is
|
||||
# deprecated. The preferred representation is to pass -1 as the code
|
||||
# for missing values.
|
||||
if missing_code == 0:
|
||||
# This is just a performance optimization. It should produce the
|
||||
# same results as below.
|
||||
codes = raw_codes - 1
|
||||
categories = self.categories[1:]
|
||||
else:
|
||||
# subtract 1 for anything greater than the missing code, and set
|
||||
# the missing code to -1.
|
||||
codes = raw_codes.copy()
|
||||
codes[codes > missing_code] -= 1
|
||||
codes[codes == missing_code] = -1
|
||||
categories = self.categories[self.categories != self.missing_value]
|
||||
|
||||
return pd.Categorical.from_codes(
|
||||
codes,
|
||||
categories,
|
||||
self.as_int_array(),
|
||||
# We need to make a copy because pandas >= 0.17 fails if this
|
||||
# buffer isn't writeable.
|
||||
self.categories.copy(),
|
||||
ordered=False,
|
||||
name=name,
|
||||
)
|
||||
|
||||
@@ -32,3 +32,7 @@ class TestingDataSet(DataSet):
|
||||
dtype=categorical_dtype,
|
||||
missing_value=None,
|
||||
)
|
||||
categorical_default_NULL_string = Column(
|
||||
dtype=categorical_dtype,
|
||||
missing_value=u'<<NULL>>',
|
||||
)
|
||||
|
||||
@@ -28,6 +28,7 @@ from zipline.errors import (
|
||||
WindowLengthNotSpecified,
|
||||
)
|
||||
from zipline.lib.adjusted_array import can_represent_dtype
|
||||
from zipline.lib.labelarray import LabelArray
|
||||
from zipline.utils.input_validation import expect_types
|
||||
from zipline.utils.memoize import lazyval
|
||||
from zipline.utils.numpy_utils import (
|
||||
@@ -755,10 +756,12 @@ def _assert_valid_categorical_missing_value(value):
|
||||
|
||||
Raises a TypeError if the value is cannot be used as the missing_value for
|
||||
a categorical_dtype Term.
|
||||
|
||||
Currently, only None is supported as a missing value.
|
||||
"""
|
||||
if value is not None:
|
||||
label_types = LabelArray.SUPPORTED_SCALAR_TYPES
|
||||
if not isinstance(value, label_types):
|
||||
raise TypeError(
|
||||
"Categorical terms must have missing values of None."
|
||||
"Categorical terms must have missing values of type "
|
||||
"{types}.".format(
|
||||
types=' or '.join([t.__name__ for t in label_types]),
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user