From 2431aaefb58a16ec6b45c2801091936567fb096c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 10 May 2016 16:56:32 -0400 Subject: [PATCH 1/3] BUG: Fix bad error message for element_of. It referred to the wrong method name (`is_element`). --- tests/pipeline/test_classifier.py | 4 ++-- zipline/pipeline/classifiers/classifier.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index de111e97..3eb02020 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -434,7 +434,7 @@ class ClassifierTestCase(BasePipelineTestCase): errmsg = str(e.exception) expected = ( "Found self.missing_value ('not in the array') in choices" - " supplied to C.is_element().\n" + " supplied to C.element_of().\n" "Missing values have NaN semantics, so the requested" " comparison would always produce False.\n" "Use the isnull() method to check for missing values.\n" @@ -447,7 +447,7 @@ class ClassifierTestCase(BasePipelineTestCase): class C(Classifier): dtype = dtype_ - missing_value = '' + missing_value = dtype.type('1') inputs = () window_length = 0 diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index 05a99593..6a64e106 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -246,7 +246,7 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): if self.missing_value in choices: raise ValueError( "Found self.missing_value ({mv!r}) in choices supplied to" - " {typename}.is_element().\n" + " {typename}.{meth_name}().\n" "Missing values have NaN semantics, so the" " requested comparison would always produce False.\n" "Use the isnull() method to check for missing values.\n" @@ -254,6 +254,7 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): mv=self.missing_value, typename=(type(self).__name__), choices=sorted(choices), + meth_name=self.element_of.__name__, ) ) From f7e9281b14a9f63f5067e5f155dccdda27a1992a Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 10 May 2016 12:40:36 -0400 Subject: [PATCH 2/3] BUG: Fix groupby with string columns. The previous algorithm assumed that the group labels were integers. It produced nonsense with LabelArrays (though sadly didn't crash because numpy promotes None and void to object). --- tests/pipeline/test_factor.py | 13 +++++++++++++ zipline/lib/labelarray.py | 4 ++++ zipline/pipeline/factors/factor.py | 26 +++++++++++++++++++------- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py index c5873c1f..4eee9f2e 100644 --- a/tests/pipeline/test_factor.py +++ b/tests/pipeline/test_factor.py @@ -23,6 +23,7 @@ from numpy import ( from numpy.random import randn, seed from zipline.errors import UnknownRankMethod +from zipline.lib.labelarray import LabelArray from zipline.lib.rank import masked_rankdata_2d from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply from zipline.pipeline import Classifier, Factor, Filter, TermGraph @@ -38,6 +39,7 @@ from zipline.testing import ( ) from zipline.utils.functional import dzip_exact from zipline.utils.numpy_utils import ( + categorical_dtype, datetime64ns_dtype, float64_dtype, int64_dtype, @@ -442,6 +444,7 @@ class FactorTestCase(BasePipelineTestCase): f = self.f m = Mask() c = C() + str_c = C(dtype=categorical_dtype) factor_data = array( [[1.0, 2.0, 3.0, 4.0], @@ -463,12 +466,18 @@ class FactorTestCase(BasePipelineTestCase): [1, 1, 2, 2]], dtype=int64_dtype, ) + string_classifier_data = LabelArray( + classifier_data.astype(str).astype(object), + missing_value=None, + ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), + 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), + 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array( @@ -496,6 +505,9 @@ class FactorTestCase(BasePipelineTestCase): [-0.500, 0.500, 0.000, nan]] ) } + # Changing the classifier dtype shouldn't affect anything. + expected['grouped_str'] = expected['grouped'] + expected['grouped_masked_str'] = expected['grouped_masked'] graph = TermGraph(terms) results = self.run_graph( @@ -503,6 +515,7 @@ class FactorTestCase(BasePipelineTestCase): initial_workspace={ f: factor_data, c: classifier_data, + str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index b707f0cd..a3ef5027 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -211,6 +211,10 @@ class LabelArray(ndarray): # This is a property because it should be immutable. return self._missing_value + @property + def missing_value_code(self): + return self.reverse_categories[self.missing_value] + def has_label(self, value): return value in self.reverse_categories diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index a76002a9..6a0b0b08 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -47,6 +47,7 @@ from zipline.utils.input_validation import expect_types from zipline.utils.math_utils import nanmean, nanstd from zipline.utils.numpy_utils import ( bool_dtype, + categorical_dtype, coerce_to_dtype, datetime64ns_dtype, float64_dtype, @@ -939,15 +940,26 @@ class GroupedRowTransform(Factor): def _compute(self, arrays, dates, assets, mask): data = arrays[0] - null_group_value = self.inputs[1].missing_value - group_labels = where( - mask, - arrays[1], - null_group_value, - ) + groupby_expr = self.inputs[1] + if groupby_expr.dtype == int64_dtype: + group_labels = arrays[1] + null_label = self.inputs[1].missing_value + elif groupby_expr.dtype == categorical_dtype: + # Coerce our LabelArray into an isomorphic array of ints. This is + # necessary because np.where doesn't know about LabelArrays or the + # void dtype. + group_labels = arrays[1].as_int_array() + null_label = arrays[1].missing_value_code + else: + raise TypeError( + "Unexpected groupby dtype: %s." % groupby_expr.dtype + ) + + # Make a copy with the null code written to masked locations. + group_labels = where(mask, group_labels, null_label) return where( - group_labels != null_group_value, + group_labels != null_label, naive_grouped_rowwise_apply( data=data, group_labels=group_labels, From 8b1136d9d5ff4ec5ee59ed6baa4f9a74a9a03763 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 10 May 2016 18:03:21 -0400 Subject: [PATCH 3/3] ENH: Validate missing_values at term construction. Finds bugs in several bad tests that were constructing invalid terms. --- tests/pipeline/test_classifier.py | 4 +-- tests/pipeline/test_factor.py | 2 +- tests/pipeline/test_term.py | 23 +++++++++++++++ zipline/pipeline/term.py | 47 ++++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index 3eb02020..aba52225 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -8,7 +8,6 @@ from zipline.pipeline import Classifier from zipline.testing import parameter_space from zipline.utils.numpy_utils import ( categorical_dtype, - coerce_to_dtype, int64_dtype, ) @@ -162,7 +161,8 @@ class ClassifierTestCase(BasePipelineTestCase): dtype_=[int64_dtype, categorical_dtype], ) def test_disallow_comparison_to_missing_value(self, missing, dtype_): - missing = coerce_to_dtype(dtype_, missing) + if dtype_ == categorical_dtype: + missing = str(missing) class C(Classifier): dtype = dtype_ diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py index 4eee9f2e..fc4eef69 100644 --- a/tests/pipeline/test_factor.py +++ b/tests/pipeline/test_factor.py @@ -444,7 +444,7 @@ class FactorTestCase(BasePipelineTestCase): f = self.f m = Mask() c = C() - str_c = C(dtype=categorical_dtype) + str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array( [[1.0, 2.0, 3.0, 4.0], diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 502d6e48..4b7bf0de 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -602,3 +602,26 @@ class SubDataSetTestCase(TestCase): with self.assertRaises(ValueError) as e: SomeClassifier() self.assertEqual(str(e.exception), expected_error) + + def test_unreasonable_missing_values(self): + + for base_type, dtype_, bad_mv in ((Factor, float64_dtype, 'ayy'), + (Filter, bool_dtype, 'lmao'), + (Classifier, int64_dtype, 'lolwut'), + (Classifier, categorical_dtype, 7)): + class SomeTerm(base_type): + inputs = () + window_length = 0 + missing_value = bad_mv + dtype = dtype_ + + with self.assertRaises(TypeError) as e: + SomeTerm() + + prefix = ( + "^Missing value {mv!r} is not a valid choice " + "for term SomeTerm with dtype {dtype}.\n\n" + "Coercion attempt failed with:" + ).format(mv=bad_mv, dtype=dtype_) + + self.assertRegexpMatches(str(e.exception), prefix) diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index 1bffc3e7..4b04d465 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -4,7 +4,7 @@ Base class for Filters, Factors and Classifiers from abc import ABCMeta, abstractproperty from weakref import WeakValueDictionary -from numpy import dtype as dtype_class, ndarray +from numpy import array, dtype as dtype_class, ndarray from six import with_metaclass from zipline.errors import ( DTypeNotSpecified, @@ -16,10 +16,12 @@ from zipline.errors import ( WindowLengthNotSpecified, ) from zipline.lib.adjusted_array import can_represent_dtype +from zipline.lib.labelarray import LabelArray from zipline.utils.input_validation import expect_types from zipline.utils.memoize import lazyval from zipline.utils.numpy_utils import ( bool_dtype, + categorical_dtype, default_missing_value_for_dtype, ) from zipline.utils.sentinel import sentinel @@ -177,6 +179,7 @@ class Term(with_metaclass(ABCMeta, object)): """ if dtype is NotSpecified: raise DTypeNotSpecified(termname=termname) + try: dtype = dtype_class(dtype) except TypeError: @@ -188,6 +191,31 @@ class Term(with_metaclass(ABCMeta, object)): if missing_value is NotSpecified: missing_value = default_missing_value_for_dtype(dtype) + try: + if (dtype == categorical_dtype): + # This check is necessary because we use object dtype for + # categoricals, and numpy will allow us to promote numerical + # values to object even though we don't support them. + _assert_valid_categorical_missing_value(missing_value) + + # For any other type, we can check if the missing_value is safe by + # making an array of that value and trying to safely convert it to + # the desired type. + # 'same_kind' allows casting between things like float32 and + # float64, but not str and int. + array([missing_value]).astype(dtype=dtype, casting='same_kind') + except TypeError as e: + raise TypeError( + "Missing value {value!r} is not a valid choice " + "for term {termname} with dtype {dtype}.\n\n" + "Coercion attempt failed with: {error}".format( + termname=termname, + value=missing_value, + dtype=dtype, + error=e, + ) + ) + return dtype, missing_value def __init__(self, *args, **kwargs): @@ -498,3 +526,20 @@ class ComputableTerm(Term): inputs=self.inputs, window_length=self.window_length, ) + + +def _assert_valid_categorical_missing_value(value): + """ + Check that value is a valid categorical missing_value. + + Raises a TypeError if the value is cannot be used as the missing_value for + a categorical_dtype Term. + """ + label_types = LabelArray.SUPPORTED_SCALAR_TYPES + if not isinstance(value, label_types): + raise TypeError( + "Categorical terms must have missing values of type " + "{types}.".format( + types=' or '.join([t.__name__ for t in label_types]), + ) + )