diff --git a/setup.py b/setup.py index 78063820..e12e0b49 100644 --- a/setup.py +++ b/setup.py @@ -81,11 +81,13 @@ class LazyBuildExtCommandClass(dict): ext_modules = [ Extension('zipline.assets._assets', ['zipline/assets/_assets.pyx']), Extension('zipline.lib.adjustment', ['zipline/lib/adjustment.pyx']), + Extension('zipline.lib._factorize', ['zipline/lib/_factorize.pyx']), Extension( 'zipline.lib._float64window', ['zipline/lib/_float64window.pyx'] ), Extension('zipline.lib._int64window', ['zipline/lib/_int64window.pyx']), Extension('zipline.lib._uint8window', ['zipline/lib/_uint8window.pyx']), + Extension('zipline.lib._labelwindow', ['zipline/lib/_labelwindow.pyx']), Extension('zipline.lib.rank', ['zipline/lib/rank.pyx']), Extension('zipline.data._equities', ['zipline/data/_equities.pyx']), Extension('zipline.data._adjustments', ['zipline/data/_adjustments.pyx']), diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index a2a8cde0..11fb5130 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -9,19 +9,23 @@ from nose_parameterized import parameterized from numpy import ( arange, array, + asarray, + dtype, full, where, ) -from numpy.testing import assert_array_equal from six.moves import zip_longest +from toolz import curry from zipline.errors import WindowLengthNotPositive, WindowLengthTooLong from zipline.lib.adjustment import ( Datetime64Overwrite, Float64Multiply, Float64Overwrite, + ObjectOverwrite, ) from zipline.lib.adjusted_array import AdjustedArray, NOMASK +from zipline.lib.labelarray import LabelArray from zipline.testing import check_arrays, parameter_space from zipline.utils.numpy_utils import ( coerce_to_dtype, @@ -29,6 +33,7 @@ from zipline.utils.numpy_utils import ( default_missing_value_for_dtype, float64_dtype, int64_dtype, + object_dtype, ) @@ -62,12 +67,41 @@ def valid_window_lengths(underlying_buffer_length): return iter(range(1, underlying_buffer_length + 1)) -def _gen_unadjusted_cases(dtype): +@curry +def as_dtype(dtype, data): + """ + Curried wrapper around array.astype for when you have the dtype before you + have the data. + """ + return asarray(data).astype(dtype) + +@curry +def as_labelarray(initial_dtype, missing_value, array): + """ + Curried wrapper around LabelArray, that round-trips the input data through + `initial_dtype` first. + """ + return LabelArray( + array.astype(initial_dtype), + missing_value=initial_dtype.type(''), + ) + + +bytes_dtype = dtype('S3') +unicode_dtype = dtype('U3') + + +def _gen_unadjusted_cases(name, + make_input, + make_expected_output, + missing_value): nrows = 6 ncols = 3 - data = arange(nrows * ncols).astype(dtype).reshape(nrows, ncols) - missing_value = default_missing_value_for_dtype(dtype) + + raw_data = arange(nrows * ncols).reshape(nrows, ncols) + input_array = make_input(raw_data) + expected_output_array = make_expected_output(raw_data) for windowlen in valid_window_lengths(nrows): @@ -76,13 +110,13 @@ def _gen_unadjusted_cases(dtype): ) yield ( - "dtype_%s_length_%d" % (dtype, windowlen), - data, + "%s_length_%d" % (name, windowlen), + input_array, windowlen, {}, missing_value, [ - data[offset:offset + windowlen] + expected_output_array[offset:offset + windowlen] for offset in range(num_legal_windows) ], ) @@ -156,84 +190,125 @@ def _gen_multiplicative_adjustment_cases(dtype): [1, 6, 1], [1, 1, 1]], dtype=dtype) - return _gen_expectations(baseline, adjustments, buffer_as_of, nrows) - - -def _gen_overwrite_adjustment_cases(dtype): - """ - Generate test cases for overwrite adjustments. - - The algorithm used here is the same as the one used above for - multiplicative adjustments. The only difference is the semantics of how - the adjustments are expected to modify the arrays. - """ - adjustment_type = { - float64_dtype: Float64Overwrite, - datetime64ns_dtype: Datetime64Overwrite, - }[dtype] - - nrows, ncols = 6, 3 - adjustments = {} - buffer_as_of = [None] * 6 - baseline = buffer_as_of[0] = full((nrows, ncols), 2, dtype=dtype) - - # Note that row indices are inclusive! - adjustments[1] = [ - adjustment_type(0, 0, 0, 0, coerce_to_dtype(dtype, 1)), - ] - buffer_as_of[1] = array([[1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - # No adjustment at index 2. - buffer_as_of[2] = buffer_as_of[1] - - adjustments[3] = [ - adjustment_type(1, 2, 1, 1, coerce_to_dtype(dtype, 3)), - adjustment_type(0, 1, 0, 0, coerce_to_dtype(dtype, 4)), - ] - buffer_as_of[3] = array([[4, 2, 2], - [4, 3, 2], - [2, 3, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - adjustments[4] = [ - adjustment_type(0, 3, 2, 2, coerce_to_dtype(dtype, 5)) - ] - buffer_as_of[4] = array([[4, 2, 5], - [4, 3, 5], - [2, 3, 5], - [2, 2, 5], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - adjustments[5] = [ - adjustment_type(0, 4, 1, 1, coerce_to_dtype(dtype, 6)), - adjustment_type(2, 2, 2, 2, coerce_to_dtype(dtype, 7)), - ] - buffer_as_of[5] = array([[4, 6, 5], - [4, 6, 5], - [2, 6, 7], - [2, 6, 5], - [2, 6, 2], - [2, 2, 2]], dtype=dtype) - return _gen_expectations( baseline, + default_missing_value_for_dtype(dtype), adjustments, buffer_as_of, nrows, ) -def _gen_expectations(baseline, adjustments, buffer_as_of, nrows): +def _gen_overwrite_adjustment_cases(name, + make_input, + make_expected_output, + dtype, + missing_value): + """ + Generate test cases for overwrite adjustments. + + The algorithm used here is the same as the one used above for + multiplicative adjustments. The only difference is the semantics of how + the adjustments are expected to modify the arrays. + + This is parameterized on `make_input` and make_expected_output functions, + which take 2-D lists of values and transform them into desired input/output + arrays. We do this so that we can easily test both vanilla numpy ndarrays + and our own LabelArray class for strings. + """ + adjustment_type = { + float64_dtype: Float64Overwrite, + datetime64ns_dtype: Datetime64Overwrite, + bytes_dtype: ObjectOverwrite, + unicode_dtype: ObjectOverwrite, + object_dtype: ObjectOverwrite, + }[dtype] + + if dtype == object_dtype: + # When we're testing object dtypes, we expect to have strings, but + # coerce_to_dtype(object, 3) just gives 3 as a Python integer. + def make_overwrite_value(dtype, value): + return str(value) + else: + make_overwrite_value = coerce_to_dtype + + adjustments = {} + buffer_as_of = [None] * 6 + baseline = make_input([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + buffer_as_of[0] = make_expected_output([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # Note that row indices are inclusive! + adjustments[1] = [ + adjustment_type(0, 0, 0, 0, make_overwrite_value(dtype, 1)), + ] + buffer_as_of[1] = make_expected_output([[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # No adjustment at index 2. + buffer_as_of[2] = buffer_as_of[1] + + adjustments[3] = [ + adjustment_type(1, 2, 1, 1, make_overwrite_value(dtype, 3)), + adjustment_type(0, 1, 0, 0, make_overwrite_value(dtype, 4)), + ] + buffer_as_of[3] = make_expected_output([[4, 2, 2], + [4, 3, 2], + [2, 3, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[4] = [ + adjustment_type(0, 3, 2, 2, make_overwrite_value(dtype, 5)) + ] + buffer_as_of[4] = make_expected_output([[4, 2, 5], + [4, 3, 5], + [2, 3, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[5] = [ + adjustment_type(0, 4, 1, 1, make_overwrite_value(dtype, 6)), + adjustment_type(2, 2, 2, 2, make_overwrite_value(dtype, 7)), + ] + buffer_as_of[5] = make_expected_output([[4, 6, 5], + [4, 6, 5], + [2, 6, 7], + [2, 6, 5], + [2, 6, 2], + [2, 2, 2]]) + + return _gen_expectations( + baseline, + missing_value, + adjustments, + buffer_as_of, + nrows=6, + ) + + +def _gen_expectations(baseline, + missing_value, + adjustments, + buffer_as_of, + nrows): - missing_value = default_missing_value_for_dtype(baseline.dtype) for windowlen in valid_window_lengths(nrows): num_legal_windows = num_windows_of_length_M_on_buffers_of_length_N( @@ -263,8 +338,60 @@ class AdjustedArrayTestCase(TestCase): @parameterized.expand( chain( - _gen_unadjusted_cases(float64_dtype), - _gen_unadjusted_cases(datetime64ns_dtype), + _gen_unadjusted_cases( + 'float', + make_input=as_dtype(float64_dtype), + make_expected_output=as_dtype(float64_dtype), + missing_value=default_missing_value_for_dtype(float64_dtype), + ), + _gen_unadjusted_cases( + 'datetime', + make_input=as_dtype(datetime64ns_dtype), + make_expected_output=as_dtype(datetime64ns_dtype), + missing_value=default_missing_value_for_dtype( + datetime64ns_dtype + ), + ), + # Test passing an array of strings to AdjustedArray. + _gen_unadjusted_cases( + 'bytes_ndarray', + make_input=as_dtype(bytes_dtype), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_ndarray', + make_input=as_dtype(unicode_dtype), + make_expected_output=as_labelarray(unicode_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_ndarray', + make_input=lambda a: a.astype(str).astype(object), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + # Test passing a LabelArray directly to AdjustedArray. + _gen_unadjusted_cases( + 'bytes_labelarray', + make_input=as_labelarray(bytes_dtype, b''), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_labelarray', + make_input=as_labelarray(unicode_dtype, u''), + make_expected_output=as_labelarray(bytes_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_labelarray', + make_input=( + lambda a: LabelArray(a.astype(str).astype(object), b'') + ), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), ) ) def test_no_adjustments(self, @@ -273,14 +400,13 @@ class AdjustedArrayTestCase(TestCase): lookback, adjustments, missing_value, - expected): + expected_output): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. - window_iter = array.traverse(lookback) - for yielded, expected_yield in zip_longest(window_iter, expected): - self.assertEqual(yielded.dtype, data.dtype) - assert_array_equal(yielded, expected_yield) + in_out = zip(array.traverse(lookback), expected_output) + for yielded, expected_yield in in_out: + check_arrays(yielded, expected_yield) @parameterized.expand(_gen_multiplicative_adjustment_cases(float64_dtype)) def test_multiplicative_adjustments(self, @@ -295,12 +421,70 @@ class AdjustedArrayTestCase(TestCase): for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): - assert_array_equal(yielded, expected_yield) + check_arrays(yielded, expected_yield) @parameterized.expand( chain( - _gen_overwrite_adjustment_cases(float64_dtype), - _gen_overwrite_adjustment_cases(datetime64ns_dtype), + _gen_overwrite_adjustment_cases( + 'float', + make_input=as_dtype(float64_dtype), + make_expected_output=as_dtype(float64_dtype), + dtype=float64_dtype, + missing_value=default_missing_value_for_dtype(float64_dtype), + ), + _gen_overwrite_adjustment_cases( + 'datetime', + make_input=as_dtype(datetime64ns_dtype), + make_expected_output=as_dtype(datetime64ns_dtype), + dtype=datetime64ns_dtype, + missing_value=default_missing_value_for_dtype( + datetime64ns_dtype, + ), + ), + # There are six cases here: + # Using np.bytes/np.unicode/python string arrays as inputs. + # Passing np.bytes/np.unicode/python string arrays to LabelArray, + # and using those as input. + # + # The outputs should always be LabelArrays. + _gen_unadjusted_cases( + 'bytes_ndarray', + make_input=as_dtype(bytes_dtype), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_ndarray', + make_input=as_dtype(unicode_dtype), + make_expected_output=as_labelarray(unicode_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_ndarray', + make_input=lambda a: a.astype(str).astype(object), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'bytes_labelarray', + make_input=as_labelarray(bytes_dtype, b''), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_labelarray', + make_input=as_labelarray(unicode_dtype, u''), + make_expected_output=as_labelarray(bytes_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_labelarray', + make_input=( + lambda a: LabelArray(a.astype(str).astype(object), b'') + ), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), ) ) def test_overwrite_adjustment_cases(self, @@ -314,11 +498,15 @@ class AdjustedArrayTestCase(TestCase): for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): - self.assertEqual(yielded.dtype, data.dtype) - assert_array_equal(yielded, expected_yield) + check_arrays(yielded, expected_yield) @parameter_space( - dtype=[float64_dtype, int64_dtype, datetime64ns_dtype], + __fail_fast=True, + dtype=[ + float64_dtype, + int64_dtype, + datetime64ns_dtype, + ], missing_value=[0, 10000], window_length=[2, 3], ) @@ -341,6 +529,37 @@ class AdjustedArrayTestCase(TestCase): for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual) + @parameter_space( + __fail_fast=True, + dtype=[bytes_dtype, unicode_dtype, object_dtype], + missing_value=["0", "-1", ""], + window_length=[2, 3], + ) + def test_masking_with_strings(self, dtype, missing_value, window_length): + missing_value = coerce_to_dtype(dtype, missing_value) + baseline_ints = arange(15).reshape(5, 3) + + # Coerce to string first so that coercion to object gets us an array of + # string objects. + baseline = baseline_ints.astype(str).astype(dtype) + mask = (baseline_ints % 2).astype(bool) + + masked_baseline = LabelArray(baseline, missing_value=missing_value) + masked_baseline[~mask] = missing_value + + array = AdjustedArray( + baseline, + mask, + adjustments={}, + missing_value=missing_value, + ) + + gen_expected = moving_window(masked_baseline, window_length) + gen_actual = array.traverse(window_length=window_length) + + for expected, actual in zip(gen_expected, gen_actual): + check_arrays(expected, actual) + def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index a4b25178..77f76079 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -1,12 +1,21 @@ import numpy as np +from zipline.lib.labelarray import LabelArray from zipline.pipeline import Classifier from zipline.testing import parameter_space -from zipline.utils.numpy_utils import int64_dtype +from zipline.utils.numpy_utils import ( + categorical_dtype, + coerce_to_dtype, + int64_dtype, +) from .base import BasePipelineTestCase +bytes_dtype = np.dtype('S3') +unicode_dtype = np.dtype('U3') + + class ClassifierTestCase(BasePipelineTestCase): @parameter_space(mv=[-1, 0, 1, 999]) @@ -69,10 +78,56 @@ class ClassifierTestCase(BasePipelineTestCase): mask=self.build_mask(self.ones_mask(shape=data.shape)), ) - @parameter_space(missing=[-1, 0, 1]) - def test_disallow_comparison_to_missing_value(self, missing): + @parameter_space( + __fail_fast=True, + compval=['a', 'ab', 'not in the array'], + labelarray_dtype=(bytes_dtype, categorical_dtype, unicode_dtype), + ) + def test_string_eq(self, compval, labelarray_dtype): + + compval = labelarray_dtype.type(compval) + class C(Classifier): - dtype = int64_dtype + dtype = categorical_dtype + missing_value = '' + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value='', + ) + + self.check_terms( + terms={ + 'eq': c.eq(compval), + }, + expected={ + 'eq': (data == compval), + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + @parameter_space( + missing=[-1, 0, 1], + dtype_=[int64_dtype, categorical_dtype], + ) + def test_disallow_comparison_to_missing_value(self, missing, dtype_): + missing = coerce_to_dtype(dtype_, missing) + + class C(Classifier): + dtype = dtype_ missing_value = missing inputs = () window_length = 0 @@ -82,7 +137,7 @@ class ClassifierTestCase(BasePipelineTestCase): errmsg = str(e.exception) self.assertEqual( errmsg, - "Comparison against self.missing_value ({v}) in C.eq().\n" + "Comparison against self.missing_value ({v!r}) in C.eq().\n" "Missing values have NaN semantics, so the requested comparison" " would always produce False.\n" "Use the isnull() method to check for missing values.".format( @@ -118,3 +173,100 @@ class ClassifierTestCase(BasePipelineTestCase): initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), ) + + @parameter_space( + __fail_fast=True, + compval=['a', 'ab', '', 'not in the array'], + missing=['a', 'ab', '', 'not in the array'], + labelarray_dtype=(bytes_dtype, unicode_dtype, categorical_dtype), + ) + def test_string_not_equal(self, compval, missing, labelarray_dtype): + + compval = labelarray_dtype.type(compval) + + class C(Classifier): + dtype = categorical_dtype + missing_value = missing + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value=missing, + ) + + expected = ( + (data.as_int_array() != data.reverse_categories.get(compval, -1)) & + (data.as_int_array() != data.reverse_categories[C.missing_value]) + ) + + self.check_terms( + terms={ + 'ne': c != compval, + }, + expected={ + 'ne': expected, + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + @parameter_space( + __fail_fast=True, + compval=['a', 'b', 'ab', 'not in the array'], + missing=['a', 'ab', '', 'not in the array'], + labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), + ) + def test_string_elementwise_predicates(self, + compval, + missing, + labelarray_dtype): + + missing = labelarray_dtype.type(missing) + compval = labelarray_dtype.type(compval) + + class C(Classifier): + dtype = categorical_dtype + missing_value = missing + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value=missing, + ) + + self.check_terms( + terms={ + 'startswith': c.startswith(compval), + 'endswith': c.endswith(compval), + 'contains': c.contains(compval), + }, + expected={ + 'startswith': (data.startswith(compval) & (data != missing)), + 'endswith': (data.endswith(compval) & (data != missing)), + 'contains': (data.contains(compval) & (data != missing)), + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) diff --git a/tests/pipeline/test_column.py b/tests/pipeline/test_column.py index d270dc3b..717fc53d 100644 --- a/tests/pipeline/test_column.py +++ b/tests/pipeline/test_column.py @@ -7,6 +7,7 @@ from unittest import TestCase from pandas import date_range, DataFrame from pandas.util.testing import assert_frame_equal +from zipline.lib.labelarray import LabelArray from zipline.pipeline import Pipeline from zipline.pipeline.data.testing import TestingDataSet as TDS from zipline.testing import chrange, temp_pipeline_engine @@ -35,6 +36,21 @@ class LatestTestCase(TestCase): def expected_latest(self, column, slice_): loader = self.engine.get_loader(column) + + index = self.calendar[slice_] + columns = self.assets + values = loader.values(column.dtype, self.calendar, self.sids)[slice_] + + if column.dtype.kind in ('O', 'S', 'U'): + # For string columns, we expect a categorical in the output. + return LabelArray( + values, + missing_value=column.missing_value, + ).as_categorical_frame( + index=index, + columns=columns, + ) + return DataFrame( loader.values(column.dtype, self.calendar, self.sids)[slice_], index=self.calendar[slice_], @@ -55,6 +71,6 @@ class LatestTestCase(TestCase): dates_to_test[-1], ) for column in columns: - float_result = result[column.name].unstack() - expected_float_result = self.expected_latest(column, cal_slice) - assert_frame_equal(float_result, expected_float_result) + col_result = result[column.name].unstack() + expected_col_result = self.expected_latest(column, cal_slice) + assert_frame_equal(col_result, expected_col_result) diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 04f35ed6..408cb689 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -471,7 +471,8 @@ class ObjectIdentityTestCase(TestCase): for column in TestingDataSet.columns: if column.dtype == bool_dtype: self.assertIsInstance(column.latest, Filter) - elif column.dtype == int64_dtype: + elif (column.dtype == int64_dtype + or column.dtype.kind in ('O', 'S', 'U')): self.assertIsInstance(column.latest, Classifier) elif column.dtype in factor_dtypes: self.assertIsInstance(column.latest, Factor) diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py new file mode 100644 index 00000000..5b24659f --- /dev/null +++ b/tests/test_labelarray.py @@ -0,0 +1,151 @@ +from itertools import product +from operator import eq, ne +import numpy as np + +from zipline.lib.labelarray import LabelArray +from zipline.testing import check_arrays, parameter_space, ZiplineTestCase + + +def rotN(l, N): + """ + Rotate a list of elements. + + Pulls N elements off the end of the list and appends them to the front. + + >>> rotN(['a', 'b', 'c', 'd'], 2) + ['c', 'd', 'a', 'b'] + >>> rotN(['a', 'b', 'c', 'd'], 3) + ['d', 'a', 'b', 'c'] + """ + assert len(l) >= N, "Can't rotate list by longer than its length." + return l[N:] + l[:N] + + +class LabelArrayTestCase(ZiplineTestCase): + + @classmethod + def init_class_fixtures(cls): + super(LabelArrayTestCase, cls).init_class_fixtures() + + cls.rowvalues = row = ['', 'a', 'b', 'ab', 'a', '', 'b', 'ab', 'z'] + cls.strs = np.array([rotN(row, i) for i in range(3)]) + + def test_fail_on_direct_construction(self): + # See http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray # noqa + + with self.assertRaises(TypeError) as e: + np.ndarray.__new__(LabelArray, (5, 5)) + + self.assertEqual( + str(e.exception), + "Direct construction of LabelArrays is not supported." + ) + + @parameter_space( + __fail_fast=True, + s=['', 'a', 'z', 'aa', 'not in the array'], + shape=[(27,), (9, 3), (3, 9), (3, 3, 3)], + array_astype=(bytes, unicode, object), + scalar_astype=(bytes, unicode, object), + ) + def test_compare_to_str(self, s, shape, array_astype, scalar_astype): + strs = self.strs.reshape(shape).astype(array_astype) + arr = LabelArray(strs, missing_value='') + check_arrays(strs == s, arr == s) + check_arrays(strs != s, arr != s) + + np_startswith = np.vectorize(lambda elem: elem.startswith(s)) + check_arrays(arr.startswith(s), np_startswith(strs)) + + np_endswith = np.vectorize(lambda elem: elem.endswith(s)) + check_arrays(arr.endswith(s), np_endswith(strs)) + + np_contains = np.vectorize(lambda elem: s in elem) + check_arrays(arr.contains(s), np_contains(strs)) + + def test_compare_to_str_array(self): + strs = self.strs + shape = strs.shape + arr = LabelArray(strs, missing_value='') + check_arrays(strs == arr, np.full_like(strs, True, dtype=bool)) + check_arrays(strs != arr, np.full_like(strs, False, dtype=bool)) + + def broadcastable_row(value, dtype): + return np.full((shape[0], 1), value, dtype=strs.dtype) + + def broadcastable_col(value, dtype): + return np.full((1, shape[1]), value, dtype=strs.dtype) + + for comparator, dtype, value in product((eq, ne), + (bytes, unicode, object), + set(self.rowvalues)): + check_arrays( + comparator(arr, np.full_like(strs, value)), + comparator(strs, value), + ) + check_arrays( + comparator(arr, broadcastable_row(value, dtype=dtype)), + comparator(strs, value), + ) + check_arrays( + comparator(arr, broadcastable_col(value, dtype=dtype)), + comparator(strs, value), + ) + + @parameter_space( + __fail_fast=True, + slice_=[ + 0, 1, -1, + slice(None), + slice(0, 0), + slice(0, 3), + slice(1, 4), + slice(0), + slice(None, 1), + slice(0, 4, 2), + (slice(None), 1), + (slice(None), slice(None)), + (slice(None), slice(1, 2)), + ] + ) + def test_slicing_preserves_attributes(self, slice_): + arr = LabelArray(self.strs.reshape((9, 3)), missing_value='') + sliced = arr[slice_] + self.assertIsInstance(sliced, LabelArray) + self.assertIs(sliced.categories, arr.categories) + self.assertIs(sliced.reverse_categories, arr.reverse_categories) + self.assertIs(sliced.missing_value, arr.missing_value) + + def test_infer_categories(self): + arr1d = LabelArray(self.strs, missing_value='') + codes1d = arr1d.as_int_array() + self.assertEqual(arr1d.shape, self.strs.shape) + self.assertEqual(arr1d.shape, codes1d.shape) + + categories = arr1d.categories + unique_rowvalues = set(self.rowvalues) + + # There should be an entry in categories for each unique row value, and + # each integer stored in the data array should be an index into + # categories. + self.assertEqual(list(categories), sorted(set(self.rowvalues))) + self.assertEqual( + set(codes1d.ravel()), + set(range(len(unique_rowvalues))) + ) + for idx, value in enumerate(arr1d.categories): + check_arrays( + self.strs == value, + arr1d.view(type=np.ndarray) == idx, + ) + + for shape in (9, 3), (3, 9), (3, 3, 3): + strs2d = self.strs.reshape(shape) + arr2d = LabelArray(strs2d, missing_value='') + codes2d = arr2d.as_int_array() + + self.assertEqual(arr2d.shape, shape) + check_arrays(arr2d.categories, categories) + + for idx, value in enumerate(arr2d.categories): + check_arrays(strs2d == value, codes2d == idx) diff --git a/zipline/algorithm.py b/zipline/algorithm.py index 496a6a56..9b0026ce 100644 --- a/zipline/algorithm.py +++ b/zipline/algorithm.py @@ -665,6 +665,7 @@ class TradingAlgorithm(object): new_assets = tuple(new_assets) new_sids = tuple(new_sids) new_symbols = tuple(new_symbols) + number_of_kinds_of_new_things = ( sum((bool(new_assets), bool(new_sids), bool(new_symbols))) ) diff --git a/zipline/data/us_equity_loader.py b/zipline/data/us_equity_loader.py index 5385b203..ffb3d7e2 100644 --- a/zipline/data/us_equity_loader.py +++ b/zipline/data/us_equity_loader.py @@ -19,7 +19,7 @@ from abc import ( ) from cachetools import LRUCache -from numpy import dtype, around, hstack +from numpy import around, hstack from pandas.tslib import normalize_date from six import with_metaclass @@ -28,6 +28,7 @@ from zipline.lib._float64window import AdjustedArrayWindow as Float64Window from zipline.lib.adjustment import Float64Multiply from zipline.utils.cache import ExpiringCache from zipline.utils.memoize import lazyval +from zipline.utils.numpy_utils import float64_dtype class SlidingWindow(object): @@ -237,9 +238,9 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)): prefetch_dts = cal[start_ix:prefetch_end_ix + 1] prefetch_len = len(prefetch_dts) array = self._array(prefetch_dts, needed_assets, field) + view_kwargs = {} if field == 'volume': - array = array.astype('float64') - dtype_ = dtype('float64') + array = array.astype(float64_dtype) for i, asset in enumerate(needed_assets): if self._adjustments_reader: @@ -249,7 +250,7 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)): adjs = {} window = Float64Window( array[:, i].reshape(prefetch_len, 1), - dtype_, + view_kwargs, adjs, offset, size diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx new file mode 100644 index 00000000..d3a98d18 --- /dev/null +++ b/zipline/lib/_factorize.pyx @@ -0,0 +1,108 @@ +""" +Factorization algorithms. +""" +from numpy cimport ndarray, int64_t, PyArray_Check, import_array +from numpy import arange, asarray, empty, int64, isnan, ndarray + +import_array() + + +cpdef factorize_strings_known_categories(ndarray[object] values, + list categories, + object missing_value, + int sort): + """ + Factorize an array whose categories are already known. + + Any entries not in the specified categories will be given the code for + `missing_value`. + """ + if missing_value not in categories: + categories.append(missing_value) + + if sort: + categories = sorted(categories) + + cdef: + Py_ssize_t nvalues = len(values) + dict reverse_categories = dict( + zip(categories, range(len(categories))) + ) + + if not nvalues: + return ( + asarray([], dtype=int64), + asarray(categories, dtype=object), + reverse_categories, + ) + + cdef: + Py_ssize_t i + Py_ssize_t missing_code = reverse_categories[missing_value] + ndarray[int64_t] codes = empty(nvalues, dtype=int64) + + for i in range(nvalues): + codes[i] = reverse_categories.get(values[i], missing_code) + + return codes, asarray(categories, dtype=object), reverse_categories + +cpdef factorize_strings(ndarray[object] values, + object missing_value, + int sort): + """ + Factorize an array of (possibly duplicated) labels into an array of indices + into a unique array of labels. + + This is ~30% faster than pandas.factorize, at the cost of not having + special treatment for NaN, which we don't care about because we only + support arrays of strings. + + (Though it's faster even if you throw in the nan checks that pandas does, + because we're using dict and list instead of PyObjectHashTable and + ObjectVector. Python's builtin data structures are **really** + well-optimized.) + """ + cdef: + Py_ssize_t nvalues = len(values) + list categories = [missing_value] + dict reverse_categories = {missing_value: 0} + + # Short circuit on empty array. + if not nvalues: + return ( + asarray([], dtype=int64), + asarray(categories, dtype=object), + reverse_categories, + ) + + cdef: + Py_ssize_t i, code + object key = None + ndarray[int64_t] codes = empty(nvalues, dtype=int64) + + for i in range(nvalues): + key = values[i] + code = reverse_categories.get(key, -1) + if code == -1: + # Assign new code. + code = len(reverse_categories) + reverse_categories[key] = code + categories.append(key) + codes[i] = code + + cdef ndarray[int64_t, ndim=1] sorter + cdef ndarray[int64_t, ndim=1] reverse_indexer + cdef int ncategories + cdef ndarray[object] categories_array = asarray(categories, dtype=object) + if sort: + # This is all taken from pandas.core.algorithms.factorize. + ncategories = len(categories_array) + sorter = categories_array.argsort() + reverse_indexer = empty(ncategories, dtype=int64) + reverse_indexer.put(sorter, arange(ncategories)) + + codes = reverse_indexer.take(codes) + categories_array = categories_array.take(sorter) + reverse_categories = dict(zip(categories_array, range(ncategories))) + + return codes, categories_array, reverse_categories diff --git a/zipline/lib/_float64window.pyx b/zipline/lib/_float64window.pyx index e3df0225..1747b014 100644 --- a/zipline/lib/_float64window.pyx +++ b/zipline/lib/_float64window.pyx @@ -1,5 +1,7 @@ """ float specialization of AdjustedArrayWindow """ -from numpy cimport float64_t as ctype +from numpy cimport float64_t +ctypedef float64_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_int64window.pyx b/zipline/lib/_int64window.pyx index 9951bd8c..d30b97ca 100644 --- a/zipline/lib/_int64window.pyx +++ b/zipline/lib/_int64window.pyx @@ -1,5 +1,8 @@ """ datetime specialization of AdjustedArrayWindow """ -from numpy cimport int64_t as ctype +from numpy cimport int64_t + +ctypedef int64_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_labelwindow.pyx b/zipline/lib/_labelwindow.pyx new file mode 100644 index 00000000..702f9d79 --- /dev/null +++ b/zipline/lib/_labelwindow.pyx @@ -0,0 +1,6 @@ +""" +AdjustedArrayWindow type used for LabelArrays. +""" +ctypedef object databuffer + +include "_windowtemplate.pxi" diff --git a/zipline/lib/_uint8window.pyx b/zipline/lib/_uint8window.pyx index 99dad45f..8b8ce98b 100644 --- a/zipline/lib/_uint8window.pyx +++ b/zipline/lib/_uint8window.pyx @@ -1,5 +1,8 @@ """ bool specialization of AdjustedArrayWindow """ -from numpy cimport uint8_t as ctype +from numpy cimport uint8_t + +ctypedef uint8_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_windowtemplate.pxi b/zipline/lib/_windowtemplate.pxi index 2a1ae38b..fb50736d 100644 --- a/zipline/lib/_windowtemplate.pxi +++ b/zipline/lib/_windowtemplate.pxi @@ -2,8 +2,8 @@ Template for AdjustedArray windowed iterators. This file is intended to be used by inserting it via a Cython include into a -file that's define a type symbol named `ctype` and string constant named -`dtype`. +file that's defined a type symbol named `databuffer` that can be used like a +2-D numpy array. See Also -------- @@ -12,9 +12,7 @@ zipline.lib._intwindow zipline.lib._datewindow """ from numpy cimport ndarray -from numpy import asarray - -ctypedef ctype[:, :] databuffer +from numpy import asanyarray cdef class AdjustedArrayWindow: @@ -33,8 +31,8 @@ cdef class AdjustedArrayWindow: """ cdef: # ctype must be defined by the file into which this is being copied. - databuffer data - object viewtype + readonly databuffer data + readonly dict view_kwargs readonly Py_ssize_t window_length Py_ssize_t anchor, next_anchor, max_anchor, next_adj dict adjustments @@ -43,13 +41,13 @@ cdef class AdjustedArrayWindow: def __cinit__(self, databuffer data not None, - object viewtype not None, + dict view_kwargs not None, dict adjustments not None, Py_ssize_t offset, Py_ssize_t window_length): self.data = data - self.viewtype = viewtype + self.view_kwargs = view_kwargs self.adjustments = adjustments self.adjustment_indices = sorted(adjustments, reverse=True) self.window_length = window_length @@ -74,9 +72,10 @@ cdef class AdjustedArrayWindow: def __next__(self): cdef: - ndarray out object adjustment + ndarray out Py_ssize_t start, anchor + dict view_kwargs anchor = self.anchor = self.next_anchor if anchor > self.max_anchor: @@ -93,7 +92,13 @@ cdef class AdjustedArrayWindow: self.next_adj = self.pop_next_adj() start = anchor - self.window_length - out = asarray(self.data[start:self.anchor]).view(self.viewtype) + + # If our data is a custom subclass of ndarray, preserve that subclass + # by using asanyarray instead of asarray. + out = asanyarray(self.data[start:self.anchor]) + view_kwargs = self.view_kwargs + if view_kwargs: + out = out.view(**view_kwargs) out.setflags(write=False) self.next_anchor = self.anchor + 1 @@ -101,8 +106,7 @@ cdef class AdjustedArrayWindow: return out def seek(self, target_anchor): - cdef: - ndarray out + cdef ndarray out = None if target_anchor < self.anchor: raise Exception('Can not access data after window has passed.') @@ -122,5 +126,5 @@ cdef class AdjustedArrayWindow: self.window_length, self.anchor, self.max_anchor, - self.viewtype, + self.view_kwargs.get('dtype'), ) diff --git a/zipline/lib/adjusted_array.py b/zipline/lib/adjusted_array.py index 4c0ee5b8..b00f3459 100644 --- a/zipline/lib/adjusted_array.py +++ b/zipline/lib/adjusted_array.py @@ -17,6 +17,8 @@ from zipline.errors import ( WindowLengthNotPositive, WindowLengthTooLong, ) +from zipline.lib.labelarray import LabelArray +from zipline.utils.compat import unicode from zipline.utils.numpy_utils import ( datetime64ns_dtype, float64_dtype, @@ -28,8 +30,10 @@ from zipline.utils.memoize import lazyval # These class names are all the same because of our bootleg templating system. from ._float64window import AdjustedArrayWindow as Float64Window from ._int64window import AdjustedArrayWindow as Int64Window +from ._labelwindow import AdjustedArrayWindow as LabelWindow from ._uint8window import AdjustedArrayWindow as UInt8Window + NOMASK = None BOOL_DTYPES = frozenset( map(dtype, [bool_]), @@ -44,16 +48,20 @@ INT_DTYPES = frozenset( DATETIME_DTYPES = frozenset( map(dtype, ['datetime64[ns]', 'datetime64[D]']), ) +# We use object arrays for strings. +CATEGORICAL_DTYPES = frozenset(map(dtype, ['O'])) + REPRESENTABLE_DTYPES = BOOL_DTYPES.union( FLOAT_DTYPES, INT_DTYPES, - DATETIME_DTYPES + DATETIME_DTYPES, + CATEGORICAL_DTYPES, ) def can_represent_dtype(dtype): """ - Can we build an AdjustedArray for a baseline of dtype ``dtype``? + Can we build an AdjustedArray for a baseline of `dtype``? """ return dtype in REPRESENTABLE_DTYPES @@ -65,11 +73,11 @@ CONCRETE_WINDOW_TYPES = { } -def _normalize_array(data): +def _normalize_array(data, missing_value): """ Coerce buffer data for an AdjustedArray into a standard scalar - representation, returning the coerced array and a numpy dtype object to use - as a view type when providing public view into the data. + representation, returning the coerced array and a dict of argument to pass + to np.view to use when providing a user-facing view of the underlying data. - float* data is coerced to float64 with viewtype float64. - int32, int64, and uint32 are converted to int64 with viewtype int64. @@ -82,19 +90,29 @@ def _normalize_array(data): Returns ------- - coerced, viewtype : (np.ndarray, np.dtype) + coerced, view_kwargs : (np.ndarray, np.dtype) """ + if isinstance(data, LabelArray): + return data, {} + data_dtype = data.dtype if data_dtype == bool_: - return data.astype(uint8), dtype(bool_) + return data.astype(uint8), {'dtype': dtype(bool_)} elif data_dtype in FLOAT_DTYPES: - return data.astype(float64), dtype(float64) + return data.astype(float64), {'dtype': dtype(float64)} elif data_dtype in INT_DTYPES: - return data.astype(int64), dtype(int64) - elif data_dtype.name.startswith('datetime'): + return data.astype(int64), {'dtype': dtype(int64)} + elif data_dtype in CATEGORICAL_DTYPES: + if not isinstance(missing_value, (bytes, unicode)): + raise TypeError( + "Invalid missing_value for categorical array.\n" + "Expected a string, got %r" % missing_value, + ) + return LabelArray(data, missing_value), {} + elif data_dtype.kind == 'M': try: outarray = data.astype('datetime64[ns]').view('int64') - return outarray, datetime64ns_dtype + return outarray, {'dtype': datetime64ns_dtype} except OverflowError: raise ValueError( "AdjustedArray received a datetime array " @@ -127,18 +145,17 @@ class AdjustedArray(object): missing_value : object A value to use to fill missing data in yielded windows. Should be a value coercible to `data.dtype`. - """ __slots__ = ( '_data', - '_viewtype', + '_view_kwargs', 'adjustments', 'missing_value', '__weakref__', ) def __init__(self, data, mask, adjustments, missing_value): - self._data, self._viewtype = _normalize_array(data) + self._data, self._view_kwargs = _normalize_array(data, missing_value) self.adjustments = adjustments self.missing_value = missing_value @@ -158,20 +175,22 @@ class AdjustedArray(object): """ The data stored in this array. """ - return self._data.view(self._viewtype) + return self._data.view(**self._view_kwargs) @lazyval def dtype(self): """ The dtype of the data stored in this array. """ - return self._viewtype + return self._view_kwargs.get('dtype') or self._data.dtype @lazyval def _iterator_type(self): """ The iterator produced when `traverse` is called on this Array. """ + if isinstance(self._data, LabelArray): + return LabelWindow return CONCRETE_WINDOW_TYPES[self._data.dtype] def traverse(self, window_length, offset=0): @@ -190,7 +209,7 @@ class AdjustedArray(object): _check_window_params(data, window_length) return self._iterator_type( data, - self._viewtype, + self._view_kwargs, self.adjustments, offset, window_length, diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index 748c3025..8708b2e6 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -316,12 +316,13 @@ cdef class Float64Multiply(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] *= self.value + data[row, col] *= value cdef class Float64Overwrite(Float64Adjustment): @@ -354,12 +355,13 @@ cdef class Float64Overwrite(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] = self.value + data[row, col] = value cdef class Float64Add(Float64Adjustment): @@ -392,12 +394,13 @@ cdef class Float64Add(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] += self.value + data[row, col] += value cdef class _Int64Adjustment(Adjustment): @@ -530,9 +533,62 @@ cdef class Datetime64Overwrite(Datetime64Adjustment): """ cpdef mutate(self, int64_t[:, :] data): cdef Py_ssize_t row, col + cdef int64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] = self.value + data[row, col] = value + + +cdef class _ObjectAdjustment(Adjustment): + """ + Base class for adjustments that operate on arbitrary objects. + + We use only this for categorical data, where our data buffer is an array of + indices into an array of unique Python string objects. + """ + cdef: + readonly object value + + def __init__(self, + Py_ssize_t first_row, + Py_ssize_t last_row, + Py_ssize_t first_col, + Py_ssize_t last_col, + object value): + super(Adjustment, self).__init__( + first_row=first_row, + last_row=last_row, + first_col=first_col, + last_col=last_col, + ) + self.value = value + + def __repr__(self): + return ( + "%s(first_row=%d, last_row=%d," + " first_col=%d, last_col=%d, value=%r)" % ( + type(self).__name__, + self.first_row, + self.last_row, + self.first_col, + self.last_col, + self.value, + ) + ) + + +cdef class ObjectOverwrite(_ObjectAdjustment): + + cpdef mutate(self, object data): + # data is an object here because this is intended to be used with a + # `zipline.lib.LabelArray`. + cdef Py_ssize_t row, col + cdef object value = self.value + + # We don't do this in a loop because we only want to look up the label + # code in the array's categories once. + data[self.first_row:self.last_row + 1, + self.first_col:self.last_col + 1] = self.value diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py new file mode 100644 index 00000000..837a39c2 --- /dev/null +++ b/zipline/lib/labelarray.py @@ -0,0 +1,482 @@ +""" +An ndarray subclass for working with arrays of strings. +""" +from functools import partial +from numbers import Number +from operator import eq, ne +import re + +import numpy as np +from numpy import ndarray +import pandas as pd +from toolz import compose + +from zipline.utils.preprocess import preprocess +from zipline.utils.sentinel import sentinel +from zipline.utils.input_validation import ( + coerce, + expect_kinds, + expect_types, + optional, +) +from zipline.utils.numpy_utils import is_object, int64_dtype + +from ._factorize import ( + factorize_strings, + factorize_strings_known_categories, +) + + +def compare_arrays(left, right): + "Eq check with a short-circuit for identical objects." + return ( + left is right + or ((left.shape == right.shape) and (left == right).all()) + ) + + +def _make_unsupported_method(name): + def method(*args, **kwargs): + raise NotImplementedError( + "Method %s is not supported on LabelArrays." % name + ) + method.__name__ = name + method.__doc__ = "Unsupported LabelArray Method: %s" % name + return method + + +class CategoryMismatch(ValueError): + """ + Error raised on attempt to perform operations between LabelArrays with + mismatched category arrays. + """ + def __init__(self, left, right): + (mismatches,) = np.where(left != right) + assert len(mismatches), "Not actually a mismatch!" + super(CategoryMismatch, self).__init__( + "LabelArray categories don't match:\n" + "Mismatched Indices: {mismatches}\n" + "Left: {left}\n" + "Right: {right}".format( + mismatches=mismatches, + left=left[mismatches], + right=right[mismatches], + ) + ) + +_NotPassed = sentinel('_NotPassed') + + +class LabelArray(ndarray): + """ + An ndarray subclass for working with arrays of strings. + + Factorizes the input array into integers, but overloads equality on strings + to check against the factor label. + + See Also + -------- + http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html + """ + @preprocess( + values=coerce(list, partial(np.asarray, dtype=object)), + ) + @expect_types( + values=np.ndarray, + categories=optional(list), + ) + @expect_kinds(values=("O", "S", "U")) + def __new__(cls, + values, + missing_value, + categories=None, + sort=True): + + # Numpy's fixed-width string types aren't very efficient. Working with + # object arrays is faster than bytes or unicode arrays in almost all + # cases. + if not is_object(values): + values = values.astype(object) + + if categories is None: + codes, categories, reverse_categories = factorize_strings( + values.ravel(), + missing_value=missing_value, + sort=sort, + ) + else: + codes, categories, reverse_categories = ( + factorize_strings_known_categories( + values.ravel(), + categories=categories, + missing_value=missing_value, + sort=sort, + ) + ) + categories.setflags(write=False) + + ret = codes.reshape(values.shape).view(type=cls) + ret._categories = categories + ret._reverse_categories = reverse_categories + ret._missing_value = missing_value + return ret + + @property + def categories(self): + # This is a property because it should be immutable. + return self._categories + + @property + def reverse_categories(self): + # This is a property because it should be immutable. + return self._reverse_categories + + @property + def missing_value(self): + # This is a property because it should be immutable. + return self._missing_value + + def __array_finalize__(self, obj): + """ + Called by Numpy after array construction. + + There are three cases where this can happen: + + 1. Someone tries to directly construct a new array by doing:: + + >>> ndarray.__new__(LabelArray, ...) + + In this case, obj will be None. We treat this as an error case and + fail. + + 2. Someone (most likely our own __new__) calls + other_array.view(type=LabelArray). + + In this case, `self` will be the new LabelArray instance, and + ``obj` will be the array on which ``view`` is being called. + + The caller of ``obj.view`` is responsible for copying setting + category metadata on ``self`` after we exit. + + 3. Someone creates a new LabelArray by slicing an existing one. + + In this case, ``obj`` will be the original LabelArray. We're + responsible for copying over the parent array's category metadata. + """ + if obj is None: + raise TypeError( + "Direct construction of LabelArrays is not supported." + ) + + # See docstring for an explanation of when these will or will not be + # set. + self._categories = getattr(obj, 'categories', None) + self._reverse_categories = getattr(obj, 'reverse_categories', None) + self._missing_value = getattr(obj, 'missing_value', None) + + def __array_wrap__(self, obj, context=None): + """ + Called by numpy after completion of a ufunc. + + We coerce back into a vanilla ndarray if our dtype changed, since that + indicates that our categories are no longer meaningful. + """ + if obj.dtype != self.dtype: + return obj.view(type=np.ndarray) + return obj + + def as_int_array(self): + """ + Convert self into a regular ndarray of ints. + + This is an O(1) operation. It does not copy the underlying data. + """ + return self.view(type=ndarray) + + def as_string_array(self): + """ + Convert self back into an array of strings. + + This is an O(N) operation. + """ + return self.categories[self] + + def as_categorical(self, name=None): + """ + Coerce self into a pandas categorical. + + This is only defined on 1D arrays, since that's all pandas supports. + """ + if len(self.shape) > 1: + raise ValueError("Can't convert a 2D array to a categorical.") + return pd.Categorical.from_codes( + self.as_int_array(), + self.categories, + ordered=False, + name=name, + ) + + def as_categorical_frame(self, index, columns, name=None): + """ + Coerce self into a pandas DataFrame of Categoricals. + """ + if len(self.shape) != 2: + raise ValueError( + "Can't convert a non-2D LabelArray into a DataFrame." + ) + + expected_shape = (len(index), len(columns)) + if expected_shape != self.shape: + raise ValueError( + "Can't construct a DataFrame with provided indices:\n\n" + "LabelArray shape is {actual}, but index and columns imply " + "that shape should be {expected}.".format( + actual=self.shape, + expected=expected_shape, + ) + ) + + return pd.Series( + index=pd.MultiIndex.from_product([index, columns]), + data=self.ravel().as_categorical(name=name), + ).unstack() + + def __setitem__(self, indexer, value): + self_categories = self.categories + + if isinstance(value, LabelArray): + value_categories = value.categories + if compare_arrays(self_categories, value_categories): + return super(LabelArray, self).__setitem__(indexer, value) + else: + raise CategoryMismatch(self_categories, value_categories) + + elif isinstance(value, (bytes, unicode)): + value_code = self.reverse_categories.get(value, None) + if value_code is None: + raise ValueError("%r is not in LabelArray categories." % value) + return super(LabelArray, self).__setitem__(indexer, value_code) + + else: + raise NotImplementedError( + "Setting into a LabelArray with a value of " + "type {type} is not yet supported.".format( + type=type(value).__name__, + ), + ) + + def _equality_check(op): + """ + Shared code for __eq__ and __ne__, parameterized on the actual + comparison operator to use. + """ + # What value should we return if we compare against a value not in our + # categories? + if op is eq: + COMPARE_TO_UNKNOWN = False + elif op is ne: + COMPARE_TO_UNKNOWN = True + else: + raise AssertionError("_make_equality_check called with %s" % op) + + def method(self, other): + self_categories = self.categories + + if isinstance(other, LabelArray): + other_categories = other.categories + if compare_arrays(self_categories, other_categories): + return op(self.as_int_array(), other.as_int_array()) + else: + raise CategoryMismatch(self_categories, other_categories) + + elif isinstance(other, ndarray): + # Compare to ndarrays as though we were an array of strings. + # This is fairly expensive, and should generally be avoided. + return op(self.as_string_array(), other) + + elif isinstance(other, (bytes, unicode)): + i = self._reverse_categories.get(other, None) + if i is None: + # Requested string isn't in our categories. Short circuit. + # This isn't full_like because that would try to return a + # LabelArray. + return np.full(self.shape, COMPARE_TO_UNKNOWN, dtype=bool) + + return op(self.as_int_array(), i) + + elif isinstance(other, Number): + return NotImplemented + + return op(super(LabelArray, self), other) + return method + + __eq__ = _equality_check(eq) + __ne__ = _equality_check(ne) + del _equality_check + + def view(self, dtype=_NotPassed, type=_NotPassed): + if type is _NotPassed and dtype not in (_NotPassed, self.dtype): + raise TypeError("Can't view LabelArray as another dtype.") + + # The text signature on ndarray.view makes it look like the default + # values for dtype and type are `None`, but passing None explicitly has + # different semantics than not passing an arg at all, so we reconstruct + # the kwargs dict here to simulate the args not being passed at all. + kwargs = {} + if dtype is not _NotPassed: + kwargs['dtype'] = dtype + if type is not _NotPassed: + kwargs['type'] = type + return super(LabelArray, self).view(**kwargs) + + # In general, we support resizing, slicing, and reshaping methods, but not + # numeric methods. + SUPPORTED_NDARRAY_METHODS = frozenset([ + 'base', + 'byteswap', + 'compress', + 'copy', + 'data', + 'diagonal', + 'dtype', + 'flat', + 'flatten', + 'item', + 'itemset', + 'itemsize', + 'nbytes', + 'ndim', + 'newbyteorder', + 'ravel', + 'repeat', + 'reshape', + 'resize', + 'setflags', + 'shape', + 'size', + 'squeeze', + 'strides', + 'swapaxes', + 'take', + 'trace', + 'transpose', + 'view' + ]) + PUBLIC_NDARRAY_METHODS = frozenset([ + s for s in dir(ndarray) if not s.startswith('_') + ]) + + # Generate failing wrappers for all unsupported methods. + locals().update( + { + method: _make_unsupported_method(method) + for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS + } + ) + + def __repr__(self): + # This happens if you call a ufunc on a LabelArray that changes the + # dtype. This is generally an indicator that the array has been used + # incorrectly, and it means we're no longer valid for anything. + if self.dtype != int64_dtype: + return "Invalid LabelArray: dtype={}, shape={}".format( + self.dtype, self.shape + ) + repr_lines = repr(self.as_string_array()).splitlines() + repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1) + repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')' + # The extra spaces here account for the difference in length between + # 'array(' and 'LabelArray('. + return '\n '.join(repr_lines) + + def empty_like(self, shape): + """ + Make an empty LabelArray with the same categories as ``self``, filled + with ``self.missing_value``. + """ + out = np.full( + shape, + self.reverse_categories[self.missing_value], + dtype=self.dtype + ).view( + type=type(self) + ) + + out._categories = self.categories + out._reverse_categories = self.reverse_categories + out._missing_value = self.missing_value + + return out + + def apply(self, f, dtype): + """ + Map a function elementwise over entries in ``self``. + + ``f`` will be applied exactly once to each unique value in ``self``. + """ + return np.vectorize(f, otypes=[dtype])(self.categories)[self] + + def startswith(self, prefix): + """ + Element-wise startswith. + + Parameters + ---------- + prefix : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self started with ``prefix``. + """ + return self.apply(lambda elem: elem.startswith(prefix), dtype=bool) + + def endswith(self, suffix): + """ + Elementwise endswith. + + Parameters + ---------- + suffix : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self ended with ``suffix``.w + """ + return self.apply(lambda elem: elem.endswith(suffix), dtype=bool) + + def contains(self, substring): + """ + Elementwise contains. + + Parameters + ---------- + substring : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self ended with ``suffix``. + """ + return self.apply(lambda elem: substring in elem, dtype=bool) + + @preprocess(pattern=re.compile) + def matches(self, pattern): + """ + Elementwise regex match. + + Parameters + ---------- + pattern : str or compiled regex + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self was matched by ``pattern``. + """ + return self.apply(compose(bool, pattern.match)) diff --git a/zipline/pipeline/api_utils.py b/zipline/pipeline/api_utils.py new file mode 100644 index 00000000..1ccf3543 --- /dev/null +++ b/zipline/pipeline/api_utils.py @@ -0,0 +1,49 @@ +""" +Utilities for creating public APIs (e.g. argument validation decorators). +""" +from zipline.utils.input_validation import preprocess + + +def restrict_to_dtype(dtype, message_template): + """ + A factory for decorators that restrict Term methods to only be callable on + Terms with a specific dtype. + + This is conceptually similar to + zipline.utils.input_validation.expect_dtypes, but provides more flexibility + for providing error messages that are specifically targeting Term methods. + + Parameters + ---------- + dtype : numpy.dtype + The dtype on which the decorated method may be called. + message_template : str + A template for the error message to be raised. + `message_template.format` will be called with keyword arguments + `method_name`, `expected_dtype`, and `received_dtype`. + + Usage + ----- + @restrict_to_dtype( + dtype=float64_dtype, + message_template=( + "{method_name}() was called on a factor of dtype {received_dtype}." + "{method_name}() requires factors of dtype{expected_dtype}." + + ), + ) + def some_factor_method(self, ...): + self.stuff_that_requires_being_float64(...) + """ + def processor(term_method, _, term_instance): + term_dtype = term_instance.dtype + if term_dtype != dtype: + raise TypeError( + message_template.format( + method_name=term_method.__name__, + expected_dtype=dtype.name, + received_dtype=term_dtype, + ) + ) + return term_instance + return preprocess(self=processor) diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index c4d77ec9..e27016eb 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -1,16 +1,24 @@ """ classifier.py """ +from functools import wraps from numbers import Number +import operator from numpy import where, isnan, nan, zeros +from zipline.lib.labelarray import LabelArray from zipline.lib.quantiles import quantiles -from zipline.pipeline.term import ComputableTerm +from zipline.pipeline.api_utils import restrict_to_dtype +from zipline.pipeline.term import ComputableTerm, NotSpecified +from zipline.utils.compat import unicode from zipline.utils.input_validation import expect_types -from zipline.utils.numpy_utils import int64_dtype +from zipline.utils.numpy_utils import ( + categorical_dtype, + int64_dtype, +) -from ..filters import NullFilter, NumExprFilter +from ..filters import Filter, NullFilter, NumExprFilter from ..mixins import ( CustomTermMixin, LatestMixin, @@ -20,6 +28,15 @@ from ..mixins import ( ) +strings_only = restrict_to_dtype( + dtype=categorical_dtype, + message_template=( + "{method_name}() is only defined on Classifiers producing strings" + " but it was called on a Factor of dtype {received_dtype}." + ) +) + + class Classifier(RestrictedDTypeMixin, ComputableTerm): """ A Pipeline expression computing a categorical output. @@ -30,7 +47,9 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): indicating that means/standard deviations should be computed on assets for which the classifier produced the same label. """ - ALLOWED_DTYPES = (int64_dtype,) # Used by RestrictedDTypeMixin + # Used by RestrictedDTypeMixin + ALLOWED_DTYPES = (int64_dtype, categorical_dtype) + categories = NotSpecified def isnull(self): """ @@ -45,9 +64,8 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): return ~self.isnull() # We explicitly don't support classifier to classifier comparisons, since - # the numbers likely don't mean the same thing. This may be relaxed in the - # future, but for now we're starting conservatively. - @expect_types(other=Number) + # the stored values likely don't mean the same thing. This may be relaxed + # in the future, but for now we're starting conservatively. def eq(self, other): """ Construct a Filter returning True for asset/date pairs where the output @@ -58,7 +76,7 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): # certainly not what the user wants. if other == self.missing_value: raise ValueError( - "Comparison against self.missing_value ({value}) in" + "Comparison against self.missing_value ({value!r}) in" " {typename}.eq().\n" "Missing values have NaN semantics, so the " "requested comparison would always produce False.\n" @@ -67,24 +85,87 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): typename=(type(self).__name__), ) ) - return NumExprFilter.create( - "x_0 == {other}".format(other=int(other)), - binds=(self,), - ) - @expect_types(other=Number) + if isinstance(other, Number) != (self.dtype == int64_dtype): + raise InvalidClassifierComparison(self, other) + + if isinstance(other, Number): + return NumExprFilter.create( + "x_0 == {other}".format(other=int(other)), + binds=(self,), + ) + else: + return ScalarStringPredicate( + classifier=self, + op=operator.eq, + compval=other, + ) + def __ne__(self, other): """ Construct a Filter returning True for asset/date pairs where the output of ``self`` matches ``other. """ - return NumExprFilter.create( - "((x_0 != {other}) & (x_0 != {missing}))".format( - other=int(other), - missing=self.missing_value, - ), - binds=(self,), - ) + if isinstance(other, Number) != (self.dtype == int64_dtype): + raise InvalidClassifierComparison(self, other) + + if isinstance(other, Number): + return NumExprFilter.create( + "((x_0 != {other}) & (x_0 != {missing}))".format( + other=int(other), + missing=self.missing_value, + ), + binds=(self,), + ) + else: + return ScalarStringPredicate( + classifier=self, + op=operator.ne, + compval=other, + ) + + def _string_predicate(f): + """ + Decorator for converting a function from (LabelArray, str) -> bool + into a Classifier method that returns a ScalarStringPredicate filter. + + This mainly exists to avoid replicating shared boilerplate + (e.g. argument type validation). + """ + @wraps(f) + @expect_types(compval=(bytes, unicode)) + @strings_only + def method(self, compval): + return ScalarStringPredicate( + classifier=self, + op=f, + compval=compval, + ) + return method + + @_string_predicate + @expect_types(label_array=LabelArray) + def startswith(label_array, other): + return label_array.startswith(other) + + @_string_predicate + @expect_types(label_array=LabelArray) + def endswith(label_array, other): + return label_array.endswith(other) + + @_string_predicate + @expect_types(label_array=LabelArray) + def contains(label_array, other): + return label_array.contains(other) + + del _string_predicate + + def postprocess(self, data): + if self.dtype == int64_dtype: + return data + if not isinstance(data, LabelArray): + raise AssertionError("Expected a LabelArray, got %s." % type(data)) + return data.as_categorical() class Everything(Classifier): @@ -127,6 +208,50 @@ class Quantiles(SingleInputMixin, Classifier): return type(self).__name__ + '(%d)' % self.params['bins'] +class ScalarStringPredicate(SingleInputMixin, Filter): + """ + A filter that applies a function from (LabelArray, str) -> ndarray[bool]. + + Examples include ``==, !=, startswith, and endswith``. + + This exists because we represent string arrays with + ``zipline.lib.LabelArray``s, which numexpr doesn't know about, so we can't + use the generic NumExprFilter implementation here. + """ + window_length = 0 + + @expect_types(classifier=Classifier, compval=(bytes, unicode)) + def __new__(cls, classifier, op, compval): + return super(ScalarStringPredicate, cls).__new__( + ScalarStringPredicate, + compval=compval, + op=op, + inputs=(classifier,), + mask=classifier.mask, + ) + + def _init(self, op, compval, *args, **kwargs): + self._op = op + self._compval = compval + return super(ScalarStringPredicate, self)._init(*args, **kwargs) + + @classmethod + def static_identity(cls, op, compval, *args, **kwargs): + return ( + super(ScalarStringPredicate, cls).static_identity(*args, **kwargs), + op, + compval, + ) + + def _compute(self, arrays, dates, assets, mask): + data = arrays[0] + return ( + self._op(data, self._compval) + & (data != self.inputs[0].missing_value) + & mask + ) + + class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier): """ Base class for user-defined Classifiers. @@ -149,3 +274,27 @@ class Latest(LatestMixin, CustomClassifier): zipline.pipeline.factors.factor.Latest zipline.pipeline.filters.filter.Latest """ + def _allocate_output(self, windows, shape): + """ + Override the default array allocation to produce a LabelArray when we + have a string-like dtype. + """ + if self.dtype == int64_dtype: + return super(Latest, self)._allocate_output(windows, shape) + + # This is a little bit of a hack. We might not know what the + # categories for a LabelArray are until it's actually been loaded, so + # we need to look at the underlying data. + return windows[0].data.empty_like(shape) + + +class InvalidClassifierComparison(TypeError): + def __init__(self, classifier, compval): + super(InvalidClassifierComparison, self).__init__( + "Can't compare classifier of dtype" + " {dtype} to value {value} of type {type}.".format( + dtype=classifier.dtype, + value=compval, + type=type(compval).__name__, + ) + ) diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index b5980d83..fcd6cfe5 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -14,11 +14,7 @@ from zipline.pipeline.term import ( Term, ) from zipline.utils.input_validation import ensure_dtype -from zipline.utils.numpy_utils import ( - bool_dtype, - int64_dtype, - NoDefaultMissingValue, -) +from zipline.utils.numpy_utils import NoDefaultMissingValue from zipline.utils.preprocess import preprocess @@ -26,7 +22,6 @@ class Column(object): """ An abstract column of data, not yet associated with a dataset. """ - @preprocess(dtype=ensure_dtype) def __init__(self, dtype, missing_value=NotSpecified): self.dtype = dtype @@ -164,15 +159,25 @@ class BoundColumn(LoadableTerm): @property def latest(self): - if self.dtype == bool_dtype: - from zipline.pipeline.filters import Latest - elif self.dtype == int64_dtype: - from zipline.pipeline.classifiers import Latest + from zipline.pipeline.factors import Factor, Latest as LatestFactor + from zipline.pipeline.filters import Filter, Latest as LatestFilter + from zipline.pipeline.classifiers import ( + Classifier, + Latest as LatestClassifier, + ) + + dtype = self.dtype + if dtype in Filter.ALLOWED_DTYPES: + Latest = LatestFilter + elif dtype in Classifier.ALLOWED_DTYPES: + Latest = LatestClassifier else: - from zipline.pipeline.factors import Latest + assert dtype in Factor.ALLOWED_DTYPES, "Unknown dtype %s." % dtype + Latest = LatestFactor + return Latest( inputs=(self,), - dtype=self.dtype, + dtype=dtype, missing_value=self.missing_value, ) diff --git a/zipline/pipeline/data/testing.py b/zipline/pipeline/data/testing.py index 5d452769..30f92642 100644 --- a/zipline/pipeline/data/testing.py +++ b/zipline/pipeline/data/testing.py @@ -7,6 +7,7 @@ zipline.pipeline.data.testing. from .dataset import Column, DataSet from zipline.utils.numpy_utils import ( bool_dtype, + categorical_dtype, float64_dtype, datetime64ns_dtype, int64_dtype, @@ -22,3 +23,9 @@ class TestingDataSet(DataSet): float_col = Column(dtype=float64_dtype) datetime_col = Column(dtype=datetime64ns_dtype) int_col = Column(dtype=int64_dtype, missing_value=0) + + categorical_col = Column(dtype=categorical_dtype, missing_value=u'') + categorical_default_NULL = Column( + dtype=categorical_dtype, + missing_value=u'NULL', + ) diff --git a/zipline/pipeline/engine.py b/zipline/pipeline/engine.py index 8a751bc5..79be34a6 100644 --- a/zipline/pipeline/engine.py +++ b/zipline/pipeline/engine.py @@ -51,7 +51,7 @@ class PipelineEngine(with_metaclass(ABCMeta)): result : pd.DataFrame A frame of computed results. - The columns `result` correspond will be the computed results of + The columns `result` correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of `zipline.pipeline.term.Term`. @@ -165,17 +165,20 @@ class SimplePipelineEngine(object): root_mask = self._compute_root_mask(start_date, end_date, extra_rows) dates, assets, root_mask_values = explode(root_mask) - outputs = self.compute_chunk( + results = self.compute_chunk( graph, dates, assets, initial_workspace={self._root_mask_term: root_mask_values}, ) - out_dates = dates[extra_rows:] - screen_values = outputs.pop(screen_name) - - return self._to_narrow(outputs, screen_values, out_dates, assets) + return self._to_narrow( + graph.outputs, + results, + results.pop(screen_name), + dates[extra_rows:], + assets, + ) def _compute_root_mask(self, start_date, end_date, extra_rows): """ @@ -363,14 +366,16 @@ class SimplePipelineEngine(object): out[name] = workspace[term][graph_extra_rows[term]:] return out - def _to_narrow(self, data, mask, dates, assets): + def _to_narrow(self, terms, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- + terms : dict[str -> Term] + Dict mapping column names to terms. data : dict[str -> ndarray[ndim=2]] - Dict mapping column names to computed results. + Dict mapping column names to computed results for those names. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] @@ -412,8 +417,18 @@ class SimplePipelineEngine(object): resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] + + final_columns = {} + for name in data: + # Each term that computed an output has its postprocess method + # called on the filtered result. + # + # We use this convert LabelArrays into categoricals, among other + # things. + final_columns[name] = terms[name].postprocess(data[name][mask]) + return DataFrame( - data={name: arr[mask] for name, arr in iteritems(data)}, + data=final_columns, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0) @@ -423,6 +438,7 @@ class SimplePipelineEngine(object): """ root = self._root_mask_term clsname = type(self).__name__ + # Writing this out explicitly so this errors in testing if we change # the name without updating this line. compute_chunk_name = self.compute_chunk.__name__ diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index 79d10f71..a76002a9 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -6,11 +6,11 @@ from operator import attrgetter from numbers import Number from numpy import inf, where -from toolz import curry from zipline.errors import UnknownRankMethod from zipline.lib.normalize import naive_grouped_rowwise_apply from zipline.lib.rank import masked_rankdata_2d +from zipline.pipeline.api_utils import restrict_to_dtype from zipline.pipeline.classifiers import Classifier, Everything, Quantiles from zipline.pipeline.mixins import ( CustomTermMixin, @@ -42,6 +42,7 @@ from zipline.pipeline.filters import ( PercentileFilter, NullFilter, ) +from zipline.utils.functional import with_doc, with_name from zipline.utils.input_validation import expect_types from zipline.utils.math_utils import nanmean, nanstd from zipline.utils.numpy_utils import ( @@ -51,7 +52,6 @@ from zipline.utils.numpy_utils import ( float64_dtype, int64_dtype, ) -from zipline.utils.preprocess import preprocess _RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal']) @@ -81,37 +81,6 @@ def coerce_numbers_to_my_dtype(f): return method -@curry -def set_attribute(name, value): - """ - Decorator factory for setting attributes on a function. - - Doesn't change the behavior of the wrapped function. - - Usage - ----- - >>> @set_attribute('__name__', 'foo') - ... def bar(): - ... return 3 - ... - >>> bar() - 3 - >>> bar.__name__ - 'foo' - """ - def decorator(f): - setattr(f, name, value) - return f - return decorator - - -# Decorators for setting the __name__ and __doc__ properties of a decorated -# function. -# Example: -with_name = set_attribute('__name__') -with_doc = set_attribute('__doc__') - - def binop_return_type(op): if is_comparison(op): return NumExprFilter @@ -328,51 +297,6 @@ def function_application(func): return mathfunc -def restrict_to_dtype(dtype, message_template): - """ - A factory for decorators that restricting Factor methods to only be - callable on Factors with a specific dtype. - - This is conceptually similar to - zipline.utils.input_validation.expect_dtypes, but provides more flexibility - for providing error messages that are specifically targeting Factor - methods. - - Parameters - ---------- - dtype : numpy.dtype - The dtype on which the decorated method may be called. - message_template : str - A template for the error message to be raised. - `message_template.format` will be called with keyword arguments - `method_name`, `expected_dtype`, and `received_dtype`. - - Usage - ----- - @restrict_to_dtype( - dtype=float64_dtype, - message_template=( - "{method_name}() was called on a factor of dtype {received_dtype}." - "{method_name}() requires factors of dtype{expected_dtype}." - - ), - ) - def some_factor_method(self, ...): - self.stuff_that_requires_being_float64(...) - """ - def processor(factor_method, _, factor_instance): - factor_dtype = factor_instance.dtype - if factor_dtype != dtype: - raise TypeError( - message_template.format( - method_name=factor_method.__name__, - expected_dtype=dtype.name, - received_dtype=factor_dtype, - ) - ) - return factor_instance - return preprocess(self=processor) - # Decorators for Factor methods. if_not_float64_tell_caller_to_use_isnull = restrict_to_dtype( dtype=float64_dtype, diff --git a/zipline/pipeline/loaders/synthetic.py b/zipline/pipeline/loaders/synthetic.py index f9957ae9..dfea5829 100644 --- a/zipline/pipeline/loaders/synthetic.py +++ b/zipline/pipeline/loaders/synthetic.py @@ -27,6 +27,7 @@ from zipline.utils.numpy_utils import ( datetime64ns_dtype, float64_dtype, int64_dtype, + object_dtype, ) @@ -148,6 +149,7 @@ class SeededRandomLoader(PrecomputedLoader): float64_dtype: self._float_values, int64_dtype: self._int_values, bool_dtype: self._bool_values, + object_dtype: self._object_values, }[dtype](shape) @property @@ -191,6 +193,9 @@ class SeededRandomLoader(PrecomputedLoader): """ return self.state.randn(*shape) < 0 + def _object_values(self, shape): + return self._int_values(shape).astype(str).astype(object) + OHLCV = ('open', 'high', 'low', 'close', 'volume') OHLC = ('open', 'high', 'low', 'close') diff --git a/zipline/pipeline/mixins.py b/zipline/pipeline/mixins.py index d5f0fcb2..497fc145 100644 --- a/zipline/pipeline/mixins.py +++ b/zipline/pipeline/mixins.py @@ -1,7 +1,7 @@ """ Mixins classes for use with Filters and Factors. """ -from numpy import full_like, recarray +from numpy import full, recarray from zipline.utils.control_flow import nullctx from zipline.errors import WindowLengthNotPositive, UnsupportedDataType @@ -51,7 +51,7 @@ class RestrictedDTypeMixin(object): if self.dtype not in self.ALLOWED_DTYPES: raise UnsupportedDataType( - typename=type(self.__name__), + typename=type(self).__name__, dtype=self.dtype, ) @@ -103,27 +103,45 @@ class CustomTermMixin(object): """ raise NotImplementedError() + def _allocate_output(self, windows, shape): + """ + Allocate an output array whose rows should be passed to `self.compute`. + + The resulting array must have a shape of ``shape``. + + If we have standard outputs (i.e. self.outputs is NotSpecified), the + default is an empty ndarray whose dtype is ``self.dtype``. + + If we have an outputs tuple, the default is an empty recarray with + ``self.outputs`` as field names. Each field will have dtype + ``self.dtype``, the default shape is ``self.shape``. + + This can be overridden to control the kind of array constructed + (e.g. to produce a LabelArray instead of an ndarray). + """ + missing_value = self.missing_value + outputs = self.outputs + if outputs is not NotSpecified: + out = recarray( + shape, + formats=[self.dtype.str] * len(outputs), + names=outputs, + ) + out[:] = missing_value + else: + out = full(shape, missing_value, dtype=self.dtype) + return out + def _compute(self, windows, dates, assets, mask): """ Call the user's `compute` function on each window with a pre-built output array. """ compute = self.compute - missing_value = self.missing_value params = self.params - outputs = self.outputs - if outputs is not NotSpecified: - out = recarray( - mask.shape, - formats=[self.dtype.str] * len(outputs), - names=outputs, - ) - out[:] = missing_value - else: - out = full_like(mask, missing_value, dtype=self.dtype) + out = self._allocate_output(windows, mask.shape) + with self.ctx: - # TODO: Consider pre-filtering columns that are all-nan at each - # time-step? for idx, date in enumerate(dates): col_mask = mask[idx] masked_out = out[idx][col_mask] diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index e6b6e9bf..1bffc3e7 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -4,7 +4,7 @@ Base class for Filters, Factors and Classifiers from abc import ABCMeta, abstractproperty from weakref import WeakValueDictionary -from numpy import dtype as dtype_class +from numpy import dtype as dtype_class, ndarray from six import with_metaclass from zipline.errors import ( DTypeNotSpecified, @@ -16,6 +16,7 @@ from zipline.errors import ( WindowLengthNotSpecified, ) from zipline.lib.adjusted_array import can_represent_dtype +from zipline.utils.input_validation import expect_types from zipline.utils.memoize import lazyval from zipline.utils.numpy_utils import ( bool_dtype, @@ -476,6 +477,19 @@ class ComputableTerm(Term): out[self.mask] = 0 return out + @expect_types(data=ndarray) + def postprocess(self, data): + """ + Called with an result of ``self``, unravelled (i.e. 1-dimensional) + after any user-defined screens have been applied. + + This is mostly useful for transforming the dtype of an output, e.g., to + convert a LabelArray into a pandas Categorical. + + The default implementation is to just return data unchanged. + """ + return data + def __repr__(self): return ( "{type}({inputs}, window_length={window_length})" diff --git a/zipline/testing/__init__.py b/zipline/testing/__init__.py index a415daa8..c52ff03e 100644 --- a/zipline/testing/__init__.py +++ b/zipline/testing/__init__.py @@ -51,3 +51,4 @@ from .core import ( # noqa write_bcolz_minute_data, write_compressed, ) +from .fixtures import ZiplineTestCase # noqa diff --git a/zipline/testing/core.py b/zipline/testing/core.py index 11430470..ad130a79 100644 --- a/zipline/testing/core.py +++ b/zipline/testing/core.py @@ -1032,7 +1032,7 @@ def temp_pipeline_engine(calendar, sids, random_seed, symbols=None): yield SimplePipelineEngine(get_loader, calendar, finder) -def parameter_space(**params): +def parameter_space(__fail_fast=False, **params): """ Wrapper around subtest that allows passing keywords mapping names to iterables of values. @@ -1083,7 +1083,16 @@ def parameter_space(**params): ) param_sets = product(*(params[name] for name in argnames)) - return subtest(param_sets, *argnames)(f) + + if __fail_fast: + @wraps(f) + def wrapped(self): + for args in param_sets: + f(self, *args) + return wrapped + else: + return subtest(param_sets, *argnames)(f) + return decorator diff --git a/zipline/utils/compat.py b/zipline/utils/compat.py index 62ed44ef..0282cb89 100644 --- a/zipline/utils/compat.py +++ b/zipline/utils/compat.py @@ -11,6 +11,9 @@ if PY2: else: from types import MappingProxyType as mappingproxy + +unicode = type(u'') + __all__ = [ 'mappingproxy', ] diff --git a/zipline/utils/functional.py b/zipline/utils/functional.py index 79569de0..685f0aef 100644 --- a/zipline/utils/functional.py +++ b/zipline/utils/functional.py @@ -56,6 +56,8 @@ def apply(f, *args, **kwargs): # Alias for use as a class decorator. instance = apply +from zipline.utils.sentinel import sentinel + def mapall(funcs, seq): """ @@ -242,3 +244,87 @@ def unzip(seq, elem_len=None): if elem_len is None: raise ValueError("cannot unzip empty sequence without 'elem_len'") return ((),) * elem_len + + +_no_default = sentinel('_no_default') + + +def getattrs(value, attrs, default=_no_default): + """ + Perform a chained application of ``getattr`` on ``value`` with the values + in ``attrs``. + + If ``default`` is supplied, return it if any of the attribute lookups fail. + + Parameters + ---------- + value : object + Root of the lookup chain. + attrs : iterable[str] + Sequence of attributes to look up. + default : object, optional + Value to return if any of the lookups fail. + + Returns + ------- + result : object + Result of the lookup sequence. + + Example + ------- + >>> class EmptyObject(object): + ... pass + ... + >>> obj = EmptyObject() + >>> obj.foo = EmptyObject() + >>> obj.foo.bar = "value" + >>> getattrs(obj, ('foo', 'bar')) + 'value' + + >>> getattrs(obj, ('foo', 'buzz')) + Traceback (most recent call last): + ... + AttributeError: 'EmptyObject' object has no attribute 'buzz' + + >>> getattrs(obj, ('foo', 'buzz'), 'default') + 'default' + """ + try: + for attr in attrs: + value = getattr(value, attr) + except AttributeError: + if default is _no_default: + raise + value = default + return value + + +@curry +def set_attribute(name, value): + """ + Decorator factory for setting attributes on a function. + + Doesn't change the behavior of the wrapped function. + + Usage + ----- + >>> @set_attribute('__name__', 'foo') + ... def bar(): + ... return 3 + ... + >>> bar() + 3 + >>> bar.__name__ + 'foo' + """ + def decorator(f): + setattr(f, name, value) + return f + return decorator + + +# Decorators for setting the __name__ and __doc__ properties of a decorated +# function. +# Example: +with_name = set_attribute('__name__') +with_doc = set_attribute('__doc__') diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py index 66e8b9fe..0424e6f4 100644 --- a/zipline/utils/input_validation.py +++ b/zipline/utils/input_validation.py @@ -22,7 +22,8 @@ from six import iteritems, string_types, PY3 from toolz import valmap, complement, compose import toolz.curried.operator as op -from zipline.utils.preprocess import preprocess +from zipline.utils.functional import getattrs +from zipline.utils.preprocess import call, preprocess def optionally(preprocessor): @@ -163,7 +164,7 @@ def ensure_timestamp(func, argname, arg): ) -def expect_dtypes(*_pos, **named): +def expect_dtypes(**named): """ Preprocessing decorator that verifies inputs have expected numpy dtypes. @@ -181,9 +182,6 @@ def expect_dtypes(*_pos, **named): ... TypeError: foo() expected an argument with dtype 'int64' for argument 'x', but got dtype 'float64' instead. # noqa """ - if _pos: - raise TypeError("expect_dtypes() only takes keyword arguments.") - for name, type_ in iteritems(named): if not isinstance(type_, (dtype, tuple)): raise TypeError( @@ -193,42 +191,101 @@ def expect_dtypes(*_pos, **named): ) ) - def _expect_dtype(_dtype_or_dtype_tuple): + @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,))) + def _expect_dtype(dtypes): """ - Factory for dtype-checking functions that work the @preprocess + Factory for dtype-checking functions that work with the @preprocess decorator. """ - # Slightly different messages for dtype and tuple of dtypes. - if isinstance(_dtype_or_dtype_tuple, tuple): - allowed_dtypes = _dtype_or_dtype_tuple - else: - allowed_dtypes = (_dtype_or_dtype_tuple,) - template = ( - "%(funcname)s() expected a value with dtype {dtype_str} " - "for argument '%(argname)s', but got %(actual)r instead." - ).format(dtype_str=' or '.join(repr(d.name) for d in allowed_dtypes)) - - def check_dtype(value): - return getattr(value, 'dtype', None) not in allowed_dtypes - - def display_bad_value(value): + def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype - # name. + # name. Otherwise just show the value. try: - return value.dtype.name + value_to_show = value.dtype.name except AttributeError: - return value + value_to_show = value + return ( + "{funcname}() expected a value with dtype {dtype_str} " + "for argument {argname!r}, but got {value!r} instead." + ).format( + funcname=_qualified_name(func), + dtype_str=' or '.join(repr(d.name) for d in dtypes), + argname=argname, + value=value_to_show, + ) - return make_check( - exc_type=TypeError, - template=template, - pred=check_dtype, - actual=display_bad_value, - ) + def _actual_preprocessor(func, argname, argvalue): + if getattr(argvalue, 'dtype', object()) not in dtypes: + raise TypeError(error_message(func, argname, argvalue)) + return argvalue + + return _actual_preprocessor return preprocess(**valmap(_expect_dtype, named)) +def expect_kinds(**named): + """ + Preprocessing decorator that verifies inputs have expected dtype kinds. + + Usage + ----- + >>> from numpy import int64, int32, float32 + >>> @expect_kinds(x='i') + ... def foo(x): + ... return x + ... + >>> foo(int64(2)) + 2 + >>> foo(int32(2)) + 2 + >>> foo(float32(2)) + Traceback (most recent call last): + ...n + TypeError: foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead. # noqa + """ + for name, kind in iteritems(named): + if not isinstance(kind, (str, tuple)): + raise TypeError( + "expect_dtype_kinds() expected a string or tuple of strings" + " for argument {name!r}, but got {kind} instead.".format( + name=name, kind=dtype, + ) + ) + + @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,))) + def _expect_kind(kinds): + """ + Factory for kind-checking functions that work the @preprocess + decorator. + """ + def error_message(func, argname, value): + # If the bad value has a dtype, but it's wrong, show the dtype + # kind. Otherwise just show the value. + try: + value_to_show = value.dtype.kind + except AttributeError: + value_to_show = value + return ( + "{funcname}() expected a numpy object of kind {kinds} " + "for argument {argname!r}, but got {value!r} instead." + ).format( + funcname=_qualified_name(func), + kinds=' or '.join(map(repr, kinds)), + argname=argname, + value=value_to_show, + ) + + def _actual_preprocessor(func, argname, argvalue): + if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds: + raise TypeError(error_message(func, argname, argvalue)) + return argvalue + + return _actual_preprocessor + + return preprocess(**valmap(_expect_kind, named)) + + def expect_types(*_pos, **named): """ Preprocessing decorator that verifies inputs have expected types. diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py index 5df6e213..a254fff8 100644 --- a/zipline/utils/numpy_utils.py +++ b/zipline/utils/numpy_utils.py @@ -32,6 +32,10 @@ complex128_dtype = dtype('complex128') datetime64D_dtype = dtype('datetime64[D]') datetime64ns_dtype = dtype('datetime64[ns]') +object_dtype = dtype('O') +# We use object arrays for strings. +categorical_dtype = object_dtype + make_datetime64ns = flip(datetime64, 'ns') make_datetime64D = flip(datetime64, 'D') @@ -71,6 +75,7 @@ def make_kind_check(python_types, numpy_kind): is_float = make_kind_check(float, 'f') is_int = make_kind_check(int, 'i') is_datetime = make_kind_check(datetime, 'M') +is_object = make_kind_check(object, 'O') def coerce_to_dtype(dtype, value): @@ -263,9 +268,7 @@ def rolling_window(array, length): _notNaT = make_datetime64D(0) -def busday_count_mask_NaT(begindates, - enddates, - out=None): +def busday_count_mask_NaT(begindates, enddates, out=None): """ Simple of numpy.busday_count that returns `float` arrays rather than int arrays, and handles `NaT`s by returning `NaN`s where the inputs were `NaT`.