From 5f190395adfe63b68a5fdf7cbba7837c7e968e9f Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 28 Apr 2016 18:10:48 -0400 Subject: [PATCH] ENH: Add support for strings in Pipeline. - Adds a new class, ``LabelArray``, which is a subclass of np.ndarray. LabelArray is conceptually similar to pandas.Categorical, in that it stores data with many duplicate values as indices into an array of unique values. For string data with many duplicates (e.g. time-series of tickers or or industry classifications), this provides multiple orders of magnitude of improvement when doing string operations, especially string comparison/matching operations. - Adds a new generic object "specialization" for `AdjustedArrayWindow`, and a corresponding ObjectOverwrite adjustment. - Adds a new ``postprocess`` method to ``zipline.pipeline.term.Term``. This method is called on the final result of any pipeline expression after screen filtering has occurred. The default implementation of ``postprocess`` is identity, but Classifier overrides it to coerce string columns into pandas.Categoricals before presenting them to the user. --- setup.py | 2 + tests/pipeline/test_adjusted_array.py | 397 +++++++++++++---- tests/pipeline/test_classifier.py | 162 ++++++- tests/pipeline/test_column.py | 22 +- tests/pipeline/test_term.py | 3 +- tests/test_labelarray.py | 151 +++++++ zipline/algorithm.py | 1 + zipline/data/us_equity_loader.py | 9 +- zipline/lib/_factorize.pyx | 108 +++++ zipline/lib/_float64window.pyx | 4 +- zipline/lib/_int64window.pyx | 5 +- zipline/lib/_labelwindow.pyx | 6 + zipline/lib/_uint8window.pyx | 5 +- zipline/lib/_windowtemplate.pxi | 32 +- zipline/lib/adjusted_array.py | 53 ++- zipline/lib/adjustment.pyx | 64 ++- zipline/lib/labelarray.py | 482 +++++++++++++++++++++ zipline/pipeline/api_utils.py | 49 +++ zipline/pipeline/classifiers/classifier.py | 189 +++++++- zipline/pipeline/data/dataset.py | 29 +- zipline/pipeline/data/testing.py | 7 + zipline/pipeline/engine.py | 34 +- zipline/pipeline/factors/factor.py | 80 +--- zipline/pipeline/loaders/synthetic.py | 5 + zipline/pipeline/mixins.py | 48 +- zipline/pipeline/term.py | 16 +- zipline/testing/__init__.py | 1 + zipline/testing/core.py | 13 +- zipline/utils/compat.py | 3 + zipline/utils/functional.py | 86 ++++ zipline/utils/input_validation.py | 117 +++-- zipline/utils/numpy_utils.py | 9 +- 32 files changed, 1882 insertions(+), 310 deletions(-) create mode 100644 tests/test_labelarray.py create mode 100644 zipline/lib/_factorize.pyx create mode 100644 zipline/lib/_labelwindow.pyx create mode 100644 zipline/lib/labelarray.py create mode 100644 zipline/pipeline/api_utils.py diff --git a/setup.py b/setup.py index 78063820..e12e0b49 100644 --- a/setup.py +++ b/setup.py @@ -81,11 +81,13 @@ class LazyBuildExtCommandClass(dict): ext_modules = [ Extension('zipline.assets._assets', ['zipline/assets/_assets.pyx']), Extension('zipline.lib.adjustment', ['zipline/lib/adjustment.pyx']), + Extension('zipline.lib._factorize', ['zipline/lib/_factorize.pyx']), Extension( 'zipline.lib._float64window', ['zipline/lib/_float64window.pyx'] ), Extension('zipline.lib._int64window', ['zipline/lib/_int64window.pyx']), Extension('zipline.lib._uint8window', ['zipline/lib/_uint8window.pyx']), + Extension('zipline.lib._labelwindow', ['zipline/lib/_labelwindow.pyx']), Extension('zipline.lib.rank', ['zipline/lib/rank.pyx']), Extension('zipline.data._equities', ['zipline/data/_equities.pyx']), Extension('zipline.data._adjustments', ['zipline/data/_adjustments.pyx']), diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index a2a8cde0..11fb5130 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -9,19 +9,23 @@ from nose_parameterized import parameterized from numpy import ( arange, array, + asarray, + dtype, full, where, ) -from numpy.testing import assert_array_equal from six.moves import zip_longest +from toolz import curry from zipline.errors import WindowLengthNotPositive, WindowLengthTooLong from zipline.lib.adjustment import ( Datetime64Overwrite, Float64Multiply, Float64Overwrite, + ObjectOverwrite, ) from zipline.lib.adjusted_array import AdjustedArray, NOMASK +from zipline.lib.labelarray import LabelArray from zipline.testing import check_arrays, parameter_space from zipline.utils.numpy_utils import ( coerce_to_dtype, @@ -29,6 +33,7 @@ from zipline.utils.numpy_utils import ( default_missing_value_for_dtype, float64_dtype, int64_dtype, + object_dtype, ) @@ -62,12 +67,41 @@ def valid_window_lengths(underlying_buffer_length): return iter(range(1, underlying_buffer_length + 1)) -def _gen_unadjusted_cases(dtype): +@curry +def as_dtype(dtype, data): + """ + Curried wrapper around array.astype for when you have the dtype before you + have the data. + """ + return asarray(data).astype(dtype) + +@curry +def as_labelarray(initial_dtype, missing_value, array): + """ + Curried wrapper around LabelArray, that round-trips the input data through + `initial_dtype` first. + """ + return LabelArray( + array.astype(initial_dtype), + missing_value=initial_dtype.type(''), + ) + + +bytes_dtype = dtype('S3') +unicode_dtype = dtype('U3') + + +def _gen_unadjusted_cases(name, + make_input, + make_expected_output, + missing_value): nrows = 6 ncols = 3 - data = arange(nrows * ncols).astype(dtype).reshape(nrows, ncols) - missing_value = default_missing_value_for_dtype(dtype) + + raw_data = arange(nrows * ncols).reshape(nrows, ncols) + input_array = make_input(raw_data) + expected_output_array = make_expected_output(raw_data) for windowlen in valid_window_lengths(nrows): @@ -76,13 +110,13 @@ def _gen_unadjusted_cases(dtype): ) yield ( - "dtype_%s_length_%d" % (dtype, windowlen), - data, + "%s_length_%d" % (name, windowlen), + input_array, windowlen, {}, missing_value, [ - data[offset:offset + windowlen] + expected_output_array[offset:offset + windowlen] for offset in range(num_legal_windows) ], ) @@ -156,84 +190,125 @@ def _gen_multiplicative_adjustment_cases(dtype): [1, 6, 1], [1, 1, 1]], dtype=dtype) - return _gen_expectations(baseline, adjustments, buffer_as_of, nrows) - - -def _gen_overwrite_adjustment_cases(dtype): - """ - Generate test cases for overwrite adjustments. - - The algorithm used here is the same as the one used above for - multiplicative adjustments. The only difference is the semantics of how - the adjustments are expected to modify the arrays. - """ - adjustment_type = { - float64_dtype: Float64Overwrite, - datetime64ns_dtype: Datetime64Overwrite, - }[dtype] - - nrows, ncols = 6, 3 - adjustments = {} - buffer_as_of = [None] * 6 - baseline = buffer_as_of[0] = full((nrows, ncols), 2, dtype=dtype) - - # Note that row indices are inclusive! - adjustments[1] = [ - adjustment_type(0, 0, 0, 0, coerce_to_dtype(dtype, 1)), - ] - buffer_as_of[1] = array([[1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - # No adjustment at index 2. - buffer_as_of[2] = buffer_as_of[1] - - adjustments[3] = [ - adjustment_type(1, 2, 1, 1, coerce_to_dtype(dtype, 3)), - adjustment_type(0, 1, 0, 0, coerce_to_dtype(dtype, 4)), - ] - buffer_as_of[3] = array([[4, 2, 2], - [4, 3, 2], - [2, 3, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - adjustments[4] = [ - adjustment_type(0, 3, 2, 2, coerce_to_dtype(dtype, 5)) - ] - buffer_as_of[4] = array([[4, 2, 5], - [4, 3, 5], - [2, 3, 5], - [2, 2, 5], - [2, 2, 2], - [2, 2, 2]], dtype=dtype) - - adjustments[5] = [ - adjustment_type(0, 4, 1, 1, coerce_to_dtype(dtype, 6)), - adjustment_type(2, 2, 2, 2, coerce_to_dtype(dtype, 7)), - ] - buffer_as_of[5] = array([[4, 6, 5], - [4, 6, 5], - [2, 6, 7], - [2, 6, 5], - [2, 6, 2], - [2, 2, 2]], dtype=dtype) - return _gen_expectations( baseline, + default_missing_value_for_dtype(dtype), adjustments, buffer_as_of, nrows, ) -def _gen_expectations(baseline, adjustments, buffer_as_of, nrows): +def _gen_overwrite_adjustment_cases(name, + make_input, + make_expected_output, + dtype, + missing_value): + """ + Generate test cases for overwrite adjustments. + + The algorithm used here is the same as the one used above for + multiplicative adjustments. The only difference is the semantics of how + the adjustments are expected to modify the arrays. + + This is parameterized on `make_input` and make_expected_output functions, + which take 2-D lists of values and transform them into desired input/output + arrays. We do this so that we can easily test both vanilla numpy ndarrays + and our own LabelArray class for strings. + """ + adjustment_type = { + float64_dtype: Float64Overwrite, + datetime64ns_dtype: Datetime64Overwrite, + bytes_dtype: ObjectOverwrite, + unicode_dtype: ObjectOverwrite, + object_dtype: ObjectOverwrite, + }[dtype] + + if dtype == object_dtype: + # When we're testing object dtypes, we expect to have strings, but + # coerce_to_dtype(object, 3) just gives 3 as a Python integer. + def make_overwrite_value(dtype, value): + return str(value) + else: + make_overwrite_value = coerce_to_dtype + + adjustments = {} + buffer_as_of = [None] * 6 + baseline = make_input([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + buffer_as_of[0] = make_expected_output([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # Note that row indices are inclusive! + adjustments[1] = [ + adjustment_type(0, 0, 0, 0, make_overwrite_value(dtype, 1)), + ] + buffer_as_of[1] = make_expected_output([[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # No adjustment at index 2. + buffer_as_of[2] = buffer_as_of[1] + + adjustments[3] = [ + adjustment_type(1, 2, 1, 1, make_overwrite_value(dtype, 3)), + adjustment_type(0, 1, 0, 0, make_overwrite_value(dtype, 4)), + ] + buffer_as_of[3] = make_expected_output([[4, 2, 2], + [4, 3, 2], + [2, 3, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[4] = [ + adjustment_type(0, 3, 2, 2, make_overwrite_value(dtype, 5)) + ] + buffer_as_of[4] = make_expected_output([[4, 2, 5], + [4, 3, 5], + [2, 3, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[5] = [ + adjustment_type(0, 4, 1, 1, make_overwrite_value(dtype, 6)), + adjustment_type(2, 2, 2, 2, make_overwrite_value(dtype, 7)), + ] + buffer_as_of[5] = make_expected_output([[4, 6, 5], + [4, 6, 5], + [2, 6, 7], + [2, 6, 5], + [2, 6, 2], + [2, 2, 2]]) + + return _gen_expectations( + baseline, + missing_value, + adjustments, + buffer_as_of, + nrows=6, + ) + + +def _gen_expectations(baseline, + missing_value, + adjustments, + buffer_as_of, + nrows): - missing_value = default_missing_value_for_dtype(baseline.dtype) for windowlen in valid_window_lengths(nrows): num_legal_windows = num_windows_of_length_M_on_buffers_of_length_N( @@ -263,8 +338,60 @@ class AdjustedArrayTestCase(TestCase): @parameterized.expand( chain( - _gen_unadjusted_cases(float64_dtype), - _gen_unadjusted_cases(datetime64ns_dtype), + _gen_unadjusted_cases( + 'float', + make_input=as_dtype(float64_dtype), + make_expected_output=as_dtype(float64_dtype), + missing_value=default_missing_value_for_dtype(float64_dtype), + ), + _gen_unadjusted_cases( + 'datetime', + make_input=as_dtype(datetime64ns_dtype), + make_expected_output=as_dtype(datetime64ns_dtype), + missing_value=default_missing_value_for_dtype( + datetime64ns_dtype + ), + ), + # Test passing an array of strings to AdjustedArray. + _gen_unadjusted_cases( + 'bytes_ndarray', + make_input=as_dtype(bytes_dtype), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_ndarray', + make_input=as_dtype(unicode_dtype), + make_expected_output=as_labelarray(unicode_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_ndarray', + make_input=lambda a: a.astype(str).astype(object), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + # Test passing a LabelArray directly to AdjustedArray. + _gen_unadjusted_cases( + 'bytes_labelarray', + make_input=as_labelarray(bytes_dtype, b''), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_labelarray', + make_input=as_labelarray(unicode_dtype, u''), + make_expected_output=as_labelarray(bytes_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_labelarray', + make_input=( + lambda a: LabelArray(a.astype(str).astype(object), b'') + ), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), ) ) def test_no_adjustments(self, @@ -273,14 +400,13 @@ class AdjustedArrayTestCase(TestCase): lookback, adjustments, missing_value, - expected): + expected_output): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. - window_iter = array.traverse(lookback) - for yielded, expected_yield in zip_longest(window_iter, expected): - self.assertEqual(yielded.dtype, data.dtype) - assert_array_equal(yielded, expected_yield) + in_out = zip(array.traverse(lookback), expected_output) + for yielded, expected_yield in in_out: + check_arrays(yielded, expected_yield) @parameterized.expand(_gen_multiplicative_adjustment_cases(float64_dtype)) def test_multiplicative_adjustments(self, @@ -295,12 +421,70 @@ class AdjustedArrayTestCase(TestCase): for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): - assert_array_equal(yielded, expected_yield) + check_arrays(yielded, expected_yield) @parameterized.expand( chain( - _gen_overwrite_adjustment_cases(float64_dtype), - _gen_overwrite_adjustment_cases(datetime64ns_dtype), + _gen_overwrite_adjustment_cases( + 'float', + make_input=as_dtype(float64_dtype), + make_expected_output=as_dtype(float64_dtype), + dtype=float64_dtype, + missing_value=default_missing_value_for_dtype(float64_dtype), + ), + _gen_overwrite_adjustment_cases( + 'datetime', + make_input=as_dtype(datetime64ns_dtype), + make_expected_output=as_dtype(datetime64ns_dtype), + dtype=datetime64ns_dtype, + missing_value=default_missing_value_for_dtype( + datetime64ns_dtype, + ), + ), + # There are six cases here: + # Using np.bytes/np.unicode/python string arrays as inputs. + # Passing np.bytes/np.unicode/python string arrays to LabelArray, + # and using those as input. + # + # The outputs should always be LabelArrays. + _gen_unadjusted_cases( + 'bytes_ndarray', + make_input=as_dtype(bytes_dtype), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_ndarray', + make_input=as_dtype(unicode_dtype), + make_expected_output=as_labelarray(unicode_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_ndarray', + make_input=lambda a: a.astype(str).astype(object), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'bytes_labelarray', + make_input=as_labelarray(bytes_dtype, b''), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), + _gen_unadjusted_cases( + 'unicode_labelarray', + make_input=as_labelarray(unicode_dtype, u''), + make_expected_output=as_labelarray(bytes_dtype, u''), + missing_value=u'', + ), + _gen_unadjusted_cases( + 'object_labelarray', + make_input=( + lambda a: LabelArray(a.astype(str).astype(object), b'') + ), + make_expected_output=as_labelarray(bytes_dtype, b''), + missing_value=b'', + ), ) ) def test_overwrite_adjustment_cases(self, @@ -314,11 +498,15 @@ class AdjustedArrayTestCase(TestCase): for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): - self.assertEqual(yielded.dtype, data.dtype) - assert_array_equal(yielded, expected_yield) + check_arrays(yielded, expected_yield) @parameter_space( - dtype=[float64_dtype, int64_dtype, datetime64ns_dtype], + __fail_fast=True, + dtype=[ + float64_dtype, + int64_dtype, + datetime64ns_dtype, + ], missing_value=[0, 10000], window_length=[2, 3], ) @@ -341,6 +529,37 @@ class AdjustedArrayTestCase(TestCase): for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual) + @parameter_space( + __fail_fast=True, + dtype=[bytes_dtype, unicode_dtype, object_dtype], + missing_value=["0", "-1", ""], + window_length=[2, 3], + ) + def test_masking_with_strings(self, dtype, missing_value, window_length): + missing_value = coerce_to_dtype(dtype, missing_value) + baseline_ints = arange(15).reshape(5, 3) + + # Coerce to string first so that coercion to object gets us an array of + # string objects. + baseline = baseline_ints.astype(str).astype(dtype) + mask = (baseline_ints % 2).astype(bool) + + masked_baseline = LabelArray(baseline, missing_value=missing_value) + masked_baseline[~mask] = missing_value + + array = AdjustedArray( + baseline, + mask, + adjustments={}, + missing_value=missing_value, + ) + + gen_expected = moving_window(masked_baseline, window_length) + gen_actual = array.traverse(window_length=window_length) + + for expected, actual in zip(gen_expected, gen_actual): + check_arrays(expected, actual) + def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) diff --git a/tests/pipeline/test_classifier.py b/tests/pipeline/test_classifier.py index a4b25178..77f76079 100644 --- a/tests/pipeline/test_classifier.py +++ b/tests/pipeline/test_classifier.py @@ -1,12 +1,21 @@ import numpy as np +from zipline.lib.labelarray import LabelArray from zipline.pipeline import Classifier from zipline.testing import parameter_space -from zipline.utils.numpy_utils import int64_dtype +from zipline.utils.numpy_utils import ( + categorical_dtype, + coerce_to_dtype, + int64_dtype, +) from .base import BasePipelineTestCase +bytes_dtype = np.dtype('S3') +unicode_dtype = np.dtype('U3') + + class ClassifierTestCase(BasePipelineTestCase): @parameter_space(mv=[-1, 0, 1, 999]) @@ -69,10 +78,56 @@ class ClassifierTestCase(BasePipelineTestCase): mask=self.build_mask(self.ones_mask(shape=data.shape)), ) - @parameter_space(missing=[-1, 0, 1]) - def test_disallow_comparison_to_missing_value(self, missing): + @parameter_space( + __fail_fast=True, + compval=['a', 'ab', 'not in the array'], + labelarray_dtype=(bytes_dtype, categorical_dtype, unicode_dtype), + ) + def test_string_eq(self, compval, labelarray_dtype): + + compval = labelarray_dtype.type(compval) + class C(Classifier): - dtype = int64_dtype + dtype = categorical_dtype + missing_value = '' + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value='', + ) + + self.check_terms( + terms={ + 'eq': c.eq(compval), + }, + expected={ + 'eq': (data == compval), + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + @parameter_space( + missing=[-1, 0, 1], + dtype_=[int64_dtype, categorical_dtype], + ) + def test_disallow_comparison_to_missing_value(self, missing, dtype_): + missing = coerce_to_dtype(dtype_, missing) + + class C(Classifier): + dtype = dtype_ missing_value = missing inputs = () window_length = 0 @@ -82,7 +137,7 @@ class ClassifierTestCase(BasePipelineTestCase): errmsg = str(e.exception) self.assertEqual( errmsg, - "Comparison against self.missing_value ({v}) in C.eq().\n" + "Comparison against self.missing_value ({v!r}) in C.eq().\n" "Missing values have NaN semantics, so the requested comparison" " would always produce False.\n" "Use the isnull() method to check for missing values.".format( @@ -118,3 +173,100 @@ class ClassifierTestCase(BasePipelineTestCase): initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), ) + + @parameter_space( + __fail_fast=True, + compval=['a', 'ab', '', 'not in the array'], + missing=['a', 'ab', '', 'not in the array'], + labelarray_dtype=(bytes_dtype, unicode_dtype, categorical_dtype), + ) + def test_string_not_equal(self, compval, missing, labelarray_dtype): + + compval = labelarray_dtype.type(compval) + + class C(Classifier): + dtype = categorical_dtype + missing_value = missing + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value=missing, + ) + + expected = ( + (data.as_int_array() != data.reverse_categories.get(compval, -1)) & + (data.as_int_array() != data.reverse_categories[C.missing_value]) + ) + + self.check_terms( + terms={ + 'ne': c != compval, + }, + expected={ + 'ne': expected, + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) + + @parameter_space( + __fail_fast=True, + compval=['a', 'b', 'ab', 'not in the array'], + missing=['a', 'ab', '', 'not in the array'], + labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype), + ) + def test_string_elementwise_predicates(self, + compval, + missing, + labelarray_dtype): + + missing = labelarray_dtype.type(missing) + compval = labelarray_dtype.type(compval) + + class C(Classifier): + dtype = categorical_dtype + missing_value = missing + inputs = () + window_length = 0 + + c = C() + + # There's no significance to the values here other than that they + # contain a mix of the comparison value and other values. + data = LabelArray( + np.asarray( + [['', 'a', 'ab', 'ba'], + ['z', 'ab', 'a', 'ab'], + ['aa', 'ab', '', 'ab'], + ['aa', 'a', 'ba', 'ba']], + dtype=labelarray_dtype, + ), + missing_value=missing, + ) + + self.check_terms( + terms={ + 'startswith': c.startswith(compval), + 'endswith': c.endswith(compval), + 'contains': c.contains(compval), + }, + expected={ + 'startswith': (data.startswith(compval) & (data != missing)), + 'endswith': (data.endswith(compval) & (data != missing)), + 'contains': (data.contains(compval) & (data != missing)), + }, + initial_workspace={c: data}, + mask=self.build_mask(self.ones_mask(shape=data.shape)), + ) diff --git a/tests/pipeline/test_column.py b/tests/pipeline/test_column.py index d270dc3b..717fc53d 100644 --- a/tests/pipeline/test_column.py +++ b/tests/pipeline/test_column.py @@ -7,6 +7,7 @@ from unittest import TestCase from pandas import date_range, DataFrame from pandas.util.testing import assert_frame_equal +from zipline.lib.labelarray import LabelArray from zipline.pipeline import Pipeline from zipline.pipeline.data.testing import TestingDataSet as TDS from zipline.testing import chrange, temp_pipeline_engine @@ -35,6 +36,21 @@ class LatestTestCase(TestCase): def expected_latest(self, column, slice_): loader = self.engine.get_loader(column) + + index = self.calendar[slice_] + columns = self.assets + values = loader.values(column.dtype, self.calendar, self.sids)[slice_] + + if column.dtype.kind in ('O', 'S', 'U'): + # For string columns, we expect a categorical in the output. + return LabelArray( + values, + missing_value=column.missing_value, + ).as_categorical_frame( + index=index, + columns=columns, + ) + return DataFrame( loader.values(column.dtype, self.calendar, self.sids)[slice_], index=self.calendar[slice_], @@ -55,6 +71,6 @@ class LatestTestCase(TestCase): dates_to_test[-1], ) for column in columns: - float_result = result[column.name].unstack() - expected_float_result = self.expected_latest(column, cal_slice) - assert_frame_equal(float_result, expected_float_result) + col_result = result[column.name].unstack() + expected_col_result = self.expected_latest(column, cal_slice) + assert_frame_equal(col_result, expected_col_result) diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 04f35ed6..408cb689 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -471,7 +471,8 @@ class ObjectIdentityTestCase(TestCase): for column in TestingDataSet.columns: if column.dtype == bool_dtype: self.assertIsInstance(column.latest, Filter) - elif column.dtype == int64_dtype: + elif (column.dtype == int64_dtype + or column.dtype.kind in ('O', 'S', 'U')): self.assertIsInstance(column.latest, Classifier) elif column.dtype in factor_dtypes: self.assertIsInstance(column.latest, Factor) diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py new file mode 100644 index 00000000..5b24659f --- /dev/null +++ b/tests/test_labelarray.py @@ -0,0 +1,151 @@ +from itertools import product +from operator import eq, ne +import numpy as np + +from zipline.lib.labelarray import LabelArray +from zipline.testing import check_arrays, parameter_space, ZiplineTestCase + + +def rotN(l, N): + """ + Rotate a list of elements. + + Pulls N elements off the end of the list and appends them to the front. + + >>> rotN(['a', 'b', 'c', 'd'], 2) + ['c', 'd', 'a', 'b'] + >>> rotN(['a', 'b', 'c', 'd'], 3) + ['d', 'a', 'b', 'c'] + """ + assert len(l) >= N, "Can't rotate list by longer than its length." + return l[N:] + l[:N] + + +class LabelArrayTestCase(ZiplineTestCase): + + @classmethod + def init_class_fixtures(cls): + super(LabelArrayTestCase, cls).init_class_fixtures() + + cls.rowvalues = row = ['', 'a', 'b', 'ab', 'a', '', 'b', 'ab', 'z'] + cls.strs = np.array([rotN(row, i) for i in range(3)]) + + def test_fail_on_direct_construction(self): + # See http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray # noqa + + with self.assertRaises(TypeError) as e: + np.ndarray.__new__(LabelArray, (5, 5)) + + self.assertEqual( + str(e.exception), + "Direct construction of LabelArrays is not supported." + ) + + @parameter_space( + __fail_fast=True, + s=['', 'a', 'z', 'aa', 'not in the array'], + shape=[(27,), (9, 3), (3, 9), (3, 3, 3)], + array_astype=(bytes, unicode, object), + scalar_astype=(bytes, unicode, object), + ) + def test_compare_to_str(self, s, shape, array_astype, scalar_astype): + strs = self.strs.reshape(shape).astype(array_astype) + arr = LabelArray(strs, missing_value='') + check_arrays(strs == s, arr == s) + check_arrays(strs != s, arr != s) + + np_startswith = np.vectorize(lambda elem: elem.startswith(s)) + check_arrays(arr.startswith(s), np_startswith(strs)) + + np_endswith = np.vectorize(lambda elem: elem.endswith(s)) + check_arrays(arr.endswith(s), np_endswith(strs)) + + np_contains = np.vectorize(lambda elem: s in elem) + check_arrays(arr.contains(s), np_contains(strs)) + + def test_compare_to_str_array(self): + strs = self.strs + shape = strs.shape + arr = LabelArray(strs, missing_value='') + check_arrays(strs == arr, np.full_like(strs, True, dtype=bool)) + check_arrays(strs != arr, np.full_like(strs, False, dtype=bool)) + + def broadcastable_row(value, dtype): + return np.full((shape[0], 1), value, dtype=strs.dtype) + + def broadcastable_col(value, dtype): + return np.full((1, shape[1]), value, dtype=strs.dtype) + + for comparator, dtype, value in product((eq, ne), + (bytes, unicode, object), + set(self.rowvalues)): + check_arrays( + comparator(arr, np.full_like(strs, value)), + comparator(strs, value), + ) + check_arrays( + comparator(arr, broadcastable_row(value, dtype=dtype)), + comparator(strs, value), + ) + check_arrays( + comparator(arr, broadcastable_col(value, dtype=dtype)), + comparator(strs, value), + ) + + @parameter_space( + __fail_fast=True, + slice_=[ + 0, 1, -1, + slice(None), + slice(0, 0), + slice(0, 3), + slice(1, 4), + slice(0), + slice(None, 1), + slice(0, 4, 2), + (slice(None), 1), + (slice(None), slice(None)), + (slice(None), slice(1, 2)), + ] + ) + def test_slicing_preserves_attributes(self, slice_): + arr = LabelArray(self.strs.reshape((9, 3)), missing_value='') + sliced = arr[slice_] + self.assertIsInstance(sliced, LabelArray) + self.assertIs(sliced.categories, arr.categories) + self.assertIs(sliced.reverse_categories, arr.reverse_categories) + self.assertIs(sliced.missing_value, arr.missing_value) + + def test_infer_categories(self): + arr1d = LabelArray(self.strs, missing_value='') + codes1d = arr1d.as_int_array() + self.assertEqual(arr1d.shape, self.strs.shape) + self.assertEqual(arr1d.shape, codes1d.shape) + + categories = arr1d.categories + unique_rowvalues = set(self.rowvalues) + + # There should be an entry in categories for each unique row value, and + # each integer stored in the data array should be an index into + # categories. + self.assertEqual(list(categories), sorted(set(self.rowvalues))) + self.assertEqual( + set(codes1d.ravel()), + set(range(len(unique_rowvalues))) + ) + for idx, value in enumerate(arr1d.categories): + check_arrays( + self.strs == value, + arr1d.view(type=np.ndarray) == idx, + ) + + for shape in (9, 3), (3, 9), (3, 3, 3): + strs2d = self.strs.reshape(shape) + arr2d = LabelArray(strs2d, missing_value='') + codes2d = arr2d.as_int_array() + + self.assertEqual(arr2d.shape, shape) + check_arrays(arr2d.categories, categories) + + for idx, value in enumerate(arr2d.categories): + check_arrays(strs2d == value, codes2d == idx) diff --git a/zipline/algorithm.py b/zipline/algorithm.py index 496a6a56..9b0026ce 100644 --- a/zipline/algorithm.py +++ b/zipline/algorithm.py @@ -665,6 +665,7 @@ class TradingAlgorithm(object): new_assets = tuple(new_assets) new_sids = tuple(new_sids) new_symbols = tuple(new_symbols) + number_of_kinds_of_new_things = ( sum((bool(new_assets), bool(new_sids), bool(new_symbols))) ) diff --git a/zipline/data/us_equity_loader.py b/zipline/data/us_equity_loader.py index 5385b203..ffb3d7e2 100644 --- a/zipline/data/us_equity_loader.py +++ b/zipline/data/us_equity_loader.py @@ -19,7 +19,7 @@ from abc import ( ) from cachetools import LRUCache -from numpy import dtype, around, hstack +from numpy import around, hstack from pandas.tslib import normalize_date from six import with_metaclass @@ -28,6 +28,7 @@ from zipline.lib._float64window import AdjustedArrayWindow as Float64Window from zipline.lib.adjustment import Float64Multiply from zipline.utils.cache import ExpiringCache from zipline.utils.memoize import lazyval +from zipline.utils.numpy_utils import float64_dtype class SlidingWindow(object): @@ -237,9 +238,9 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)): prefetch_dts = cal[start_ix:prefetch_end_ix + 1] prefetch_len = len(prefetch_dts) array = self._array(prefetch_dts, needed_assets, field) + view_kwargs = {} if field == 'volume': - array = array.astype('float64') - dtype_ = dtype('float64') + array = array.astype(float64_dtype) for i, asset in enumerate(needed_assets): if self._adjustments_reader: @@ -249,7 +250,7 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)): adjs = {} window = Float64Window( array[:, i].reshape(prefetch_len, 1), - dtype_, + view_kwargs, adjs, offset, size diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx new file mode 100644 index 00000000..d3a98d18 --- /dev/null +++ b/zipline/lib/_factorize.pyx @@ -0,0 +1,108 @@ +""" +Factorization algorithms. +""" +from numpy cimport ndarray, int64_t, PyArray_Check, import_array +from numpy import arange, asarray, empty, int64, isnan, ndarray + +import_array() + + +cpdef factorize_strings_known_categories(ndarray[object] values, + list categories, + object missing_value, + int sort): + """ + Factorize an array whose categories are already known. + + Any entries not in the specified categories will be given the code for + `missing_value`. + """ + if missing_value not in categories: + categories.append(missing_value) + + if sort: + categories = sorted(categories) + + cdef: + Py_ssize_t nvalues = len(values) + dict reverse_categories = dict( + zip(categories, range(len(categories))) + ) + + if not nvalues: + return ( + asarray([], dtype=int64), + asarray(categories, dtype=object), + reverse_categories, + ) + + cdef: + Py_ssize_t i + Py_ssize_t missing_code = reverse_categories[missing_value] + ndarray[int64_t] codes = empty(nvalues, dtype=int64) + + for i in range(nvalues): + codes[i] = reverse_categories.get(values[i], missing_code) + + return codes, asarray(categories, dtype=object), reverse_categories + +cpdef factorize_strings(ndarray[object] values, + object missing_value, + int sort): + """ + Factorize an array of (possibly duplicated) labels into an array of indices + into a unique array of labels. + + This is ~30% faster than pandas.factorize, at the cost of not having + special treatment for NaN, which we don't care about because we only + support arrays of strings. + + (Though it's faster even if you throw in the nan checks that pandas does, + because we're using dict and list instead of PyObjectHashTable and + ObjectVector. Python's builtin data structures are **really** + well-optimized.) + """ + cdef: + Py_ssize_t nvalues = len(values) + list categories = [missing_value] + dict reverse_categories = {missing_value: 0} + + # Short circuit on empty array. + if not nvalues: + return ( + asarray([], dtype=int64), + asarray(categories, dtype=object), + reverse_categories, + ) + + cdef: + Py_ssize_t i, code + object key = None + ndarray[int64_t] codes = empty(nvalues, dtype=int64) + + for i in range(nvalues): + key = values[i] + code = reverse_categories.get(key, -1) + if code == -1: + # Assign new code. + code = len(reverse_categories) + reverse_categories[key] = code + categories.append(key) + codes[i] = code + + cdef ndarray[int64_t, ndim=1] sorter + cdef ndarray[int64_t, ndim=1] reverse_indexer + cdef int ncategories + cdef ndarray[object] categories_array = asarray(categories, dtype=object) + if sort: + # This is all taken from pandas.core.algorithms.factorize. + ncategories = len(categories_array) + sorter = categories_array.argsort() + reverse_indexer = empty(ncategories, dtype=int64) + reverse_indexer.put(sorter, arange(ncategories)) + + codes = reverse_indexer.take(codes) + categories_array = categories_array.take(sorter) + reverse_categories = dict(zip(categories_array, range(ncategories))) + + return codes, categories_array, reverse_categories diff --git a/zipline/lib/_float64window.pyx b/zipline/lib/_float64window.pyx index e3df0225..1747b014 100644 --- a/zipline/lib/_float64window.pyx +++ b/zipline/lib/_float64window.pyx @@ -1,5 +1,7 @@ """ float specialization of AdjustedArrayWindow """ -from numpy cimport float64_t as ctype +from numpy cimport float64_t +ctypedef float64_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_int64window.pyx b/zipline/lib/_int64window.pyx index 9951bd8c..d30b97ca 100644 --- a/zipline/lib/_int64window.pyx +++ b/zipline/lib/_int64window.pyx @@ -1,5 +1,8 @@ """ datetime specialization of AdjustedArrayWindow """ -from numpy cimport int64_t as ctype +from numpy cimport int64_t + +ctypedef int64_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_labelwindow.pyx b/zipline/lib/_labelwindow.pyx new file mode 100644 index 00000000..702f9d79 --- /dev/null +++ b/zipline/lib/_labelwindow.pyx @@ -0,0 +1,6 @@ +""" +AdjustedArrayWindow type used for LabelArrays. +""" +ctypedef object databuffer + +include "_windowtemplate.pxi" diff --git a/zipline/lib/_uint8window.pyx b/zipline/lib/_uint8window.pyx index 99dad45f..8b8ce98b 100644 --- a/zipline/lib/_uint8window.pyx +++ b/zipline/lib/_uint8window.pyx @@ -1,5 +1,8 @@ """ bool specialization of AdjustedArrayWindow """ -from numpy cimport uint8_t as ctype +from numpy cimport uint8_t + +ctypedef uint8_t[:, :] databuffer + include "_windowtemplate.pxi" diff --git a/zipline/lib/_windowtemplate.pxi b/zipline/lib/_windowtemplate.pxi index 2a1ae38b..fb50736d 100644 --- a/zipline/lib/_windowtemplate.pxi +++ b/zipline/lib/_windowtemplate.pxi @@ -2,8 +2,8 @@ Template for AdjustedArray windowed iterators. This file is intended to be used by inserting it via a Cython include into a -file that's define a type symbol named `ctype` and string constant named -`dtype`. +file that's defined a type symbol named `databuffer` that can be used like a +2-D numpy array. See Also -------- @@ -12,9 +12,7 @@ zipline.lib._intwindow zipline.lib._datewindow """ from numpy cimport ndarray -from numpy import asarray - -ctypedef ctype[:, :] databuffer +from numpy import asanyarray cdef class AdjustedArrayWindow: @@ -33,8 +31,8 @@ cdef class AdjustedArrayWindow: """ cdef: # ctype must be defined by the file into which this is being copied. - databuffer data - object viewtype + readonly databuffer data + readonly dict view_kwargs readonly Py_ssize_t window_length Py_ssize_t anchor, next_anchor, max_anchor, next_adj dict adjustments @@ -43,13 +41,13 @@ cdef class AdjustedArrayWindow: def __cinit__(self, databuffer data not None, - object viewtype not None, + dict view_kwargs not None, dict adjustments not None, Py_ssize_t offset, Py_ssize_t window_length): self.data = data - self.viewtype = viewtype + self.view_kwargs = view_kwargs self.adjustments = adjustments self.adjustment_indices = sorted(adjustments, reverse=True) self.window_length = window_length @@ -74,9 +72,10 @@ cdef class AdjustedArrayWindow: def __next__(self): cdef: - ndarray out object adjustment + ndarray out Py_ssize_t start, anchor + dict view_kwargs anchor = self.anchor = self.next_anchor if anchor > self.max_anchor: @@ -93,7 +92,13 @@ cdef class AdjustedArrayWindow: self.next_adj = self.pop_next_adj() start = anchor - self.window_length - out = asarray(self.data[start:self.anchor]).view(self.viewtype) + + # If our data is a custom subclass of ndarray, preserve that subclass + # by using asanyarray instead of asarray. + out = asanyarray(self.data[start:self.anchor]) + view_kwargs = self.view_kwargs + if view_kwargs: + out = out.view(**view_kwargs) out.setflags(write=False) self.next_anchor = self.anchor + 1 @@ -101,8 +106,7 @@ cdef class AdjustedArrayWindow: return out def seek(self, target_anchor): - cdef: - ndarray out + cdef ndarray out = None if target_anchor < self.anchor: raise Exception('Can not access data after window has passed.') @@ -122,5 +126,5 @@ cdef class AdjustedArrayWindow: self.window_length, self.anchor, self.max_anchor, - self.viewtype, + self.view_kwargs.get('dtype'), ) diff --git a/zipline/lib/adjusted_array.py b/zipline/lib/adjusted_array.py index 4c0ee5b8..b00f3459 100644 --- a/zipline/lib/adjusted_array.py +++ b/zipline/lib/adjusted_array.py @@ -17,6 +17,8 @@ from zipline.errors import ( WindowLengthNotPositive, WindowLengthTooLong, ) +from zipline.lib.labelarray import LabelArray +from zipline.utils.compat import unicode from zipline.utils.numpy_utils import ( datetime64ns_dtype, float64_dtype, @@ -28,8 +30,10 @@ from zipline.utils.memoize import lazyval # These class names are all the same because of our bootleg templating system. from ._float64window import AdjustedArrayWindow as Float64Window from ._int64window import AdjustedArrayWindow as Int64Window +from ._labelwindow import AdjustedArrayWindow as LabelWindow from ._uint8window import AdjustedArrayWindow as UInt8Window + NOMASK = None BOOL_DTYPES = frozenset( map(dtype, [bool_]), @@ -44,16 +48,20 @@ INT_DTYPES = frozenset( DATETIME_DTYPES = frozenset( map(dtype, ['datetime64[ns]', 'datetime64[D]']), ) +# We use object arrays for strings. +CATEGORICAL_DTYPES = frozenset(map(dtype, ['O'])) + REPRESENTABLE_DTYPES = BOOL_DTYPES.union( FLOAT_DTYPES, INT_DTYPES, - DATETIME_DTYPES + DATETIME_DTYPES, + CATEGORICAL_DTYPES, ) def can_represent_dtype(dtype): """ - Can we build an AdjustedArray for a baseline of dtype ``dtype``? + Can we build an AdjustedArray for a baseline of `dtype``? """ return dtype in REPRESENTABLE_DTYPES @@ -65,11 +73,11 @@ CONCRETE_WINDOW_TYPES = { } -def _normalize_array(data): +def _normalize_array(data, missing_value): """ Coerce buffer data for an AdjustedArray into a standard scalar - representation, returning the coerced array and a numpy dtype object to use - as a view type when providing public view into the data. + representation, returning the coerced array and a dict of argument to pass + to np.view to use when providing a user-facing view of the underlying data. - float* data is coerced to float64 with viewtype float64. - int32, int64, and uint32 are converted to int64 with viewtype int64. @@ -82,19 +90,29 @@ def _normalize_array(data): Returns ------- - coerced, viewtype : (np.ndarray, np.dtype) + coerced, view_kwargs : (np.ndarray, np.dtype) """ + if isinstance(data, LabelArray): + return data, {} + data_dtype = data.dtype if data_dtype == bool_: - return data.astype(uint8), dtype(bool_) + return data.astype(uint8), {'dtype': dtype(bool_)} elif data_dtype in FLOAT_DTYPES: - return data.astype(float64), dtype(float64) + return data.astype(float64), {'dtype': dtype(float64)} elif data_dtype in INT_DTYPES: - return data.astype(int64), dtype(int64) - elif data_dtype.name.startswith('datetime'): + return data.astype(int64), {'dtype': dtype(int64)} + elif data_dtype in CATEGORICAL_DTYPES: + if not isinstance(missing_value, (bytes, unicode)): + raise TypeError( + "Invalid missing_value for categorical array.\n" + "Expected a string, got %r" % missing_value, + ) + return LabelArray(data, missing_value), {} + elif data_dtype.kind == 'M': try: outarray = data.astype('datetime64[ns]').view('int64') - return outarray, datetime64ns_dtype + return outarray, {'dtype': datetime64ns_dtype} except OverflowError: raise ValueError( "AdjustedArray received a datetime array " @@ -127,18 +145,17 @@ class AdjustedArray(object): missing_value : object A value to use to fill missing data in yielded windows. Should be a value coercible to `data.dtype`. - """ __slots__ = ( '_data', - '_viewtype', + '_view_kwargs', 'adjustments', 'missing_value', '__weakref__', ) def __init__(self, data, mask, adjustments, missing_value): - self._data, self._viewtype = _normalize_array(data) + self._data, self._view_kwargs = _normalize_array(data, missing_value) self.adjustments = adjustments self.missing_value = missing_value @@ -158,20 +175,22 @@ class AdjustedArray(object): """ The data stored in this array. """ - return self._data.view(self._viewtype) + return self._data.view(**self._view_kwargs) @lazyval def dtype(self): """ The dtype of the data stored in this array. """ - return self._viewtype + return self._view_kwargs.get('dtype') or self._data.dtype @lazyval def _iterator_type(self): """ The iterator produced when `traverse` is called on this Array. """ + if isinstance(self._data, LabelArray): + return LabelWindow return CONCRETE_WINDOW_TYPES[self._data.dtype] def traverse(self, window_length, offset=0): @@ -190,7 +209,7 @@ class AdjustedArray(object): _check_window_params(data, window_length) return self._iterator_type( data, - self._viewtype, + self._view_kwargs, self.adjustments, offset, window_length, diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index 748c3025..8708b2e6 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -316,12 +316,13 @@ cdef class Float64Multiply(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] *= self.value + data[row, col] *= value cdef class Float64Overwrite(Float64Adjustment): @@ -354,12 +355,13 @@ cdef class Float64Overwrite(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] = self.value + data[row, col] = value cdef class Float64Add(Float64Adjustment): @@ -392,12 +394,13 @@ cdef class Float64Add(Float64Adjustment): cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t row, col + cdef float64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] += self.value + data[row, col] += value cdef class _Int64Adjustment(Adjustment): @@ -530,9 +533,62 @@ cdef class Datetime64Overwrite(Datetime64Adjustment): """ cpdef mutate(self, int64_t[:, :] data): cdef Py_ssize_t row, col + cdef int64_t value = self.value # last_col + 1 because last_col should also be affected. for col in range(self.first_col, self.last_col + 1): # last_row + 1 because last_row should also be affected. for row in range(self.first_row, self.last_row + 1): - data[row, col] = self.value + data[row, col] = value + + +cdef class _ObjectAdjustment(Adjustment): + """ + Base class for adjustments that operate on arbitrary objects. + + We use only this for categorical data, where our data buffer is an array of + indices into an array of unique Python string objects. + """ + cdef: + readonly object value + + def __init__(self, + Py_ssize_t first_row, + Py_ssize_t last_row, + Py_ssize_t first_col, + Py_ssize_t last_col, + object value): + super(Adjustment, self).__init__( + first_row=first_row, + last_row=last_row, + first_col=first_col, + last_col=last_col, + ) + self.value = value + + def __repr__(self): + return ( + "%s(first_row=%d, last_row=%d," + " first_col=%d, last_col=%d, value=%r)" % ( + type(self).__name__, + self.first_row, + self.last_row, + self.first_col, + self.last_col, + self.value, + ) + ) + + +cdef class ObjectOverwrite(_ObjectAdjustment): + + cpdef mutate(self, object data): + # data is an object here because this is intended to be used with a + # `zipline.lib.LabelArray`. + cdef Py_ssize_t row, col + cdef object value = self.value + + # We don't do this in a loop because we only want to look up the label + # code in the array's categories once. + data[self.first_row:self.last_row + 1, + self.first_col:self.last_col + 1] = self.value diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py new file mode 100644 index 00000000..837a39c2 --- /dev/null +++ b/zipline/lib/labelarray.py @@ -0,0 +1,482 @@ +""" +An ndarray subclass for working with arrays of strings. +""" +from functools import partial +from numbers import Number +from operator import eq, ne +import re + +import numpy as np +from numpy import ndarray +import pandas as pd +from toolz import compose + +from zipline.utils.preprocess import preprocess +from zipline.utils.sentinel import sentinel +from zipline.utils.input_validation import ( + coerce, + expect_kinds, + expect_types, + optional, +) +from zipline.utils.numpy_utils import is_object, int64_dtype + +from ._factorize import ( + factorize_strings, + factorize_strings_known_categories, +) + + +def compare_arrays(left, right): + "Eq check with a short-circuit for identical objects." + return ( + left is right + or ((left.shape == right.shape) and (left == right).all()) + ) + + +def _make_unsupported_method(name): + def method(*args, **kwargs): + raise NotImplementedError( + "Method %s is not supported on LabelArrays." % name + ) + method.__name__ = name + method.__doc__ = "Unsupported LabelArray Method: %s" % name + return method + + +class CategoryMismatch(ValueError): + """ + Error raised on attempt to perform operations between LabelArrays with + mismatched category arrays. + """ + def __init__(self, left, right): + (mismatches,) = np.where(left != right) + assert len(mismatches), "Not actually a mismatch!" + super(CategoryMismatch, self).__init__( + "LabelArray categories don't match:\n" + "Mismatched Indices: {mismatches}\n" + "Left: {left}\n" + "Right: {right}".format( + mismatches=mismatches, + left=left[mismatches], + right=right[mismatches], + ) + ) + +_NotPassed = sentinel('_NotPassed') + + +class LabelArray(ndarray): + """ + An ndarray subclass for working with arrays of strings. + + Factorizes the input array into integers, but overloads equality on strings + to check against the factor label. + + See Also + -------- + http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html + """ + @preprocess( + values=coerce(list, partial(np.asarray, dtype=object)), + ) + @expect_types( + values=np.ndarray, + categories=optional(list), + ) + @expect_kinds(values=("O", "S", "U")) + def __new__(cls, + values, + missing_value, + categories=None, + sort=True): + + # Numpy's fixed-width string types aren't very efficient. Working with + # object arrays is faster than bytes or unicode arrays in almost all + # cases. + if not is_object(values): + values = values.astype(object) + + if categories is None: + codes, categories, reverse_categories = factorize_strings( + values.ravel(), + missing_value=missing_value, + sort=sort, + ) + else: + codes, categories, reverse_categories = ( + factorize_strings_known_categories( + values.ravel(), + categories=categories, + missing_value=missing_value, + sort=sort, + ) + ) + categories.setflags(write=False) + + ret = codes.reshape(values.shape).view(type=cls) + ret._categories = categories + ret._reverse_categories = reverse_categories + ret._missing_value = missing_value + return ret + + @property + def categories(self): + # This is a property because it should be immutable. + return self._categories + + @property + def reverse_categories(self): + # This is a property because it should be immutable. + return self._reverse_categories + + @property + def missing_value(self): + # This is a property because it should be immutable. + return self._missing_value + + def __array_finalize__(self, obj): + """ + Called by Numpy after array construction. + + There are three cases where this can happen: + + 1. Someone tries to directly construct a new array by doing:: + + >>> ndarray.__new__(LabelArray, ...) + + In this case, obj will be None. We treat this as an error case and + fail. + + 2. Someone (most likely our own __new__) calls + other_array.view(type=LabelArray). + + In this case, `self` will be the new LabelArray instance, and + ``obj` will be the array on which ``view`` is being called. + + The caller of ``obj.view`` is responsible for copying setting + category metadata on ``self`` after we exit. + + 3. Someone creates a new LabelArray by slicing an existing one. + + In this case, ``obj`` will be the original LabelArray. We're + responsible for copying over the parent array's category metadata. + """ + if obj is None: + raise TypeError( + "Direct construction of LabelArrays is not supported." + ) + + # See docstring for an explanation of when these will or will not be + # set. + self._categories = getattr(obj, 'categories', None) + self._reverse_categories = getattr(obj, 'reverse_categories', None) + self._missing_value = getattr(obj, 'missing_value', None) + + def __array_wrap__(self, obj, context=None): + """ + Called by numpy after completion of a ufunc. + + We coerce back into a vanilla ndarray if our dtype changed, since that + indicates that our categories are no longer meaningful. + """ + if obj.dtype != self.dtype: + return obj.view(type=np.ndarray) + return obj + + def as_int_array(self): + """ + Convert self into a regular ndarray of ints. + + This is an O(1) operation. It does not copy the underlying data. + """ + return self.view(type=ndarray) + + def as_string_array(self): + """ + Convert self back into an array of strings. + + This is an O(N) operation. + """ + return self.categories[self] + + def as_categorical(self, name=None): + """ + Coerce self into a pandas categorical. + + This is only defined on 1D arrays, since that's all pandas supports. + """ + if len(self.shape) > 1: + raise ValueError("Can't convert a 2D array to a categorical.") + return pd.Categorical.from_codes( + self.as_int_array(), + self.categories, + ordered=False, + name=name, + ) + + def as_categorical_frame(self, index, columns, name=None): + """ + Coerce self into a pandas DataFrame of Categoricals. + """ + if len(self.shape) != 2: + raise ValueError( + "Can't convert a non-2D LabelArray into a DataFrame." + ) + + expected_shape = (len(index), len(columns)) + if expected_shape != self.shape: + raise ValueError( + "Can't construct a DataFrame with provided indices:\n\n" + "LabelArray shape is {actual}, but index and columns imply " + "that shape should be {expected}.".format( + actual=self.shape, + expected=expected_shape, + ) + ) + + return pd.Series( + index=pd.MultiIndex.from_product([index, columns]), + data=self.ravel().as_categorical(name=name), + ).unstack() + + def __setitem__(self, indexer, value): + self_categories = self.categories + + if isinstance(value, LabelArray): + value_categories = value.categories + if compare_arrays(self_categories, value_categories): + return super(LabelArray, self).__setitem__(indexer, value) + else: + raise CategoryMismatch(self_categories, value_categories) + + elif isinstance(value, (bytes, unicode)): + value_code = self.reverse_categories.get(value, None) + if value_code is None: + raise ValueError("%r is not in LabelArray categories." % value) + return super(LabelArray, self).__setitem__(indexer, value_code) + + else: + raise NotImplementedError( + "Setting into a LabelArray with a value of " + "type {type} is not yet supported.".format( + type=type(value).__name__, + ), + ) + + def _equality_check(op): + """ + Shared code for __eq__ and __ne__, parameterized on the actual + comparison operator to use. + """ + # What value should we return if we compare against a value not in our + # categories? + if op is eq: + COMPARE_TO_UNKNOWN = False + elif op is ne: + COMPARE_TO_UNKNOWN = True + else: + raise AssertionError("_make_equality_check called with %s" % op) + + def method(self, other): + self_categories = self.categories + + if isinstance(other, LabelArray): + other_categories = other.categories + if compare_arrays(self_categories, other_categories): + return op(self.as_int_array(), other.as_int_array()) + else: + raise CategoryMismatch(self_categories, other_categories) + + elif isinstance(other, ndarray): + # Compare to ndarrays as though we were an array of strings. + # This is fairly expensive, and should generally be avoided. + return op(self.as_string_array(), other) + + elif isinstance(other, (bytes, unicode)): + i = self._reverse_categories.get(other, None) + if i is None: + # Requested string isn't in our categories. Short circuit. + # This isn't full_like because that would try to return a + # LabelArray. + return np.full(self.shape, COMPARE_TO_UNKNOWN, dtype=bool) + + return op(self.as_int_array(), i) + + elif isinstance(other, Number): + return NotImplemented + + return op(super(LabelArray, self), other) + return method + + __eq__ = _equality_check(eq) + __ne__ = _equality_check(ne) + del _equality_check + + def view(self, dtype=_NotPassed, type=_NotPassed): + if type is _NotPassed and dtype not in (_NotPassed, self.dtype): + raise TypeError("Can't view LabelArray as another dtype.") + + # The text signature on ndarray.view makes it look like the default + # values for dtype and type are `None`, but passing None explicitly has + # different semantics than not passing an arg at all, so we reconstruct + # the kwargs dict here to simulate the args not being passed at all. + kwargs = {} + if dtype is not _NotPassed: + kwargs['dtype'] = dtype + if type is not _NotPassed: + kwargs['type'] = type + return super(LabelArray, self).view(**kwargs) + + # In general, we support resizing, slicing, and reshaping methods, but not + # numeric methods. + SUPPORTED_NDARRAY_METHODS = frozenset([ + 'base', + 'byteswap', + 'compress', + 'copy', + 'data', + 'diagonal', + 'dtype', + 'flat', + 'flatten', + 'item', + 'itemset', + 'itemsize', + 'nbytes', + 'ndim', + 'newbyteorder', + 'ravel', + 'repeat', + 'reshape', + 'resize', + 'setflags', + 'shape', + 'size', + 'squeeze', + 'strides', + 'swapaxes', + 'take', + 'trace', + 'transpose', + 'view' + ]) + PUBLIC_NDARRAY_METHODS = frozenset([ + s for s in dir(ndarray) if not s.startswith('_') + ]) + + # Generate failing wrappers for all unsupported methods. + locals().update( + { + method: _make_unsupported_method(method) + for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS + } + ) + + def __repr__(self): + # This happens if you call a ufunc on a LabelArray that changes the + # dtype. This is generally an indicator that the array has been used + # incorrectly, and it means we're no longer valid for anything. + if self.dtype != int64_dtype: + return "Invalid LabelArray: dtype={}, shape={}".format( + self.dtype, self.shape + ) + repr_lines = repr(self.as_string_array()).splitlines() + repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1) + repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')' + # The extra spaces here account for the difference in length between + # 'array(' and 'LabelArray('. + return '\n '.join(repr_lines) + + def empty_like(self, shape): + """ + Make an empty LabelArray with the same categories as ``self``, filled + with ``self.missing_value``. + """ + out = np.full( + shape, + self.reverse_categories[self.missing_value], + dtype=self.dtype + ).view( + type=type(self) + ) + + out._categories = self.categories + out._reverse_categories = self.reverse_categories + out._missing_value = self.missing_value + + return out + + def apply(self, f, dtype): + """ + Map a function elementwise over entries in ``self``. + + ``f`` will be applied exactly once to each unique value in ``self``. + """ + return np.vectorize(f, otypes=[dtype])(self.categories)[self] + + def startswith(self, prefix): + """ + Element-wise startswith. + + Parameters + ---------- + prefix : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self started with ``prefix``. + """ + return self.apply(lambda elem: elem.startswith(prefix), dtype=bool) + + def endswith(self, suffix): + """ + Elementwise endswith. + + Parameters + ---------- + suffix : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self ended with ``suffix``.w + """ + return self.apply(lambda elem: elem.endswith(suffix), dtype=bool) + + def contains(self, substring): + """ + Elementwise contains. + + Parameters + ---------- + substring : str + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self ended with ``suffix``. + """ + return self.apply(lambda elem: substring in elem, dtype=bool) + + @preprocess(pattern=re.compile) + def matches(self, pattern): + """ + Elementwise regex match. + + Parameters + ---------- + pattern : str or compiled regex + + Returns + ------- + matches : np.ndarray[bool] + An array with the same shape as self indicating whether each + element of self was matched by ``pattern``. + """ + return self.apply(compose(bool, pattern.match)) diff --git a/zipline/pipeline/api_utils.py b/zipline/pipeline/api_utils.py new file mode 100644 index 00000000..1ccf3543 --- /dev/null +++ b/zipline/pipeline/api_utils.py @@ -0,0 +1,49 @@ +""" +Utilities for creating public APIs (e.g. argument validation decorators). +""" +from zipline.utils.input_validation import preprocess + + +def restrict_to_dtype(dtype, message_template): + """ + A factory for decorators that restrict Term methods to only be callable on + Terms with a specific dtype. + + This is conceptually similar to + zipline.utils.input_validation.expect_dtypes, but provides more flexibility + for providing error messages that are specifically targeting Term methods. + + Parameters + ---------- + dtype : numpy.dtype + The dtype on which the decorated method may be called. + message_template : str + A template for the error message to be raised. + `message_template.format` will be called with keyword arguments + `method_name`, `expected_dtype`, and `received_dtype`. + + Usage + ----- + @restrict_to_dtype( + dtype=float64_dtype, + message_template=( + "{method_name}() was called on a factor of dtype {received_dtype}." + "{method_name}() requires factors of dtype{expected_dtype}." + + ), + ) + def some_factor_method(self, ...): + self.stuff_that_requires_being_float64(...) + """ + def processor(term_method, _, term_instance): + term_dtype = term_instance.dtype + if term_dtype != dtype: + raise TypeError( + message_template.format( + method_name=term_method.__name__, + expected_dtype=dtype.name, + received_dtype=term_dtype, + ) + ) + return term_instance + return preprocess(self=processor) diff --git a/zipline/pipeline/classifiers/classifier.py b/zipline/pipeline/classifiers/classifier.py index c4d77ec9..e27016eb 100644 --- a/zipline/pipeline/classifiers/classifier.py +++ b/zipline/pipeline/classifiers/classifier.py @@ -1,16 +1,24 @@ """ classifier.py """ +from functools import wraps from numbers import Number +import operator from numpy import where, isnan, nan, zeros +from zipline.lib.labelarray import LabelArray from zipline.lib.quantiles import quantiles -from zipline.pipeline.term import ComputableTerm +from zipline.pipeline.api_utils import restrict_to_dtype +from zipline.pipeline.term import ComputableTerm, NotSpecified +from zipline.utils.compat import unicode from zipline.utils.input_validation import expect_types -from zipline.utils.numpy_utils import int64_dtype +from zipline.utils.numpy_utils import ( + categorical_dtype, + int64_dtype, +) -from ..filters import NullFilter, NumExprFilter +from ..filters import Filter, NullFilter, NumExprFilter from ..mixins import ( CustomTermMixin, LatestMixin, @@ -20,6 +28,15 @@ from ..mixins import ( ) +strings_only = restrict_to_dtype( + dtype=categorical_dtype, + message_template=( + "{method_name}() is only defined on Classifiers producing strings" + " but it was called on a Factor of dtype {received_dtype}." + ) +) + + class Classifier(RestrictedDTypeMixin, ComputableTerm): """ A Pipeline expression computing a categorical output. @@ -30,7 +47,9 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): indicating that means/standard deviations should be computed on assets for which the classifier produced the same label. """ - ALLOWED_DTYPES = (int64_dtype,) # Used by RestrictedDTypeMixin + # Used by RestrictedDTypeMixin + ALLOWED_DTYPES = (int64_dtype, categorical_dtype) + categories = NotSpecified def isnull(self): """ @@ -45,9 +64,8 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): return ~self.isnull() # We explicitly don't support classifier to classifier comparisons, since - # the numbers likely don't mean the same thing. This may be relaxed in the - # future, but for now we're starting conservatively. - @expect_types(other=Number) + # the stored values likely don't mean the same thing. This may be relaxed + # in the future, but for now we're starting conservatively. def eq(self, other): """ Construct a Filter returning True for asset/date pairs where the output @@ -58,7 +76,7 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): # certainly not what the user wants. if other == self.missing_value: raise ValueError( - "Comparison against self.missing_value ({value}) in" + "Comparison against self.missing_value ({value!r}) in" " {typename}.eq().\n" "Missing values have NaN semantics, so the " "requested comparison would always produce False.\n" @@ -67,24 +85,87 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm): typename=(type(self).__name__), ) ) - return NumExprFilter.create( - "x_0 == {other}".format(other=int(other)), - binds=(self,), - ) - @expect_types(other=Number) + if isinstance(other, Number) != (self.dtype == int64_dtype): + raise InvalidClassifierComparison(self, other) + + if isinstance(other, Number): + return NumExprFilter.create( + "x_0 == {other}".format(other=int(other)), + binds=(self,), + ) + else: + return ScalarStringPredicate( + classifier=self, + op=operator.eq, + compval=other, + ) + def __ne__(self, other): """ Construct a Filter returning True for asset/date pairs where the output of ``self`` matches ``other. """ - return NumExprFilter.create( - "((x_0 != {other}) & (x_0 != {missing}))".format( - other=int(other), - missing=self.missing_value, - ), - binds=(self,), - ) + if isinstance(other, Number) != (self.dtype == int64_dtype): + raise InvalidClassifierComparison(self, other) + + if isinstance(other, Number): + return NumExprFilter.create( + "((x_0 != {other}) & (x_0 != {missing}))".format( + other=int(other), + missing=self.missing_value, + ), + binds=(self,), + ) + else: + return ScalarStringPredicate( + classifier=self, + op=operator.ne, + compval=other, + ) + + def _string_predicate(f): + """ + Decorator for converting a function from (LabelArray, str) -> bool + into a Classifier method that returns a ScalarStringPredicate filter. + + This mainly exists to avoid replicating shared boilerplate + (e.g. argument type validation). + """ + @wraps(f) + @expect_types(compval=(bytes, unicode)) + @strings_only + def method(self, compval): + return ScalarStringPredicate( + classifier=self, + op=f, + compval=compval, + ) + return method + + @_string_predicate + @expect_types(label_array=LabelArray) + def startswith(label_array, other): + return label_array.startswith(other) + + @_string_predicate + @expect_types(label_array=LabelArray) + def endswith(label_array, other): + return label_array.endswith(other) + + @_string_predicate + @expect_types(label_array=LabelArray) + def contains(label_array, other): + return label_array.contains(other) + + del _string_predicate + + def postprocess(self, data): + if self.dtype == int64_dtype: + return data + if not isinstance(data, LabelArray): + raise AssertionError("Expected a LabelArray, got %s." % type(data)) + return data.as_categorical() class Everything(Classifier): @@ -127,6 +208,50 @@ class Quantiles(SingleInputMixin, Classifier): return type(self).__name__ + '(%d)' % self.params['bins'] +class ScalarStringPredicate(SingleInputMixin, Filter): + """ + A filter that applies a function from (LabelArray, str) -> ndarray[bool]. + + Examples include ``==, !=, startswith, and endswith``. + + This exists because we represent string arrays with + ``zipline.lib.LabelArray``s, which numexpr doesn't know about, so we can't + use the generic NumExprFilter implementation here. + """ + window_length = 0 + + @expect_types(classifier=Classifier, compval=(bytes, unicode)) + def __new__(cls, classifier, op, compval): + return super(ScalarStringPredicate, cls).__new__( + ScalarStringPredicate, + compval=compval, + op=op, + inputs=(classifier,), + mask=classifier.mask, + ) + + def _init(self, op, compval, *args, **kwargs): + self._op = op + self._compval = compval + return super(ScalarStringPredicate, self)._init(*args, **kwargs) + + @classmethod + def static_identity(cls, op, compval, *args, **kwargs): + return ( + super(ScalarStringPredicate, cls).static_identity(*args, **kwargs), + op, + compval, + ) + + def _compute(self, arrays, dates, assets, mask): + data = arrays[0] + return ( + self._op(data, self._compval) + & (data != self.inputs[0].missing_value) + & mask + ) + + class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier): """ Base class for user-defined Classifiers. @@ -149,3 +274,27 @@ class Latest(LatestMixin, CustomClassifier): zipline.pipeline.factors.factor.Latest zipline.pipeline.filters.filter.Latest """ + def _allocate_output(self, windows, shape): + """ + Override the default array allocation to produce a LabelArray when we + have a string-like dtype. + """ + if self.dtype == int64_dtype: + return super(Latest, self)._allocate_output(windows, shape) + + # This is a little bit of a hack. We might not know what the + # categories for a LabelArray are until it's actually been loaded, so + # we need to look at the underlying data. + return windows[0].data.empty_like(shape) + + +class InvalidClassifierComparison(TypeError): + def __init__(self, classifier, compval): + super(InvalidClassifierComparison, self).__init__( + "Can't compare classifier of dtype" + " {dtype} to value {value} of type {type}.".format( + dtype=classifier.dtype, + value=compval, + type=type(compval).__name__, + ) + ) diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index b5980d83..fcd6cfe5 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -14,11 +14,7 @@ from zipline.pipeline.term import ( Term, ) from zipline.utils.input_validation import ensure_dtype -from zipline.utils.numpy_utils import ( - bool_dtype, - int64_dtype, - NoDefaultMissingValue, -) +from zipline.utils.numpy_utils import NoDefaultMissingValue from zipline.utils.preprocess import preprocess @@ -26,7 +22,6 @@ class Column(object): """ An abstract column of data, not yet associated with a dataset. """ - @preprocess(dtype=ensure_dtype) def __init__(self, dtype, missing_value=NotSpecified): self.dtype = dtype @@ -164,15 +159,25 @@ class BoundColumn(LoadableTerm): @property def latest(self): - if self.dtype == bool_dtype: - from zipline.pipeline.filters import Latest - elif self.dtype == int64_dtype: - from zipline.pipeline.classifiers import Latest + from zipline.pipeline.factors import Factor, Latest as LatestFactor + from zipline.pipeline.filters import Filter, Latest as LatestFilter + from zipline.pipeline.classifiers import ( + Classifier, + Latest as LatestClassifier, + ) + + dtype = self.dtype + if dtype in Filter.ALLOWED_DTYPES: + Latest = LatestFilter + elif dtype in Classifier.ALLOWED_DTYPES: + Latest = LatestClassifier else: - from zipline.pipeline.factors import Latest + assert dtype in Factor.ALLOWED_DTYPES, "Unknown dtype %s." % dtype + Latest = LatestFactor + return Latest( inputs=(self,), - dtype=self.dtype, + dtype=dtype, missing_value=self.missing_value, ) diff --git a/zipline/pipeline/data/testing.py b/zipline/pipeline/data/testing.py index 5d452769..30f92642 100644 --- a/zipline/pipeline/data/testing.py +++ b/zipline/pipeline/data/testing.py @@ -7,6 +7,7 @@ zipline.pipeline.data.testing. from .dataset import Column, DataSet from zipline.utils.numpy_utils import ( bool_dtype, + categorical_dtype, float64_dtype, datetime64ns_dtype, int64_dtype, @@ -22,3 +23,9 @@ class TestingDataSet(DataSet): float_col = Column(dtype=float64_dtype) datetime_col = Column(dtype=datetime64ns_dtype) int_col = Column(dtype=int64_dtype, missing_value=0) + + categorical_col = Column(dtype=categorical_dtype, missing_value=u'') + categorical_default_NULL = Column( + dtype=categorical_dtype, + missing_value=u'NULL', + ) diff --git a/zipline/pipeline/engine.py b/zipline/pipeline/engine.py index 8a751bc5..79be34a6 100644 --- a/zipline/pipeline/engine.py +++ b/zipline/pipeline/engine.py @@ -51,7 +51,7 @@ class PipelineEngine(with_metaclass(ABCMeta)): result : pd.DataFrame A frame of computed results. - The columns `result` correspond will be the computed results of + The columns `result` correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of `zipline.pipeline.term.Term`. @@ -165,17 +165,20 @@ class SimplePipelineEngine(object): root_mask = self._compute_root_mask(start_date, end_date, extra_rows) dates, assets, root_mask_values = explode(root_mask) - outputs = self.compute_chunk( + results = self.compute_chunk( graph, dates, assets, initial_workspace={self._root_mask_term: root_mask_values}, ) - out_dates = dates[extra_rows:] - screen_values = outputs.pop(screen_name) - - return self._to_narrow(outputs, screen_values, out_dates, assets) + return self._to_narrow( + graph.outputs, + results, + results.pop(screen_name), + dates[extra_rows:], + assets, + ) def _compute_root_mask(self, start_date, end_date, extra_rows): """ @@ -363,14 +366,16 @@ class SimplePipelineEngine(object): out[name] = workspace[term][graph_extra_rows[term]:] return out - def _to_narrow(self, data, mask, dates, assets): + def _to_narrow(self, terms, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- + terms : dict[str -> Term] + Dict mapping column names to terms. data : dict[str -> ndarray[ndim=2]] - Dict mapping column names to computed results. + Dict mapping column names to computed results for those names. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] @@ -412,8 +417,18 @@ class SimplePipelineEngine(object): resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] + + final_columns = {} + for name in data: + # Each term that computed an output has its postprocess method + # called on the filtered result. + # + # We use this convert LabelArrays into categoricals, among other + # things. + final_columns[name] = terms[name].postprocess(data[name][mask]) + return DataFrame( - data={name: arr[mask] for name, arr in iteritems(data)}, + data=final_columns, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0) @@ -423,6 +438,7 @@ class SimplePipelineEngine(object): """ root = self._root_mask_term clsname = type(self).__name__ + # Writing this out explicitly so this errors in testing if we change # the name without updating this line. compute_chunk_name = self.compute_chunk.__name__ diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index 79d10f71..a76002a9 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -6,11 +6,11 @@ from operator import attrgetter from numbers import Number from numpy import inf, where -from toolz import curry from zipline.errors import UnknownRankMethod from zipline.lib.normalize import naive_grouped_rowwise_apply from zipline.lib.rank import masked_rankdata_2d +from zipline.pipeline.api_utils import restrict_to_dtype from zipline.pipeline.classifiers import Classifier, Everything, Quantiles from zipline.pipeline.mixins import ( CustomTermMixin, @@ -42,6 +42,7 @@ from zipline.pipeline.filters import ( PercentileFilter, NullFilter, ) +from zipline.utils.functional import with_doc, with_name from zipline.utils.input_validation import expect_types from zipline.utils.math_utils import nanmean, nanstd from zipline.utils.numpy_utils import ( @@ -51,7 +52,6 @@ from zipline.utils.numpy_utils import ( float64_dtype, int64_dtype, ) -from zipline.utils.preprocess import preprocess _RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal']) @@ -81,37 +81,6 @@ def coerce_numbers_to_my_dtype(f): return method -@curry -def set_attribute(name, value): - """ - Decorator factory for setting attributes on a function. - - Doesn't change the behavior of the wrapped function. - - Usage - ----- - >>> @set_attribute('__name__', 'foo') - ... def bar(): - ... return 3 - ... - >>> bar() - 3 - >>> bar.__name__ - 'foo' - """ - def decorator(f): - setattr(f, name, value) - return f - return decorator - - -# Decorators for setting the __name__ and __doc__ properties of a decorated -# function. -# Example: -with_name = set_attribute('__name__') -with_doc = set_attribute('__doc__') - - def binop_return_type(op): if is_comparison(op): return NumExprFilter @@ -328,51 +297,6 @@ def function_application(func): return mathfunc -def restrict_to_dtype(dtype, message_template): - """ - A factory for decorators that restricting Factor methods to only be - callable on Factors with a specific dtype. - - This is conceptually similar to - zipline.utils.input_validation.expect_dtypes, but provides more flexibility - for providing error messages that are specifically targeting Factor - methods. - - Parameters - ---------- - dtype : numpy.dtype - The dtype on which the decorated method may be called. - message_template : str - A template for the error message to be raised. - `message_template.format` will be called with keyword arguments - `method_name`, `expected_dtype`, and `received_dtype`. - - Usage - ----- - @restrict_to_dtype( - dtype=float64_dtype, - message_template=( - "{method_name}() was called on a factor of dtype {received_dtype}." - "{method_name}() requires factors of dtype{expected_dtype}." - - ), - ) - def some_factor_method(self, ...): - self.stuff_that_requires_being_float64(...) - """ - def processor(factor_method, _, factor_instance): - factor_dtype = factor_instance.dtype - if factor_dtype != dtype: - raise TypeError( - message_template.format( - method_name=factor_method.__name__, - expected_dtype=dtype.name, - received_dtype=factor_dtype, - ) - ) - return factor_instance - return preprocess(self=processor) - # Decorators for Factor methods. if_not_float64_tell_caller_to_use_isnull = restrict_to_dtype( dtype=float64_dtype, diff --git a/zipline/pipeline/loaders/synthetic.py b/zipline/pipeline/loaders/synthetic.py index f9957ae9..dfea5829 100644 --- a/zipline/pipeline/loaders/synthetic.py +++ b/zipline/pipeline/loaders/synthetic.py @@ -27,6 +27,7 @@ from zipline.utils.numpy_utils import ( datetime64ns_dtype, float64_dtype, int64_dtype, + object_dtype, ) @@ -148,6 +149,7 @@ class SeededRandomLoader(PrecomputedLoader): float64_dtype: self._float_values, int64_dtype: self._int_values, bool_dtype: self._bool_values, + object_dtype: self._object_values, }[dtype](shape) @property @@ -191,6 +193,9 @@ class SeededRandomLoader(PrecomputedLoader): """ return self.state.randn(*shape) < 0 + def _object_values(self, shape): + return self._int_values(shape).astype(str).astype(object) + OHLCV = ('open', 'high', 'low', 'close', 'volume') OHLC = ('open', 'high', 'low', 'close') diff --git a/zipline/pipeline/mixins.py b/zipline/pipeline/mixins.py index d5f0fcb2..497fc145 100644 --- a/zipline/pipeline/mixins.py +++ b/zipline/pipeline/mixins.py @@ -1,7 +1,7 @@ """ Mixins classes for use with Filters and Factors. """ -from numpy import full_like, recarray +from numpy import full, recarray from zipline.utils.control_flow import nullctx from zipline.errors import WindowLengthNotPositive, UnsupportedDataType @@ -51,7 +51,7 @@ class RestrictedDTypeMixin(object): if self.dtype not in self.ALLOWED_DTYPES: raise UnsupportedDataType( - typename=type(self.__name__), + typename=type(self).__name__, dtype=self.dtype, ) @@ -103,27 +103,45 @@ class CustomTermMixin(object): """ raise NotImplementedError() + def _allocate_output(self, windows, shape): + """ + Allocate an output array whose rows should be passed to `self.compute`. + + The resulting array must have a shape of ``shape``. + + If we have standard outputs (i.e. self.outputs is NotSpecified), the + default is an empty ndarray whose dtype is ``self.dtype``. + + If we have an outputs tuple, the default is an empty recarray with + ``self.outputs`` as field names. Each field will have dtype + ``self.dtype``, the default shape is ``self.shape``. + + This can be overridden to control the kind of array constructed + (e.g. to produce a LabelArray instead of an ndarray). + """ + missing_value = self.missing_value + outputs = self.outputs + if outputs is not NotSpecified: + out = recarray( + shape, + formats=[self.dtype.str] * len(outputs), + names=outputs, + ) + out[:] = missing_value + else: + out = full(shape, missing_value, dtype=self.dtype) + return out + def _compute(self, windows, dates, assets, mask): """ Call the user's `compute` function on each window with a pre-built output array. """ compute = self.compute - missing_value = self.missing_value params = self.params - outputs = self.outputs - if outputs is not NotSpecified: - out = recarray( - mask.shape, - formats=[self.dtype.str] * len(outputs), - names=outputs, - ) - out[:] = missing_value - else: - out = full_like(mask, missing_value, dtype=self.dtype) + out = self._allocate_output(windows, mask.shape) + with self.ctx: - # TODO: Consider pre-filtering columns that are all-nan at each - # time-step? for idx, date in enumerate(dates): col_mask = mask[idx] masked_out = out[idx][col_mask] diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index e6b6e9bf..1bffc3e7 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -4,7 +4,7 @@ Base class for Filters, Factors and Classifiers from abc import ABCMeta, abstractproperty from weakref import WeakValueDictionary -from numpy import dtype as dtype_class +from numpy import dtype as dtype_class, ndarray from six import with_metaclass from zipline.errors import ( DTypeNotSpecified, @@ -16,6 +16,7 @@ from zipline.errors import ( WindowLengthNotSpecified, ) from zipline.lib.adjusted_array import can_represent_dtype +from zipline.utils.input_validation import expect_types from zipline.utils.memoize import lazyval from zipline.utils.numpy_utils import ( bool_dtype, @@ -476,6 +477,19 @@ class ComputableTerm(Term): out[self.mask] = 0 return out + @expect_types(data=ndarray) + def postprocess(self, data): + """ + Called with an result of ``self``, unravelled (i.e. 1-dimensional) + after any user-defined screens have been applied. + + This is mostly useful for transforming the dtype of an output, e.g., to + convert a LabelArray into a pandas Categorical. + + The default implementation is to just return data unchanged. + """ + return data + def __repr__(self): return ( "{type}({inputs}, window_length={window_length})" diff --git a/zipline/testing/__init__.py b/zipline/testing/__init__.py index a415daa8..c52ff03e 100644 --- a/zipline/testing/__init__.py +++ b/zipline/testing/__init__.py @@ -51,3 +51,4 @@ from .core import ( # noqa write_bcolz_minute_data, write_compressed, ) +from .fixtures import ZiplineTestCase # noqa diff --git a/zipline/testing/core.py b/zipline/testing/core.py index 11430470..ad130a79 100644 --- a/zipline/testing/core.py +++ b/zipline/testing/core.py @@ -1032,7 +1032,7 @@ def temp_pipeline_engine(calendar, sids, random_seed, symbols=None): yield SimplePipelineEngine(get_loader, calendar, finder) -def parameter_space(**params): +def parameter_space(__fail_fast=False, **params): """ Wrapper around subtest that allows passing keywords mapping names to iterables of values. @@ -1083,7 +1083,16 @@ def parameter_space(**params): ) param_sets = product(*(params[name] for name in argnames)) - return subtest(param_sets, *argnames)(f) + + if __fail_fast: + @wraps(f) + def wrapped(self): + for args in param_sets: + f(self, *args) + return wrapped + else: + return subtest(param_sets, *argnames)(f) + return decorator diff --git a/zipline/utils/compat.py b/zipline/utils/compat.py index 62ed44ef..0282cb89 100644 --- a/zipline/utils/compat.py +++ b/zipline/utils/compat.py @@ -11,6 +11,9 @@ if PY2: else: from types import MappingProxyType as mappingproxy + +unicode = type(u'') + __all__ = [ 'mappingproxy', ] diff --git a/zipline/utils/functional.py b/zipline/utils/functional.py index 79569de0..685f0aef 100644 --- a/zipline/utils/functional.py +++ b/zipline/utils/functional.py @@ -56,6 +56,8 @@ def apply(f, *args, **kwargs): # Alias for use as a class decorator. instance = apply +from zipline.utils.sentinel import sentinel + def mapall(funcs, seq): """ @@ -242,3 +244,87 @@ def unzip(seq, elem_len=None): if elem_len is None: raise ValueError("cannot unzip empty sequence without 'elem_len'") return ((),) * elem_len + + +_no_default = sentinel('_no_default') + + +def getattrs(value, attrs, default=_no_default): + """ + Perform a chained application of ``getattr`` on ``value`` with the values + in ``attrs``. + + If ``default`` is supplied, return it if any of the attribute lookups fail. + + Parameters + ---------- + value : object + Root of the lookup chain. + attrs : iterable[str] + Sequence of attributes to look up. + default : object, optional + Value to return if any of the lookups fail. + + Returns + ------- + result : object + Result of the lookup sequence. + + Example + ------- + >>> class EmptyObject(object): + ... pass + ... + >>> obj = EmptyObject() + >>> obj.foo = EmptyObject() + >>> obj.foo.bar = "value" + >>> getattrs(obj, ('foo', 'bar')) + 'value' + + >>> getattrs(obj, ('foo', 'buzz')) + Traceback (most recent call last): + ... + AttributeError: 'EmptyObject' object has no attribute 'buzz' + + >>> getattrs(obj, ('foo', 'buzz'), 'default') + 'default' + """ + try: + for attr in attrs: + value = getattr(value, attr) + except AttributeError: + if default is _no_default: + raise + value = default + return value + + +@curry +def set_attribute(name, value): + """ + Decorator factory for setting attributes on a function. + + Doesn't change the behavior of the wrapped function. + + Usage + ----- + >>> @set_attribute('__name__', 'foo') + ... def bar(): + ... return 3 + ... + >>> bar() + 3 + >>> bar.__name__ + 'foo' + """ + def decorator(f): + setattr(f, name, value) + return f + return decorator + + +# Decorators for setting the __name__ and __doc__ properties of a decorated +# function. +# Example: +with_name = set_attribute('__name__') +with_doc = set_attribute('__doc__') diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py index 66e8b9fe..0424e6f4 100644 --- a/zipline/utils/input_validation.py +++ b/zipline/utils/input_validation.py @@ -22,7 +22,8 @@ from six import iteritems, string_types, PY3 from toolz import valmap, complement, compose import toolz.curried.operator as op -from zipline.utils.preprocess import preprocess +from zipline.utils.functional import getattrs +from zipline.utils.preprocess import call, preprocess def optionally(preprocessor): @@ -163,7 +164,7 @@ def ensure_timestamp(func, argname, arg): ) -def expect_dtypes(*_pos, **named): +def expect_dtypes(**named): """ Preprocessing decorator that verifies inputs have expected numpy dtypes. @@ -181,9 +182,6 @@ def expect_dtypes(*_pos, **named): ... TypeError: foo() expected an argument with dtype 'int64' for argument 'x', but got dtype 'float64' instead. # noqa """ - if _pos: - raise TypeError("expect_dtypes() only takes keyword arguments.") - for name, type_ in iteritems(named): if not isinstance(type_, (dtype, tuple)): raise TypeError( @@ -193,42 +191,101 @@ def expect_dtypes(*_pos, **named): ) ) - def _expect_dtype(_dtype_or_dtype_tuple): + @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,))) + def _expect_dtype(dtypes): """ - Factory for dtype-checking functions that work the @preprocess + Factory for dtype-checking functions that work with the @preprocess decorator. """ - # Slightly different messages for dtype and tuple of dtypes. - if isinstance(_dtype_or_dtype_tuple, tuple): - allowed_dtypes = _dtype_or_dtype_tuple - else: - allowed_dtypes = (_dtype_or_dtype_tuple,) - template = ( - "%(funcname)s() expected a value with dtype {dtype_str} " - "for argument '%(argname)s', but got %(actual)r instead." - ).format(dtype_str=' or '.join(repr(d.name) for d in allowed_dtypes)) - - def check_dtype(value): - return getattr(value, 'dtype', None) not in allowed_dtypes - - def display_bad_value(value): + def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype - # name. + # name. Otherwise just show the value. try: - return value.dtype.name + value_to_show = value.dtype.name except AttributeError: - return value + value_to_show = value + return ( + "{funcname}() expected a value with dtype {dtype_str} " + "for argument {argname!r}, but got {value!r} instead." + ).format( + funcname=_qualified_name(func), + dtype_str=' or '.join(repr(d.name) for d in dtypes), + argname=argname, + value=value_to_show, + ) - return make_check( - exc_type=TypeError, - template=template, - pred=check_dtype, - actual=display_bad_value, - ) + def _actual_preprocessor(func, argname, argvalue): + if getattr(argvalue, 'dtype', object()) not in dtypes: + raise TypeError(error_message(func, argname, argvalue)) + return argvalue + + return _actual_preprocessor return preprocess(**valmap(_expect_dtype, named)) +def expect_kinds(**named): + """ + Preprocessing decorator that verifies inputs have expected dtype kinds. + + Usage + ----- + >>> from numpy import int64, int32, float32 + >>> @expect_kinds(x='i') + ... def foo(x): + ... return x + ... + >>> foo(int64(2)) + 2 + >>> foo(int32(2)) + 2 + >>> foo(float32(2)) + Traceback (most recent call last): + ...n + TypeError: foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead. # noqa + """ + for name, kind in iteritems(named): + if not isinstance(kind, (str, tuple)): + raise TypeError( + "expect_dtype_kinds() expected a string or tuple of strings" + " for argument {name!r}, but got {kind} instead.".format( + name=name, kind=dtype, + ) + ) + + @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,))) + def _expect_kind(kinds): + """ + Factory for kind-checking functions that work the @preprocess + decorator. + """ + def error_message(func, argname, value): + # If the bad value has a dtype, but it's wrong, show the dtype + # kind. Otherwise just show the value. + try: + value_to_show = value.dtype.kind + except AttributeError: + value_to_show = value + return ( + "{funcname}() expected a numpy object of kind {kinds} " + "for argument {argname!r}, but got {value!r} instead." + ).format( + funcname=_qualified_name(func), + kinds=' or '.join(map(repr, kinds)), + argname=argname, + value=value_to_show, + ) + + def _actual_preprocessor(func, argname, argvalue): + if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds: + raise TypeError(error_message(func, argname, argvalue)) + return argvalue + + return _actual_preprocessor + + return preprocess(**valmap(_expect_kind, named)) + + def expect_types(*_pos, **named): """ Preprocessing decorator that verifies inputs have expected types. diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py index 5df6e213..a254fff8 100644 --- a/zipline/utils/numpy_utils.py +++ b/zipline/utils/numpy_utils.py @@ -32,6 +32,10 @@ complex128_dtype = dtype('complex128') datetime64D_dtype = dtype('datetime64[D]') datetime64ns_dtype = dtype('datetime64[ns]') +object_dtype = dtype('O') +# We use object arrays for strings. +categorical_dtype = object_dtype + make_datetime64ns = flip(datetime64, 'ns') make_datetime64D = flip(datetime64, 'D') @@ -71,6 +75,7 @@ def make_kind_check(python_types, numpy_kind): is_float = make_kind_check(float, 'f') is_int = make_kind_check(int, 'i') is_datetime = make_kind_check(datetime, 'M') +is_object = make_kind_check(object, 'O') def coerce_to_dtype(dtype, value): @@ -263,9 +268,7 @@ def rolling_window(array, length): _notNaT = make_datetime64D(0) -def busday_count_mask_NaT(begindates, - enddates, - out=None): +def busday_count_mask_NaT(begindates, enddates, out=None): """ Simple of numpy.busday_count that returns `float` arrays rather than int arrays, and handles `NaT`s by returning `NaN`s where the inputs were `NaT`.