ENH: Add support for strings in Pipeline.

- Adds a new class, ``LabelArray``, which is a subclass of np.ndarray.
  LabelArray is conceptually similar to pandas.Categorical, in that it
  stores data with many duplicate values as indices into an array of
  unique values.  For string data with many duplicates (e.g. time-series
  of tickers or or industry classifications), this provides multiple
  orders of magnitude of improvement when doing string operations,
  especially string comparison/matching operations.

- Adds a new generic object "specialization" for `AdjustedArrayWindow`,
  and a corresponding ObjectOverwrite adjustment.

- Adds a new ``postprocess`` method to ``zipline.pipeline.term.Term``.
  This method is called on the final result of any pipeline expression
  after screen filtering has occurred. The default implementation of
  ``postprocess`` is identity, but Classifier overrides it to coerce
  string columns into pandas.Categoricals before presenting them to the
  user.
This commit is contained in:
Scott Sanderson
2016-04-28 18:10:48 -04:00
parent 8756bf2c91
commit 5f190395ad
32 changed files with 1882 additions and 310 deletions
+2
View File
@@ -81,11 +81,13 @@ class LazyBuildExtCommandClass(dict):
ext_modules = [
Extension('zipline.assets._assets', ['zipline/assets/_assets.pyx']),
Extension('zipline.lib.adjustment', ['zipline/lib/adjustment.pyx']),
Extension('zipline.lib._factorize', ['zipline/lib/_factorize.pyx']),
Extension(
'zipline.lib._float64window', ['zipline/lib/_float64window.pyx']
),
Extension('zipline.lib._int64window', ['zipline/lib/_int64window.pyx']),
Extension('zipline.lib._uint8window', ['zipline/lib/_uint8window.pyx']),
Extension('zipline.lib._labelwindow', ['zipline/lib/_labelwindow.pyx']),
Extension('zipline.lib.rank', ['zipline/lib/rank.pyx']),
Extension('zipline.data._equities', ['zipline/data/_equities.pyx']),
Extension('zipline.data._adjustments', ['zipline/data/_adjustments.pyx']),
+308 -89
View File
@@ -9,19 +9,23 @@ from nose_parameterized import parameterized
from numpy import (
arange,
array,
asarray,
dtype,
full,
where,
)
from numpy.testing import assert_array_equal
from six.moves import zip_longest
from toolz import curry
from zipline.errors import WindowLengthNotPositive, WindowLengthTooLong
from zipline.lib.adjustment import (
Datetime64Overwrite,
Float64Multiply,
Float64Overwrite,
ObjectOverwrite,
)
from zipline.lib.adjusted_array import AdjustedArray, NOMASK
from zipline.lib.labelarray import LabelArray
from zipline.testing import check_arrays, parameter_space
from zipline.utils.numpy_utils import (
coerce_to_dtype,
@@ -29,6 +33,7 @@ from zipline.utils.numpy_utils import (
default_missing_value_for_dtype,
float64_dtype,
int64_dtype,
object_dtype,
)
@@ -62,12 +67,41 @@ def valid_window_lengths(underlying_buffer_length):
return iter(range(1, underlying_buffer_length + 1))
def _gen_unadjusted_cases(dtype):
@curry
def as_dtype(dtype, data):
"""
Curried wrapper around array.astype for when you have the dtype before you
have the data.
"""
return asarray(data).astype(dtype)
@curry
def as_labelarray(initial_dtype, missing_value, array):
"""
Curried wrapper around LabelArray, that round-trips the input data through
`initial_dtype` first.
"""
return LabelArray(
array.astype(initial_dtype),
missing_value=initial_dtype.type(''),
)
bytes_dtype = dtype('S3')
unicode_dtype = dtype('U3')
def _gen_unadjusted_cases(name,
make_input,
make_expected_output,
missing_value):
nrows = 6
ncols = 3
data = arange(nrows * ncols).astype(dtype).reshape(nrows, ncols)
missing_value = default_missing_value_for_dtype(dtype)
raw_data = arange(nrows * ncols).reshape(nrows, ncols)
input_array = make_input(raw_data)
expected_output_array = make_expected_output(raw_data)
for windowlen in valid_window_lengths(nrows):
@@ -76,13 +110,13 @@ def _gen_unadjusted_cases(dtype):
)
yield (
"dtype_%s_length_%d" % (dtype, windowlen),
data,
"%s_length_%d" % (name, windowlen),
input_array,
windowlen,
{},
missing_value,
[
data[offset:offset + windowlen]
expected_output_array[offset:offset + windowlen]
for offset in range(num_legal_windows)
],
)
@@ -156,84 +190,125 @@ def _gen_multiplicative_adjustment_cases(dtype):
[1, 6, 1],
[1, 1, 1]], dtype=dtype)
return _gen_expectations(baseline, adjustments, buffer_as_of, nrows)
def _gen_overwrite_adjustment_cases(dtype):
"""
Generate test cases for overwrite adjustments.
The algorithm used here is the same as the one used above for
multiplicative adjustments. The only difference is the semantics of how
the adjustments are expected to modify the arrays.
"""
adjustment_type = {
float64_dtype: Float64Overwrite,
datetime64ns_dtype: Datetime64Overwrite,
}[dtype]
nrows, ncols = 6, 3
adjustments = {}
buffer_as_of = [None] * 6
baseline = buffer_as_of[0] = full((nrows, ncols), 2, dtype=dtype)
# Note that row indices are inclusive!
adjustments[1] = [
adjustment_type(0, 0, 0, 0, coerce_to_dtype(dtype, 1)),
]
buffer_as_of[1] = array([[1, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]], dtype=dtype)
# No adjustment at index 2.
buffer_as_of[2] = buffer_as_of[1]
adjustments[3] = [
adjustment_type(1, 2, 1, 1, coerce_to_dtype(dtype, 3)),
adjustment_type(0, 1, 0, 0, coerce_to_dtype(dtype, 4)),
]
buffer_as_of[3] = array([[4, 2, 2],
[4, 3, 2],
[2, 3, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]], dtype=dtype)
adjustments[4] = [
adjustment_type(0, 3, 2, 2, coerce_to_dtype(dtype, 5))
]
buffer_as_of[4] = array([[4, 2, 5],
[4, 3, 5],
[2, 3, 5],
[2, 2, 5],
[2, 2, 2],
[2, 2, 2]], dtype=dtype)
adjustments[5] = [
adjustment_type(0, 4, 1, 1, coerce_to_dtype(dtype, 6)),
adjustment_type(2, 2, 2, 2, coerce_to_dtype(dtype, 7)),
]
buffer_as_of[5] = array([[4, 6, 5],
[4, 6, 5],
[2, 6, 7],
[2, 6, 5],
[2, 6, 2],
[2, 2, 2]], dtype=dtype)
return _gen_expectations(
baseline,
default_missing_value_for_dtype(dtype),
adjustments,
buffer_as_of,
nrows,
)
def _gen_expectations(baseline, adjustments, buffer_as_of, nrows):
def _gen_overwrite_adjustment_cases(name,
make_input,
make_expected_output,
dtype,
missing_value):
"""
Generate test cases for overwrite adjustments.
The algorithm used here is the same as the one used above for
multiplicative adjustments. The only difference is the semantics of how
the adjustments are expected to modify the arrays.
This is parameterized on `make_input` and make_expected_output functions,
which take 2-D lists of values and transform them into desired input/output
arrays. We do this so that we can easily test both vanilla numpy ndarrays
and our own LabelArray class for strings.
"""
adjustment_type = {
float64_dtype: Float64Overwrite,
datetime64ns_dtype: Datetime64Overwrite,
bytes_dtype: ObjectOverwrite,
unicode_dtype: ObjectOverwrite,
object_dtype: ObjectOverwrite,
}[dtype]
if dtype == object_dtype:
# When we're testing object dtypes, we expect to have strings, but
# coerce_to_dtype(object, 3) just gives 3 as a Python integer.
def make_overwrite_value(dtype, value):
return str(value)
else:
make_overwrite_value = coerce_to_dtype
adjustments = {}
buffer_as_of = [None] * 6
baseline = make_input([[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]])
buffer_as_of[0] = make_expected_output([[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]])
# Note that row indices are inclusive!
adjustments[1] = [
adjustment_type(0, 0, 0, 0, make_overwrite_value(dtype, 1)),
]
buffer_as_of[1] = make_expected_output([[1, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]])
# No adjustment at index 2.
buffer_as_of[2] = buffer_as_of[1]
adjustments[3] = [
adjustment_type(1, 2, 1, 1, make_overwrite_value(dtype, 3)),
adjustment_type(0, 1, 0, 0, make_overwrite_value(dtype, 4)),
]
buffer_as_of[3] = make_expected_output([[4, 2, 2],
[4, 3, 2],
[2, 3, 2],
[2, 2, 2],
[2, 2, 2],
[2, 2, 2]])
adjustments[4] = [
adjustment_type(0, 3, 2, 2, make_overwrite_value(dtype, 5))
]
buffer_as_of[4] = make_expected_output([[4, 2, 5],
[4, 3, 5],
[2, 3, 5],
[2, 2, 5],
[2, 2, 2],
[2, 2, 2]])
adjustments[5] = [
adjustment_type(0, 4, 1, 1, make_overwrite_value(dtype, 6)),
adjustment_type(2, 2, 2, 2, make_overwrite_value(dtype, 7)),
]
buffer_as_of[5] = make_expected_output([[4, 6, 5],
[4, 6, 5],
[2, 6, 7],
[2, 6, 5],
[2, 6, 2],
[2, 2, 2]])
return _gen_expectations(
baseline,
missing_value,
adjustments,
buffer_as_of,
nrows=6,
)
def _gen_expectations(baseline,
missing_value,
adjustments,
buffer_as_of,
nrows):
missing_value = default_missing_value_for_dtype(baseline.dtype)
for windowlen in valid_window_lengths(nrows):
num_legal_windows = num_windows_of_length_M_on_buffers_of_length_N(
@@ -263,8 +338,60 @@ class AdjustedArrayTestCase(TestCase):
@parameterized.expand(
chain(
_gen_unadjusted_cases(float64_dtype),
_gen_unadjusted_cases(datetime64ns_dtype),
_gen_unadjusted_cases(
'float',
make_input=as_dtype(float64_dtype),
make_expected_output=as_dtype(float64_dtype),
missing_value=default_missing_value_for_dtype(float64_dtype),
),
_gen_unadjusted_cases(
'datetime',
make_input=as_dtype(datetime64ns_dtype),
make_expected_output=as_dtype(datetime64ns_dtype),
missing_value=default_missing_value_for_dtype(
datetime64ns_dtype
),
),
# Test passing an array of strings to AdjustedArray.
_gen_unadjusted_cases(
'bytes_ndarray',
make_input=as_dtype(bytes_dtype),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
_gen_unadjusted_cases(
'unicode_ndarray',
make_input=as_dtype(unicode_dtype),
make_expected_output=as_labelarray(unicode_dtype, u''),
missing_value=u'',
),
_gen_unadjusted_cases(
'object_ndarray',
make_input=lambda a: a.astype(str).astype(object),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
# Test passing a LabelArray directly to AdjustedArray.
_gen_unadjusted_cases(
'bytes_labelarray',
make_input=as_labelarray(bytes_dtype, b''),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
_gen_unadjusted_cases(
'unicode_labelarray',
make_input=as_labelarray(unicode_dtype, u''),
make_expected_output=as_labelarray(bytes_dtype, u''),
missing_value=u'',
),
_gen_unadjusted_cases(
'object_labelarray',
make_input=(
lambda a: LabelArray(a.astype(str).astype(object), b'')
),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
)
)
def test_no_adjustments(self,
@@ -273,14 +400,13 @@ class AdjustedArrayTestCase(TestCase):
lookback,
adjustments,
missing_value,
expected):
expected_output):
array = AdjustedArray(data, NOMASK, adjustments, missing_value)
for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable.
window_iter = array.traverse(lookback)
for yielded, expected_yield in zip_longest(window_iter, expected):
self.assertEqual(yielded.dtype, data.dtype)
assert_array_equal(yielded, expected_yield)
in_out = zip(array.traverse(lookback), expected_output)
for yielded, expected_yield in in_out:
check_arrays(yielded, expected_yield)
@parameterized.expand(_gen_multiplicative_adjustment_cases(float64_dtype))
def test_multiplicative_adjustments(self,
@@ -295,12 +421,70 @@ class AdjustedArrayTestCase(TestCase):
for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable.
window_iter = array.traverse(lookback)
for yielded, expected_yield in zip_longest(window_iter, expected):
assert_array_equal(yielded, expected_yield)
check_arrays(yielded, expected_yield)
@parameterized.expand(
chain(
_gen_overwrite_adjustment_cases(float64_dtype),
_gen_overwrite_adjustment_cases(datetime64ns_dtype),
_gen_overwrite_adjustment_cases(
'float',
make_input=as_dtype(float64_dtype),
make_expected_output=as_dtype(float64_dtype),
dtype=float64_dtype,
missing_value=default_missing_value_for_dtype(float64_dtype),
),
_gen_overwrite_adjustment_cases(
'datetime',
make_input=as_dtype(datetime64ns_dtype),
make_expected_output=as_dtype(datetime64ns_dtype),
dtype=datetime64ns_dtype,
missing_value=default_missing_value_for_dtype(
datetime64ns_dtype,
),
),
# There are six cases here:
# Using np.bytes/np.unicode/python string arrays as inputs.
# Passing np.bytes/np.unicode/python string arrays to LabelArray,
# and using those as input.
#
# The outputs should always be LabelArrays.
_gen_unadjusted_cases(
'bytes_ndarray',
make_input=as_dtype(bytes_dtype),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
_gen_unadjusted_cases(
'unicode_ndarray',
make_input=as_dtype(unicode_dtype),
make_expected_output=as_labelarray(unicode_dtype, u''),
missing_value=u'',
),
_gen_unadjusted_cases(
'object_ndarray',
make_input=lambda a: a.astype(str).astype(object),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
_gen_unadjusted_cases(
'bytes_labelarray',
make_input=as_labelarray(bytes_dtype, b''),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
_gen_unadjusted_cases(
'unicode_labelarray',
make_input=as_labelarray(unicode_dtype, u''),
make_expected_output=as_labelarray(bytes_dtype, u''),
missing_value=u'',
),
_gen_unadjusted_cases(
'object_labelarray',
make_input=(
lambda a: LabelArray(a.astype(str).astype(object), b'')
),
make_expected_output=as_labelarray(bytes_dtype, b''),
missing_value=b'',
),
)
)
def test_overwrite_adjustment_cases(self,
@@ -314,11 +498,15 @@ class AdjustedArrayTestCase(TestCase):
for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable.
window_iter = array.traverse(lookback)
for yielded, expected_yield in zip_longest(window_iter, expected):
self.assertEqual(yielded.dtype, data.dtype)
assert_array_equal(yielded, expected_yield)
check_arrays(yielded, expected_yield)
@parameter_space(
dtype=[float64_dtype, int64_dtype, datetime64ns_dtype],
__fail_fast=True,
dtype=[
float64_dtype,
int64_dtype,
datetime64ns_dtype,
],
missing_value=[0, 10000],
window_length=[2, 3],
)
@@ -341,6 +529,37 @@ class AdjustedArrayTestCase(TestCase):
for expected, actual in zip(gen_expected, gen_actual):
check_arrays(expected, actual)
@parameter_space(
__fail_fast=True,
dtype=[bytes_dtype, unicode_dtype, object_dtype],
missing_value=["0", "-1", ""],
window_length=[2, 3],
)
def test_masking_with_strings(self, dtype, missing_value, window_length):
missing_value = coerce_to_dtype(dtype, missing_value)
baseline_ints = arange(15).reshape(5, 3)
# Coerce to string first so that coercion to object gets us an array of
# string objects.
baseline = baseline_ints.astype(str).astype(dtype)
mask = (baseline_ints % 2).astype(bool)
masked_baseline = LabelArray(baseline, missing_value=missing_value)
masked_baseline[~mask] = missing_value
array = AdjustedArray(
baseline,
mask,
adjustments={},
missing_value=missing_value,
)
gen_expected = moving_window(masked_baseline, window_length)
gen_actual = array.traverse(window_length=window_length)
for expected, actual in zip(gen_expected, gen_actual):
check_arrays(expected, actual)
def test_invalid_lookback(self):
data = arange(30, dtype=float).reshape(6, 5)
+157 -5
View File
@@ -1,12 +1,21 @@
import numpy as np
from zipline.lib.labelarray import LabelArray
from zipline.pipeline import Classifier
from zipline.testing import parameter_space
from zipline.utils.numpy_utils import int64_dtype
from zipline.utils.numpy_utils import (
categorical_dtype,
coerce_to_dtype,
int64_dtype,
)
from .base import BasePipelineTestCase
bytes_dtype = np.dtype('S3')
unicode_dtype = np.dtype('U3')
class ClassifierTestCase(BasePipelineTestCase):
@parameter_space(mv=[-1, 0, 1, 999])
@@ -69,10 +78,56 @@ class ClassifierTestCase(BasePipelineTestCase):
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(missing=[-1, 0, 1])
def test_disallow_comparison_to_missing_value(self, missing):
@parameter_space(
__fail_fast=True,
compval=['a', 'ab', 'not in the array'],
labelarray_dtype=(bytes_dtype, categorical_dtype, unicode_dtype),
)
def test_string_eq(self, compval, labelarray_dtype):
compval = labelarray_dtype.type(compval)
class C(Classifier):
dtype = int64_dtype
dtype = categorical_dtype
missing_value = ''
inputs = ()
window_length = 0
c = C()
# There's no significance to the values here other than that they
# contain a mix of the comparison value and other values.
data = LabelArray(
np.asarray(
[['', 'a', 'ab', 'ba'],
['z', 'ab', 'a', 'ab'],
['aa', 'ab', '', 'ab'],
['aa', 'a', 'ba', 'ba']],
dtype=labelarray_dtype,
),
missing_value='',
)
self.check_terms(
terms={
'eq': c.eq(compval),
},
expected={
'eq': (data == compval),
},
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(
missing=[-1, 0, 1],
dtype_=[int64_dtype, categorical_dtype],
)
def test_disallow_comparison_to_missing_value(self, missing, dtype_):
missing = coerce_to_dtype(dtype_, missing)
class C(Classifier):
dtype = dtype_
missing_value = missing
inputs = ()
window_length = 0
@@ -82,7 +137,7 @@ class ClassifierTestCase(BasePipelineTestCase):
errmsg = str(e.exception)
self.assertEqual(
errmsg,
"Comparison against self.missing_value ({v}) in C.eq().\n"
"Comparison against self.missing_value ({v!r}) in C.eq().\n"
"Missing values have NaN semantics, so the requested comparison"
" would always produce False.\n"
"Use the isnull() method to check for missing values.".format(
@@ -118,3 +173,100 @@ class ClassifierTestCase(BasePipelineTestCase):
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(
__fail_fast=True,
compval=['a', 'ab', '', 'not in the array'],
missing=['a', 'ab', '', 'not in the array'],
labelarray_dtype=(bytes_dtype, unicode_dtype, categorical_dtype),
)
def test_string_not_equal(self, compval, missing, labelarray_dtype):
compval = labelarray_dtype.type(compval)
class C(Classifier):
dtype = categorical_dtype
missing_value = missing
inputs = ()
window_length = 0
c = C()
# There's no significance to the values here other than that they
# contain a mix of the comparison value and other values.
data = LabelArray(
np.asarray(
[['', 'a', 'ab', 'ba'],
['z', 'ab', 'a', 'ab'],
['aa', 'ab', '', 'ab'],
['aa', 'a', 'ba', 'ba']],
dtype=labelarray_dtype,
),
missing_value=missing,
)
expected = (
(data.as_int_array() != data.reverse_categories.get(compval, -1)) &
(data.as_int_array() != data.reverse_categories[C.missing_value])
)
self.check_terms(
terms={
'ne': c != compval,
},
expected={
'ne': expected,
},
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(
__fail_fast=True,
compval=['a', 'b', 'ab', 'not in the array'],
missing=['a', 'ab', '', 'not in the array'],
labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
)
def test_string_elementwise_predicates(self,
compval,
missing,
labelarray_dtype):
missing = labelarray_dtype.type(missing)
compval = labelarray_dtype.type(compval)
class C(Classifier):
dtype = categorical_dtype
missing_value = missing
inputs = ()
window_length = 0
c = C()
# There's no significance to the values here other than that they
# contain a mix of the comparison value and other values.
data = LabelArray(
np.asarray(
[['', 'a', 'ab', 'ba'],
['z', 'ab', 'a', 'ab'],
['aa', 'ab', '', 'ab'],
['aa', 'a', 'ba', 'ba']],
dtype=labelarray_dtype,
),
missing_value=missing,
)
self.check_terms(
terms={
'startswith': c.startswith(compval),
'endswith': c.endswith(compval),
'contains': c.contains(compval),
},
expected={
'startswith': (data.startswith(compval) & (data != missing)),
'endswith': (data.endswith(compval) & (data != missing)),
'contains': (data.contains(compval) & (data != missing)),
},
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
+19 -3
View File
@@ -7,6 +7,7 @@ from unittest import TestCase
from pandas import date_range, DataFrame
from pandas.util.testing import assert_frame_equal
from zipline.lib.labelarray import LabelArray
from zipline.pipeline import Pipeline
from zipline.pipeline.data.testing import TestingDataSet as TDS
from zipline.testing import chrange, temp_pipeline_engine
@@ -35,6 +36,21 @@ class LatestTestCase(TestCase):
def expected_latest(self, column, slice_):
loader = self.engine.get_loader(column)
index = self.calendar[slice_]
columns = self.assets
values = loader.values(column.dtype, self.calendar, self.sids)[slice_]
if column.dtype.kind in ('O', 'S', 'U'):
# For string columns, we expect a categorical in the output.
return LabelArray(
values,
missing_value=column.missing_value,
).as_categorical_frame(
index=index,
columns=columns,
)
return DataFrame(
loader.values(column.dtype, self.calendar, self.sids)[slice_],
index=self.calendar[slice_],
@@ -55,6 +71,6 @@ class LatestTestCase(TestCase):
dates_to_test[-1],
)
for column in columns:
float_result = result[column.name].unstack()
expected_float_result = self.expected_latest(column, cal_slice)
assert_frame_equal(float_result, expected_float_result)
col_result = result[column.name].unstack()
expected_col_result = self.expected_latest(column, cal_slice)
assert_frame_equal(col_result, expected_col_result)
+2 -1
View File
@@ -471,7 +471,8 @@ class ObjectIdentityTestCase(TestCase):
for column in TestingDataSet.columns:
if column.dtype == bool_dtype:
self.assertIsInstance(column.latest, Filter)
elif column.dtype == int64_dtype:
elif (column.dtype == int64_dtype
or column.dtype.kind in ('O', 'S', 'U')):
self.assertIsInstance(column.latest, Classifier)
elif column.dtype in factor_dtypes:
self.assertIsInstance(column.latest, Factor)
+151
View File
@@ -0,0 +1,151 @@
from itertools import product
from operator import eq, ne
import numpy as np
from zipline.lib.labelarray import LabelArray
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
def rotN(l, N):
"""
Rotate a list of elements.
Pulls N elements off the end of the list and appends them to the front.
>>> rotN(['a', 'b', 'c', 'd'], 2)
['c', 'd', 'a', 'b']
>>> rotN(['a', 'b', 'c', 'd'], 3)
['d', 'a', 'b', 'c']
"""
assert len(l) >= N, "Can't rotate list by longer than its length."
return l[N:] + l[:N]
class LabelArrayTestCase(ZiplineTestCase):
@classmethod
def init_class_fixtures(cls):
super(LabelArrayTestCase, cls).init_class_fixtures()
cls.rowvalues = row = ['', 'a', 'b', 'ab', 'a', '', 'b', 'ab', 'z']
cls.strs = np.array([rotN(row, i) for i in range(3)])
def test_fail_on_direct_construction(self):
# See http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray # noqa
with self.assertRaises(TypeError) as e:
np.ndarray.__new__(LabelArray, (5, 5))
self.assertEqual(
str(e.exception),
"Direct construction of LabelArrays is not supported."
)
@parameter_space(
__fail_fast=True,
s=['', 'a', 'z', 'aa', 'not in the array'],
shape=[(27,), (9, 3), (3, 9), (3, 3, 3)],
array_astype=(bytes, unicode, object),
scalar_astype=(bytes, unicode, object),
)
def test_compare_to_str(self, s, shape, array_astype, scalar_astype):
strs = self.strs.reshape(shape).astype(array_astype)
arr = LabelArray(strs, missing_value='')
check_arrays(strs == s, arr == s)
check_arrays(strs != s, arr != s)
np_startswith = np.vectorize(lambda elem: elem.startswith(s))
check_arrays(arr.startswith(s), np_startswith(strs))
np_endswith = np.vectorize(lambda elem: elem.endswith(s))
check_arrays(arr.endswith(s), np_endswith(strs))
np_contains = np.vectorize(lambda elem: s in elem)
check_arrays(arr.contains(s), np_contains(strs))
def test_compare_to_str_array(self):
strs = self.strs
shape = strs.shape
arr = LabelArray(strs, missing_value='')
check_arrays(strs == arr, np.full_like(strs, True, dtype=bool))
check_arrays(strs != arr, np.full_like(strs, False, dtype=bool))
def broadcastable_row(value, dtype):
return np.full((shape[0], 1), value, dtype=strs.dtype)
def broadcastable_col(value, dtype):
return np.full((1, shape[1]), value, dtype=strs.dtype)
for comparator, dtype, value in product((eq, ne),
(bytes, unicode, object),
set(self.rowvalues)):
check_arrays(
comparator(arr, np.full_like(strs, value)),
comparator(strs, value),
)
check_arrays(
comparator(arr, broadcastable_row(value, dtype=dtype)),
comparator(strs, value),
)
check_arrays(
comparator(arr, broadcastable_col(value, dtype=dtype)),
comparator(strs, value),
)
@parameter_space(
__fail_fast=True,
slice_=[
0, 1, -1,
slice(None),
slice(0, 0),
slice(0, 3),
slice(1, 4),
slice(0),
slice(None, 1),
slice(0, 4, 2),
(slice(None), 1),
(slice(None), slice(None)),
(slice(None), slice(1, 2)),
]
)
def test_slicing_preserves_attributes(self, slice_):
arr = LabelArray(self.strs.reshape((9, 3)), missing_value='')
sliced = arr[slice_]
self.assertIsInstance(sliced, LabelArray)
self.assertIs(sliced.categories, arr.categories)
self.assertIs(sliced.reverse_categories, arr.reverse_categories)
self.assertIs(sliced.missing_value, arr.missing_value)
def test_infer_categories(self):
arr1d = LabelArray(self.strs, missing_value='')
codes1d = arr1d.as_int_array()
self.assertEqual(arr1d.shape, self.strs.shape)
self.assertEqual(arr1d.shape, codes1d.shape)
categories = arr1d.categories
unique_rowvalues = set(self.rowvalues)
# There should be an entry in categories for each unique row value, and
# each integer stored in the data array should be an index into
# categories.
self.assertEqual(list(categories), sorted(set(self.rowvalues)))
self.assertEqual(
set(codes1d.ravel()),
set(range(len(unique_rowvalues)))
)
for idx, value in enumerate(arr1d.categories):
check_arrays(
self.strs == value,
arr1d.view(type=np.ndarray) == idx,
)
for shape in (9, 3), (3, 9), (3, 3, 3):
strs2d = self.strs.reshape(shape)
arr2d = LabelArray(strs2d, missing_value='')
codes2d = arr2d.as_int_array()
self.assertEqual(arr2d.shape, shape)
check_arrays(arr2d.categories, categories)
for idx, value in enumerate(arr2d.categories):
check_arrays(strs2d == value, codes2d == idx)
+1
View File
@@ -665,6 +665,7 @@ class TradingAlgorithm(object):
new_assets = tuple(new_assets)
new_sids = tuple(new_sids)
new_symbols = tuple(new_symbols)
number_of_kinds_of_new_things = (
sum((bool(new_assets), bool(new_sids), bool(new_symbols)))
)
+5 -4
View File
@@ -19,7 +19,7 @@ from abc import (
)
from cachetools import LRUCache
from numpy import dtype, around, hstack
from numpy import around, hstack
from pandas.tslib import normalize_date
from six import with_metaclass
@@ -28,6 +28,7 @@ from zipline.lib._float64window import AdjustedArrayWindow as Float64Window
from zipline.lib.adjustment import Float64Multiply
from zipline.utils.cache import ExpiringCache
from zipline.utils.memoize import lazyval
from zipline.utils.numpy_utils import float64_dtype
class SlidingWindow(object):
@@ -237,9 +238,9 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)):
prefetch_dts = cal[start_ix:prefetch_end_ix + 1]
prefetch_len = len(prefetch_dts)
array = self._array(prefetch_dts, needed_assets, field)
view_kwargs = {}
if field == 'volume':
array = array.astype('float64')
dtype_ = dtype('float64')
array = array.astype(float64_dtype)
for i, asset in enumerate(needed_assets):
if self._adjustments_reader:
@@ -249,7 +250,7 @@ class USEquityHistoryLoader(with_metaclass(ABCMeta)):
adjs = {}
window = Float64Window(
array[:, i].reshape(prefetch_len, 1),
dtype_,
view_kwargs,
adjs,
offset,
size
+108
View File
@@ -0,0 +1,108 @@
"""
Factorization algorithms.
"""
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
from numpy import arange, asarray, empty, int64, isnan, ndarray
import_array()
cpdef factorize_strings_known_categories(ndarray[object] values,
list categories,
object missing_value,
int sort):
"""
Factorize an array whose categories are already known.
Any entries not in the specified categories will be given the code for
`missing_value`.
"""
if missing_value not in categories:
categories.append(missing_value)
if sort:
categories = sorted(categories)
cdef:
Py_ssize_t nvalues = len(values)
dict reverse_categories = dict(
zip(categories, range(len(categories)))
)
if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
)
cdef:
Py_ssize_t i
Py_ssize_t missing_code = reverse_categories[missing_value]
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
for i in range(nvalues):
codes[i] = reverse_categories.get(values[i], missing_code)
return codes, asarray(categories, dtype=object), reverse_categories
cpdef factorize_strings(ndarray[object] values,
object missing_value,
int sort):
"""
Factorize an array of (possibly duplicated) labels into an array of indices
into a unique array of labels.
This is ~30% faster than pandas.factorize, at the cost of not having
special treatment for NaN, which we don't care about because we only
support arrays of strings.
(Though it's faster even if you throw in the nan checks that pandas does,
because we're using dict and list instead of PyObjectHashTable and
ObjectVector. Python's builtin data structures are **really**
well-optimized.)
"""
cdef:
Py_ssize_t nvalues = len(values)
list categories = [missing_value]
dict reverse_categories = {missing_value: 0}
# Short circuit on empty array.
if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
)
cdef:
Py_ssize_t i, code
object key = None
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
for i in range(nvalues):
key = values[i]
code = reverse_categories.get(key, -1)
if code == -1:
# Assign new code.
code = len(reverse_categories)
reverse_categories[key] = code
categories.append(key)
codes[i] = code
cdef ndarray[int64_t, ndim=1] sorter
cdef ndarray[int64_t, ndim=1] reverse_indexer
cdef int ncategories
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
if sort:
# This is all taken from pandas.core.algorithms.factorize.
ncategories = len(categories_array)
sorter = categories_array.argsort()
reverse_indexer = empty(ncategories, dtype=int64)
reverse_indexer.put(sorter, arange(ncategories))
codes = reverse_indexer.take(codes)
categories_array = categories_array.take(sorter)
reverse_categories = dict(zip(categories_array, range(ncategories)))
return codes, categories_array, reverse_categories
+3 -1
View File
@@ -1,5 +1,7 @@
"""
float specialization of AdjustedArrayWindow
"""
from numpy cimport float64_t as ctype
from numpy cimport float64_t
ctypedef float64_t[:, :] databuffer
include "_windowtemplate.pxi"
+4 -1
View File
@@ -1,5 +1,8 @@
"""
datetime specialization of AdjustedArrayWindow
"""
from numpy cimport int64_t as ctype
from numpy cimport int64_t
ctypedef int64_t[:, :] databuffer
include "_windowtemplate.pxi"
+6
View File
@@ -0,0 +1,6 @@
"""
AdjustedArrayWindow type used for LabelArrays.
"""
ctypedef object databuffer
include "_windowtemplate.pxi"
+4 -1
View File
@@ -1,5 +1,8 @@
"""
bool specialization of AdjustedArrayWindow
"""
from numpy cimport uint8_t as ctype
from numpy cimport uint8_t
ctypedef uint8_t[:, :] databuffer
include "_windowtemplate.pxi"
+18 -14
View File
@@ -2,8 +2,8 @@
Template for AdjustedArray windowed iterators.
This file is intended to be used by inserting it via a Cython include into a
file that's define a type symbol named `ctype` and string constant named
`dtype`.
file that's defined a type symbol named `databuffer` that can be used like a
2-D numpy array.
See Also
--------
@@ -12,9 +12,7 @@ zipline.lib._intwindow
zipline.lib._datewindow
"""
from numpy cimport ndarray
from numpy import asarray
ctypedef ctype[:, :] databuffer
from numpy import asanyarray
cdef class AdjustedArrayWindow:
@@ -33,8 +31,8 @@ cdef class AdjustedArrayWindow:
"""
cdef:
# ctype must be defined by the file into which this is being copied.
databuffer data
object viewtype
readonly databuffer data
readonly dict view_kwargs
readonly Py_ssize_t window_length
Py_ssize_t anchor, next_anchor, max_anchor, next_adj
dict adjustments
@@ -43,13 +41,13 @@ cdef class AdjustedArrayWindow:
def __cinit__(self,
databuffer data not None,
object viewtype not None,
dict view_kwargs not None,
dict adjustments not None,
Py_ssize_t offset,
Py_ssize_t window_length):
self.data = data
self.viewtype = viewtype
self.view_kwargs = view_kwargs
self.adjustments = adjustments
self.adjustment_indices = sorted(adjustments, reverse=True)
self.window_length = window_length
@@ -74,9 +72,10 @@ cdef class AdjustedArrayWindow:
def __next__(self):
cdef:
ndarray out
object adjustment
ndarray out
Py_ssize_t start, anchor
dict view_kwargs
anchor = self.anchor = self.next_anchor
if anchor > self.max_anchor:
@@ -93,7 +92,13 @@ cdef class AdjustedArrayWindow:
self.next_adj = self.pop_next_adj()
start = anchor - self.window_length
out = asarray(self.data[start:self.anchor]).view(self.viewtype)
# If our data is a custom subclass of ndarray, preserve that subclass
# by using asanyarray instead of asarray.
out = asanyarray(self.data[start:self.anchor])
view_kwargs = self.view_kwargs
if view_kwargs:
out = out.view(**view_kwargs)
out.setflags(write=False)
self.next_anchor = self.anchor + 1
@@ -101,8 +106,7 @@ cdef class AdjustedArrayWindow:
return out
def seek(self, target_anchor):
cdef:
ndarray out
cdef ndarray out = None
if target_anchor < self.anchor:
raise Exception('Can not access data after window has passed.')
@@ -122,5 +126,5 @@ cdef class AdjustedArrayWindow:
self.window_length,
self.anchor,
self.max_anchor,
self.viewtype,
self.view_kwargs.get('dtype'),
)
+36 -17
View File
@@ -17,6 +17,8 @@ from zipline.errors import (
WindowLengthNotPositive,
WindowLengthTooLong,
)
from zipline.lib.labelarray import LabelArray
from zipline.utils.compat import unicode
from zipline.utils.numpy_utils import (
datetime64ns_dtype,
float64_dtype,
@@ -28,8 +30,10 @@ from zipline.utils.memoize import lazyval
# These class names are all the same because of our bootleg templating system.
from ._float64window import AdjustedArrayWindow as Float64Window
from ._int64window import AdjustedArrayWindow as Int64Window
from ._labelwindow import AdjustedArrayWindow as LabelWindow
from ._uint8window import AdjustedArrayWindow as UInt8Window
NOMASK = None
BOOL_DTYPES = frozenset(
map(dtype, [bool_]),
@@ -44,16 +48,20 @@ INT_DTYPES = frozenset(
DATETIME_DTYPES = frozenset(
map(dtype, ['datetime64[ns]', 'datetime64[D]']),
)
# We use object arrays for strings.
CATEGORICAL_DTYPES = frozenset(map(dtype, ['O']))
REPRESENTABLE_DTYPES = BOOL_DTYPES.union(
FLOAT_DTYPES,
INT_DTYPES,
DATETIME_DTYPES
DATETIME_DTYPES,
CATEGORICAL_DTYPES,
)
def can_represent_dtype(dtype):
"""
Can we build an AdjustedArray for a baseline of dtype ``dtype``?
Can we build an AdjustedArray for a baseline of `dtype``?
"""
return dtype in REPRESENTABLE_DTYPES
@@ -65,11 +73,11 @@ CONCRETE_WINDOW_TYPES = {
}
def _normalize_array(data):
def _normalize_array(data, missing_value):
"""
Coerce buffer data for an AdjustedArray into a standard scalar
representation, returning the coerced array and a numpy dtype object to use
as a view type when providing public view into the data.
representation, returning the coerced array and a dict of argument to pass
to np.view to use when providing a user-facing view of the underlying data.
- float* data is coerced to float64 with viewtype float64.
- int32, int64, and uint32 are converted to int64 with viewtype int64.
@@ -82,19 +90,29 @@ def _normalize_array(data):
Returns
-------
coerced, viewtype : (np.ndarray, np.dtype)
coerced, view_kwargs : (np.ndarray, np.dtype)
"""
if isinstance(data, LabelArray):
return data, {}
data_dtype = data.dtype
if data_dtype == bool_:
return data.astype(uint8), dtype(bool_)
return data.astype(uint8), {'dtype': dtype(bool_)}
elif data_dtype in FLOAT_DTYPES:
return data.astype(float64), dtype(float64)
return data.astype(float64), {'dtype': dtype(float64)}
elif data_dtype in INT_DTYPES:
return data.astype(int64), dtype(int64)
elif data_dtype.name.startswith('datetime'):
return data.astype(int64), {'dtype': dtype(int64)}
elif data_dtype in CATEGORICAL_DTYPES:
if not isinstance(missing_value, (bytes, unicode)):
raise TypeError(
"Invalid missing_value for categorical array.\n"
"Expected a string, got %r" % missing_value,
)
return LabelArray(data, missing_value), {}
elif data_dtype.kind == 'M':
try:
outarray = data.astype('datetime64[ns]').view('int64')
return outarray, datetime64ns_dtype
return outarray, {'dtype': datetime64ns_dtype}
except OverflowError:
raise ValueError(
"AdjustedArray received a datetime array "
@@ -127,18 +145,17 @@ class AdjustedArray(object):
missing_value : object
A value to use to fill missing data in yielded windows.
Should be a value coercible to `data.dtype`.
"""
__slots__ = (
'_data',
'_viewtype',
'_view_kwargs',
'adjustments',
'missing_value',
'__weakref__',
)
def __init__(self, data, mask, adjustments, missing_value):
self._data, self._viewtype = _normalize_array(data)
self._data, self._view_kwargs = _normalize_array(data, missing_value)
self.adjustments = adjustments
self.missing_value = missing_value
@@ -158,20 +175,22 @@ class AdjustedArray(object):
"""
The data stored in this array.
"""
return self._data.view(self._viewtype)
return self._data.view(**self._view_kwargs)
@lazyval
def dtype(self):
"""
The dtype of the data stored in this array.
"""
return self._viewtype
return self._view_kwargs.get('dtype') or self._data.dtype
@lazyval
def _iterator_type(self):
"""
The iterator produced when `traverse` is called on this Array.
"""
if isinstance(self._data, LabelArray):
return LabelWindow
return CONCRETE_WINDOW_TYPES[self._data.dtype]
def traverse(self, window_length, offset=0):
@@ -190,7 +209,7 @@ class AdjustedArray(object):
_check_window_params(data, window_length)
return self._iterator_type(
data,
self._viewtype,
self._view_kwargs,
self.adjustments,
offset,
window_length,
+60 -4
View File
@@ -316,12 +316,13 @@ cdef class Float64Multiply(Float64Adjustment):
cpdef mutate(self, float64_t[:, :] data):
cdef Py_ssize_t row, col
cdef float64_t value = self.value
# last_col + 1 because last_col should also be affected.
for col in range(self.first_col, self.last_col + 1):
# last_row + 1 because last_row should also be affected.
for row in range(self.first_row, self.last_row + 1):
data[row, col] *= self.value
data[row, col] *= value
cdef class Float64Overwrite(Float64Adjustment):
@@ -354,12 +355,13 @@ cdef class Float64Overwrite(Float64Adjustment):
cpdef mutate(self, float64_t[:, :] data):
cdef Py_ssize_t row, col
cdef float64_t value = self.value
# last_col + 1 because last_col should also be affected.
for col in range(self.first_col, self.last_col + 1):
# last_row + 1 because last_row should also be affected.
for row in range(self.first_row, self.last_row + 1):
data[row, col] = self.value
data[row, col] = value
cdef class Float64Add(Float64Adjustment):
@@ -392,12 +394,13 @@ cdef class Float64Add(Float64Adjustment):
cpdef mutate(self, float64_t[:, :] data):
cdef Py_ssize_t row, col
cdef float64_t value = self.value
# last_col + 1 because last_col should also be affected.
for col in range(self.first_col, self.last_col + 1):
# last_row + 1 because last_row should also be affected.
for row in range(self.first_row, self.last_row + 1):
data[row, col] += self.value
data[row, col] += value
cdef class _Int64Adjustment(Adjustment):
@@ -530,9 +533,62 @@ cdef class Datetime64Overwrite(Datetime64Adjustment):
"""
cpdef mutate(self, int64_t[:, :] data):
cdef Py_ssize_t row, col
cdef int64_t value = self.value
# last_col + 1 because last_col should also be affected.
for col in range(self.first_col, self.last_col + 1):
# last_row + 1 because last_row should also be affected.
for row in range(self.first_row, self.last_row + 1):
data[row, col] = self.value
data[row, col] = value
cdef class _ObjectAdjustment(Adjustment):
"""
Base class for adjustments that operate on arbitrary objects.
We use only this for categorical data, where our data buffer is an array of
indices into an array of unique Python string objects.
"""
cdef:
readonly object value
def __init__(self,
Py_ssize_t first_row,
Py_ssize_t last_row,
Py_ssize_t first_col,
Py_ssize_t last_col,
object value):
super(Adjustment, self).__init__(
first_row=first_row,
last_row=last_row,
first_col=first_col,
last_col=last_col,
)
self.value = value
def __repr__(self):
return (
"%s(first_row=%d, last_row=%d,"
" first_col=%d, last_col=%d, value=%r)" % (
type(self).__name__,
self.first_row,
self.last_row,
self.first_col,
self.last_col,
self.value,
)
)
cdef class ObjectOverwrite(_ObjectAdjustment):
cpdef mutate(self, object data):
# data is an object here because this is intended to be used with a
# `zipline.lib.LabelArray`.
cdef Py_ssize_t row, col
cdef object value = self.value
# We don't do this in a loop because we only want to look up the label
# code in the array's categories once.
data[self.first_row:self.last_row + 1,
self.first_col:self.last_col + 1] = self.value
+482
View File
@@ -0,0 +1,482 @@
"""
An ndarray subclass for working with arrays of strings.
"""
from functools import partial
from numbers import Number
from operator import eq, ne
import re
import numpy as np
from numpy import ndarray
import pandas as pd
from toolz import compose
from zipline.utils.preprocess import preprocess
from zipline.utils.sentinel import sentinel
from zipline.utils.input_validation import (
coerce,
expect_kinds,
expect_types,
optional,
)
from zipline.utils.numpy_utils import is_object, int64_dtype
from ._factorize import (
factorize_strings,
factorize_strings_known_categories,
)
def compare_arrays(left, right):
"Eq check with a short-circuit for identical objects."
return (
left is right
or ((left.shape == right.shape) and (left == right).all())
)
def _make_unsupported_method(name):
def method(*args, **kwargs):
raise NotImplementedError(
"Method %s is not supported on LabelArrays." % name
)
method.__name__ = name
method.__doc__ = "Unsupported LabelArray Method: %s" % name
return method
class CategoryMismatch(ValueError):
"""
Error raised on attempt to perform operations between LabelArrays with
mismatched category arrays.
"""
def __init__(self, left, right):
(mismatches,) = np.where(left != right)
assert len(mismatches), "Not actually a mismatch!"
super(CategoryMismatch, self).__init__(
"LabelArray categories don't match:\n"
"Mismatched Indices: {mismatches}\n"
"Left: {left}\n"
"Right: {right}".format(
mismatches=mismatches,
left=left[mismatches],
right=right[mismatches],
)
)
_NotPassed = sentinel('_NotPassed')
class LabelArray(ndarray):
"""
An ndarray subclass for working with arrays of strings.
Factorizes the input array into integers, but overloads equality on strings
to check against the factor label.
See Also
--------
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
"""
@preprocess(
values=coerce(list, partial(np.asarray, dtype=object)),
)
@expect_types(
values=np.ndarray,
categories=optional(list),
)
@expect_kinds(values=("O", "S", "U"))
def __new__(cls,
values,
missing_value,
categories=None,
sort=True):
# Numpy's fixed-width string types aren't very efficient. Working with
# object arrays is faster than bytes or unicode arrays in almost all
# cases.
if not is_object(values):
values = values.astype(object)
if categories is None:
codes, categories, reverse_categories = factorize_strings(
values.ravel(),
missing_value=missing_value,
sort=sort,
)
else:
codes, categories, reverse_categories = (
factorize_strings_known_categories(
values.ravel(),
categories=categories,
missing_value=missing_value,
sort=sort,
)
)
categories.setflags(write=False)
ret = codes.reshape(values.shape).view(type=cls)
ret._categories = categories
ret._reverse_categories = reverse_categories
ret._missing_value = missing_value
return ret
@property
def categories(self):
# This is a property because it should be immutable.
return self._categories
@property
def reverse_categories(self):
# This is a property because it should be immutable.
return self._reverse_categories
@property
def missing_value(self):
# This is a property because it should be immutable.
return self._missing_value
def __array_finalize__(self, obj):
"""
Called by Numpy after array construction.
There are three cases where this can happen:
1. Someone tries to directly construct a new array by doing::
>>> ndarray.__new__(LabelArray, ...)
In this case, obj will be None. We treat this as an error case and
fail.
2. Someone (most likely our own __new__) calls
other_array.view(type=LabelArray).
In this case, `self` will be the new LabelArray instance, and
``obj` will be the array on which ``view`` is being called.
The caller of ``obj.view`` is responsible for copying setting
category metadata on ``self`` after we exit.
3. Someone creates a new LabelArray by slicing an existing one.
In this case, ``obj`` will be the original LabelArray. We're
responsible for copying over the parent array's category metadata.
"""
if obj is None:
raise TypeError(
"Direct construction of LabelArrays is not supported."
)
# See docstring for an explanation of when these will or will not be
# set.
self._categories = getattr(obj, 'categories', None)
self._reverse_categories = getattr(obj, 'reverse_categories', None)
self._missing_value = getattr(obj, 'missing_value', None)
def __array_wrap__(self, obj, context=None):
"""
Called by numpy after completion of a ufunc.
We coerce back into a vanilla ndarray if our dtype changed, since that
indicates that our categories are no longer meaningful.
"""
if obj.dtype != self.dtype:
return obj.view(type=np.ndarray)
return obj
def as_int_array(self):
"""
Convert self into a regular ndarray of ints.
This is an O(1) operation. It does not copy the underlying data.
"""
return self.view(type=ndarray)
def as_string_array(self):
"""
Convert self back into an array of strings.
This is an O(N) operation.
"""
return self.categories[self]
def as_categorical(self, name=None):
"""
Coerce self into a pandas categorical.
This is only defined on 1D arrays, since that's all pandas supports.
"""
if len(self.shape) > 1:
raise ValueError("Can't convert a 2D array to a categorical.")
return pd.Categorical.from_codes(
self.as_int_array(),
self.categories,
ordered=False,
name=name,
)
def as_categorical_frame(self, index, columns, name=None):
"""
Coerce self into a pandas DataFrame of Categoricals.
"""
if len(self.shape) != 2:
raise ValueError(
"Can't convert a non-2D LabelArray into a DataFrame."
)
expected_shape = (len(index), len(columns))
if expected_shape != self.shape:
raise ValueError(
"Can't construct a DataFrame with provided indices:\n\n"
"LabelArray shape is {actual}, but index and columns imply "
"that shape should be {expected}.".format(
actual=self.shape,
expected=expected_shape,
)
)
return pd.Series(
index=pd.MultiIndex.from_product([index, columns]),
data=self.ravel().as_categorical(name=name),
).unstack()
def __setitem__(self, indexer, value):
self_categories = self.categories
if isinstance(value, LabelArray):
value_categories = value.categories
if compare_arrays(self_categories, value_categories):
return super(LabelArray, self).__setitem__(indexer, value)
else:
raise CategoryMismatch(self_categories, value_categories)
elif isinstance(value, (bytes, unicode)):
value_code = self.reverse_categories.get(value, None)
if value_code is None:
raise ValueError("%r is not in LabelArray categories." % value)
return super(LabelArray, self).__setitem__(indexer, value_code)
else:
raise NotImplementedError(
"Setting into a LabelArray with a value of "
"type {type} is not yet supported.".format(
type=type(value).__name__,
),
)
def _equality_check(op):
"""
Shared code for __eq__ and __ne__, parameterized on the actual
comparison operator to use.
"""
# What value should we return if we compare against a value not in our
# categories?
if op is eq:
COMPARE_TO_UNKNOWN = False
elif op is ne:
COMPARE_TO_UNKNOWN = True
else:
raise AssertionError("_make_equality_check called with %s" % op)
def method(self, other):
self_categories = self.categories
if isinstance(other, LabelArray):
other_categories = other.categories
if compare_arrays(self_categories, other_categories):
return op(self.as_int_array(), other.as_int_array())
else:
raise CategoryMismatch(self_categories, other_categories)
elif isinstance(other, ndarray):
# Compare to ndarrays as though we were an array of strings.
# This is fairly expensive, and should generally be avoided.
return op(self.as_string_array(), other)
elif isinstance(other, (bytes, unicode)):
i = self._reverse_categories.get(other, None)
if i is None:
# Requested string isn't in our categories. Short circuit.
# This isn't full_like because that would try to return a
# LabelArray.
return np.full(self.shape, COMPARE_TO_UNKNOWN, dtype=bool)
return op(self.as_int_array(), i)
elif isinstance(other, Number):
return NotImplemented
return op(super(LabelArray, self), other)
return method
__eq__ = _equality_check(eq)
__ne__ = _equality_check(ne)
del _equality_check
def view(self, dtype=_NotPassed, type=_NotPassed):
if type is _NotPassed and dtype not in (_NotPassed, self.dtype):
raise TypeError("Can't view LabelArray as another dtype.")
# The text signature on ndarray.view makes it look like the default
# values for dtype and type are `None`, but passing None explicitly has
# different semantics than not passing an arg at all, so we reconstruct
# the kwargs dict here to simulate the args not being passed at all.
kwargs = {}
if dtype is not _NotPassed:
kwargs['dtype'] = dtype
if type is not _NotPassed:
kwargs['type'] = type
return super(LabelArray, self).view(**kwargs)
# In general, we support resizing, slicing, and reshaping methods, but not
# numeric methods.
SUPPORTED_NDARRAY_METHODS = frozenset([
'base',
'byteswap',
'compress',
'copy',
'data',
'diagonal',
'dtype',
'flat',
'flatten',
'item',
'itemset',
'itemsize',
'nbytes',
'ndim',
'newbyteorder',
'ravel',
'repeat',
'reshape',
'resize',
'setflags',
'shape',
'size',
'squeeze',
'strides',
'swapaxes',
'take',
'trace',
'transpose',
'view'
])
PUBLIC_NDARRAY_METHODS = frozenset([
s for s in dir(ndarray) if not s.startswith('_')
])
# Generate failing wrappers for all unsupported methods.
locals().update(
{
method: _make_unsupported_method(method)
for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS
}
)
def __repr__(self):
# This happens if you call a ufunc on a LabelArray that changes the
# dtype. This is generally an indicator that the array has been used
# incorrectly, and it means we're no longer valid for anything.
if self.dtype != int64_dtype:
return "Invalid LabelArray: dtype={}, shape={}".format(
self.dtype, self.shape
)
repr_lines = repr(self.as_string_array()).splitlines()
repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1)
repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')'
# The extra spaces here account for the difference in length between
# 'array(' and 'LabelArray('.
return '\n '.join(repr_lines)
def empty_like(self, shape):
"""
Make an empty LabelArray with the same categories as ``self``, filled
with ``self.missing_value``.
"""
out = np.full(
shape,
self.reverse_categories[self.missing_value],
dtype=self.dtype
).view(
type=type(self)
)
out._categories = self.categories
out._reverse_categories = self.reverse_categories
out._missing_value = self.missing_value
return out
def apply(self, f, dtype):
"""
Map a function elementwise over entries in ``self``.
``f`` will be applied exactly once to each unique value in ``self``.
"""
return np.vectorize(f, otypes=[dtype])(self.categories)[self]
def startswith(self, prefix):
"""
Element-wise startswith.
Parameters
----------
prefix : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self started with ``prefix``.
"""
return self.apply(lambda elem: elem.startswith(prefix), dtype=bool)
def endswith(self, suffix):
"""
Elementwise endswith.
Parameters
----------
suffix : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self ended with ``suffix``.w
"""
return self.apply(lambda elem: elem.endswith(suffix), dtype=bool)
def contains(self, substring):
"""
Elementwise contains.
Parameters
----------
substring : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self ended with ``suffix``.
"""
return self.apply(lambda elem: substring in elem, dtype=bool)
@preprocess(pattern=re.compile)
def matches(self, pattern):
"""
Elementwise regex match.
Parameters
----------
pattern : str or compiled regex
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self was matched by ``pattern``.
"""
return self.apply(compose(bool, pattern.match))
+49
View File
@@ -0,0 +1,49 @@
"""
Utilities for creating public APIs (e.g. argument validation decorators).
"""
from zipline.utils.input_validation import preprocess
def restrict_to_dtype(dtype, message_template):
"""
A factory for decorators that restrict Term methods to only be callable on
Terms with a specific dtype.
This is conceptually similar to
zipline.utils.input_validation.expect_dtypes, but provides more flexibility
for providing error messages that are specifically targeting Term methods.
Parameters
----------
dtype : numpy.dtype
The dtype on which the decorated method may be called.
message_template : str
A template for the error message to be raised.
`message_template.format` will be called with keyword arguments
`method_name`, `expected_dtype`, and `received_dtype`.
Usage
-----
@restrict_to_dtype(
dtype=float64_dtype,
message_template=(
"{method_name}() was called on a factor of dtype {received_dtype}."
"{method_name}() requires factors of dtype{expected_dtype}."
),
)
def some_factor_method(self, ...):
self.stuff_that_requires_being_float64(...)
"""
def processor(term_method, _, term_instance):
term_dtype = term_instance.dtype
if term_dtype != dtype:
raise TypeError(
message_template.format(
method_name=term_method.__name__,
expected_dtype=dtype.name,
received_dtype=term_dtype,
)
)
return term_instance
return preprocess(self=processor)
+169 -20
View File
@@ -1,16 +1,24 @@
"""
classifier.py
"""
from functools import wraps
from numbers import Number
import operator
from numpy import where, isnan, nan, zeros
from zipline.lib.labelarray import LabelArray
from zipline.lib.quantiles import quantiles
from zipline.pipeline.term import ComputableTerm
from zipline.pipeline.api_utils import restrict_to_dtype
from zipline.pipeline.term import ComputableTerm, NotSpecified
from zipline.utils.compat import unicode
from zipline.utils.input_validation import expect_types
from zipline.utils.numpy_utils import int64_dtype
from zipline.utils.numpy_utils import (
categorical_dtype,
int64_dtype,
)
from ..filters import NullFilter, NumExprFilter
from ..filters import Filter, NullFilter, NumExprFilter
from ..mixins import (
CustomTermMixin,
LatestMixin,
@@ -20,6 +28,15 @@ from ..mixins import (
)
strings_only = restrict_to_dtype(
dtype=categorical_dtype,
message_template=(
"{method_name}() is only defined on Classifiers producing strings"
" but it was called on a Factor of dtype {received_dtype}."
)
)
class Classifier(RestrictedDTypeMixin, ComputableTerm):
"""
A Pipeline expression computing a categorical output.
@@ -30,7 +47,9 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
indicating that means/standard deviations should be computed on assets for
which the classifier produced the same label.
"""
ALLOWED_DTYPES = (int64_dtype,) # Used by RestrictedDTypeMixin
# Used by RestrictedDTypeMixin
ALLOWED_DTYPES = (int64_dtype, categorical_dtype)
categories = NotSpecified
def isnull(self):
"""
@@ -45,9 +64,8 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
return ~self.isnull()
# We explicitly don't support classifier to classifier comparisons, since
# the numbers likely don't mean the same thing. This may be relaxed in the
# future, but for now we're starting conservatively.
@expect_types(other=Number)
# the stored values likely don't mean the same thing. This may be relaxed
# in the future, but for now we're starting conservatively.
def eq(self, other):
"""
Construct a Filter returning True for asset/date pairs where the output
@@ -58,7 +76,7 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
# certainly not what the user wants.
if other == self.missing_value:
raise ValueError(
"Comparison against self.missing_value ({value}) in"
"Comparison against self.missing_value ({value!r}) in"
" {typename}.eq().\n"
"Missing values have NaN semantics, so the "
"requested comparison would always produce False.\n"
@@ -67,24 +85,87 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
typename=(type(self).__name__),
)
)
return NumExprFilter.create(
"x_0 == {other}".format(other=int(other)),
binds=(self,),
)
@expect_types(other=Number)
if isinstance(other, Number) != (self.dtype == int64_dtype):
raise InvalidClassifierComparison(self, other)
if isinstance(other, Number):
return NumExprFilter.create(
"x_0 == {other}".format(other=int(other)),
binds=(self,),
)
else:
return ScalarStringPredicate(
classifier=self,
op=operator.eq,
compval=other,
)
def __ne__(self, other):
"""
Construct a Filter returning True for asset/date pairs where the output
of ``self`` matches ``other.
"""
return NumExprFilter.create(
"((x_0 != {other}) & (x_0 != {missing}))".format(
other=int(other),
missing=self.missing_value,
),
binds=(self,),
)
if isinstance(other, Number) != (self.dtype == int64_dtype):
raise InvalidClassifierComparison(self, other)
if isinstance(other, Number):
return NumExprFilter.create(
"((x_0 != {other}) & (x_0 != {missing}))".format(
other=int(other),
missing=self.missing_value,
),
binds=(self,),
)
else:
return ScalarStringPredicate(
classifier=self,
op=operator.ne,
compval=other,
)
def _string_predicate(f):
"""
Decorator for converting a function from (LabelArray, str) -> bool
into a Classifier method that returns a ScalarStringPredicate filter.
This mainly exists to avoid replicating shared boilerplate
(e.g. argument type validation).
"""
@wraps(f)
@expect_types(compval=(bytes, unicode))
@strings_only
def method(self, compval):
return ScalarStringPredicate(
classifier=self,
op=f,
compval=compval,
)
return method
@_string_predicate
@expect_types(label_array=LabelArray)
def startswith(label_array, other):
return label_array.startswith(other)
@_string_predicate
@expect_types(label_array=LabelArray)
def endswith(label_array, other):
return label_array.endswith(other)
@_string_predicate
@expect_types(label_array=LabelArray)
def contains(label_array, other):
return label_array.contains(other)
del _string_predicate
def postprocess(self, data):
if self.dtype == int64_dtype:
return data
if not isinstance(data, LabelArray):
raise AssertionError("Expected a LabelArray, got %s." % type(data))
return data.as_categorical()
class Everything(Classifier):
@@ -127,6 +208,50 @@ class Quantiles(SingleInputMixin, Classifier):
return type(self).__name__ + '(%d)' % self.params['bins']
class ScalarStringPredicate(SingleInputMixin, Filter):
"""
A filter that applies a function from (LabelArray, str) -> ndarray[bool].
Examples include ``==, !=, startswith, and endswith``.
This exists because we represent string arrays with
``zipline.lib.LabelArray``s, which numexpr doesn't know about, so we can't
use the generic NumExprFilter implementation here.
"""
window_length = 0
@expect_types(classifier=Classifier, compval=(bytes, unicode))
def __new__(cls, classifier, op, compval):
return super(ScalarStringPredicate, cls).__new__(
ScalarStringPredicate,
compval=compval,
op=op,
inputs=(classifier,),
mask=classifier.mask,
)
def _init(self, op, compval, *args, **kwargs):
self._op = op
self._compval = compval
return super(ScalarStringPredicate, self)._init(*args, **kwargs)
@classmethod
def static_identity(cls, op, compval, *args, **kwargs):
return (
super(ScalarStringPredicate, cls).static_identity(*args, **kwargs),
op,
compval,
)
def _compute(self, arrays, dates, assets, mask):
data = arrays[0]
return (
self._op(data, self._compval)
& (data != self.inputs[0].missing_value)
& mask
)
class CustomClassifier(PositiveWindowLengthMixin, CustomTermMixin, Classifier):
"""
Base class for user-defined Classifiers.
@@ -149,3 +274,27 @@ class Latest(LatestMixin, CustomClassifier):
zipline.pipeline.factors.factor.Latest
zipline.pipeline.filters.filter.Latest
"""
def _allocate_output(self, windows, shape):
"""
Override the default array allocation to produce a LabelArray when we
have a string-like dtype.
"""
if self.dtype == int64_dtype:
return super(Latest, self)._allocate_output(windows, shape)
# This is a little bit of a hack. We might not know what the
# categories for a LabelArray are until it's actually been loaded, so
# we need to look at the underlying data.
return windows[0].data.empty_like(shape)
class InvalidClassifierComparison(TypeError):
def __init__(self, classifier, compval):
super(InvalidClassifierComparison, self).__init__(
"Can't compare classifier of dtype"
" {dtype} to value {value} of type {type}.".format(
dtype=classifier.dtype,
value=compval,
type=type(compval).__name__,
)
)
+17 -12
View File
@@ -14,11 +14,7 @@ from zipline.pipeline.term import (
Term,
)
from zipline.utils.input_validation import ensure_dtype
from zipline.utils.numpy_utils import (
bool_dtype,
int64_dtype,
NoDefaultMissingValue,
)
from zipline.utils.numpy_utils import NoDefaultMissingValue
from zipline.utils.preprocess import preprocess
@@ -26,7 +22,6 @@ class Column(object):
"""
An abstract column of data, not yet associated with a dataset.
"""
@preprocess(dtype=ensure_dtype)
def __init__(self, dtype, missing_value=NotSpecified):
self.dtype = dtype
@@ -164,15 +159,25 @@ class BoundColumn(LoadableTerm):
@property
def latest(self):
if self.dtype == bool_dtype:
from zipline.pipeline.filters import Latest
elif self.dtype == int64_dtype:
from zipline.pipeline.classifiers import Latest
from zipline.pipeline.factors import Factor, Latest as LatestFactor
from zipline.pipeline.filters import Filter, Latest as LatestFilter
from zipline.pipeline.classifiers import (
Classifier,
Latest as LatestClassifier,
)
dtype = self.dtype
if dtype in Filter.ALLOWED_DTYPES:
Latest = LatestFilter
elif dtype in Classifier.ALLOWED_DTYPES:
Latest = LatestClassifier
else:
from zipline.pipeline.factors import Latest
assert dtype in Factor.ALLOWED_DTYPES, "Unknown dtype %s." % dtype
Latest = LatestFactor
return Latest(
inputs=(self,),
dtype=self.dtype,
dtype=dtype,
missing_value=self.missing_value,
)
+7
View File
@@ -7,6 +7,7 @@ zipline.pipeline.data.testing.
from .dataset import Column, DataSet
from zipline.utils.numpy_utils import (
bool_dtype,
categorical_dtype,
float64_dtype,
datetime64ns_dtype,
int64_dtype,
@@ -22,3 +23,9 @@ class TestingDataSet(DataSet):
float_col = Column(dtype=float64_dtype)
datetime_col = Column(dtype=datetime64ns_dtype)
int_col = Column(dtype=int64_dtype, missing_value=0)
categorical_col = Column(dtype=categorical_dtype, missing_value=u'')
categorical_default_NULL = Column(
dtype=categorical_dtype,
missing_value=u'NULL',
)
+25 -9
View File
@@ -51,7 +51,7 @@ class PipelineEngine(with_metaclass(ABCMeta)):
result : pd.DataFrame
A frame of computed results.
The columns `result` correspond will be the computed results of
The columns `result` correspond to the entries of
`pipeline.columns`, which should be a dictionary mapping strings to
instances of `zipline.pipeline.term.Term`.
@@ -165,17 +165,20 @@ class SimplePipelineEngine(object):
root_mask = self._compute_root_mask(start_date, end_date, extra_rows)
dates, assets, root_mask_values = explode(root_mask)
outputs = self.compute_chunk(
results = self.compute_chunk(
graph,
dates,
assets,
initial_workspace={self._root_mask_term: root_mask_values},
)
out_dates = dates[extra_rows:]
screen_values = outputs.pop(screen_name)
return self._to_narrow(outputs, screen_values, out_dates, assets)
return self._to_narrow(
graph.outputs,
results,
results.pop(screen_name),
dates[extra_rows:],
assets,
)
def _compute_root_mask(self, start_date, end_date, extra_rows):
"""
@@ -363,14 +366,16 @@ class SimplePipelineEngine(object):
out[name] = workspace[term][graph_extra_rows[term]:]
return out
def _to_narrow(self, data, mask, dates, assets):
def _to_narrow(self, terms, data, mask, dates, assets):
"""
Convert raw computed pipeline results into a DataFrame for public APIs.
Parameters
----------
terms : dict[str -> Term]
Dict mapping column names to terms.
data : dict[str -> ndarray[ndim=2]]
Dict mapping column names to computed results.
Dict mapping column names to computed results for those names.
mask : ndarray[bool, ndim=2]
Mask array of values to keep.
dates : ndarray[datetime64, ndim=1]
@@ -412,8 +417,18 @@ class SimplePipelineEngine(object):
resolved_assets = array(self._finder.retrieve_all(assets))
dates_kept = repeat_last_axis(dates.values, len(assets))[mask]
assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask]
final_columns = {}
for name in data:
# Each term that computed an output has its postprocess method
# called on the filtered result.
#
# We use this convert LabelArrays into categoricals, among other
# things.
final_columns[name] = terms[name].postprocess(data[name][mask])
return DataFrame(
data={name: arr[mask] for name, arr in iteritems(data)},
data=final_columns,
index=MultiIndex.from_arrays([dates_kept, assets_kept]),
).tz_localize('UTC', level=0)
@@ -423,6 +438,7 @@ class SimplePipelineEngine(object):
"""
root = self._root_mask_term
clsname = type(self).__name__
# Writing this out explicitly so this errors in testing if we change
# the name without updating this line.
compute_chunk_name = self.compute_chunk.__name__
+2 -78
View File
@@ -6,11 +6,11 @@ from operator import attrgetter
from numbers import Number
from numpy import inf, where
from toolz import curry
from zipline.errors import UnknownRankMethod
from zipline.lib.normalize import naive_grouped_rowwise_apply
from zipline.lib.rank import masked_rankdata_2d
from zipline.pipeline.api_utils import restrict_to_dtype
from zipline.pipeline.classifiers import Classifier, Everything, Quantiles
from zipline.pipeline.mixins import (
CustomTermMixin,
@@ -42,6 +42,7 @@ from zipline.pipeline.filters import (
PercentileFilter,
NullFilter,
)
from zipline.utils.functional import with_doc, with_name
from zipline.utils.input_validation import expect_types
from zipline.utils.math_utils import nanmean, nanstd
from zipline.utils.numpy_utils import (
@@ -51,7 +52,6 @@ from zipline.utils.numpy_utils import (
float64_dtype,
int64_dtype,
)
from zipline.utils.preprocess import preprocess
_RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
@@ -81,37 +81,6 @@ def coerce_numbers_to_my_dtype(f):
return method
@curry
def set_attribute(name, value):
"""
Decorator factory for setting attributes on a function.
Doesn't change the behavior of the wrapped function.
Usage
-----
>>> @set_attribute('__name__', 'foo')
... def bar():
... return 3
...
>>> bar()
3
>>> bar.__name__
'foo'
"""
def decorator(f):
setattr(f, name, value)
return f
return decorator
# Decorators for setting the __name__ and __doc__ properties of a decorated
# function.
# Example:
with_name = set_attribute('__name__')
with_doc = set_attribute('__doc__')
def binop_return_type(op):
if is_comparison(op):
return NumExprFilter
@@ -328,51 +297,6 @@ def function_application(func):
return mathfunc
def restrict_to_dtype(dtype, message_template):
"""
A factory for decorators that restricting Factor methods to only be
callable on Factors with a specific dtype.
This is conceptually similar to
zipline.utils.input_validation.expect_dtypes, but provides more flexibility
for providing error messages that are specifically targeting Factor
methods.
Parameters
----------
dtype : numpy.dtype
The dtype on which the decorated method may be called.
message_template : str
A template for the error message to be raised.
`message_template.format` will be called with keyword arguments
`method_name`, `expected_dtype`, and `received_dtype`.
Usage
-----
@restrict_to_dtype(
dtype=float64_dtype,
message_template=(
"{method_name}() was called on a factor of dtype {received_dtype}."
"{method_name}() requires factors of dtype{expected_dtype}."
),
)
def some_factor_method(self, ...):
self.stuff_that_requires_being_float64(...)
"""
def processor(factor_method, _, factor_instance):
factor_dtype = factor_instance.dtype
if factor_dtype != dtype:
raise TypeError(
message_template.format(
method_name=factor_method.__name__,
expected_dtype=dtype.name,
received_dtype=factor_dtype,
)
)
return factor_instance
return preprocess(self=processor)
# Decorators for Factor methods.
if_not_float64_tell_caller_to_use_isnull = restrict_to_dtype(
dtype=float64_dtype,
+5
View File
@@ -27,6 +27,7 @@ from zipline.utils.numpy_utils import (
datetime64ns_dtype,
float64_dtype,
int64_dtype,
object_dtype,
)
@@ -148,6 +149,7 @@ class SeededRandomLoader(PrecomputedLoader):
float64_dtype: self._float_values,
int64_dtype: self._int_values,
bool_dtype: self._bool_values,
object_dtype: self._object_values,
}[dtype](shape)
@property
@@ -191,6 +193,9 @@ class SeededRandomLoader(PrecomputedLoader):
"""
return self.state.randn(*shape) < 0
def _object_values(self, shape):
return self._int_values(shape).astype(str).astype(object)
OHLCV = ('open', 'high', 'low', 'close', 'volume')
OHLC = ('open', 'high', 'low', 'close')
+33 -15
View File
@@ -1,7 +1,7 @@
"""
Mixins classes for use with Filters and Factors.
"""
from numpy import full_like, recarray
from numpy import full, recarray
from zipline.utils.control_flow import nullctx
from zipline.errors import WindowLengthNotPositive, UnsupportedDataType
@@ -51,7 +51,7 @@ class RestrictedDTypeMixin(object):
if self.dtype not in self.ALLOWED_DTYPES:
raise UnsupportedDataType(
typename=type(self.__name__),
typename=type(self).__name__,
dtype=self.dtype,
)
@@ -103,27 +103,45 @@ class CustomTermMixin(object):
"""
raise NotImplementedError()
def _allocate_output(self, windows, shape):
"""
Allocate an output array whose rows should be passed to `self.compute`.
The resulting array must have a shape of ``shape``.
If we have standard outputs (i.e. self.outputs is NotSpecified), the
default is an empty ndarray whose dtype is ``self.dtype``.
If we have an outputs tuple, the default is an empty recarray with
``self.outputs`` as field names. Each field will have dtype
``self.dtype``, the default shape is ``self.shape``.
This can be overridden to control the kind of array constructed
(e.g. to produce a LabelArray instead of an ndarray).
"""
missing_value = self.missing_value
outputs = self.outputs
if outputs is not NotSpecified:
out = recarray(
shape,
formats=[self.dtype.str] * len(outputs),
names=outputs,
)
out[:] = missing_value
else:
out = full(shape, missing_value, dtype=self.dtype)
return out
def _compute(self, windows, dates, assets, mask):
"""
Call the user's `compute` function on each window with a pre-built
output array.
"""
compute = self.compute
missing_value = self.missing_value
params = self.params
outputs = self.outputs
if outputs is not NotSpecified:
out = recarray(
mask.shape,
formats=[self.dtype.str] * len(outputs),
names=outputs,
)
out[:] = missing_value
else:
out = full_like(mask, missing_value, dtype=self.dtype)
out = self._allocate_output(windows, mask.shape)
with self.ctx:
# TODO: Consider pre-filtering columns that are all-nan at each
# time-step?
for idx, date in enumerate(dates):
col_mask = mask[idx]
masked_out = out[idx][col_mask]
+15 -1
View File
@@ -4,7 +4,7 @@ Base class for Filters, Factors and Classifiers
from abc import ABCMeta, abstractproperty
from weakref import WeakValueDictionary
from numpy import dtype as dtype_class
from numpy import dtype as dtype_class, ndarray
from six import with_metaclass
from zipline.errors import (
DTypeNotSpecified,
@@ -16,6 +16,7 @@ from zipline.errors import (
WindowLengthNotSpecified,
)
from zipline.lib.adjusted_array import can_represent_dtype
from zipline.utils.input_validation import expect_types
from zipline.utils.memoize import lazyval
from zipline.utils.numpy_utils import (
bool_dtype,
@@ -476,6 +477,19 @@ class ComputableTerm(Term):
out[self.mask] = 0
return out
@expect_types(data=ndarray)
def postprocess(self, data):
"""
Called with an result of ``self``, unravelled (i.e. 1-dimensional)
after any user-defined screens have been applied.
This is mostly useful for transforming the dtype of an output, e.g., to
convert a LabelArray into a pandas Categorical.
The default implementation is to just return data unchanged.
"""
return data
def __repr__(self):
return (
"{type}({inputs}, window_length={window_length})"
+1
View File
@@ -51,3 +51,4 @@ from .core import ( # noqa
write_bcolz_minute_data,
write_compressed,
)
from .fixtures import ZiplineTestCase # noqa
+11 -2
View File
@@ -1032,7 +1032,7 @@ def temp_pipeline_engine(calendar, sids, random_seed, symbols=None):
yield SimplePipelineEngine(get_loader, calendar, finder)
def parameter_space(**params):
def parameter_space(__fail_fast=False, **params):
"""
Wrapper around subtest that allows passing keywords mapping names to
iterables of values.
@@ -1083,7 +1083,16 @@ def parameter_space(**params):
)
param_sets = product(*(params[name] for name in argnames))
return subtest(param_sets, *argnames)(f)
if __fail_fast:
@wraps(f)
def wrapped(self):
for args in param_sets:
f(self, *args)
return wrapped
else:
return subtest(param_sets, *argnames)(f)
return decorator
+3
View File
@@ -11,6 +11,9 @@ if PY2:
else:
from types import MappingProxyType as mappingproxy
unicode = type(u'')
__all__ = [
'mappingproxy',
]
+86
View File
@@ -56,6 +56,8 @@ def apply(f, *args, **kwargs):
# Alias for use as a class decorator.
instance = apply
from zipline.utils.sentinel import sentinel
def mapall(funcs, seq):
"""
@@ -242,3 +244,87 @@ def unzip(seq, elem_len=None):
if elem_len is None:
raise ValueError("cannot unzip empty sequence without 'elem_len'")
return ((),) * elem_len
_no_default = sentinel('_no_default')
def getattrs(value, attrs, default=_no_default):
"""
Perform a chained application of ``getattr`` on ``value`` with the values
in ``attrs``.
If ``default`` is supplied, return it if any of the attribute lookups fail.
Parameters
----------
value : object
Root of the lookup chain.
attrs : iterable[str]
Sequence of attributes to look up.
default : object, optional
Value to return if any of the lookups fail.
Returns
-------
result : object
Result of the lookup sequence.
Example
-------
>>> class EmptyObject(object):
... pass
...
>>> obj = EmptyObject()
>>> obj.foo = EmptyObject()
>>> obj.foo.bar = "value"
>>> getattrs(obj, ('foo', 'bar'))
'value'
>>> getattrs(obj, ('foo', 'buzz'))
Traceback (most recent call last):
...
AttributeError: 'EmptyObject' object has no attribute 'buzz'
>>> getattrs(obj, ('foo', 'buzz'), 'default')
'default'
"""
try:
for attr in attrs:
value = getattr(value, attr)
except AttributeError:
if default is _no_default:
raise
value = default
return value
@curry
def set_attribute(name, value):
"""
Decorator factory for setting attributes on a function.
Doesn't change the behavior of the wrapped function.
Usage
-----
>>> @set_attribute('__name__', 'foo')
... def bar():
... return 3
...
>>> bar()
3
>>> bar.__name__
'foo'
"""
def decorator(f):
setattr(f, name, value)
return f
return decorator
# Decorators for setting the __name__ and __doc__ properties of a decorated
# function.
# Example:
with_name = set_attribute('__name__')
with_doc = set_attribute('__doc__')
+87 -30
View File
@@ -22,7 +22,8 @@ from six import iteritems, string_types, PY3
from toolz import valmap, complement, compose
import toolz.curried.operator as op
from zipline.utils.preprocess import preprocess
from zipline.utils.functional import getattrs
from zipline.utils.preprocess import call, preprocess
def optionally(preprocessor):
@@ -163,7 +164,7 @@ def ensure_timestamp(func, argname, arg):
)
def expect_dtypes(*_pos, **named):
def expect_dtypes(**named):
"""
Preprocessing decorator that verifies inputs have expected numpy dtypes.
@@ -181,9 +182,6 @@ def expect_dtypes(*_pos, **named):
...
TypeError: foo() expected an argument with dtype 'int64' for argument 'x', but got dtype 'float64' instead. # noqa
"""
if _pos:
raise TypeError("expect_dtypes() only takes keyword arguments.")
for name, type_ in iteritems(named):
if not isinstance(type_, (dtype, tuple)):
raise TypeError(
@@ -193,42 +191,101 @@ def expect_dtypes(*_pos, **named):
)
)
def _expect_dtype(_dtype_or_dtype_tuple):
@preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,)))
def _expect_dtype(dtypes):
"""
Factory for dtype-checking functions that work the @preprocess
Factory for dtype-checking functions that work with the @preprocess
decorator.
"""
# Slightly different messages for dtype and tuple of dtypes.
if isinstance(_dtype_or_dtype_tuple, tuple):
allowed_dtypes = _dtype_or_dtype_tuple
else:
allowed_dtypes = (_dtype_or_dtype_tuple,)
template = (
"%(funcname)s() expected a value with dtype {dtype_str} "
"for argument '%(argname)s', but got %(actual)r instead."
).format(dtype_str=' or '.join(repr(d.name) for d in allowed_dtypes))
def check_dtype(value):
return getattr(value, 'dtype', None) not in allowed_dtypes
def display_bad_value(value):
def error_message(func, argname, value):
# If the bad value has a dtype, but it's wrong, show the dtype
# name.
# name. Otherwise just show the value.
try:
return value.dtype.name
value_to_show = value.dtype.name
except AttributeError:
return value
value_to_show = value
return (
"{funcname}() expected a value with dtype {dtype_str} "
"for argument {argname!r}, but got {value!r} instead."
).format(
funcname=_qualified_name(func),
dtype_str=' or '.join(repr(d.name) for d in dtypes),
argname=argname,
value=value_to_show,
)
return make_check(
exc_type=TypeError,
template=template,
pred=check_dtype,
actual=display_bad_value,
)
def _actual_preprocessor(func, argname, argvalue):
if getattr(argvalue, 'dtype', object()) not in dtypes:
raise TypeError(error_message(func, argname, argvalue))
return argvalue
return _actual_preprocessor
return preprocess(**valmap(_expect_dtype, named))
def expect_kinds(**named):
"""
Preprocessing decorator that verifies inputs have expected dtype kinds.
Usage
-----
>>> from numpy import int64, int32, float32
>>> @expect_kinds(x='i')
... def foo(x):
... return x
...
>>> foo(int64(2))
2
>>> foo(int32(2))
2
>>> foo(float32(2))
Traceback (most recent call last):
...n
TypeError: foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead. # noqa
"""
for name, kind in iteritems(named):
if not isinstance(kind, (str, tuple)):
raise TypeError(
"expect_dtype_kinds() expected a string or tuple of strings"
" for argument {name!r}, but got {kind} instead.".format(
name=name, kind=dtype,
)
)
@preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,)))
def _expect_kind(kinds):
"""
Factory for kind-checking functions that work the @preprocess
decorator.
"""
def error_message(func, argname, value):
# If the bad value has a dtype, but it's wrong, show the dtype
# kind. Otherwise just show the value.
try:
value_to_show = value.dtype.kind
except AttributeError:
value_to_show = value
return (
"{funcname}() expected a numpy object of kind {kinds} "
"for argument {argname!r}, but got {value!r} instead."
).format(
funcname=_qualified_name(func),
kinds=' or '.join(map(repr, kinds)),
argname=argname,
value=value_to_show,
)
def _actual_preprocessor(func, argname, argvalue):
if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds:
raise TypeError(error_message(func, argname, argvalue))
return argvalue
return _actual_preprocessor
return preprocess(**valmap(_expect_kind, named))
def expect_types(*_pos, **named):
"""
Preprocessing decorator that verifies inputs have expected types.
+6 -3
View File
@@ -32,6 +32,10 @@ complex128_dtype = dtype('complex128')
datetime64D_dtype = dtype('datetime64[D]')
datetime64ns_dtype = dtype('datetime64[ns]')
object_dtype = dtype('O')
# We use object arrays for strings.
categorical_dtype = object_dtype
make_datetime64ns = flip(datetime64, 'ns')
make_datetime64D = flip(datetime64, 'D')
@@ -71,6 +75,7 @@ def make_kind_check(python_types, numpy_kind):
is_float = make_kind_check(float, 'f')
is_int = make_kind_check(int, 'i')
is_datetime = make_kind_check(datetime, 'M')
is_object = make_kind_check(object, 'O')
def coerce_to_dtype(dtype, value):
@@ -263,9 +268,7 @@ def rolling_window(array, length):
_notNaT = make_datetime64D(0)
def busday_count_mask_NaT(begindates,
enddates,
out=None):
def busday_count_mask_NaT(begindates, enddates, out=None):
"""
Simple of numpy.busday_count that returns `float` arrays rather than int
arrays, and handles `NaT`s by returning `NaN`s where the inputs were `NaT`.