From 28fdecc98bcdfa5ba5c22e7eb1e01ff815eb1426 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 18 Jan 2016 15:19:48 -0500 Subject: [PATCH 01/20] ENH: Make .latest return a Filter on bool columns. --- tests/pipeline/test_term.py | 14 +++++++- zipline/pipeline/data/dataset.py | 6 +++- zipline/pipeline/filters/__init__.py | 2 ++ zipline/pipeline/filters/filter.py | 48 +++++++++++++++++++++++++++- zipline/pipeline/filters/latest.py | 29 +++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 zipline/pipeline/filters/latest.py diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index c657811a..a16a9129 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -12,11 +12,12 @@ from zipline.errors import ( TermInputsNotSpecified, WindowLengthNotSpecified, ) -from zipline.pipeline import Factor, TermGraph +from zipline.pipeline import Factor, Filter, TermGraph from zipline.pipeline.data import Column, DataSet from zipline.pipeline.term import AssetExists, NotSpecified from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( + bool_dtype, datetime64ns_dtype, float64_dtype, ) @@ -331,6 +332,17 @@ class ObjectIdentityTestCase(TestCase): with self.assertRaises(InvalidDType): SomeFactor(dtype=1) + def test_latest_on_different_dtypes(self): + + class D(DataSet): + bool_col = Column(dtype=bool_dtype) + float_col = Column(dtype=float64_dtype) + datetime_col = Column(dtype=datetime64ns_dtype) + + self.assertIsInstance(D.bool_col.latest, Filter) + self.assertIsInstance(D.float_col.latest, Factor) + self.assertIsInstance(D.datetime_col.latest, Factor) + class SubDataSetTestCase(TestCase): def test_subdataset(self): diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index e65dca39..c420ea81 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -9,6 +9,7 @@ from six import ( from zipline.pipeline.term import Term, AssetExists from zipline.utils.input_validation import ensure_dtype +from zipline.utils.numpy_utils import bool_dtype from zipline.utils.preprocess import preprocess @@ -92,7 +93,10 @@ class BoundColumn(Term): @property def latest(self): - from zipline.pipeline.factors import Latest + if self.dtype == bool_dtype: + from zipline.pipeline.filters import Latest + else: + from zipline.pipeline.factors import Latest return Latest(inputs=(self,), dtype=self.dtype) def __repr__(self): diff --git a/zipline/pipeline/filters/__init__.py b/zipline/pipeline/filters/__init__.py index 3184bb9a..9ff7a0eb 100644 --- a/zipline/pipeline/filters/__init__.py +++ b/zipline/pipeline/filters/__init__.py @@ -1,7 +1,9 @@ from .filter import Filter, NumExprFilter, PercentileFilter +from .latest import Latest __all__ = [ 'Filter', + 'Latest', 'NumExprFilter', 'PercentileFilter', ] diff --git a/zipline/pipeline/filters/filter.py b/zipline/pipeline/filters/filter.py index f3c02924..2212f507 100644 --- a/zipline/pipeline/filters/filter.py +++ b/zipline/pipeline/filters/filter.py @@ -252,6 +252,52 @@ class PercentileFilter(SingleInputMixin, Filter): class CustomFilter(PositiveWindowLengthMixin, CustomTermMixin, Filter): """ - Filter analog to ``CustomFactor``. + Base class for user-defined Filters. + + Parameters + ---------- + inputs : iterable, optional + An iterable of `BoundColumn` instances (e.g. USEquityPricing.close), + describing the data to load and pass to `self.compute`. If this + argument is passed to the CustomFilter constructor, we look for a + class-level attribute named `inputs`. + window_length : int, optional + Number of rows to pass for each input. If this argument is not passed + to the CustomFilter constructor, we look for a class-level attribute + named `window_length`. + + Notes + ----- + Users implementing their own Filters should subclass CustomFilter and + implement a method named `compute` with the following signature: + + .. code-block:: python + + def compute(self, today, assets, out, *inputs): + ... + + On each simulation date, ``compute`` will be called with the current date, + an array of sids, an output array, and an input array for each expression + passed as inputs to the CustomFilter constructor. + + The specific types of the values passed to `compute` are as follows:: + + today : np.datetime64[ns] + Row label for the last row of all arrays passed as `inputs`. + assets : np.array[int64, ndim=1] + Column labels for `out` and`inputs`. + out : np.array[bool, ndim=1] + Output array of the same shape as `assets`. `compute` should write + its desired return values into `out`. + *inputs : tuple of np.array + Raw data arrays corresponding to the values of `self.inputs`. + + See the documentation for + :class:`~zipline.pipeline.factors.factor.CustomFactor` for more details on + implementing a custom ``compute`` method. + + See Also + -------- + zipline.pipeline.factors.factor.CustomFactor """ ctx = nullctx() diff --git a/zipline/pipeline/filters/latest.py b/zipline/pipeline/filters/latest.py new file mode 100644 index 00000000..f9588804 --- /dev/null +++ b/zipline/pipeline/filters/latest.py @@ -0,0 +1,29 @@ +""" +Filter that produces the most most recently-known value of a boolean-valued +Column. +""" +from zipline.utils.numpy_utils import bool_dtype + +from .filter import CustomFilter +from ..mixins import SingleInputMixin + + +class Latest(SingleInputMixin, CustomFilter): + """ + Filter producing the most recently-known value of `inputs[0]` on each day. + """ + window_length = 1 + + def compute(self, today, assets, out, data): + out[:] = data[-1] + + def _validate(self): + if self.inputs[0].dtype != bool_dtype: + raise TypeError( + "{name} expected an input of dtype bool, " + "but got {not_bool} instead.".format( + name=type(self).__name__, + not_bool=self.inputs[0].dtype, + ) + ) + super(Latest, self)._validate() From 0c15f502313e3ceb508a7cf8247355b12d04d405 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 18 Jan 2016 20:10:16 -0500 Subject: [PATCH 02/20] TEST: Add dedicated testing dataset. --- tests/pipeline/test_term.py | 16 +++++++++------- zipline/pipeline/data/testing.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 zipline/pipeline/data/testing.py diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index a16a9129..231439f5 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -11,9 +11,11 @@ from zipline.errors import ( InvalidDType, TermInputsNotSpecified, WindowLengthNotSpecified, + UnsupportedDataType, ) from zipline.pipeline import Factor, Filter, TermGraph from zipline.pipeline.data import Column, DataSet +from zipline.pipeline.data.testing import TestingDataSet from zipline.pipeline.term import AssetExists, NotSpecified from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( @@ -334,14 +336,14 @@ class ObjectIdentityTestCase(TestCase): def test_latest_on_different_dtypes(self): - class D(DataSet): - bool_col = Column(dtype=bool_dtype) - float_col = Column(dtype=float64_dtype) - datetime_col = Column(dtype=datetime64ns_dtype) + self.assertIsInstance(TestingDataSet.bool_col.latest, Filter) + self.assertIsInstance(TestingDataSet.float_col.latest, Factor) + self.assertIsInstance(TestingDataSet.datetime_col.latest, Factor) - self.assertIsInstance(D.bool_col.latest, Filter) - self.assertIsInstance(D.float_col.latest, Factor) - self.assertIsInstance(D.datetime_col.latest, Factor) + # TODO: Support this by allowing users to provide a missing value on + # columns. + with self.assertRaises(UnsupportedDataType): + self.assertIsInstance(TestingDataSet.int_col.latest, Factor) class SubDataSetTestCase(TestCase): diff --git a/zipline/pipeline/data/testing.py b/zipline/pipeline/data/testing.py new file mode 100644 index 00000000..8d734e36 --- /dev/null +++ b/zipline/pipeline/data/testing.py @@ -0,0 +1,21 @@ +""" +Datasets for testing use. + +Loaders for datasets in this file can be found in +zipline.pipeline.data.testing. +""" +from .dataset import Column, DataSet +from zipline.utils.numpy_utils import ( + bool_dtype, + float64_dtype, + datetime64ns_dtype, + int64_dtype, +) + + +class TestingDataSet(DataSet): + + bool_col = Column(dtype=bool_dtype) + float_col = Column(dtype=float64_dtype) + datetime_col = Column(dtype=datetime64ns_dtype) + int_col = Column(dtype=int64_dtype) From a96dd7063499985df42317f9e472660e7e105203 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 18 Jan 2016 20:30:54 -0500 Subject: [PATCH 03/20] MAINT: Rename ConstantLoader to PrecomputedLoader. --- tests/pipeline/test_engine.py | 25 +++++++++++++------------ tests/pipeline/test_term.py | 1 - zipline/pipeline/loaders/synthetic.py | 25 ++++++++++++++----------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/tests/pipeline/test_engine.py b/tests/pipeline/test_engine.py index 4dc6e606..cbe0a305 100644 --- a/tests/pipeline/test_engine.py +++ b/tests/pipeline/test_engine.py @@ -41,7 +41,7 @@ from zipline.data.us_equity_pricing import BcolzDailyBarReader from zipline.finance.trading import TradingEnvironment from zipline.lib.adjustment import MULTIPLY from zipline.pipeline.loaders.synthetic import ( - ConstantLoader, + PrecomputedLoader, NullAdjustmentReader, SyntheticDailyBarWriter, ) @@ -126,16 +126,16 @@ class ColumnArgs(tuple): return hash(frozenset(self)) -class RecordingConstantLoader(ConstantLoader): +class RecordingPrecomputedLoader(PrecomputedLoader): def __init__(self, *args, **kwargs): - super(RecordingConstantLoader, self).__init__(*args, **kwargs) + super(RecordingPrecomputedLoader, self).__init__(*args, **kwargs) self.load_calls = [] def load_adjusted_array(self, columns, dates, assets, mask): self.load_calls.append(ColumnArgs(*columns)) - return super(RecordingConstantLoader, self).load_adjusted_array( + return super(RecordingPrecomputedLoader, self).load_adjusted_array( columns, dates, assets, mask, ) @@ -159,7 +159,7 @@ class ConstantInputTestCase(TestCase): } self.asset_ids = [1, 2, 3] self.dates = date_range('2014-01', '2014-03', freq='D', tz='UTC') - self.loader = ConstantLoader( + self.loader = PrecomputedLoader( constants=self.constants, dates=self.dates, assets=self.asset_ids, @@ -364,7 +364,7 @@ class ConstantInputTestCase(TestCase): dates_to_test = self.dates[-30:] constants = {open_: 1, close: 2, volume: 3} - loader = ConstantLoader( + loader = PrecomputedLoader( constants=constants, dates=self.dates, assets=self.asset_ids, @@ -430,14 +430,15 @@ class ConstantInputTestCase(TestCase): Loader1DataSet1.col2: 2, Loader1DataSet2.col1: 3, Loader1DataSet2.col2: 4} - loader1 = RecordingConstantLoader(constants=constants1, - dates=self.dates, - assets=self.asset_ids) + + loader1 = RecordingPrecomputedLoader(constants=constants1, + dates=self.dates, + assets=self.assets) constants2 = {Loader2DataSet.col1: 5, Loader2DataSet.col2: 6} - loader2 = RecordingConstantLoader(constants=constants2, - dates=self.dates, - assets=self.asset_ids) + loader2 = RecordingPrecomputedLoader(constants=constants2, + dates=self.dates, + assets=self.assets) engine = SimplePipelineEngine( lambda column: diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 231439f5..f642d2b5 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -19,7 +19,6 @@ from zipline.pipeline.data.testing import TestingDataSet from zipline.pipeline.term import AssetExists, NotSpecified from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( - bool_dtype, datetime64ns_dtype, float64_dtype, ) diff --git a/zipline/pipeline/loaders/synthetic.py b/zipline/pipeline/loaders/synthetic.py index bd2e75af..f1783596 100644 --- a/zipline/pipeline/loaders/synthetic.py +++ b/zipline/pipeline/loaders/synthetic.py @@ -32,31 +32,34 @@ def nanos_to_seconds(nanos): return nanos / (1000 * 1000 * 1000) -class ConstantLoader(PipelineLoader): +class PrecomputedLoader(PipelineLoader): """ - Synthetic PipelineLoader that returns a constant value for each column. + Synthetic PipelineLoader that uses a pre-computed array for each column. Parameters ---------- - constants : dict - Map from column to value(s) to use for that column. + values : dict + Map from column to values to use for that column. Values can be anything that can be passed as the first positional - argument to a DataFrame of the same shape as `mask`. - mask : pandas.DataFrame - Mask indicating when assets existed. - Indices of this frame are used to align input queries. + argument to a DataFrame whose indices are ``dates`` and ``sids`` + dates : iterable[datetime-like] + Row labels for input data. Can be anything that pd.DataFrame will + coerce to a DatetimeIndex. + sids : iterable[int-like] + Column labels for input data. Can be anything that pd.DataFrame will + coerce to an Int64Index. Notes ----- - Adjustments are unsupported with ConstantLoader. + Adjustments are unsupported by this loader. """ - def __init__(self, constants, dates, assets): + def __init__(self, constants, dates, sids): loaders = {} for column, const in iteritems(constants): frame = DataFrame( const, index=dates, - columns=assets, + columns=sids, dtype=column.dtype, ) loaders[column] = DataFrameLoader( From 22fdc910e414ab231b9e42afbd88dd9091b99bcf Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 18 Jan 2016 20:50:11 -0500 Subject: [PATCH 04/20] DEV: Add EyeLoader. --- zipline/pipeline/loaders/synthetic.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/zipline/pipeline/loaders/synthetic.py b/zipline/pipeline/loaders/synthetic.py index f1783596..8c50d869 100644 --- a/zipline/pipeline/loaders/synthetic.py +++ b/zipline/pipeline/loaders/synthetic.py @@ -6,6 +6,7 @@ from bcolz import ctable from numpy import ( arange, array, + eye, float64, full, iinfo, @@ -86,6 +87,29 @@ class PrecomputedLoader(PipelineLoader): return out +class EyeLoader(PrecomputedLoader): + """ + A PrecomputedLoader that emits arrays containing 1s on the diagonal and 0s + elsewhere. + + Parameters + ---------- + columns : list[BoundColumn] + Columns that this loader should know about. + dates : iterable[datetime-like] + Same as PrecomputedLoader. + sids : iterable[int-like] + Same as PrecomputedLoader + """ + def __init__(self, columns, dates, sids): + shape = (len(dates), len(sids)) + super(EyeLoader, self).__init__( + {column: eye(shape, dtype=column.dtype) for column in columns}, + dates, + sids, + ) + + class SyntheticDailyBarWriter(BcolzDailyBarWriter): """ Bcolz writer that creates synthetic data based on asset lifetime metadata. From f440cb73b25e04f0b8313c77dbcf0d3d05795a8d Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 19 Jan 2016 12:28:03 -0500 Subject: [PATCH 05/20] MAINT: Make __slots__ a tuple. --- zipline/pipeline/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zipline/pipeline/engine.py b/zipline/pipeline/engine.py index 3f290b1f..35809c0a 100644 --- a/zipline/pipeline/engine.py +++ b/zipline/pipeline/engine.py @@ -92,13 +92,13 @@ class SimplePipelineEngine(object): An AssetFinder instance. We depend on the AssetFinder to determine which assets are in the top-level universe at any point in time. """ - __slots__ = [ + __slots__ = ( '_get_loader', '_calendar', '_finder', '_root_mask_term', '__weakref__', - ] + ) def __init__(self, get_loader, calendar, asset_finder): self._get_loader = get_loader From 1bf33f9ee0677bb52678d3f51a7e790d79cf2668 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 19 Jan 2016 12:58:51 -0500 Subject: [PATCH 06/20] TEST: Add isolated tests for .latest. --- tests/pipeline/test_column.py | 63 ++++++++++++++++++++ zipline/pipeline/loaders/synthetic.py | 84 +++++++++++++++++++++++++++ zipline/pipeline/loaders/testing.py | 21 +++++++ zipline/utils/test_utils.py | 60 ++++++++++++++++++- 4 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 tests/pipeline/test_column.py create mode 100644 zipline/pipeline/loaders/testing.py diff --git a/tests/pipeline/test_column.py b/tests/pipeline/test_column.py new file mode 100644 index 00000000..142e97a6 --- /dev/null +++ b/tests/pipeline/test_column.py @@ -0,0 +1,63 @@ +""" +Tests BoundColumn attributes and methods. +""" +from contextlib2 import ExitStack +from unittest import TestCase + +from pandas import date_range, DataFrame +from pandas.util.testing import assert_frame_equal + +from zipline.pipeline import Pipeline +from zipline.pipeline.data.testing import TestingDataSet as TDS +from zipline.utils.test_utils import chrange, temp_pipeline_engine + + +class LatestTestCase(TestCase): + + @classmethod + def setUpClass(cls): + cls._stack = stack = ExitStack() + cls.calendar = cal = date_range('2014', '2015', freq='D', tz='UTC') + cls.sids = list(range(5)) + cls.engine = stack.enter_context( + temp_pipeline_engine( + cal, + cls.sids, + random_seed=100, + symbols=chrange('A', 'E'), + ), + ) + cls.assets = cls.engine._finder.retrieve_all(cls.sids) + + @classmethod + def tearDownClass(cls): + cls._stack.close() + + def expected_latest(self, column, slice_): + loader = self.engine.get_loader(column) + return DataFrame( + loader.values(column.dtype, self.calendar, self.sids)[slice_], + index=self.calendar[slice_], + columns=self.sids, + ) + + def test_latest(self): + pipe = Pipeline( + columns={ + name: getattr(TDS, name + '_col').latest + # Intentionally not including int and bool because they're not + # yet supported. + for name in ('float', 'datetime') + } + ) + + cal_slice = slice(20, 40) + dates_to_test = self.calendar[cal_slice] + result = self.engine.run_pipeline( + pipe, + dates_to_test[0], + dates_to_test[-1], + ) + float_result = result.float.unstack() + expected_float_result = self.expected_latest(TDS.float_col, cal_slice) + assert_frame_equal(float_result, expected_float_result) diff --git a/zipline/pipeline/loaders/synthetic.py b/zipline/pipeline/loaders/synthetic.py index 8c50d869..1d983a53 100644 --- a/zipline/pipeline/loaders/synthetic.py +++ b/zipline/pipeline/loaders/synthetic.py @@ -12,6 +12,7 @@ from numpy import ( iinfo, uint32, ) +from numpy.random import RandomState from pandas import DataFrame, Timestamp from six import iteritems from sqlite3 import connect as sqlite3_connect @@ -24,6 +25,12 @@ from zipline.data.us_equity_pricing import ( SQLiteAdjustmentWriter, US_EQUITY_PRICING_BCOLZ_COLUMNS, ) +from zipline.utils.numpy_utils import ( + bool_dtype, + datetime64ns_dtype, + float64_dtype, + int64_dtype, +) UINT_32_MAX = iinfo(uint32).max @@ -110,6 +117,83 @@ class EyeLoader(PrecomputedLoader): ) +class SeededRandomLoader(PrecomputedLoader): + """ + A PrecomputedLoader that emits arrays randomly-generated with a given seed. + + Parameters + ---------- + seed : int + Seed for numpy.random.RandomState. + columns : list[BoundColumn] + Columns that this loader should know about. + dates : iterable[datetime-like] + Same as PrecomputedLoader. + sids : iterable[int-like] + Same as PrecomputedLoader + """ + + def __init__(self, seed, columns, dates, sids): + self._seed = seed + super(SeededRandomLoader, self).__init__( + {c: self.values(c.dtype, dates, sids) for c in columns}, + dates, + sids, + ) + + def values(self, dtype, dates, sids): + """ + Make a random array of shape (len(dates), len(sids)) with ``dtype``. + """ + shape = (len(dates), len(sids)) + return { + datetime64ns_dtype: self._datetime_values, + float64_dtype: self._float_values, + int64_dtype: self._int_values, + bool_dtype: self._bool_values, + }[dtype](shape) + + @property + def state(self): + """ + Make a new RandomState from our seed. + + This ensures that every call to _*_values produces the same output + every time for a given SeededRandomLoader instance. + """ + return RandomState(self._seed) + + def _float_values(self, shape): + """ + Return uniformly-distributed floats between -0.0 and 100.0. + """ + return self.state.uniform(low=0.0, high=100.0, size=shape) + + def _int_values(self, shape): + """ + Return uniformly-distributed integers between 0 and 100. + """ + return self.state.random_integers(low=0, high=100, size=shape) + + def _datetime_values(self, shape): + """ + Return uniformly-distributed dates in 2014. + """ + start = Timestamp('2014', tz='UTC').asm8 + offsets = self.state.random_integers( + low=0, + high=364, + size=shape, + ).astype('timedelta64[D]') + return start + offsets + + def _bool_values(self, shape): + """ + Return uniformly-distributed True/False values. + """ + return self.state.randn(*shape) < 0 + + class SyntheticDailyBarWriter(BcolzDailyBarWriter): """ Bcolz writer that creates synthetic data based on asset lifetime metadata. diff --git a/zipline/pipeline/loaders/testing.py b/zipline/pipeline/loaders/testing.py new file mode 100644 index 00000000..cd4cd62f --- /dev/null +++ b/zipline/pipeline/loaders/testing.py @@ -0,0 +1,21 @@ +""" +Loaders for zipline.pipeline.data.testing datasets. +""" +from .synthetic import EyeLoader, SeededRandomLoader +from ..data.testing import TestingDataSet + + +def make_eye_loader(dates, sids): + """ + Make a PipelineLoader that emits np.eye arrays for the columns in + ``TestingDataSet``. + """ + return EyeLoader(TestingDataSet.columns, dates, sids) + + +def make_seeded_random_loader(seed, dates, sids): + """ + Make a PipelineLoader that emits random arrays seeded with `seed` for the + columns in ``TestingDataSet``. + """ + return SeededRandomLoader(seed, TestingDataSet.columns, dates, sids) diff --git a/zipline/utils/test_utils.py b/zipline/utils/test_utils.py index 8ea3aa00..641f2316 100644 --- a/zipline/utils/test_utils.py +++ b/zipline/utils/test_utils.py @@ -25,6 +25,8 @@ from zipline.assets import AssetFinder from zipline.assets.asset_writer import AssetDBWriterFromDataFrame from zipline.assets.futures import CME_CODE_TO_MONTH from zipline.finance.order import ORDER_STATUS +from zipline.pipeline.engine import SimplePipelineEngine +from zipline.pipeline.loaders.testing import make_seeded_random_loader from zipline.utils import security_list from zipline.utils.tradingcalendar import trading_days @@ -238,6 +240,31 @@ def all_subindices(index): ) +def chrange(start, stop): + """ + Construct an iterable of length-1 strings beginning with `start` and ending + with `stop`. + + Parameters + ---------- + start : str + The first character. + stop : str + The last character. + + Returns + ------- + chars: iterable[str] + Iterable of strings beginning with start and ending with stop. + + Example + ------- + >>> list(chrange('A', 'C')) + ['A', 'B', 'C'] + """ + return map(chr, range(ord(start), ord(stop) + 1)) + + def make_rotating_equity_info(num_assets, first_start, frequency, @@ -296,7 +323,7 @@ def make_simple_equity_info(sids, start_date, end_date, symbols=None): sids : array-like of int start_date : pd.Timestamp end_date : pd.Timestamp - symbols : list, optional + symbols : list, optionaln Symbols to use for the assets. If not provided, symbols are generated from the sequence 'A', 'B', ... @@ -664,3 +691,34 @@ def gen_calendars(start, stop, critical_dates): # Also test with the trading calendar. yield (trading_days[trading_days.slice_indexer(start, stop)],) + + +@contextmanager +def temp_pipeline_engine(calendar, sids, random_seed, symbols=None): + """ + A contextManager that yields a SimplePipelineEngine holding a reference to + an AssetFinder generated via tmp_asset_finder. + + Parameters + ---------- + calendar : pd.DatetimeIndex + Calendar to pass to the constructed PipelineEngine. + sids : iterable[int] + Sids to use for the temp asset finder. + random_seed : int + Integer used to seed instances of SeededRandomLoader. + symbols : iterable[str], optional + Symbols for constructed assets. Forwarded to make_simple_equity_info. + """ + equity_info = make_simple_equity_info( + sids=sids, + start_date=calendar[0], + end_date=calendar[-1], + symbols=symbols, + ) + + loader = make_seeded_random_loader(random_seed, calendar, sids) + get_loader = lambda column: loader + + with tmp_asset_finder(equities=equity_info) as finder: + yield SimplePipelineEngine(get_loader, calendar, finder) From 9c448b5238d462bc1d9e0add22e8b26ad7dc6585 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 19 Jan 2016 13:09:05 -0500 Subject: [PATCH 07/20] DOC: Explain why _BoundColumnDescr exists. We don't want to bind to parent DataSets when we bind to names because we want to dynamically create new BoundColumns in subclasses. --- zipline/pipeline/data/dataset.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index c420ea81..11ad415a 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -33,12 +33,21 @@ class _BoundColumnDescr(object): """ Intermediate class that sits on `DataSet` objects and returns memoized `BoundColumn` objects when requested. + + This exists so that subclasses of DataSets don't share columns with their + parent classes. """ def __init__(self, dtype, name): self.dtype = dtype self.name = name def __get__(self, instance, owner): + """ + Produce a concrete BoundColumn object when accessed. + + We don't bind to datasets at class creation time so that subclasses of + DataSets produce different BoundColumns. + """ return BoundColumn( dtype=self.dtype, dataset=owner, From 94c02c710bc36d79879deee970317e642095758c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 20 Jan 2016 22:03:35 -0500 Subject: [PATCH 08/20] DEV: Add parameter_space test decorator. --- tests/test_test_utils.py | 31 ++++++++++++++++++++++ zipline/utils/test_utils.py | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tests/test_test_utils.py diff --git a/tests/test_test_utils.py b/tests/test_test_utils.py new file mode 100644 index 00000000..861d10bc --- /dev/null +++ b/tests/test_test_utils.py @@ -0,0 +1,31 @@ +""" +Tests for our testing utilities. +""" +from itertools import product +from unittest import TestCase +from zipline.utils.test_utils import parameter_space + + +class TestParameterSpace(TestCase): + + x_args = [1, 2] + y_args = [3, 4] + + @classmethod + def setUpClass(cls): + cls.xy_invocations = [] + + @classmethod + def tearDownClass(cls): + # This is the only actual test here. + assert cls.xy_invocations == list(product(cls.x_args, cls.y_args)) + + @parameter_space(x=x_args, y=y_args) + def test_xy(self, x, y): + self.xy_invocations.append((x, y)) + + def test_nothing(self): + # Ensure that there's at least one "real" test in the class, or else + # our {setUp,tearDown}Class won't be called if, for example, + # `parameter_space` returns None. + pass diff --git a/zipline/utils/test_utils.py b/zipline/utils/test_utils.py index 641f2316..f989e8eb 100644 --- a/zipline/utils/test_utils.py +++ b/zipline/utils/test_utils.py @@ -1,5 +1,6 @@ from contextlib import contextmanager from functools import wraps +from inspect import getargspec from itertools import ( combinations, count, @@ -722,3 +723,53 @@ def temp_pipeline_engine(calendar, sids, random_seed, symbols=None): with tmp_asset_finder(equities=equity_info) as finder: yield SimplePipelineEngine(get_loader, calendar, finder) + + +def parameter_space(**params): + """ + Wrapper around subtest that allows passing keywords mapping names to + iterables of values. + + The decorated test function will be called with the cross-product of all + possible inputs + + Usage + ----- + >>> class SomeTestCase(TestCase): + ... @parameter_space(x=[1, 2], y=[2, 3]) + ... def test_some_func(self, x, y): + ... # Will be called with every possible combination of x and y. + ... self.assertEqual(somefunc(x, y), expected_result(x, y)) + """ + def decorator(f): + + argspec = getargspec(f) + if argspec.varargs: + raise AssertionError("parameter_space() doesn't support *args") + if argspec.keywords: + raise AssertionError("parameter_space() doesn't support **kwargs") + if argspec.defaults: + raise AssertionError("parameter_space() doesn't support defaults.") + + # Skip over implicit self. + argnames = argspec.args + if argnames[0] == 'self': + argnames = argnames[1:] + + extra = set(params) - set(argnames) + if extra: + raise AssertionError( + "Keywords %s supplied to parameter_space() are " + "not in function signature." % extra + ) + + unspecified = set(argnames) - set(params) + if unspecified: + raise AssertionError( + "Function arguments %s were not " + "supplied to parameter_space()." % extra + ) + + param_sets = product(*(params[name] for name in argnames)) + return subtest(param_sets, *argnames)(f) + return decorator From c1057355743788589e408894c4482bf404581e61 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 20 Jan 2016 22:45:51 -0500 Subject: [PATCH 09/20] DEV: Add support for specifying missing_value. Consequently, enable support for `int`-dtyped Factors and BoundColumns. --- tests/pipeline/test_adjusted_array.py | 57 +++++++++++++++++-- tests/pipeline/test_blaze.py | 54 ++++++++++++++---- tests/pipeline/test_column.py | 15 ++--- tests/pipeline/test_engine.py | 10 ++-- tests/pipeline/test_term.py | 25 ++++++-- zipline/lib/adjusted_array.py | 53 +++++++++-------- zipline/pipeline/data/dataset.py | 48 +++++++++++++--- zipline/pipeline/data/testing.py | 7 ++- zipline/pipeline/factors/factor.py | 3 +- zipline/pipeline/loaders/blaze/core.py | 39 ++++++++++--- .../pipeline/loaders/equity_pricing_loader.py | 14 +++-- zipline/pipeline/loaders/frame.py | 3 +- zipline/pipeline/mixins.py | 2 + zipline/pipeline/term.py | 43 ++++++++------ zipline/utils/numpy_utils.py | 18 +++++- 15 files changed, 287 insertions(+), 104 deletions(-) diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index f2da52eb..e2a96e53 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -10,6 +10,7 @@ from numpy import ( arange, array, full, + where, ) from numpy.testing import assert_array_equal from six.moves import zip_longest @@ -23,9 +24,21 @@ from zipline.lib.adjustment import ( from zipline.lib.adjusted_array import AdjustedArray, NOMASK from zipline.utils.numpy_utils import ( datetime64ns_dtype, + default_missing_value_for_dtype, float64_dtype, + int64_dtype, make_datetime64ns, ) +from zipline.utils.test_utils import check_arrays, parameter_space + + +def moving_window(array, nrows): + """ + Simple moving window generator over a 2D numpy array. + """ + count = num_windows_of_length_M_on_buffers_of_length_N(nrows, len(array)) + for i in range(count): + yield array[i:i + nrows] def num_windows_of_length_M_on_buffers_of_length_N(M, N): @@ -66,6 +79,7 @@ def _gen_unadjusted_cases(dtype): nrows = 6 ncols = 3 data = arange(nrows * ncols).astype(dtype).reshape(nrows, ncols) + missing_value = default_missing_value_for_dtype(dtype) for windowlen in valid_window_lengths(nrows): @@ -78,6 +92,7 @@ def _gen_unadjusted_cases(dtype): data, windowlen, {}, + missing_value, [ data[offset:offset + windowlen] for offset in range(num_legal_windows) @@ -230,6 +245,7 @@ def _gen_overwrite_adjustment_cases(dtype): def _gen_expectations(baseline, adjustments, buffer_as_of, nrows): + missing_value = default_missing_value_for_dtype(baseline.dtype) for windowlen in valid_window_lengths(nrows): num_legal_windows = num_windows_of_length_M_on_buffers_of_length_N( @@ -241,6 +257,7 @@ def _gen_expectations(baseline, adjustments, buffer_as_of, nrows): baseline, windowlen, adjustments, + missing_value, [ # This is a nasty expression... # @@ -267,9 +284,10 @@ class AdjustedArrayTestCase(TestCase): data, lookback, adjustments, + missing_value, expected): - array = AdjustedArray(data, NOMASK, adjustments) + array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): @@ -282,9 +300,10 @@ class AdjustedArrayTestCase(TestCase): data, lookback, adjustments, + missing_value, expected): - array = AdjustedArray(data, NOMASK, adjustments) + array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): @@ -301,18 +320,43 @@ class AdjustedArrayTestCase(TestCase): data, lookback, adjustments, + missing_value, expected): - array = AdjustedArray(data, NOMASK, adjustments) + array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): self.assertEqual(yielded.dtype, data.dtype) assert_array_equal(yielded, expected_yield) + @parameter_space( + dtype=[float64_dtype, int64_dtype, datetime64ns_dtype], + missing_value=[0, 10000], + window_length=[2, 3], + ) + def test_masking(self, dtype, missing_value, window_length): + missing_value = value_with_dtype(dtype, missing_value) + baseline_ints = arange(15).reshape(5, 3) + baseline = baseline_ints.astype(dtype) + mask = (baseline_ints % 2).astype(bool) + masked_baseline = where(mask, baseline, missing_value) + + array = AdjustedArray( + baseline, + mask, + adjustments={}, + missing_value=missing_value, + ) + + gen_expected = moving_window(masked_baseline, window_length) + gen_actual = array.traverse(window_length) + for expected, actual in zip(gen_expected, gen_actual): + check_arrays(expected, actual) + def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) - adj_array = AdjustedArray(data, NOMASK, {}) + adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) with self.assertRaises(WindowLengthTooLong): adj_array.traverse(7) @@ -326,7 +370,7 @@ class AdjustedArrayTestCase(TestCase): def test_array_views_arent_writable(self): data = arange(30, dtype=float).reshape(6, 5) - adj_array = AdjustedArray(data, NOMASK, {}) + adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) for frame in adj_array.traverse(3): with self.assertRaises(ValueError): @@ -338,7 +382,7 @@ class AdjustedArrayTestCase(TestCase): bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool) with self.assertRaisesRegexp(ValueError, msg): - AdjustedArray(data, bad_mask, {}) + AdjustedArray(data, bad_mask, {}, missing_value=-1) def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) @@ -346,6 +390,7 @@ class AdjustedArrayTestCase(TestCase): data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, + float('nan'), ) expected = dedent( diff --git a/tests/pipeline/test_blaze.py b/tests/pipeline/test_blaze.py index 202342e0..5527b963 100644 --- a/tests/pipeline/test_blaze.py +++ b/tests/pipeline/test_blaze.py @@ -31,7 +31,11 @@ from zipline.pipeline.loaders.blaze.core import ( NonPipelineField, no_deltas_rules, ) -from zipline.utils.numpy_utils import repeat_last_axis +from zipline.utils.numpy_utils import ( + float64_dtype, + int64_dtype, + repeat_last_axis, +) from zipline.utils.test_utils import tmp_asset_finder, make_simple_equity_info @@ -73,7 +77,8 @@ class BlazeToPipelineTestCase(TestCase): cls.sids = sids = ord('A'), ord('B'), ord('C') cls.df = df = pd.DataFrame({ 'sid': sids * 3, - 'value': (0, 1, 2, 1, 2, 3, 2, 3, 4), + 'value': (0., 1., 2., 1., 2., 3., 2., 3., 4.), + 'int_value': (0, 1, 2, 1, 2, 3, 2, 3, 4), 'asof_date': dates, 'timestamp': dates, }) @@ -81,6 +86,7 @@ class BlazeToPipelineTestCase(TestCase): var * { sid: ?int64, value: ?float64, + int_value: ?int64, asof_date: datetime, timestamp: datetime } @@ -91,6 +97,7 @@ class BlazeToPipelineTestCase(TestCase): cls.macro_dshape = var * Record(dshape_) cls.garbage_loader = BlazeLoader() + cls.missing_values = {'int_value': 0} def test_tabular(self): name = 'expr' @@ -99,15 +106,20 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) - self.assertEqual( - {c.name: c.dtype for c in ds.columns}, - {'sid': np.int64, 'value': np.float64}, - ) - for field in ('timestamp', 'asof_date'): + self.assertIs(ds.value.dtype, float64_dtype) + self.assertIs(ds.int_value.dtype, int64_dtype) + + self.assertTrue(np.isnan(ds.value.missing_value)) + self.assertEqual(ds.int_value.missing_value, 0) + + invalid_type_fields = ('asof_date',) + + for field in invalid_type_fields: with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) @@ -119,6 +131,7 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ), ds, ) @@ -130,10 +143,11 @@ class BlazeToPipelineTestCase(TestCase): expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ) self.assertEqual(value.name, 'value') self.assertIsInstance(value, BoundColumn) - self.assertEqual(value.dtype, np.float64) + self.assertIs(value.dtype, float64_dtype) # test memoization self.assertIs( @@ -141,6 +155,7 @@ class BlazeToPipelineTestCase(TestCase): expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ), value, ) @@ -149,6 +164,7 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ).value, value, ) @@ -159,6 +175,7 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ), value.dataset, ) @@ -195,7 +212,11 @@ class BlazeToPipelineTestCase(TestCase): )), ) loader = BlazeLoader() - ds = from_blaze(expr.ds, loader=loader) + ds = from_blaze( + expr.ds, + loader=loader, + missing_values=self.missing_values, + ) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) @@ -210,6 +231,7 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=loader, no_deltas_rule=no_deltas_rules.warn, + missing_values=self.missing_values, ) self.assertEqual(len(ws), 1) w = ws[0].message @@ -281,6 +303,7 @@ class BlazeToPipelineTestCase(TestCase): expr_with_add, deltas=None, loader=self.garbage_loader, + missing_values=self.missing_values, ) with self.assertRaises(TypeError): @@ -288,6 +311,7 @@ class BlazeToPipelineTestCase(TestCase): expr.value + 1, # put an Add in the column deltas=None, loader=self.garbage_loader, + missing_values=self.missing_values, ) deltas = bz.Data( @@ -299,6 +323,7 @@ class BlazeToPipelineTestCase(TestCase): expr_with_add, deltas=deltas, loader=self.garbage_loader, + missing_values=self.missing_values, ) with self.assertRaises(TypeError): @@ -306,6 +331,7 @@ class BlazeToPipelineTestCase(TestCase): expr.value + 1, deltas=deltas, loader=self.garbage_loader, + missing_values=self.missing_values, ) def _test_id(self, df, dshape, expected, finder, add): @@ -315,6 +341,7 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ) p = Pipeline() for a in add: @@ -347,9 +374,11 @@ class BlazeToPipelineTestCase(TestCase): expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, + missing_values=self.missing_values, ) p = Pipeline() p.add(ds.value.latest, 'value') + p.add(ds.int_value.latest, 'int_value') dates = self.dates with tmp_asset_finder() as finder: @@ -405,7 +434,9 @@ class BlazeToPipelineTestCase(TestCase): expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) - self._test_id(self.df, self.dshape, expected, finder, ('value',)) + self._test_id( + self.df, self.dshape, expected, finder, ('int_value', 'value',) + ) def test_id_ffill_out_of_window(self): """ @@ -512,7 +543,7 @@ class BlazeToPipelineTestCase(TestCase): var * Record(fields), expected, finder, - ('value', 'other'), + ('value', 'int_value', 'other'), ) def test_id_macro_dataset(self): @@ -782,6 +813,7 @@ class BlazeToPipelineTestCase(TestCase): deltas, loader=loader, no_deltas_rule=no_deltas_rules.raise_, + missing_values=self.missing_values, ) p = Pipeline() diff --git a/tests/pipeline/test_column.py b/tests/pipeline/test_column.py index 142e97a6..26d5e608 100644 --- a/tests/pipeline/test_column.py +++ b/tests/pipeline/test_column.py @@ -42,13 +42,9 @@ class LatestTestCase(TestCase): ) def test_latest(self): + columns = TDS.columns pipe = Pipeline( - columns={ - name: getattr(TDS, name + '_col').latest - # Intentionally not including int and bool because they're not - # yet supported. - for name in ('float', 'datetime') - } + columns={c.name: c.latest for c in columns}, ) cal_slice = slice(20, 40) @@ -58,6 +54,7 @@ class LatestTestCase(TestCase): dates_to_test[0], dates_to_test[-1], ) - float_result = result.float.unstack() - expected_float_result = self.expected_latest(TDS.float_col, cal_slice) - assert_frame_equal(float_result, expected_float_result) + for column in columns: + float_result = result[column.name].unstack() + expected_float_result = self.expected_latest(column, cal_slice) + assert_frame_equal(float_result, expected_float_result) diff --git a/tests/pipeline/test_engine.py b/tests/pipeline/test_engine.py index cbe0a305..bde0fcdd 100644 --- a/tests/pipeline/test_engine.py +++ b/tests/pipeline/test_engine.py @@ -162,7 +162,7 @@ class ConstantInputTestCase(TestCase): self.loader = PrecomputedLoader( constants=self.constants, dates=self.dates, - assets=self.asset_ids, + sids=self.asset_ids, ) self.asset_info = make_simple_equity_info( @@ -367,7 +367,7 @@ class ConstantInputTestCase(TestCase): loader = PrecomputedLoader( constants=constants, dates=self.dates, - assets=self.asset_ids, + sids=self.asset_ids, ) engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, @@ -415,7 +415,7 @@ class ConstantInputTestCase(TestCase): def test_loader_given_multiple_columns(self): class Loader1DataSet1(DataSet): - col1 = Column(float32) + col1 = Column(float) col2 = Column(float32) class Loader1DataSet2(DataSet): @@ -433,12 +433,12 @@ class ConstantInputTestCase(TestCase): loader1 = RecordingPrecomputedLoader(constants=constants1, dates=self.dates, - assets=self.assets) + sids=self.assets) constants2 = {Loader2DataSet.col1: 5, Loader2DataSet.col2: 6} loader2 = RecordingPrecomputedLoader(constants=constants2, dates=self.dates, - assets=self.assets) + sids=self.assets) engine = SimplePipelineEngine( lambda column: diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index f642d2b5..eebcbd41 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -11,7 +11,6 @@ from zipline.errors import ( InvalidDType, TermInputsNotSpecified, WindowLengthNotSpecified, - UnsupportedDataType, ) from zipline.pipeline import Factor, Filter, TermGraph from zipline.pipeline.data import Column, DataSet @@ -21,6 +20,8 @@ from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( datetime64ns_dtype, float64_dtype, + int64_dtype, + NoDefaultMissingValue, ) @@ -334,15 +335,27 @@ class ObjectIdentityTestCase(TestCase): SomeFactor(dtype=1) def test_latest_on_different_dtypes(self): - self.assertIsInstance(TestingDataSet.bool_col.latest, Filter) self.assertIsInstance(TestingDataSet.float_col.latest, Factor) self.assertIsInstance(TestingDataSet.datetime_col.latest, Factor) + self.assertIsInstance(TestingDataSet.int_col.latest, Factor) - # TODO: Support this by allowing users to provide a missing value on - # columns. - with self.assertRaises(UnsupportedDataType): - self.assertIsInstance(TestingDataSet.int_col.latest, Factor) + def test_failure_timing_on_bad_missing_values(self): + + # Just constructing a bad column shouldn't fail. + Column(dtype=int64_dtype) + + with self.assertRaises(NoDefaultMissingValue) as e: + class BadDataSet(DataSet): + bad_column = Column(dtype=int64_dtype) + float_column = Column(dtype=float64_dtype) + int_column = Column(dtype=int64_dtype, missing_value=3) + + self.assertTrue( + str(e.exception.message).startswith( + "Failed to create Column with name 'bad_column'" + ) + ) class SubDataSetTestCase(TestCase): diff --git a/zipline/lib/adjusted_array.py b/zipline/lib/adjusted_array.py index 1611300d..03556528 100644 --- a/zipline/lib/adjusted_array.py +++ b/zipline/lib/adjusted_array.py @@ -17,26 +17,24 @@ from zipline.errors import ( ) from zipline.utils.numpy_utils import ( datetime64ns_dtype, - default_fillvalue_for_dtype, float64_dtype, int64_dtype, uint8_dtype, ) from zipline.utils.memoize import lazyval -from zipline.utils.sentinel import sentinel # These class names are all the same because of our bootleg templating system. from ._float64window import AdjustedArrayWindow as Float64Window from ._int64window import AdjustedArrayWindow as Int64Window from ._uint8window import AdjustedArrayWindow as UInt8Window -Infer = sentinel( - 'Infer', - "Sentinel used to say 'infer missing_value from data type.'" -) NOMASK = None -SUPPORTED_NUMERIC_DTYPES = frozenset( - map(dtype, [float32, float64, int32, int64, uint32]) +FLOAT_DTYPES = frozenset( + map(dtype, [float32, float64, int32]), +) +INT_DTYPES = frozenset( + # NOTE: uint64 not supported because it can't be safely cast to int64. + map(dtype, [int32, int64, uint32]), ) CONCRETE_WINDOW_TYPES = { float64_dtype: Float64Window, @@ -51,13 +49,10 @@ def _normalize_array(data): representation, returning the coerced array and a numpy dtype object to use as a view type when providing public view into the data. - Semantically numerical data (float*, int*, uint*) is coerced to float64 and - viewed as float64. We coerce integral data to float so that we can use NaN - as a missing value. - - datetime[*] data is coerced to int64 with a viewtype of ``datetime64[ns]``. - - ``bool_`` data is coerced to uint8 with a viewtype of ``bool_`` + - float* data is coerced to float64 with viewtype float64. + - int32, int64, and uint32 are converted to int64 with viewtype int64. + - datetime[*] data is coerced to int64 with a viewtype of datetime64[ns]. + - bool_ data is coerced to uint8 with a viewtype of bool_. Parameters ---------- @@ -70,8 +65,10 @@ def _normalize_array(data): data_dtype = data.dtype if data_dtype == bool_: return data.astype(uint8), dtype(bool_) - elif data_dtype in SUPPORTED_NUMERIC_DTYPES: + elif data_dtype in FLOAT_DTYPES: return data.astype(float64), dtype(float64) + elif data_dtype in INT_DTYPES: + return data.astype(int64), dtype(int64) elif data_dtype.name.startswith('datetime'): try: outarray = data.astype('datetime64[ns]').view('int64') @@ -105,18 +102,24 @@ class AdjustedArray(object): adjustments : dict[int -> list[Adjustment]] A dict mapping row indices to lists of adjustments to apply when we reach that row. - fillvalue : object, optional + missing_value : object A value to use to fill missing data in yielded windows. - Default behavior is to infer a value based on the dtype of `data`. - `NaN` is used for numeric data, and `NaT` is used for datetime data. - """ - __slots__ = ('_data', '_viewtype', 'adjustments', '__weakref__') + Should be a value coercible to `data.dtype`. - def __init__(self, data, mask, adjustments, fillvalue=Infer): + """ + __slots__ = ( + '_data', + '_viewtype', + 'adjustments', + 'missing_value', + '__weakref__', + ) + + def __init__(self, data, mask, adjustments, missing_value): self._data, self._viewtype = _normalize_array(data) + self.adjustments = adjustments - if fillvalue is Infer: - fillvalue = default_fillvalue_for_dtype(self.data.dtype) + self.missing_value = missing_value if mask is not NOMASK: if mask.dtype != bool_: @@ -126,7 +129,7 @@ class AdjustedArray(object): "Mask shape %s != data shape %s." % (mask.shape, data.shape), ) - self._data[~mask] = fillvalue + self._data[~mask] = self.missing_value @lazyval def data(self): diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index 11ad415a..7183dea8 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -7,9 +7,13 @@ from six import ( with_metaclass, ) -from zipline.pipeline.term import Term, AssetExists +from zipline.pipeline.term import Term, AssetExists, NotSpecified from zipline.utils.input_validation import ensure_dtype -from zipline.utils.numpy_utils import bool_dtype +from zipline.utils.numpy_utils import ( + bool_dtype, + default_missing_value_for_dtype, + NoDefaultMissingValue, +) from zipline.utils.preprocess import preprocess @@ -19,14 +23,19 @@ class Column(object): """ @preprocess(dtype=ensure_dtype) - def __init__(self, dtype): + def __init__(self, dtype, missing_value=NotSpecified): self.dtype = dtype + self.missing_value = missing_value def bind(self, name): """ Bind a `Column` object to its name. """ - return _BoundColumnDescr(dtype=self.dtype, name=name) + return _BoundColumnDescr( + dtype=self.dtype, + missing_value=self.missing_value, + name=name, + ) class _BoundColumnDescr(object): @@ -37,8 +46,27 @@ class _BoundColumnDescr(object): This exists so that subclasses of DataSets don't share columns with their parent classes. """ - def __init__(self, dtype, name): + def __init__(self, dtype, missing_value, name): self.dtype = dtype + + # Calculating missing values here guarantees that we fail quickly if + # the user fails to provide a missing value for a dtype that requires + # one (e.g. int64), but still enables us to provide an error message + # that points to the name of the failing column. + if missing_value is NotSpecified: + try: + missing_value = default_missing_value_for_dtype(dtype) + except NoDefaultMissingValue: + # Re-raise with a better message. + raise NoDefaultMissingValue( + "Failed to create Column with name {name!r} and" + " dtype {dtype} because no missing_value was provided\n\n" + "Columns with dtype {dtype} require a missing_value.\n" + "Please pass missing_value to Column() or use a different" + " dtype.".format(dtype=dtype, name=name) + ) + + self.missing_value = missing_value self.name = name def __get__(self, instance, owner): @@ -50,6 +78,7 @@ class _BoundColumnDescr(object): """ return BoundColumn( dtype=self.dtype, + missing_value=self.missing_value, dataset=owner, name=self.name, ) @@ -63,11 +92,12 @@ class BoundColumn(Term): extra_input_rows = 0 inputs = () - def __new__(cls, dtype, dataset, name): + def __new__(cls, dtype, missing_value, dataset, name): return super(BoundColumn, cls).__new__( cls, domain=dataset.domain, dtype=dtype, + missing_value=missing_value, dataset=dataset, name=name, ) @@ -106,7 +136,11 @@ class BoundColumn(Term): from zipline.pipeline.filters import Latest else: from zipline.pipeline.factors import Latest - return Latest(inputs=(self,), dtype=self.dtype) + return Latest( + inputs=(self,), + dtype=self.dtype, + missing_value=self.missing_value, + ) def __repr__(self): return "{qualname}::{dtype}".format( diff --git a/zipline/pipeline/data/testing.py b/zipline/pipeline/data/testing.py index 8d734e36..5d452769 100644 --- a/zipline/pipeline/data/testing.py +++ b/zipline/pipeline/data/testing.py @@ -14,8 +14,11 @@ from zipline.utils.numpy_utils import ( class TestingDataSet(DataSet): + # Tell nose this isn't a test case. + __test__ = False - bool_col = Column(dtype=bool_dtype) + bool_col = Column(dtype=bool_dtype, missing_value=False) + bool_col_default_True = Column(dtype=bool_dtype, missing_value=True) float_col = Column(dtype=float64_dtype) datetime_col = Column(dtype=datetime64ns_dtype) - int_col = Column(dtype=int64_dtype) + int_col = Column(dtype=int64_dtype, missing_value=0) diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py index b75192a3..2fa8875e 100644 --- a/zipline/pipeline/factors/factor.py +++ b/zipline/pipeline/factors/factor.py @@ -38,6 +38,7 @@ from zipline.utils.numpy_utils import ( bool_dtype, datetime64ns_dtype, float64_dtype, + int64_dtype, ) from zipline.utils.preprocess import preprocess @@ -303,7 +304,7 @@ def function_application(func): return mathfunc -FACTOR_DTYPES = frozenset([datetime64ns_dtype, float64_dtype]) +FACTOR_DTYPES = frozenset([datetime64ns_dtype, float64_dtype, int64_dtype]) class Factor(CompositeTerm): diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 859adcec..c7ee00c4 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -165,6 +165,7 @@ from zipline.pipeline.loaders.utils import ( normalize_data_query_bounds, normalize_timestamp_to_query_time, ) +from zipline.pipeline.term import NotSpecified from zipline.lib.adjusted_array import AdjustedArray from zipline.lib.adjustment import Float64Overwrite from zipline.utils.enum import enum @@ -275,15 +276,21 @@ _new_names = ('BlazeDataSet_%d' % n for n in count()) @memoize -def new_dataset(expr, deltas): - """Creates or returns a dataset from a pair of blaze expressions. +def new_dataset(expr, deltas, missing_values): + """ + Creates or returns a dataset from a pair of blaze expressions. Parameters ---------- expr : Expr - The blaze expression representing the first known values. + The blaze expression representing the first known values. deltas : Expr - The blaze expression representing the deltas to the data. + The blaze expression representing the deltas to the data. + missing_values : frozenset((name, value) pairs + Association pairs column name and missing_value for that column. + + This needs to be a frozenset rather than a dict or tuple of tuples + because we want a collection that's unordered but still hashable. Returns ------- @@ -295,9 +302,16 @@ def new_dataset(expr, deltas): This function is memoized. repeated calls with the same inputs will return the same type. """ + missing_values = dict(missing_values) columns = {} for name, type_ in expr.dshape.measure.fields: + # Don't generate a column for sid or timestamp, since they're + # implicitly the labels if the arrays that will be passed to pipeline + # Terms. + if name in (SID_FIELD_NAME, TS_FIELD_NAME): + continue try: + # TODO: This should support datetime and bool columns. if promote(type_, float64, promote_option=False) != float64: raise NotPipelineCompatible() if isinstance(type_, Option): @@ -307,7 +321,10 @@ def new_dataset(expr, deltas): except TypeError: col = NonNumpyField(name, type_) else: - col = Column(type_.to_numpy_dtype()) + col = Column( + type_.to_numpy_dtype(), + missing_values.get(name, NotSpecified), + ) columns[name] = col @@ -473,6 +490,7 @@ def from_blaze(expr, loader=None, resources=None, odo_kwargs=None, + missing_values=None, no_deltas_rule=no_deltas_rules.warn): """Create a Pipeline API object from a blaze expression. @@ -494,6 +512,9 @@ def from_blaze(expr, scope for ``bz.compute``. odo_kwargs : dict, optional The keyword arguments to pass to odo when evaluating the expressions. + missing_values : dict[str -> any], optional + A dict mapping column names to missing values for those columns. + Missing values are required for integral columns. no_deltas_rule : no_deltas_rule What should happen if ``deltas='auto'`` but no deltas can be found. 'warn' says to raise a warning but continue. @@ -583,7 +604,10 @@ def from_blaze(expr, _check_resources('deltas', deltas, resources) # Create or retrieve the Pipeline API dataset. - ds = new_dataset(dataset_expr, deltas) + if missing_values is None: + missing_values = {} + ds = new_dataset(dataset_expr, deltas, frozenset(missing_values.items())) + # Register our new dataset with the loader. (loader if loader is not None else global_loader)[ds] = ExprData( bind_expression_to_resources(dataset_expr, resources), @@ -1018,7 +1042,8 @@ class BlazeLoader(dict): column_name, asset_idx, sparse_deltas, - ) + ), + column.missing_value, ) global_loader = BlazeLoader.global_instance() diff --git a/zipline/pipeline/loaders/equity_pricing_loader.py b/zipline/pipeline/loaders/equity_pricing_loader.py index 1181d89d..64448903 100644 --- a/zipline/pipeline/loaders/equity_pricing_loader.py +++ b/zipline/pipeline/loaders/equity_pricing_loader.py @@ -81,12 +81,16 @@ class USEquityPricingLoader(PipelineLoader): dates, assets, ) - adjusted_arrays = [ - AdjustedArray(raw_array, mask, col_adjustments) - for raw_array, col_adjustments in zip(raw_arrays, adjustments) - ] - return dict(zip(columns, adjusted_arrays)) + out = {} + for c, c_raw, c_adjs in zip(columns, raw_arrays, adjustments): + out[c] = AdjustedArray( + c_raw.astype(c.dtype), + mask, + c_adjs, + c.missing_value, + ) + return out def _shift_dates(dates, start_date, end_date, shift): diff --git a/zipline/pipeline/loaders/frame.py b/zipline/pipeline/loaders/frame.py index 4466ed04..77d4bd91 100644 --- a/zipline/pipeline/loaders/frame.py +++ b/zipline/pipeline/loaders/frame.py @@ -60,7 +60,7 @@ class DataFrameLoader(PipelineLoader): def __init__(self, column, baseline, adjustments=None): self.column = column - self.baseline = baseline.values + self.baseline = baseline.values.astype(self.column.dtype) self.dates = baseline.index self.assets = baseline.columns @@ -171,5 +171,6 @@ class DataFrameLoader(PipelineLoader): # Mask out requested columns/rows that didnt match. mask=(good_assets & good_dates[:, None]) & mask, adjustments=self.format_adjustments(dates, assets), + missing_value=column.missing_value, ), } diff --git a/zipline/pipeline/mixins.py b/zipline/pipeline/mixins.py index e3ebf815..12491acd 100644 --- a/zipline/pipeline/mixins.py +++ b/zipline/pipeline/mixins.py @@ -47,6 +47,7 @@ class CustomTermMixin(object): inputs=NotSpecified, window_length=NotSpecified, dtype=NotSpecified, + missing_value=NotSpecified, **kwargs): unexpected_keys = set(kwargs) - set(cls.params) @@ -64,6 +65,7 @@ class CustomTermMixin(object): inputs=inputs, window_length=window_length, dtype=dtype, + missing_value=missing_value, **kwargs ) diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index 5f427bc7..5b730232 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -15,7 +15,10 @@ from zipline.errors import ( WindowLengthNotSpecified, ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import bool_dtype, default_fillvalue_for_dtype +from zipline.utils.numpy_utils import ( + bool_dtype, + default_missing_value_for_dtype, +) from zipline.utils.sentinel import sentinel @@ -32,6 +35,7 @@ class Term(with_metaclass(ABCMeta, object)): # These are NotSpecified because a subclass is required to provide them. dtype = NotSpecified domain = NotSpecified + missing_value = NotSpecified # Subclasses aren't required to provide `params`. The default behavior is # no params. @@ -42,6 +46,7 @@ class Term(with_metaclass(ABCMeta, object)): def __new__(cls, domain=domain, dtype=dtype, + missing_value=missing_value, # params is explicitly not allowed to be passed to an instance. *args, **kwargs): @@ -55,18 +60,22 @@ class Term(with_metaclass(ABCMeta, object)): Caching previously-constructed Terms is **sane** because terms and their inputs are both conceptually immutable. """ - # Class-level attributes can be used to provide defaults for Term - # subclasses. - + # Subclasses can set override these class-level attributes to provide + # default values. if domain is NotSpecified: domain = cls.domain + if dtype is NotSpecified: + dtype = cls.dtype + if missing_value is NotSpecified: + missing_value = cls.missing_value - dtype = cls._validate_dtype(dtype) + dtype, missing_value = cls._validate_dtype(dtype, missing_value) params = cls._pop_params(kwargs) identity = cls.static_identity( domain=domain, dtype=dtype, + missing_value=missing_value, params=params, *args, **kwargs ) @@ -78,6 +87,7 @@ class Term(with_metaclass(ABCMeta, object)): super(Term, cls).__new__(cls)._init( domain=domain, dtype=dtype, + missing_value=missing_value, params=params, *args, **kwargs ) @@ -132,9 +142,9 @@ class Term(with_metaclass(ABCMeta, object)): return tuple(zip(cls.params, param_values)) @classmethod - def _validate_dtype(cls, passed_dtype): + def _validate_dtype(cls, passed_dtype, missing_value): """ - Validate a `dtype` passed to Term.__new__. + Validate `dtype` passed to Term.__new__. If passed_dtype is NotSpecified, then we try to fall back to a class-level attribute. If a value is found at that point, we pass it @@ -156,15 +166,17 @@ class Term(with_metaclass(ABCMeta, object)): coercible to a numpy dtype. """ dtype = passed_dtype - if dtype is NotSpecified: - dtype = cls.dtype if dtype is NotSpecified: raise DTypeNotSpecified(termname=cls.__name__) try: dtype = dtype_class(dtype) except TypeError: raise InvalidDType(dtype=dtype, termname=cls.__name__) - return dtype + + if missing_value is NotSpecified: + missing_value = default_missing_value_for_dtype(dtype) + + return dtype, missing_value def __init__(self, *args, **kwargs): """ @@ -183,7 +195,7 @@ class Term(with_metaclass(ABCMeta, object)): pass @classmethod - def static_identity(cls, domain, dtype, params): + def static_identity(cls, domain, dtype, missing_value, params): """ Return the identity of the Term that would be constructed from the given arguments. @@ -195,9 +207,9 @@ class Term(with_metaclass(ABCMeta, object)): This is a classmethod so that it can be called from Term.__new__ to determine whether to produce a new instance. """ - return (cls, domain, dtype, params) + return (cls, domain, dtype, missing_value, params) - def _init(self, domain, dtype, params): + def _init(self, domain, dtype, missing_value, params): """ Parameters ---------- @@ -210,6 +222,7 @@ class Term(with_metaclass(ABCMeta, object)): """ self.domain = domain self.dtype = dtype + self.missing_value = missing_value for name, value in params: if hasattr(self, name): @@ -268,10 +281,6 @@ class Term(with_metaclass(ABCMeta, object)): return not any(dep for dep in self.dependencies if dep is not AssetExists()) - @lazyval - def missing_value(self): - return default_fillvalue_for_dtype(self.dtype) - class AssetExists(Term): """ diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py index aba56b26..4cc37aff 100644 --- a/zipline/utils/numpy_utils.py +++ b/zipline/utils/numpy_utils.py @@ -16,7 +16,10 @@ from toolz import flip uint8_dtype = dtype('uint8') bool_dtype = dtype('bool') int64_dtype = dtype('int64') + +float32_dtype = dtype('float32') float64_dtype = dtype('float64') + datetime64D_dtype = dtype('datetime64[D]') datetime64ns_dtype = dtype('datetime64[ns]') @@ -33,16 +36,27 @@ NaTD = NaT_for_dtype(datetime64D_dtype) _FILLVALUE_DEFAULTS = { + bool_dtype: False, + float32_dtype: nan, float64_dtype: nan, datetime64ns_dtype: NaTns, } -def default_fillvalue_for_dtype(dtype): +class NoDefaultMissingValue(Exception): + pass + + +def default_missing_value_for_dtype(dtype): """ Get the default fill value for `dtype`. """ - return _FILLVALUE_DEFAULTS[dtype] + try: + return _FILLVALUE_DEFAULTS[dtype] + except KeyError: + raise NoDefaultMissingValue( + "No default value registered for dtype %s." % dtype + ) def repeat_first_axis(array, count): From 09be7acaa8bf6d7a23e58890bca9063be187a2ea Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 20 Jan 2016 22:57:02 -0500 Subject: [PATCH 10/20] TEST: Test forwarding of missing_value. --- tests/pipeline/test_term.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index eebcbd41..ce86b24a 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -18,6 +18,7 @@ from zipline.pipeline.data.testing import TestingDataSet from zipline.pipeline.term import AssetExists, NotSpecified from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( + bool_dtype, datetime64ns_dtype, float64_dtype, int64_dtype, @@ -335,10 +336,19 @@ class ObjectIdentityTestCase(TestCase): SomeFactor(dtype=1) def test_latest_on_different_dtypes(self): - self.assertIsInstance(TestingDataSet.bool_col.latest, Filter) - self.assertIsInstance(TestingDataSet.float_col.latest, Factor) - self.assertIsInstance(TestingDataSet.datetime_col.latest, Factor) - self.assertIsInstance(TestingDataSet.int_col.latest, Factor) + factor_dtypes = (int64_dtype, float64_dtype, datetime64ns_dtype) + for column in TestingDataSet.columns: + if column.dtype == bool_dtype: + self.assertIsInstance(column.latest, Filter) + elif column.dtype in factor_dtypes: + self.assertIsInstance(column.latest, Factor) + else: + self.fail( + "Unknown dtype %s for column %s" % (column.dtype, column) + ) + # These should be the same value, plus this has the convenient + # property of correctly handling `NaN`. + self.assertIs(column.missing_value, column.latest.missing_value) def test_failure_timing_on_bad_missing_values(self): From 3cfc22ed77c0ceb4b22f1667f89d053d7f99fc59 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 20 Jan 2016 23:21:00 -0500 Subject: [PATCH 11/20] DOC: Add whatsnew. --- docs/source/whatsnew/0.8.4.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/source/whatsnew/0.8.4.txt b/docs/source/whatsnew/0.8.4.txt index 2b86d8f1..9531fead 100644 --- a/docs/source/whatsnew/0.8.4.txt +++ b/docs/source/whatsnew/0.8.4.txt @@ -17,6 +17,11 @@ Highlights * :class:`~zipline.assets.assets.AssetFinder` speedups (:issue:`830` and :issue:`817`). +* Improved support for non-float dtypes in Pipeline. Most notably, we now + support ``datetime64`` and ``int64`` dtypes for ``Factor``, and + ``BoundColumn.latest`` now returns a proper ``Filter`` object when the column + is of dtype ``bool``. + Enhancements ~~~~~~~~~~~~ @@ -83,6 +88,17 @@ Enhancements data that is timestamped on or after ``8:45`` will not seen on that day in the simulation. The data will be made available on the next day (:issue:`947`). +* ``BoundColumn.latest`` now returns a + :class:`~zipline.pipeline.filters.Filter` for columns of dtype + ``bool`` (:issue:`962`). + +* Added support for :class:`~zipline.pipeline.factors.Factor` instances with + ``int64`` dtype. :class:`~zipline.pipeline.data.dataset.Column` now requires + a ``missing_value`` when dtype is integral. (:issue:`962`) + +* It is also now possible to specify custom ``missing_value`` values for + ``float``, ``datetime``, and ``bool`` Pipeline terms. (:issue:`962`) + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ From 0115cdc46c48e86286659bcf3f79f99ea24d3fca Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 21 Jan 2016 17:55:21 -0500 Subject: [PATCH 12/20] MAINT: Fail fast on unsupported dtypes. --- tests/pipeline/test_term.py | 22 ++++++++++++--- zipline/errors.py | 13 ++++++++- zipline/lib/adjusted_array.py | 26 ++++++++++++++++-- zipline/pipeline/data/dataset.py | 46 +++++++++++++++++--------------- zipline/pipeline/term.py | 40 ++++++++++++++++----------- zipline/utils/numpy_utils.py | 3 +++ 6 files changed, 106 insertions(+), 44 deletions(-) diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index ce86b24a..3f386907 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -8,8 +8,9 @@ from unittest import TestCase from zipline.errors import ( DTypeNotSpecified, InputTermNotAtomic, - InvalidDType, + NotDType, TermInputsNotSpecified, + UnsupportedDType, WindowLengthNotSpecified, ) from zipline.pipeline import Factor, Filter, TermGraph @@ -19,6 +20,7 @@ from zipline.pipeline.term import AssetExists, NotSpecified from zipline.pipeline.expression import NUMEXPR_MATH_FUNCS from zipline.utils.numpy_utils import ( bool_dtype, + complex128_dtype, datetime64ns_dtype, float64_dtype, int64_dtype, @@ -332,9 +334,15 @@ class ObjectIdentityTestCase(TestCase): with self.assertRaises(DTypeNotSpecified): SomeFactorNoDType() - with self.assertRaises(InvalidDType): + with self.assertRaises(NotDType): SomeFactor(dtype=1) + with self.assertRaises(NoDefaultMissingValue): + SomeFactor(dtype=int64_dtype) + + with self.assertRaises(UnsupportedDType): + SomeFactor(dtype=complex128_dtype) + def test_latest_on_different_dtypes(self): factor_dtypes = (int64_dtype, float64_dtype, datetime64ns_dtype) for column in TestingDataSet.columns: @@ -350,11 +358,10 @@ class ObjectIdentityTestCase(TestCase): # property of correctly handling `NaN`. self.assertIs(column.missing_value, column.latest.missing_value) - def test_failure_timing_on_bad_missing_values(self): + def test_failure_timing_on_bad_dtypes(self): # Just constructing a bad column shouldn't fail. Column(dtype=int64_dtype) - with self.assertRaises(NoDefaultMissingValue) as e: class BadDataSet(DataSet): bad_column = Column(dtype=int64_dtype) @@ -367,6 +374,13 @@ class ObjectIdentityTestCase(TestCase): ) ) + Column(dtype=complex128_dtype) + with self.assertRaises(UnsupportedDType): + class BadDataSetComplex(DataSet): + bad_column = Column(dtype=complex128_dtype) + float_column = Column(dtype=float64_dtype) + int_column = Column(dtype=int64_dtype, missing_value=3) + class SubDataSetTestCase(TestCase): def test_subdataset(self): diff --git a/zipline/errors.py b/zipline/errors.py index 57e2df7d..95feb581 100644 --- a/zipline/errors.py +++ b/zipline/errors.py @@ -396,7 +396,7 @@ class DTypeNotSpecified(ZiplineError): ) -class InvalidDType(ZiplineError): +class NotDType(ZiplineError): """ Raised when a pipeline Term is constructed with a dtype that isn't a numpy dtype object. @@ -407,6 +407,17 @@ class InvalidDType(ZiplineError): ) +class UnsupportedDType(ZiplineError): + """ + Raised when a pipeline Term is constructed with a dtype that's not + supported. + """ + msg = ( + "Failed to construct {termname}.\n" + "Pipeline terms of dtype {dtype} are not yet supported." + ) + + class BadPercentileBounds(ZiplineError): """ Raised by API functions accepting percentile bounds when the passed bounds diff --git a/zipline/lib/adjusted_array.py b/zipline/lib/adjusted_array.py index 03556528..4c0ee5b8 100644 --- a/zipline/lib/adjusted_array.py +++ b/zipline/lib/adjusted_array.py @@ -7,6 +7,8 @@ from numpy import ( float64, int32, int64, + int16, + uint16, ndarray, uint32, uint8, @@ -29,13 +31,33 @@ from ._int64window import AdjustedArrayWindow as Int64Window from ._uint8window import AdjustedArrayWindow as UInt8Window NOMASK = None +BOOL_DTYPES = frozenset( + map(dtype, [bool_]), +) FLOAT_DTYPES = frozenset( - map(dtype, [float32, float64, int32]), + map(dtype, [float32, float64]), ) INT_DTYPES = frozenset( # NOTE: uint64 not supported because it can't be safely cast to int64. - map(dtype, [int32, int64, uint32]), + map(dtype, [int16, uint16, int32, int64, uint32]), ) +DATETIME_DTYPES = frozenset( + map(dtype, ['datetime64[ns]', 'datetime64[D]']), +) +REPRESENTABLE_DTYPES = BOOL_DTYPES.union( + FLOAT_DTYPES, + INT_DTYPES, + DATETIME_DTYPES +) + + +def can_represent_dtype(dtype): + """ + Can we build an AdjustedArray for a baseline of dtype ``dtype``? + """ + return dtype in REPRESENTABLE_DTYPES + + CONCRETE_WINDOW_TYPES = { float64_dtype: Float64Window, int64_dtype: Int64Window, diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index 7183dea8..6ded1f13 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -7,7 +7,11 @@ from six import ( with_metaclass, ) -from zipline.pipeline.term import Term, AssetExists, NotSpecified +from zipline.pipeline.term import ( + Term, + AssetExists, + NotSpecified, +) from zipline.utils.input_validation import ensure_dtype from zipline.utils.numpy_utils import ( bool_dtype, @@ -47,26 +51,26 @@ class _BoundColumnDescr(object): parent classes. """ def __init__(self, dtype, missing_value, name): - self.dtype = dtype - - # Calculating missing values here guarantees that we fail quickly if - # the user fails to provide a missing value for a dtype that requires - # one (e.g. int64), but still enables us to provide an error message - # that points to the name of the failing column. - if missing_value is NotSpecified: - try: - missing_value = default_missing_value_for_dtype(dtype) - except NoDefaultMissingValue: - # Re-raise with a better message. - raise NoDefaultMissingValue( - "Failed to create Column with name {name!r} and" - " dtype {dtype} because no missing_value was provided\n\n" - "Columns with dtype {dtype} require a missing_value.\n" - "Please pass missing_value to Column() or use a different" - " dtype.".format(dtype=dtype, name=name) - ) - - self.missing_value = missing_value + # Validating and calculating default missing values here guarantees + # that we fail quickly if the user passes an unsupporte dtype or fails + # to provide a missing value for a dtype that requires one + # (e.g. int64), but still enables us to provide an error message that + # points to the name of the failing column. + try: + self.dtype, self.missing_value = Term.validate_dtype( + termname="Column(name={name!r})".format(name=name), + dtype=dtype, + missing_value=missing_value, + ) + except NoDefaultMissingValue: + # Re-raise with a more specific message. + raise NoDefaultMissingValue( + "Failed to create Column with name {name!r} and" + " dtype {dtype} because no missing_value was provided\n\n" + "Columns with dtype {dtype} require a missing_value.\n" + "Please pass missing_value to Column() or use a different" + " dtype.".format(dtype=dtype, name=name) + ) self.name = name def __get__(self, instance, owner): diff --git a/zipline/pipeline/term.py b/zipline/pipeline/term.py index 5b730232..3cb9cc63 100644 --- a/zipline/pipeline/term.py +++ b/zipline/pipeline/term.py @@ -6,14 +6,15 @@ from weakref import WeakValueDictionary from numpy import dtype as dtype_class from six import with_metaclass - from zipline.errors import ( DTypeNotSpecified, InputTermNotAtomic, - InvalidDType, + NotDType, TermInputsNotSpecified, + UnsupportedDType, WindowLengthNotSpecified, ) +from zipline.lib.adjusted_array import can_represent_dtype from zipline.utils.memoize import lazyval from zipline.utils.numpy_utils import ( bool_dtype, @@ -69,7 +70,11 @@ class Term(with_metaclass(ABCMeta, object)): if missing_value is NotSpecified: missing_value = cls.missing_value - dtype, missing_value = cls._validate_dtype(dtype, missing_value) + dtype, missing_value = cls.validate_dtype( + cls.__name__, + dtype, + missing_value, + ) params = cls._pop_params(kwargs) identity = cls.static_identity( @@ -141,37 +146,40 @@ class Term(with_metaclass(ABCMeta, object)): ) return tuple(zip(cls.params, param_values)) - @classmethod - def _validate_dtype(cls, passed_dtype, missing_value): + @staticmethod + def validate_dtype(termname, dtype, missing_value): """ - Validate `dtype` passed to Term.__new__. + Validate a `dtype` and `missing_value` passed to Term.__new__. - If passed_dtype is NotSpecified, then we try to fall back to a - class-level attribute. If a value is found at that point, we pass it - to np.dtype so that users can pass `float` or `bool` and have them - coerce to the appropriate numpy types. + Ensures that we know how to represent ``dtype``, and that missing_value + is specified for types without default missing values. Returns ------- - validated : np.dtype - The dtype to use for the new term. + validated_dtype, validated_missing_value : np.dtype, any + The dtype and missing_value to use for the new term. Raises ------ DTypeNotSpecified When no dtype was passed to the instance, and the class doesn't provide a default. - InvalidDType + NotDType When either the class or the instance provides a value not coercible to a numpy dtype. + NoDefaultMissingValue + When dtype requires an explicit missing_value, but + ``missing_value`` is NotSpecified. """ - dtype = passed_dtype if dtype is NotSpecified: - raise DTypeNotSpecified(termname=cls.__name__) + raise DTypeNotSpecified(termname=termname) try: dtype = dtype_class(dtype) except TypeError: - raise InvalidDType(dtype=dtype, termname=cls.__name__) + raise NotDType(dtype=dtype, termname=termname) + + if not can_represent_dtype(dtype): + raise UnsupportedDType(dtype=dtype, termname=termname) if missing_value is NotSpecified: missing_value = default_missing_value_for_dtype(dtype) diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py index 4cc37aff..56315cf7 100644 --- a/zipline/utils/numpy_utils.py +++ b/zipline/utils/numpy_utils.py @@ -15,11 +15,14 @@ from toolz import flip uint8_dtype = dtype('uint8') bool_dtype = dtype('bool') + int64_dtype = dtype('int64') float32_dtype = dtype('float32') float64_dtype = dtype('float64') +complex128_dtype = dtype('complex128') + datetime64D_dtype = dtype('datetime64[D]') datetime64ns_dtype = dtype('datetime64[ns]') From 3739df0da6afa6c6c5751f1e05fe2b20dfb1101e Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 21 Jan 2016 20:54:29 -0500 Subject: [PATCH 13/20] STY: Flake8. --- zipline/pipeline/data/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zipline/pipeline/data/dataset.py b/zipline/pipeline/data/dataset.py index 6ded1f13..0362a183 100644 --- a/zipline/pipeline/data/dataset.py +++ b/zipline/pipeline/data/dataset.py @@ -15,7 +15,6 @@ from zipline.pipeline.term import ( from zipline.utils.input_validation import ensure_dtype from zipline.utils.numpy_utils import ( bool_dtype, - default_missing_value_for_dtype, NoDefaultMissingValue, ) from zipline.utils.preprocess import preprocess From 58b594a6e5632678e366566794ba26b4dceb15e4 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 21 Jan 2016 20:59:32 -0500 Subject: [PATCH 14/20] STY: Typo. --- zipline/utils/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zipline/utils/test_utils.py b/zipline/utils/test_utils.py index f989e8eb..3f9a5895 100644 --- a/zipline/utils/test_utils.py +++ b/zipline/utils/test_utils.py @@ -324,7 +324,7 @@ def make_simple_equity_info(sids, start_date, end_date, symbols=None): sids : array-like of int start_date : pd.Timestamp end_date : pd.Timestamp - symbols : list, optionaln + symbols : list, optional Symbols to use for the assets. If not provided, symbols are generated from the sequence 'A', 'B', ... From cbb59747af48ae60473f27b6de976a08a741ab54 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 21 Jan 2016 21:03:32 -0500 Subject: [PATCH 15/20] TEST: Add test for parameter_space ordering. --- tests/test_test_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_test_utils.py b/tests/test_test_utils.py index 861d10bc..e62dc7a3 100644 --- a/tests/test_test_utils.py +++ b/tests/test_test_utils.py @@ -14,16 +14,24 @@ class TestParameterSpace(TestCase): @classmethod def setUpClass(cls): cls.xy_invocations = [] + cls.yx_invocations = [] @classmethod def tearDownClass(cls): # This is the only actual test here. assert cls.xy_invocations == list(product(cls.x_args, cls.y_args)) + assert cls.yx_invocations == list(product(cls.y_args, cls.x_args)) @parameter_space(x=x_args, y=y_args) def test_xy(self, x, y): self.xy_invocations.append((x, y)) + @parameter_space(x=x_args, y=y_args) + def test_yx(self, y, x): + # Ensure that product is called with args in the order that they appear + # in the function's parameter list. + self.yx_invocations.append((y, x)) + def test_nothing(self): # Ensure that there's at least one "real" test in the class, or else # our {setUp,tearDown}Class won't be called if, for example, From f27b415c8366137250353dab62e549e45d07d14f Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 21 Jan 2016 21:09:20 -0500 Subject: [PATCH 16/20] DOC: Docs updates for subtest and parameter_space. --- zipline/utils/test_utils.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/zipline/utils/test_utils.py b/zipline/utils/test_utils.py index 3f9a5895..ad88c4fc 100644 --- a/zipline/utils/test_utils.py +++ b/zipline/utils/test_utils.py @@ -567,13 +567,18 @@ class SubTestFailures(AssertionError): def subtest(iterator, *_names): - """Construct a subtest in a unittest. + """ + Construct a subtest in a unittest. - This works by decorating a function as a subtest. The test will be run - by iterating over the ``iterator`` and *unpacking the values into the - function. If any of the runs fail, the result will be put into a set and - the rest of the tests will be run. Finally, if any failed, all of the - results will be dumped as one failure. + Consider using ``zipline.utils.test_utils.parameter_space`` when subtests + are constructed over a single input or over the cross-product of multiple + inputs. + + ``subtest`` works by decorating a function as a subtest. The decorated + function will be run by iterating over the ``iterator`` and *unpacking the + values into the function. If any of the runs fail, the result will be put + into a set and the rest of the tests will be run. Finally, if any failed, + all of the results will be dumped as one failure. Parameters ---------- @@ -615,6 +620,10 @@ def subtest(iterator, *_names): We cannot use ``unittest2.TestCase.subTest`` because nose, pytest, and nose2 do not support ``addSubTest``. + + See Also + -------- + zipline.utils.test_utils.parameter_space """ def dec(f): @wraps(f) @@ -735,11 +744,16 @@ def parameter_space(**params): Usage ----- + >>> from unittest import TestCase >>> class SomeTestCase(TestCase): ... @parameter_space(x=[1, 2], y=[2, 3]) ... def test_some_func(self, x, y): ... # Will be called with every possible combination of x and y. ... self.assertEqual(somefunc(x, y), expected_result(x, y)) + + See Also + -------- + zipline.utils.test_utils.subtest """ def decorator(f): From b26a40b2980789019571129cae7c5b27e42f7e37 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 16 Feb 2016 11:50:10 -0500 Subject: [PATCH 17/20] BUG: Check against assets, not sids in pipeline output. --- tests/pipeline/test_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/test_column.py b/tests/pipeline/test_column.py index 26d5e608..3cf08b48 100644 --- a/tests/pipeline/test_column.py +++ b/tests/pipeline/test_column.py @@ -38,7 +38,7 @@ class LatestTestCase(TestCase): return DataFrame( loader.values(column.dtype, self.calendar, self.sids)[slice_], index=self.calendar[slice_], - columns=self.sids, + columns=self.assets, ) def test_latest(self): From 4bc42d5658cce958450555d8e0121c99c07291f0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 16 Feb 2016 12:35:49 -0500 Subject: [PATCH 18/20] BUG: Don't return a map from chrange. --- zipline/utils/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zipline/utils/test_utils.py b/zipline/utils/test_utils.py index ad88c4fc..18e949e5 100644 --- a/zipline/utils/test_utils.py +++ b/zipline/utils/test_utils.py @@ -18,7 +18,7 @@ from numpy.testing import assert_allclose, assert_array_equal import pandas as pd from pandas.tseries.offsets import MonthBegin from six import iteritems, itervalues -from six.moves import filter +from six.moves import filter, map from sqlalchemy import create_engine from toolz import concat @@ -260,10 +260,10 @@ def chrange(start, stop): Example ------- - >>> list(chrange('A', 'C')) + >>> chrange('A', 'C') ['A', 'B', 'C'] """ - return map(chr, range(ord(start), ord(stop) + 1)) + return list(map(chr, range(ord(start), ord(stop) + 1))) def make_rotating_equity_info(num_assets, From d889f8b08bf942009f2a80918f8ea0e70f8ff71e Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 16 Feb 2016 13:43:25 -0500 Subject: [PATCH 19/20] BUG: Don't use deprecated attribute of exception. --- tests/pipeline/test_term.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/test_term.py b/tests/pipeline/test_term.py index 3f386907..7d630a01 100644 --- a/tests/pipeline/test_term.py +++ b/tests/pipeline/test_term.py @@ -369,7 +369,7 @@ class ObjectIdentityTestCase(TestCase): int_column = Column(dtype=int64_dtype, missing_value=3) self.assertTrue( - str(e.exception.message).startswith( + str(e.exception.args[0]).startswith( "Failed to create Column with name 'bad_column'" ) ) From 6287987c0bc776b9077f13cf97587f7601f2d494 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 16 Feb 2016 13:43:56 -0500 Subject: [PATCH 20/20] BUG: Work around scipy >= 0.17 changing dtype of rankdata. --- zipline/lib/rank.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zipline/lib/rank.pyx b/zipline/lib/rank.pyx index 52b36ed0..84a57d07 100644 --- a/zipline/lib/rank.pyx +++ b/zipline/lib/rank.pyx @@ -58,6 +58,11 @@ def masked_rankdata_2d(ndarray data, # the extra work that apply_along_axis does. result = apply_along_axis(rankdata, 1, data, method=method) + # On SciPy >= 0.17, rankdata returns integers for any method except + # average. + if result.dtype.name != 'float64': + result = result.astype('float64') + # rankdata will sort missing values into last place, but we want our nans # to propagate, so explicitly re-apply. result[missing_locations] = nan