Files
catalyst/tests/pipeline/test_column.py
T
Scott Sanderson 5f190395ad ENH: Add support for strings in Pipeline.
- Adds a new class, ``LabelArray``, which is a subclass of np.ndarray.
  LabelArray is conceptually similar to pandas.Categorical, in that it
  stores data with many duplicate values as indices into an array of
  unique values.  For string data with many duplicates (e.g. time-series
  of tickers or or industry classifications), this provides multiple
  orders of magnitude of improvement when doing string operations,
  especially string comparison/matching operations.

- Adds a new generic object "specialization" for `AdjustedArrayWindow`,
  and a corresponding ObjectOverwrite adjustment.

- Adds a new ``postprocess`` method to ``zipline.pipeline.term.Term``.
  This method is called on the final result of any pipeline expression
  after screen filtering has occurred. The default implementation of
  ``postprocess`` is identity, but Classifier overrides it to coerce
  string columns into pandas.Categoricals before presenting them to the
  user.
2016-05-04 15:50:52 -04:00

77 lines
2.3 KiB
Python

"""
Tests BoundColumn attributes and methods.
"""
from contextlib2 import ExitStack
from unittest import TestCase
from pandas import date_range, DataFrame
from pandas.util.testing import assert_frame_equal
from zipline.lib.labelarray import LabelArray
from zipline.pipeline import Pipeline
from zipline.pipeline.data.testing import TestingDataSet as TDS
from zipline.testing import chrange, temp_pipeline_engine
class LatestTestCase(TestCase):
@classmethod
def setUpClass(cls):
cls._stack = stack = ExitStack()
cls.calendar = cal = date_range('2014', '2015', freq='D', tz='UTC')
cls.sids = list(range(5))
cls.engine = stack.enter_context(
temp_pipeline_engine(
cal,
cls.sids,
random_seed=100,
symbols=chrange('A', 'E'),
),
)
cls.assets = cls.engine._finder.retrieve_all(cls.sids)
@classmethod
def tearDownClass(cls):
cls._stack.close()
def expected_latest(self, column, slice_):
loader = self.engine.get_loader(column)
index = self.calendar[slice_]
columns = self.assets
values = loader.values(column.dtype, self.calendar, self.sids)[slice_]
if column.dtype.kind in ('O', 'S', 'U'):
# For string columns, we expect a categorical in the output.
return LabelArray(
values,
missing_value=column.missing_value,
).as_categorical_frame(
index=index,
columns=columns,
)
return DataFrame(
loader.values(column.dtype, self.calendar, self.sids)[slice_],
index=self.calendar[slice_],
columns=self.assets,
)
def test_latest(self):
columns = TDS.columns
pipe = Pipeline(
columns={c.name: c.latest for c in columns},
)
cal_slice = slice(20, 40)
dates_to_test = self.calendar[cal_slice]
result = self.engine.run_pipeline(
pipe,
dates_to_test[0],
dates_to_test[-1],
)
for column in columns:
col_result = result[column.name].unstack()
expected_col_result = self.expected_latest(column, cal_slice)
assert_frame_equal(col_result, expected_col_result)