From 8022a47f7476090213aafc1196c4d8c4c3ee56c8 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 9 Mar 2016 10:29:10 -0500 Subject: [PATCH] BUG: fix previous frame logic and buyback auth tests WIP: earnings test case separated works WIP: fix/reorganize buyback auth. MAINT: clean up MAINT: fix merge conflicts --- tests/pipeline/base.py | 138 ++------------------- tests/pipeline/test_buyback_auth.py | 181 ++++++++++++++-------------- tests/pipeline/test_earnings.py | 140 ++++++++++++++++++--- zipline/pipeline/loaders/events.py | 20 +-- zipline/pipeline/loaders/utils.py | 96 ++++++--------- 5 files changed, 278 insertions(+), 297 deletions(-) diff --git a/tests/pipeline/base.py b/tests/pipeline/base.py index 8da70e5f..f07c00b7 100644 --- a/tests/pipeline/base.py +++ b/tests/pipeline/base.py @@ -1,7 +1,8 @@ """ Base class for Pipeline API unittests. """ -from functools import wraps, partial +import abc +from functools import wraps from unittest import TestCase from nose_parameterized import parameterized @@ -13,7 +14,6 @@ from pandas.util.testing import assert_series_equal from six import iteritems from zipline.pipeline import Pipeline -from zipline.pipeline.common import TS_FIELD_NAME from zipline.pipeline.engine import SimplePipelineEngine from zipline.pipeline.term import AssetExists from zipline.testing import ( @@ -148,43 +148,18 @@ class BasePipelineTestCase(TestCase): return arange(prod(shape), dtype=dtype).reshape(shape) -DATE_FIELD_NAME = "event_date" - - class EventLoaderCommonMixin(object): - sids = A, B, C, D, E = range(5) - equity_info = make_simple_equity_info( - sids, - start_date=pd.Timestamp('2013-01-01', tz='UTC'), - end_date=pd.Timestamp('2015-01-01', tz='UTC'), - ) + @abc.abstractproperty + def get_sids(cls): + raise NotImplementedError('get_sids') - event_dates_cases = [ - # K1--K2--E1--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - DATE_FIELD_NAME: pd.to_datetime(['2014-01-15', '2014-01-20']) - }), - # K1--K2--E2--E1. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - DATE_FIELD_NAME: pd.to_datetime(['2014-01-20', '2014-01-15']) - }), - # K1--E1--K2--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), - DATE_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-20']) - }), - # K1 == K2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), - DATE_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-15']) - }), - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime([]), - DATE_FIELD_NAME: pd.to_datetime([]) - }) - ] + @classmethod + def get_equity_info(cls): + return make_simple_equity_info( + cls.get_sids(), + start_date=pd.Timestamp('2013-01-01', tz='UTC'), + end_date=pd.Timestamp('2015-01-01', tz='UTC'), + ) def zip_with_floats(self, dates, flts): return pd.Series(flts, index=dates).astype('float') @@ -217,93 +192,6 @@ class EventLoaderCommonMixin(object): loader = self.loader_type(*self.loader_args(dates)) return SimplePipelineEngine(lambda _: loader, dates, self.finder) - def get_expected_next_event_dates(self, dates): - num_days_between_for_dates = partial(self.num_days_between, dates) - zip_with_dates_for_dates = partial(self.zip_with_dates, dates) - return pd.DataFrame({ - 0: zip_with_dates_for_dates( - ['NaT'] * - num_days_between_for_dates(None, '2014-01-04') + - ['2014-01-15'] * - num_days_between_for_dates('2014-01-05', '2014-01-15') + - ['2014-01-20'] * - num_days_between_for_dates('2014-01-16', '2014-01-20') + - ['NaT'] * - num_days_between_for_dates('2014-01-21', None) - ), - 1: zip_with_dates_for_dates( - ['NaT'] * - num_days_between_for_dates(None, '2014-01-04') + - ['2014-01-20'] * - num_days_between_for_dates('2014-01-05', '2014-01-09') + - ['2014-01-15'] * - num_days_between_for_dates('2014-01-10', '2014-01-15') + - ['2014-01-20'] * - num_days_between_for_dates('2014-01-16', '2014-01-20') + - ['NaT'] * - num_days_between_for_dates('2014-01-21', None) - ), - 2: zip_with_dates_for_dates( - ['NaT'] * - num_days_between_for_dates(None, '2014-01-04') + - ['2014-01-10'] * - num_days_between_for_dates('2014-01-05', '2014-01-10') + - ['NaT'] * - num_days_between_for_dates('2014-01-11', '2014-01-14') + - ['2014-01-20'] * - num_days_between_for_dates('2014-01-15', '2014-01-20') + - ['NaT'] * - num_days_between_for_dates('2014-01-21', None) - ), - 3: zip_with_dates_for_dates( - ['NaT'] * - num_days_between_for_dates(None, '2014-01-04') + - ['2014-01-10'] * - num_days_between_for_dates('2014-01-05', '2014-01-10') + - ['2014-01-15'] * - num_days_between_for_dates('2014-01-11', '2014-01-15') + - ['NaT'] * - num_days_between_for_dates('2014-01-16', None) - ), - 4: zip_with_dates_for_dates(['NaT'] * - len(dates)), - }, index=dates) - - def get_expected_previous_event_dates(self, dates): - num_days_between_for_dates = partial(self.num_days_between, dates) - zip_with_dates_for_dates = partial(self.zip_with_dates, dates) - return pd.DataFrame({ - 0: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - 1: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - 2: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + - ['2014-01-10'] * num_days_between_for_dates('2014-01-10', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - 3: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + - ['2014-01-10'] * num_days_between_for_dates('2014-01-10', - '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - None), - ), - 4: zip_with_dates_for_dates(['NaT'] * len(dates)), - }, index=dates) - @staticmethod def _compute_busday_offsets(announcement_dates): """ @@ -364,7 +252,7 @@ class EventLoaderCommonMixin(object): end_date=dates[-1], ) - for sid in self.sids: + for sid in self.get_sids(): for col_name in self.cols.keys(): assert_series_equal(result[col_name].xs(sid, level=1), self.cols[col_name][sid], diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index f97aa049..af3d35a1 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -7,8 +7,10 @@ from unittest import TestCase import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack +import itertools import pandas as pd from six import iteritems +from tests.pipeline.base import EventLoaderCommonMixin from zipline.pipeline.common import( BUYBACK_ANNOUNCEMENT_FIELD_NAME, @@ -38,51 +40,67 @@ from zipline.pipeline.loaders.blaze import ( BlazeShareBuybackAuthorizationsLoader, ) from zipline.testing import tmp_asset_finder -from .base import EventLoaderCommonMixin, DATE_FIELD_NAME +date_intervals = [[None, '2014-01-04'], ['2014-01-05', '2014-01-09'], + ['2014-01-10', None]] -buyback_authorizations = [ - # K1--K2--A1--A2. +buyback_authorizations_cases = [ pd.DataFrame({ SHARE_COUNT_FIELD_NAME: [1, 15], - CASH_FIELD_NAME: [10, 20] - }), - # K1--K2--A2--A1. - pd.DataFrame({ - SHARE_COUNT_FIELD_NAME: [7, 13], - CASH_FIELD_NAME: [10, 22] - }), - # K1--A1--K2--A2. - pd.DataFrame({ - SHARE_COUNT_FIELD_NAME: [3, 1], - CASH_FIELD_NAME: [4, 7] - }), - # K1 == K2. - pd.DataFrame({ - SHARE_COUNT_FIELD_NAME: [6, 23], - CASH_FIELD_NAME: [1, 2] + CASH_FIELD_NAME: [10, 20], + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-04', + '2014-01-09']) }), pd.DataFrame( columns=[SHARE_COUNT_FIELD_NAME, - CASH_FIELD_NAME], + CASH_FIELD_NAME, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + TS_FIELD_NAME], dtype='datetime64[ns]' ), ] -def create_buyback_auth_tst_frame(cases, field_to_drop): - buyback_auth_df = { - sid: - pd.concat([df, buyback_authorizations[sid]], axis=1).drop( - field_to_drop, 1) - for sid, df - in enumerate(case.rename(columns={DATE_FIELD_NAME: - BUYBACK_ANNOUNCEMENT_FIELD_NAME} - ) - for case in cases - ) - } - return buyback_auth_df +def get_values_for_date_ranges(zip_with_floats_dates, + num_days_between_dates, + vals_for_date_intervals): + # Fill in given values for given date ranges. + return zip_with_floats_dates( + list( + itertools.chain(*[ + [val] * num_days_between_dates(*date_intervals[i]) + for i, val in enumerate(vals_for_date_intervals) + ]) + ) + ) + + +def get_expected_previous_values(zip_with_floats_dates, + num_days_between_dates, + dates, + vals_for_date_intervals): + return pd.DataFrame({ + 0: get_values_for_date_ranges(zip_with_floats_dates, + num_days_between_dates, + vals_for_date_intervals), + 1: zip_with_floats_dates(['NaN'] * len(dates)), + }, index=dates) + + +def get_expected_previous_dates(zip_with_dates_for_dates, + num_days_between_for_dates, + dates): + return pd.DataFrame({ + 0: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-04'] * num_days_between_for_dates('2014-01-05', + '2014-01-09') + + ['2014-01-09'] * num_days_between_for_dates('2014-01-10', + None), + ), + 1: zip_with_dates_for_dates(['NaT'] * len(dates)) + }) class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): @@ -98,15 +116,21 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): BusinessDaysSinceCashBuybackAuth(), } + @classmethod + def get_sids(cls): + return range(2) + @classmethod def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() cls.finder = stack.enter_context( - tmp_asset_finder(equities=cls.equity_info), + tmp_asset_finder(equities=cls.get_equity_info()), ) cls.cols = {} - cls.dataset = create_buyback_auth_tst_frame(cls.event_dates_cases, - SHARE_COUNT_FIELD_NAME) + cls.dataset = {sid: + frame.drop(SHARE_COUNT_FIELD_NAME, axis=1) + for sid, frame + in enumerate(buyback_authorizations_cases)} cls.loader_type = CashBuybackAuthorizationsLoader @classmethod @@ -116,31 +140,17 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): def setup(self, dates): zip_with_floats_dates = partial(self.zip_with_floats, dates) num_days_between_dates = partial(self.num_days_between, dates) - _expected_previous_cash = pd.DataFrame({ - 0: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-14') + - [10] * num_days_between_dates('2014-01-15', '2014-01-19') + - [20] * num_days_between_dates('2014-01-20', None) - ), - 1: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-14') + - [22] * num_days_between_dates('2014-01-15', '2014-01-19') + - [10] * num_days_between_dates('2014-01-20', None) - ), - 2: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-09') + - [4] * num_days_between_dates('2014-01-10', '2014-01-19') + - [7] * num_days_between_dates('2014-01-20', None) - ), - 3: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-09') + - [1] * num_days_between_dates('2014-01-10', '2014-01-14') + - [2] * num_days_between_dates('2014-01-15', None) - ), - 4: zip_with_floats_dates(['NaN'] * len(dates)), - }, index=dates) - self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = \ - self.get_expected_previous_event_dates(dates) + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + _expected_previous_cash = get_expected_previous_values( + zip_with_floats_dates, num_days_between_dates, dates, + ['NaN', 10, 20] + ) + self.cols[ + PREVIOUS_BUYBACK_ANNOUNCEMENT + ] = get_expected_previous_dates(zip_with_dates_for_dates, + num_days_between_for_dates, + dates) self.cols[PREVIOUS_BUYBACK_CASH] = _expected_previous_cash self.cols[DAYS_SINCE_PREV] = self._compute_busday_offsets( self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] @@ -160,15 +170,21 @@ class ShareBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): BusinessDaysSinceShareBuybackAuth(), } + @classmethod + def get_sids(cls): + return range(2) + @classmethod def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() cls.finder = stack.enter_context( - tmp_asset_finder(equities=cls.equity_info), + tmp_asset_finder(equities=cls.get_equity_info()), ) cls.cols = {} - cls.dataset = create_buyback_auth_tst_frame(cls.event_dates_cases, - CASH_FIELD_NAME) + cls.dataset = {sid: + frame.drop(CASH_FIELD_NAME, axis=1) + for sid, frame + in enumerate(buyback_authorizations_cases)} cls.loader_type = ShareBuybackAuthorizationsLoader @classmethod @@ -178,34 +194,19 @@ class ShareBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): def setup(self, dates): zip_with_floats_dates = partial(self.zip_with_floats, dates) num_days_between_dates = partial(self.num_days_between, dates) - _expected_previous_buyback_share_count = pd.DataFrame({ - 0: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-14') + - [1] * num_days_between_dates('2014-01-15', '2014-01-19') + - [15] * num_days_between_dates('2014-01-20', None) - ), - 1: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-14') + - [13] * num_days_between_dates('2014-01-15', '2014-01-19') + - [7] * num_days_between_dates('2014-01-20', None) - ), - 2: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-09') + - [3] * num_days_between_dates('2014-01-10', '2014-01-19') + - [1] * num_days_between_dates('2014-01-20', None) - ), - 3: zip_with_floats_dates( - ['NaN'] * num_days_between_dates(None, '2014-01-09') + - [6] * num_days_between_dates('2014-01-10', '2014-01-14') + - [23] * num_days_between_dates('2014-01-15', None) - ), - 4: zip_with_floats_dates(['NaN'] * len(dates)), - }, index=dates) + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + self.cols[ PREVIOUS_BUYBACK_SHARE_COUNT - ] = _expected_previous_buyback_share_count - self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = \ - self.get_expected_previous_event_dates(dates) + ] = get_expected_previous_values(zip_with_floats_dates, + num_days_between_dates, dates, + ['NaN', 1, 15]) + self.cols[ + PREVIOUS_BUYBACK_ANNOUNCEMENT + ] = get_expected_previous_dates(zip_with_dates_for_dates, + num_days_between_for_dates, + dates) self.cols[DAYS_SINCE_PREV] = self._compute_busday_offsets( self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] ) diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index cad01757..4e9dd8b1 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -1,6 +1,7 @@ """ Tests for the reference loader for EarningsCalendar. """ +from functools import partial from unittest import TestCase import blaze as bz @@ -8,6 +9,7 @@ from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack import pandas as pd from six import iteritems +from tests.pipeline.base import EventLoaderCommonMixin from zipline.pipeline.common import ( ANNOUNCEMENT_FIELD_NAME, @@ -27,11 +29,36 @@ from zipline.pipeline.loaders.earnings import EarningsCalendarLoader from zipline.pipeline.loaders.blaze import ( BlazeEarningsCalendarLoader, ) -from zipline.testing import ( - make_simple_equity_info, - tmp_asset_finder, -) -from .base import EventLoaderCommonMixin, DATE_FIELD_NAME + +from zipline.testing import tmp_asset_finder + +earnings_cases = [ + # K1--K2--A1--A2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', '2014-01-20']) + }), + # K1--K2--A2--A1. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', '2014-01-15']) + }), + # K1--A1--K2--A2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-20']) + }), + # K1 == K2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-15']) + }), + pd.DataFrame( + columns=[ANNOUNCEMENT_FIELD_NAME, + TS_FIELD_NAME], + dtype='datetime64[ns]' + ), +] class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonMixin): @@ -45,25 +72,108 @@ class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonMixin): DAYS_TO_NEXT: BusinessDaysUntilNextEarnings(), } + @classmethod + def get_sids(cls): + return range(5) + @classmethod def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() - equity_info = make_simple_equity_info( - cls.sids, - start_date=pd.Timestamp('2013-01-01', tz='UTC'), - end_date=pd.Timestamp('2015-01-01', tz='UTC'), - ) cls.cols = {} - cls.dataset = {sid: df for sid, df in enumerate( - case.rename( - columns={DATE_FIELD_NAME: ANNOUNCEMENT_FIELD_NAME} - ) for case in cls.event_dates_cases)} + cls.dataset = {sid: df for sid, df in enumerate(earnings_cases)} cls.finder = stack.enter_context( - tmp_asset_finder(equities=equity_info), + tmp_asset_finder(equities=cls.get_equity_info()), ) cls.loader_type = EarningsCalendarLoader + def get_expected_next_event_dates(self, dates): + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + return pd.DataFrame({ + 0: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-05', '2014-01-15') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-16', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 1: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-05', '2014-01-09') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-10', '2014-01-15') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-16', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 2: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-10'] * + num_days_between_for_dates('2014-01-05', '2014-01-10') + + ['NaT'] * + num_days_between_for_dates('2014-01-11', '2014-01-14') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-15', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 3: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-10'] * + num_days_between_for_dates('2014-01-05', '2014-01-10') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-11', '2014-01-15') + + ['NaT'] * + num_days_between_for_dates('2014-01-16', None) + ), + 4: zip_with_dates_for_dates(['NaT'] * + len(dates)), + }, index=dates) + + def get_expected_previous_event_dates(self, dates): + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + return pd.DataFrame({ + 0: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 1: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 2: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 3: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + None), + ), + 4: zip_with_dates_for_dates(['NaT'] * len(dates)), + }, index=dates) + @classmethod def tearDownClass(cls): cls._cleanup_stack.close() diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 84d0735c..27cebd50 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -5,8 +5,9 @@ from toolz import merge from .base import PipelineLoader from .frame import DataFrameLoader -from .utils import next_date_frame, previous_date_frame, previous_value +from .utils import previous_event_frame, next_date_frame from zipline.pipeline.common import TS_FIELD_NAME +from zipline.utils.numpy_utils import NaTD WRONG_COLS_ERROR = "Expected columns {expected_columns} for sid {sid} but " \ "got columns {resulting_columns}." @@ -179,10 +180,13 @@ class EventsLoader(PipelineLoader): event_date_field_name): return DataFrameLoader( prev_date_field, - previous_date_frame( - self.all_dates, + previous_event_frame( self.events_by_sid, + self.all_dates, + NaTD, + 'datetime64[ns]', event_date_field_name, + event_date_field_name ), adjustments=None, ) @@ -193,13 +197,13 @@ class EventsLoader(PipelineLoader): value_field_name): return DataFrameLoader( previous_value_field, - previous_value( - self.all_dates, + previous_event_frame( self.events_by_sid, - event_date_field_name, - value_field_name, + self.all_dates, + previous_value_field.missing_value, previous_value_field.dtype, - previous_value_field.missing_value + event_date_field_name, + value_field_name ), adjustments=None, ) diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 73d0ad3f..8b2cc9a2 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -5,7 +5,7 @@ import pandas as pd from six import iteritems from six.moves import zip -from zipline.utils.numpy_utils import NaTns, NaTD +from zipline.utils.numpy_utils import NaTns def next_date_frame(dates, events_by_sid, event_date_field_name): @@ -60,70 +60,42 @@ def next_date_frame(dates, events_by_sid, event_date_field_name): return pd.DataFrame(index=dates, data=cols) -def previous_date_frame(date_index, events_by_sid, event_date_field_name): +def previous_event_frame(events_by_sid, + date_index, + missing_value, + field_dtype, + event_date_field, + previous_return_field): """ - Make a DataFrame representing simulated next earnings date_index. + Make a DataFrame representing simulated previous dates or values for an + event. Parameters ---------- - date_index : DatetimeIndex. - The index of the returned DataFrame. - events_by_sid : dict[int -> pd.DataFrame] - Dict mapping sids to a DataFrame. The index of the DataFrame - represents the date we learned of the event mapping to the event - data. - event_date_field_name : str - The name of the date field that marks when the event occurred. - - Returns - ------- - previous_events: pd.DataFrame - A DataFrame where each column is a security from `events_by_sid` where - the values are the dates of the previous event that occurred on the - date of the index. Entries falling before the first date will have - `NaT` as the result in the output. - - See Also - -------- - next_date_frame - """ - sids = list(events_by_sid) - out = np.full((len(date_index), len(sids)), NaTD, dtype='datetime64[ns]') - d_n = date_index[-1].asm8 - for col_idx, sid in enumerate(sids): - # events_by_sid[sid] is Series mapping knowledge_date to actual - # event_date. We don't care about the knowledge date for - # computing previous earnings. - values = events_by_sid[sid][event_date_field_name].values - values = values[values <= d_n] - out[date_index.searchsorted(values), col_idx] = values - - frame = pd.DataFrame(out, index=date_index, columns=sids) - frame.ffill(inplace=True) - return frame - - -def previous_value(date_index, events_by_sid, event_date_field, value_field, - value_field_dtype, missing_value): - """ - Make a DataFrame representing simulated next earnings date_index. - - Parameters - ---------- - date_index : DatetimeIndex. - The index of the returned DataFrame. events_by_sid : dict[int -> DatetimeIndex] Dict mapping sids to a series of dates. Each k:v pair of the series represents the date we learned of the event mapping to the date the event will occur. + date_index : DatetimeIndex. + The index of the returned DataFrame. + missing_value : any + Data which missing values should be filled with. + field_dtype: any + The dtype of the field for which the previous values are being + retrieved. + event_date_field: str + The name of the date field that marks when the event occurred. + return_field: str + The name of the field for which the previous values are being + retrieved. Returns ------- previous_events: pd.DataFrame - A DataFrame where each column is a security from `events_by_sid` where - the values are the dates of the previous event that occured on the date - of the index. Entries falling before the first date will have `NaT` as - the result in the output. + A DataFrame where each column is a security from `events_by_sid` and + the values are the values for the previous event that occurred on the + date of the index. Entries falling before the first date will have + `missing_value` filled in as the result in the output. See Also -------- @@ -133,18 +105,24 @@ def previous_value(date_index, events_by_sid, event_date_field, value_field, out = np.full( (len(date_index), len(sids)), missing_value, - dtype=value_field_dtype + dtype=field_dtype ) d_n = date_index[-1].asm8 for col_idx, sid in enumerate(sids): - # events_by_sid[sid] is DataFrame mapping knowledge_date to event - # date and value. We don't care about the knowledge date for computing - # previous values. + # events_by_sid[sid] is a DataFrame mapping knowledge_date to event + # date and values. df = events_by_sid[sid] df = df[df[event_date_field] <= d_n] + event_date_vals = df[event_date_field].values + # Get knowledge dates corresponding to the values in which we are + # interested + kd_vals = df[df[event_date_field] <= d_n].index.values + # The date at which a previous event is first known is the max of the + # kd and the event date. + index_dates = np.maximum(kd_vals, event_date_vals) out[ - date_index.searchsorted(df[event_date_field].values), col_idx - ] = df[value_field] + date_index.searchsorted(index_dates), col_idx + ] = df[previous_return_field] frame = pd.DataFrame(out, index=date_index, columns=sids) frame.ffill(inplace=True)